Merge https://github.com/ggerganov/llama.cpp into vulkan
This commit is contained in:
commit
c9463641af
104 changed files with 3684 additions and 2040 deletions
16
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
16
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
|
@ -66,11 +66,21 @@ body:
|
||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: logs
|
id: command
|
||||||
attributes:
|
attributes:
|
||||||
label: Relevant log output
|
label: Compile command
|
||||||
description: >
|
description: >
|
||||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
|
||||||
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: >
|
||||||
|
Please copy and paste any relevant log output, including any generated text.
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
render: shell
|
render: shell
|
||||||
validations:
|
validations:
|
||||||
|
|
12
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
12
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
|
@ -52,6 +52,16 @@ body:
|
||||||
- Other (Please specify in the next section)
|
- Other (Please specify in the next section)
|
||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: command
|
||||||
|
attributes:
|
||||||
|
label: Command line
|
||||||
|
description: >
|
||||||
|
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
|
||||||
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: info
|
id: info
|
||||||
attributes:
|
attributes:
|
||||||
|
@ -74,7 +84,7 @@ body:
|
||||||
attributes:
|
attributes:
|
||||||
label: Relevant log output
|
label: Relevant log output
|
||||||
description: >
|
description: >
|
||||||
If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
|
If applicable, please copy and paste any relevant log output, including any generated text.
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
render: shell
|
render: shell
|
||||||
validations:
|
validations:
|
||||||
|
|
4
.github/workflows/build.yml
vendored
4
.github/workflows/build.yml
vendored
|
@ -665,7 +665,7 @@ jobs:
|
||||||
- build: 'llvm-arm64'
|
- build: 'llvm-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||||
- build: 'msvc-arm64'
|
- build: 'msvc-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||||
- build: 'llvm-arm64-opencl-adreno'
|
- build: 'llvm-arm64-opencl-adreno'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||||
|
|
||||||
|
@ -1237,7 +1237,7 @@ jobs:
|
||||||
|
|
||||||
- name: Create release
|
- name: Create release
|
||||||
id: create_release
|
id: create_release
|
||||||
uses: anzz1/action-create-release@v1
|
uses: ggml-org/action-create-release@v1
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
with:
|
with:
|
||||||
|
|
3
.github/workflows/docker.yml
vendored
3
.github/workflows/docker.yml
vendored
|
@ -97,10 +97,9 @@ jobs:
|
||||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||||
|
|
||||||
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
|
||||||
- name: Free Disk Space (Ubuntu)
|
- name: Free Disk Space (Ubuntu)
|
||||||
if: ${{ matrix.config.free_disk_space == true }}
|
if: ${{ matrix.config.free_disk_space == true }}
|
||||||
uses: jlumbroso/free-disk-space@main
|
uses: ggml-org/free-disk-space@v1.3.1
|
||||||
with:
|
with:
|
||||||
# this might remove tools that are actually needed,
|
# this might remove tools that are actually needed,
|
||||||
# if set to "true" but frees about 6 GB
|
# if set to "true" but frees about 6 GB
|
||||||
|
|
4
.github/workflows/editorconfig.yml
vendored
4
.github/workflows/editorconfig.yml
vendored
|
@ -23,5 +23,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: editorconfig-checker/action-editorconfig-checker@main
|
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
||||||
|
with:
|
||||||
|
version: v3.0.3
|
||||||
- run: editorconfig-checker
|
- run: editorconfig-checker
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
||||||
|
|
||||||
/ci/ @ggerganov
|
/ci/ @ggerganov
|
||||||
/.devops/ @ngxson
|
/.devops/*.Dockerfile @ngxson
|
||||||
/examples/server/ @ngxson
|
/examples/server/ @ngxson
|
||||||
|
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||||
|
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
||||||
|
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
||||||
|
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
||||||
|
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||||
|
/ggml/src/gguf.cpp @JohannesGaessler
|
||||||
|
|
|
@ -22,6 +22,11 @@ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example>
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
|
||||||
|
this->excludes = std::move(excludes);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
common_arg & common_arg::set_env(const char * env) {
|
common_arg & common_arg::set_env(const char * env) {
|
||||||
help = help + "\n(env: " + env + ")";
|
help = help + "\n(env: " + env + ")";
|
||||||
this->env = env;
|
this->env = env;
|
||||||
|
@ -37,6 +42,10 @@ bool common_arg::in_example(enum llama_example ex) {
|
||||||
return examples.find(ex) != examples.end();
|
return examples.find(ex) != examples.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool common_arg::is_exclude(enum llama_example ex) {
|
||||||
|
return excludes.find(ex) != excludes.end();
|
||||||
|
}
|
||||||
|
|
||||||
bool common_arg::get_value_from_env(std::string & output) {
|
bool common_arg::get_value_from_env(std::string & output) {
|
||||||
if (env == nullptr) return false;
|
if (env == nullptr) return false;
|
||||||
char * value = std::getenv(env);
|
char * value = std::getenv(env);
|
||||||
|
@ -420,7 +429,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
||||||
*/
|
*/
|
||||||
auto add_opt = [&](common_arg arg) {
|
auto add_opt = [&](common_arg arg) {
|
||||||
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
|
if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
|
||||||
ctx_arg.options.push_back(std::move(arg));
|
ctx_arg.options.push_back(std::move(arg));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -649,7 +658,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.prompt = value;
|
params.prompt = value;
|
||||||
}
|
}
|
||||||
));
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-perf"},
|
{"--no-perf"},
|
||||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||||
|
@ -673,7 +682,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.prompt.pop_back();
|
params.prompt.pop_back();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
));
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--in-file"}, "FNAME",
|
{"--in-file"}, "FNAME",
|
||||||
"an input file (repeat to specify multiple files)",
|
"an input file (repeat to specify multiple files)",
|
||||||
|
@ -700,7 +709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.prompt = ss.str();
|
params.prompt = ss.str();
|
||||||
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
||||||
}
|
}
|
||||||
));
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-e", "--escape"},
|
{"-e", "--escape"},
|
||||||
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
|
|
||||||
struct common_arg {
|
struct common_arg {
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
|
std::set<enum llama_example> excludes = {};
|
||||||
std::vector<const char *> args;
|
std::vector<const char *> args;
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
|
@ -53,9 +54,11 @@ struct common_arg {
|
||||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||||
|
|
||||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||||
|
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
||||||
common_arg & set_env(const char * env);
|
common_arg & set_env(const char * env);
|
||||||
common_arg & set_sparam();
|
common_arg & set_sparam();
|
||||||
bool in_example(enum llama_example ex);
|
bool in_example(enum llama_example ex);
|
||||||
|
bool is_exclude(enum llama_example ex);
|
||||||
bool get_value_from_env(std::string & output);
|
bool get_value_from_env(std::string & output);
|
||||||
bool has_value_from_env();
|
bool has_value_from_env();
|
||||||
std::string to_string();
|
std::string to_string();
|
||||||
|
|
|
@ -2,6 +2,9 @@
|
||||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
|
@ -846,7 +849,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||||
} else {
|
} else {
|
||||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
|
@ -873,7 +876,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
@ -884,7 +887,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -900,7 +903,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||||
if (cvec.n_embd == -1) {
|
if (cvec.n_embd == -1) {
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
@ -913,7 +916,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
params.control_vector_layer_end);
|
params.control_vector_layer_end);
|
||||||
if (err) {
|
if (err) {
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
@ -926,7 +929,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
if (lora == nullptr) {
|
if (lora == nullptr) {
|
||||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -982,7 +985,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
if (llama_model_has_encoder(model)) {
|
if (llama_model_has_encoder(model)) {
|
||||||
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||||
if (decoder_start_token_id == -1) {
|
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||||
decoder_start_token_id = bos;
|
decoder_start_token_id = bos;
|
||||||
}
|
}
|
||||||
tmp.clear();
|
tmp.clear();
|
||||||
|
@ -1411,7 +1414,7 @@ struct llama_model * common_load_model_from_url(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return llama_load_model_from_file(local_path.c_str(), params);
|
return llama_model_load_from_file(local_path.c_str(), params);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
|
|
|
@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
||||||
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
||||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||||
if (part_static_it == nc_static.end()) {
|
if (part_static_it == nc_static.end()) {
|
||||||
return -1;
|
return LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
const common_ngram_cache_part part_static = part_static_it->second;
|
const common_ngram_cache_part part_static = part_static_it->second;
|
||||||
|
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
int sum_count_static = 0;
|
int sum_count_static = 0;
|
||||||
llama_token max_token = -1;
|
llama_token max_token = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
for (std::pair<llama_token, int> token_count_static : part_static) {
|
for (std::pair<llama_token, int> token_count_static : part_static) {
|
||||||
const llama_token token = token_count_static.first;
|
const llama_token token = token_count_static.first;
|
||||||
|
@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
||||||
return -1;
|
return LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
||||||
return -1;
|
return LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
return max_token;
|
return max_token;
|
||||||
}
|
}
|
||||||
|
@ -98,9 +98,9 @@ static llama_token try_draft(
|
||||||
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
||||||
const int * min_sample_size, const int * min_percent) {
|
const int * min_sample_size, const int * min_percent) {
|
||||||
|
|
||||||
llama_token drafted_token = -1;
|
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
|
||||||
const common_ngram ngram_primary = ngrams_primary[i];
|
const common_ngram ngram_primary = ngrams_primary[i];
|
||||||
|
|
||||||
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
||||||
|
@ -112,7 +112,7 @@ static llama_token try_draft(
|
||||||
int max_count_primary = 0;
|
int max_count_primary = 0;
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
int sum_count_primary = 0;
|
int sum_count_primary = 0;
|
||||||
llama_token max_token = -1;
|
llama_token max_token = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
||||||
const llama_token token = token_count_primary.first;
|
const llama_token token = token_count_primary.first;
|
||||||
|
@ -154,7 +154,7 @@ void common_ngram_cache_draft(
|
||||||
}
|
}
|
||||||
|
|
||||||
while ((int) draft.size()-1 < n_draft) {
|
while ((int) draft.size()-1 < n_draft) {
|
||||||
llama_token drafted_token = -1;
|
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
||||||
common_ngram ngram_static;
|
common_ngram ngram_static;
|
||||||
|
@ -177,17 +177,17 @@ void common_ngram_cache_draft(
|
||||||
}
|
}
|
||||||
ngrams_cd.push_back(ngram_cd);
|
ngrams_cd.push_back(ngram_cd);
|
||||||
}
|
}
|
||||||
if (drafted_token == -1) {
|
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||||
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
|
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
|
||||||
}
|
}
|
||||||
if (drafted_token == -1) {
|
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||||
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
|
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
|
||||||
}
|
}
|
||||||
if (drafted_token == -1) {
|
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||||
drafted_token = try_draft(nc_static, ngram_static);
|
drafted_token = try_draft(nc_static, ngram_static);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (drafted_token == -1) {
|
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,13 +17,13 @@ struct common_ngram {
|
||||||
|
|
||||||
common_ngram() {
|
common_ngram() {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = -1;
|
tokens[i] = LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
common_ngram(const llama_token * input, const int ngram_size) {
|
common_ngram(const llama_token * input, const int ngram_size) {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = i < ngram_size ? input[i] : -1;
|
tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -687,6 +687,9 @@ class Model:
|
||||||
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
||||||
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
||||||
res = "megrez"
|
res = "megrez"
|
||||||
|
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
|
||||||
|
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
||||||
|
res = "deepseek-v3"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -3373,6 +3376,24 @@ class CommandR2Model(Model):
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Cohere2ForCausalLM")
|
||||||
|
class Cohere2Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.COHERE2
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
|
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
||||||
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||||
|
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||||
|
|
||||||
|
rotary_pct = self.hparams["rotary_pct"]
|
||||||
|
hidden_size = self.hparams["hidden_size"]
|
||||||
|
num_attention_heads = self.hparams["num_attention_heads"]
|
||||||
|
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("OlmoForCausalLM")
|
@Model.register("OlmoForCausalLM")
|
||||||
@Model.register("OLMoForCausalLM")
|
@Model.register("OLMoForCausalLM")
|
||||||
class OlmoModel(Model):
|
class OlmoModel(Model):
|
||||||
|
@ -3831,6 +3852,7 @@ class DeepseekModel(Model):
|
||||||
|
|
||||||
|
|
||||||
@Model.register("DeepseekV2ForCausalLM")
|
@Model.register("DeepseekV2ForCausalLM")
|
||||||
|
@Model.register("DeepseekV3ForCausalLM")
|
||||||
class DeepseekV2Model(Model):
|
class DeepseekV2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||||
|
|
||||||
|
@ -3852,6 +3874,15 @@ class DeepseekV2Model(Model):
|
||||||
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
||||||
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
||||||
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
||||||
|
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
||||||
|
|
||||||
|
if hparams["scoring_func"] == "sigmoid":
|
||||||
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||||
|
elif hparams["scoring_func"] == "softmax":
|
||||||
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
|
@ -3864,6 +3895,16 @@ class DeepseekV2Model(Model):
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# rename e_score_correction_bias tensors
|
||||||
|
if name.endswith("e_score_correction_bias"):
|
||||||
|
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||||
|
|
||||||
|
# skip Multi-Token Prediction (MTP) layers
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
match = re.match(r"model.layers.(\d+)", name)
|
||||||
|
if match and int(match.group(1)) >= block_count:
|
||||||
|
return []
|
||||||
|
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
if name.find("mlp.experts") != -1:
|
if name.find("mlp.experts") != -1:
|
||||||
n_experts = self.hparams["n_routed_experts"]
|
n_experts = self.hparams["n_routed_experts"]
|
||||||
|
|
|
@ -107,6 +107,7 @@ models = [
|
||||||
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
||||||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||||
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||||
|
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
|
||||||
base_name = lora_tensor_name.replace("base_model.model.", "")
|
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||||
base_name = base_name.replace(".lora_A.weight", ".weight")
|
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||||
base_name = base_name.replace(".lora_B.weight", ".weight")
|
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||||
|
# models produced by mergekit-extract-lora have token embeddings in the adapter
|
||||||
|
base_name = base_name.replace(".lora_embedding_A", ".weight")
|
||||||
|
base_name = base_name.replace(".lora_embedding_B", ".weight")
|
||||||
return base_name
|
return base_name
|
||||||
|
|
||||||
|
|
||||||
|
@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
|
||||||
"--base", type=Path,
|
"--base", type=Path,
|
||||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-model-id", type=str,
|
||||||
|
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"lora_path", type=Path,
|
"lora_path", type=Path,
|
||||||
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
||||||
|
@ -290,6 +297,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
dir_base_model: Path | None = args.base
|
dir_base_model: Path | None = args.base
|
||||||
dir_lora: Path = args.lora_path
|
dir_lora: Path = args.lora_path
|
||||||
|
base_model_id: str | None = args.base_model_id
|
||||||
lora_config = dir_lora / "adapter_config.json"
|
lora_config = dir_lora / "adapter_config.json"
|
||||||
input_model = dir_lora / "adapter_model.safetensors"
|
input_model = dir_lora / "adapter_model.safetensors"
|
||||||
|
|
||||||
|
@ -313,7 +321,10 @@ if __name__ == '__main__':
|
||||||
lparams: dict[str, Any] = json.load(f)
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
# load base model
|
# load base model
|
||||||
if dir_base_model is None:
|
if base_model_id is not None:
|
||||||
|
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
||||||
|
hparams = load_hparams_from_hf(base_model_id)
|
||||||
|
elif dir_base_model is None:
|
||||||
if "base_model_name_or_path" in lparams:
|
if "base_model_name_or_path" in lparams:
|
||||||
model_id = lparams["base_model_name_or_path"]
|
model_id = lparams["base_model_name_or_path"]
|
||||||
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||||
|
@ -371,11 +382,16 @@ if __name__ == '__main__':
|
||||||
if self.lazy:
|
if self.lazy:
|
||||||
tensor = LazyTorchTensor.from_eager(tensor)
|
tensor = LazyTorchTensor.from_eager(tensor)
|
||||||
base_name = get_base_tensor_name(name)
|
base_name = get_base_tensor_name(name)
|
||||||
is_lora_a = ".lora_A.weight" in name
|
# note: mergekit-extract-lora also adds token embeddings to the adapter
|
||||||
is_lora_b = ".lora_B.weight" in name
|
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
||||||
|
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
||||||
if not is_lora_a and not is_lora_b:
|
if not is_lora_a and not is_lora_b:
|
||||||
if ".base_layer.weight" in name:
|
if ".base_layer.weight" in name:
|
||||||
continue
|
continue
|
||||||
|
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
|
||||||
|
if "_layernorm" in name or ".norm" in name:
|
||||||
|
yield (base_name, tensor)
|
||||||
|
continue
|
||||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||||
|
@ -407,9 +423,21 @@ if __name__ == '__main__':
|
||||||
if name == "lm_head.weight" and len(dest) == 0:
|
if name == "lm_head.weight" and len(dest) == 0:
|
||||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||||
for dest_name, dest_data in dest:
|
for dest_name, dest_data in dest:
|
||||||
|
# mergekit-extract-lora add these layernorm to the adapter
|
||||||
|
if "_norm" in dest_name:
|
||||||
|
assert dest_data.dim() == 1
|
||||||
|
yield (dest_name, dest_data)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# otherwise, we must get the lora_A and lora_B tensors
|
||||||
assert isinstance(dest_data, LoraTorchTensor)
|
assert isinstance(dest_data, LoraTorchTensor)
|
||||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||||
|
|
||||||
|
# note: mergekit-extract-lora flip and transpose A and B
|
||||||
|
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
|
||||||
|
if "token_embd.weight" in dest_name:
|
||||||
|
lora_a = lora_a.T
|
||||||
|
|
||||||
yield (dest_name + ".lora_a", lora_a)
|
yield (dest_name + ".lora_a", lora_a)
|
||||||
yield (dest_name + ".lora_b", lora_b)
|
yield (dest_name + ".lora_b", lora_b)
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(params);
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
|
@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
|
|
@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(params);
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
||||||
|
@ -120,7 +120,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||||
if (decoder_start_token_id == -1) {
|
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||||
decoder_start_token_id = llama_token_bos(model);
|
decoder_start_token_id = llama_token_bos(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
@ -689,8 +691,8 @@ static void save_as_llama_model(
|
||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
|
||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
|
||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
|
||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
|
||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
|
||||||
|
|
||||||
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
|
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
|
||||||
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
|
||||||
#include "pca.hpp"
|
#include "pca.hpp"
|
||||||
#include "mean.hpp"
|
#include "mean.hpp"
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
#include "arg.h"
|
|
||||||
#include "common.h"
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include <cstdlib> /* abort() */
|
#include <cstdlib> /* abort() */
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
|
|
@ -1,16 +1,18 @@
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <climits>
|
||||||
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <cstring>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <climits>
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
@ -296,7 +298,7 @@ struct split_strategy {
|
||||||
total_size += ggml_nbytes(t);
|
total_size += ggml_nbytes(t);
|
||||||
}
|
}
|
||||||
total_size = total_size / 1000 / 1000; // convert to megabytes
|
total_size = total_size / 1000 / 1000; // convert to megabytes
|
||||||
printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
||||||
i_split++;
|
i_split++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cinttypes>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <fstream>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
|
@ -135,9 +134,10 @@ static bool gguf_ex_read_0(const std::string & fname) {
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name (ctx, i);
|
const char * name = gguf_get_tensor_name (ctx, i);
|
||||||
|
const size_t size = gguf_get_tensor_size (ctx, i);
|
||||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||||
|
|
||||||
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,9 +182,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name (ctx, i);
|
const char * name = gguf_get_tensor_name (ctx, i);
|
||||||
|
const size_t size = gguf_get_tensor_size (ctx, i);
|
||||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||||
|
|
||||||
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,7 +200,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||||
|
|
||||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
|
printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
|
||||||
|
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
|
||||||
|
|
||||||
// print first 10 elements
|
// print first 10 elements
|
||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
|
@ -215,7 +217,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||||
if (data[j] != 100 + i) {
|
if (data[j] != 100 + i) {
|
||||||
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
|
fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -245,6 +247,8 @@ int main(int argc, char ** argv) {
|
||||||
check_data = false;
|
check_data = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
srand(123456);
|
||||||
|
|
||||||
const std::string fname(argv[1]);
|
const std::string fname(argv[1]);
|
||||||
const std::string mode (argv[2]);
|
const std::string mode (argv[2]);
|
||||||
|
|
||||||
|
|
|
@ -165,7 +165,7 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
// create generation context
|
// create generation context
|
||||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
@ -219,7 +219,7 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -1526,10 +1526,10 @@ int main(int argc, char ** argv) {
|
||||||
// keep the same model between tests when possible
|
// keep the same model between tests when possible
|
||||||
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
||||||
if (lmodel) {
|
if (lmodel) {
|
||||||
llama_free_model(lmodel);
|
llama_model_free(lmodel);
|
||||||
}
|
}
|
||||||
|
|
||||||
lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
||||||
if (lmodel == NULL) {
|
if (lmodel == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -1540,7 +1540,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
|
llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
|
||||||
llama_free_model(lmodel);
|
llama_model_free(lmodel);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1626,7 +1626,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_threadpool_free_fn(threadpool);
|
ggml_threadpool_free_fn(threadpool);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(lmodel);
|
llama_model_free(lmodel);
|
||||||
|
|
||||||
if (p) {
|
if (p) {
|
||||||
p->print_footer();
|
p->print_footer();
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
//#ifdef GGML_USE_CUDA
|
//#ifdef GGML_USE_CUDA
|
||||||
//#include "ggml-cuda.h"
|
//#include "ggml-cuda.h"
|
||||||
|
@ -262,7 +263,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||||
{
|
{
|
||||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "[";
|
ss << "[";
|
||||||
for (int j = 0; j < arr_n; j++) {
|
for (int j = 0; j < arr_n; j++) {
|
||||||
|
@ -2734,7 +2735,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
total_size_org += orig_size;
|
total_size_org += orig_size;
|
||||||
total_size_new += new_size;
|
total_size_new += new_size;
|
||||||
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
||||||
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
|
GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
|
||||||
|
gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
|
||||||
fout.write((const char *)new_data, new_size);
|
fout.write((const char *)new_data, new_size);
|
||||||
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
|
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
|
||||||
for (size_t j = 0; j < pad; ++j) {
|
for (size_t j = 0; j < pad; ++j) {
|
||||||
|
|
|
@ -221,7 +221,7 @@ static struct llama_model * llava_init(common_params * params) {
|
||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -265,7 +265,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free(ctx_llava->ctx_llama);
|
llama_free(ctx_llava->ctx_llama);
|
||||||
llama_free_model(ctx_llava->model);
|
llama_model_free(ctx_llava->model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -323,7 +323,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
|
||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -75,7 +75,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free(ctx_llava->ctx_llama);
|
llama_free(ctx_llava->ctx_llama);
|
||||||
llama_free_model(ctx_llava->model);
|
llama_model_free(ctx_llava->model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -310,7 +310,7 @@ static struct llama_model * llava_init(common_params * params) {
|
||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -354,7 +354,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free(ctx_llava->ctx_llama);
|
llama_free(ctx_llava->ctx_llama);
|
||||||
llama_free_model(ctx_llava->model);
|
llama_model_free(ctx_llava->model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -575,7 +575,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -494,7 +494,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||||
if (decoder_start_token_id == -1) {
|
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||||
decoder_start_token_id = llama_token_bos(model);
|
decoder_start_token_id = llama_token_bos(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -831,7 +831,7 @@ int main(int argc, char ** argv) {
|
||||||
// if user stop generation mid-way, we must add EOT to finish model's last response
|
// if user stop generation mid-way, we must add EOT to finish model's last response
|
||||||
if (need_insert_eot && format_chat) {
|
if (need_insert_eot && format_chat) {
|
||||||
llama_token eot = llama_token_eot(model);
|
llama_token eot = llama_token_eot(model);
|
||||||
embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
|
embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot);
|
||||||
need_insert_eot = false;
|
need_insert_eot = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(params);
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
|
@ -266,7 +266,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
|
|
@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
|
||||||
auto mparams = llama_model_default_params();
|
auto mparams = llama_model_default_params();
|
||||||
mparams.use_mlock = false;
|
mparams.use_mlock = false;
|
||||||
|
|
||||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
|
@ -323,7 +323,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -347,7 +347,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
||||||
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
included_layers++;
|
included_layers++;
|
||||||
|
@ -409,7 +409,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
// report timing
|
// report timing
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = ggml_time_us();
|
||||||
|
|
|
@ -11,6 +11,8 @@
|
||||||
# include <curl/curl.h>
|
# include <curl/curl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <signal.h>
|
||||||
|
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
@ -25,6 +27,13 @@
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
#include "llama-cpp.h"
|
#include "llama-cpp.h"
|
||||||
|
|
||||||
|
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
|
||||||
|
[[noreturn]] static void sigint_handler(int) {
|
||||||
|
printf("\n");
|
||||||
|
exit(0); // not ideal, but it's the only way to guarantee exit in all cases
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
GGML_ATTRIBUTE_FORMAT(1, 2)
|
GGML_ATTRIBUTE_FORMAT(1, 2)
|
||||||
static std::string fmt(const char * fmt, ...) {
|
static std::string fmt(const char * fmt, ...) {
|
||||||
va_list ap;
|
va_list ap;
|
||||||
|
@ -83,6 +92,7 @@ class Opt {
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
||||||
|
ctx_params.n_ctx = ctx_params.n_batch;
|
||||||
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
||||||
temperature = temperature >= 0 ? temperature : temperature_default;
|
temperature = temperature >= 0 ? temperature : temperature_default;
|
||||||
|
|
||||||
|
@ -664,7 +674,7 @@ class LlamaData {
|
||||||
"\r%*s"
|
"\r%*s"
|
||||||
"\rLoading model",
|
"\rLoading model",
|
||||||
get_terminal_width(), " ");
|
get_terminal_width(), " ");
|
||||||
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params));
|
llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
|
||||||
if (!model) {
|
if (!model) {
|
||||||
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
||||||
}
|
}
|
||||||
|
@ -800,7 +810,20 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
|
||||||
|
|
||||||
static int read_user_input(std::string & user) {
|
static int read_user_input(std::string & user) {
|
||||||
std::getline(std::cin, user);
|
std::getline(std::cin, user);
|
||||||
return user.empty(); // Should have data in happy path
|
if (std::cin.eof()) {
|
||||||
|
printf("\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (user == "/bye") {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (user.empty()) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0; // Should have data in happy path
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to generate a response based on the prompt
|
// Function to generate a response based on the prompt
|
||||||
|
@ -867,7 +890,25 @@ static bool is_stdout_a_terminal() {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to tokenize the prompt
|
// Function to handle user input
|
||||||
|
static int get_user_input(std::string & user_input, const std::string & user) {
|
||||||
|
while (true) {
|
||||||
|
const int ret = handle_user_input(user_input, user);
|
||||||
|
if (ret == 1) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret == 2) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Main chat loop function
|
||||||
static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
||||||
int prev_len = 0;
|
int prev_len = 0;
|
||||||
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
||||||
|
@ -875,7 +916,8 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
||||||
while (true) {
|
while (true) {
|
||||||
// Get user input
|
// Get user input
|
||||||
std::string user_input;
|
std::string user_input;
|
||||||
while (handle_user_input(user_input, user)) {
|
if (get_user_input(user_input, user) == 1) {
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
add_message("user", user.empty() ? user_input : user, llama_data);
|
add_message("user", user.empty() ? user_input : user, llama_data);
|
||||||
|
@ -916,7 +958,23 @@ static std::string read_pipe_data() {
|
||||||
return result.str();
|
return result.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ctrl_c_handling() {
|
||||||
|
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
|
||||||
|
struct sigaction sigint_action;
|
||||||
|
sigint_action.sa_handler = sigint_handler;
|
||||||
|
sigemptyset(&sigint_action.sa_mask);
|
||||||
|
sigint_action.sa_flags = 0;
|
||||||
|
sigaction(SIGINT, &sigint_action, NULL);
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||||
|
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||||
|
};
|
||||||
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, const char ** argv) {
|
int main(int argc, const char ** argv) {
|
||||||
|
ctrl_c_handling();
|
||||||
Opt opt;
|
Opt opt;
|
||||||
const int ret = opt.init(argc, argv);
|
const int ret = opt.init(argc, argv);
|
||||||
if (ret == 2) {
|
if (ret == 2) {
|
||||||
|
|
|
@ -45,10 +45,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||||
| `-p, --prompt PROMPT` | prompt to start generation with |
|
|
||||||
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
||||||
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
|
||||||
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
|
||||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||||
| `--no-escape` | do not process escape sequences |
|
| `--no-escape` | do not process escape sequences |
|
||||||
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
|
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
|
||||||
|
|
|
@ -3797,7 +3797,7 @@ int main(int argc, char ** argv) {
|
||||||
data["input_extra"] = input_extra; // default to empty array if it's not exist
|
data["input_extra"] = input_extra; // default to empty array if it's not exist
|
||||||
|
|
||||||
std::string prompt = json_value(data, "prompt", std::string());
|
std::string prompt = json_value(data, "prompt", std::string());
|
||||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
|
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true);
|
||||||
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
|
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
|
||||||
data["prompt"] = format_infill(
|
data["prompt"] = format_infill(
|
||||||
ctx_server.ctx,
|
ctx_server.ctx,
|
||||||
|
|
|
@ -18,7 +18,7 @@ def test_infill_without_input_extra():
|
||||||
"input_suffix": "}\n",
|
"input_suffix": "}\n",
|
||||||
})
|
})
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert match_regex("(Ann|small|shiny)+", res.body["content"])
|
assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
|
||||||
|
|
||||||
|
|
||||||
def test_infill_with_input_extra():
|
def test_infill_with_input_extra():
|
||||||
|
|
|
@ -507,7 +507,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
|
|
||||||
// format incomplete utf-8 multibyte character for output
|
// format incomplete utf-8 multibyte character for output
|
||||||
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
||||||
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
|
||||||
|
|
||||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||||
// (size > 1 meaning it's already a known token)
|
// (size > 1 meaning it's already a known token)
|
||||||
|
|
|
@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.n_gpu_layers = ngl;
|
model_params.n_gpu_layers = ngl;
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
|
||||||
if (!model) {
|
if (!model) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.n_gpu_layers = ngl;
|
model_params.n_gpu_layers = ngl;
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
|
@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,6 +31,7 @@ static void print_usage_information(const char * argv0) {
|
||||||
printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||||
printf(" --stdin read prompt from standard input.\n");
|
printf(" --stdin read prompt from standard input.\n");
|
||||||
printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||||
|
printf(" --no-escape do not escape input (such as \\n, \\t, etc.).\n");
|
||||||
printf(" --no-parse-special do not parse control tokens.\n");
|
printf(" --no-parse-special do not parse control tokens.\n");
|
||||||
printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||||
printf(" --show-count print the total number of tokens.\n");
|
printf(" --show-count print the total number of tokens.\n");
|
||||||
|
@ -198,6 +199,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||||
// variables where to put any arguments we see.
|
// variables where to put any arguments we see.
|
||||||
bool printing_ids = false;
|
bool printing_ids = false;
|
||||||
bool no_bos = false;
|
bool no_bos = false;
|
||||||
|
bool no_escape = false;
|
||||||
bool no_parse_special = false;
|
bool no_parse_special = false;
|
||||||
bool disable_logging = false;
|
bool disable_logging = false;
|
||||||
bool show_token_count = false;
|
bool show_token_count = false;
|
||||||
|
@ -233,6 +235,9 @@ int main(int raw_argc, char ** raw_argv) {
|
||||||
else if (arg == "--no-bos") {
|
else if (arg == "--no-bos") {
|
||||||
no_bos = true;
|
no_bos = true;
|
||||||
}
|
}
|
||||||
|
else if (arg == "--no-escape") {
|
||||||
|
no_escape = true;
|
||||||
|
}
|
||||||
else if (arg == "--no-parse-special") {
|
else if (arg == "--no-parse-special") {
|
||||||
no_parse_special = true;
|
no_parse_special = true;
|
||||||
}
|
}
|
||||||
|
@ -333,7 +338,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.vocab_only = true;
|
model_params.vocab_only = true;
|
||||||
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
llama_model * model = llama_model_load_from_file(model_path, model_params);
|
||||||
if (!model) {
|
if (!model) {
|
||||||
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -363,6 +368,11 @@ int main(int raw_argc, char ** raw_argv) {
|
||||||
const bool model_wants_add_bos = llama_add_bos_token(model);
|
const bool model_wants_add_bos = llama_add_bos_token(model);
|
||||||
const bool add_bos = model_wants_add_bos && !no_bos;
|
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||||
const bool parse_special = !no_parse_special;
|
const bool parse_special = !no_parse_special;
|
||||||
|
const bool escape = !no_escape;
|
||||||
|
|
||||||
|
if (escape) {
|
||||||
|
string_process_escapes(prompt);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
tokens = common_tokenize(model, prompt, add_bos, parse_special);
|
tokens = common_tokenize(model, prompt, add_bos, parse_special);
|
||||||
|
@ -398,7 +408,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||||
}
|
}
|
||||||
// silence valgrind
|
// silence valgrind
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -243,7 +243,8 @@ set(GGML_PUBLIC_HEADERS
|
||||||
include/ggml-metal.h
|
include/ggml-metal.h
|
||||||
include/ggml-rpc.h
|
include/ggml-rpc.h
|
||||||
include/ggml-sycl.h
|
include/ggml-sycl.h
|
||||||
include/ggml-vulkan.h)
|
include/ggml-vulkan.h
|
||||||
|
include/gguf.h)
|
||||||
|
|
||||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
#if (GGML_METAL)
|
#if (GGML_METAL)
|
||||||
|
@ -252,26 +253,6 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
|
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
|
||||||
install(TARGETS ggml-base LIBRARY)
|
install(TARGETS ggml-base LIBRARY)
|
||||||
|
|
||||||
# FIXME: this should be done in the backend cmake files
|
|
||||||
if (GGML_METAL)
|
|
||||||
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
|
|
||||||
install(
|
|
||||||
FILES src/ggml-metal/ggml-metal.metal
|
|
||||||
PERMISSIONS
|
|
||||||
OWNER_READ
|
|
||||||
OWNER_WRITE
|
|
||||||
GROUP_READ
|
|
||||||
WORLD_READ
|
|
||||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
||||||
|
|
||||||
if (NOT GGML_METAL_EMBED_LIBRARY)
|
|
||||||
install(
|
|
||||||
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
|
||||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_STANDALONE)
|
if (GGML_STANDALONE)
|
||||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
|
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
#include "gguf.h"
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
// Smart pointers for ggml types
|
// Smart pointers for ggml types
|
||||||
|
|
|
@ -241,12 +241,6 @@
|
||||||
#define GGML_ROPE_TYPE_MROPE 8
|
#define GGML_ROPE_TYPE_MROPE 8
|
||||||
#define GGML_ROPE_TYPE_VISION 24
|
#define GGML_ROPE_TYPE_VISION 24
|
||||||
|
|
||||||
#define GGUF_MAGIC "GGUF"
|
|
||||||
|
|
||||||
#define GGUF_VERSION 3
|
|
||||||
|
|
||||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
|
||||||
|
|
||||||
#define GGML_UNUSED(x) (void)(x)
|
#define GGML_UNUSED(x) (void)(x)
|
||||||
|
|
||||||
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||||
|
@ -403,12 +397,6 @@ extern "C" {
|
||||||
GGML_PREC_F32,
|
GGML_PREC_F32,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_backend_type {
|
|
||||||
GGML_BACKEND_TYPE_CPU = 0,
|
|
||||||
GGML_BACKEND_TYPE_GPU = 10,
|
|
||||||
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
|
||||||
};
|
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum ggml_ftype {
|
enum ggml_ftype {
|
||||||
GGML_FTYPE_UNKNOWN = -1,
|
GGML_FTYPE_UNKNOWN = -1,
|
||||||
|
@ -587,8 +575,6 @@ extern "C" {
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
|
||||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
|
||||||
|
|
||||||
struct ggml_backend_buffer * buffer;
|
struct ggml_backend_buffer * buffer;
|
||||||
|
|
||||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||||
|
@ -2111,132 +2097,6 @@ extern "C" {
|
||||||
int64_t n_per_row,
|
int64_t n_per_row,
|
||||||
const float * imatrix);
|
const float * imatrix);
|
||||||
|
|
||||||
//
|
|
||||||
// gguf
|
|
||||||
//
|
|
||||||
|
|
||||||
enum gguf_type {
|
|
||||||
GGUF_TYPE_UINT8 = 0,
|
|
||||||
GGUF_TYPE_INT8 = 1,
|
|
||||||
GGUF_TYPE_UINT16 = 2,
|
|
||||||
GGUF_TYPE_INT16 = 3,
|
|
||||||
GGUF_TYPE_UINT32 = 4,
|
|
||||||
GGUF_TYPE_INT32 = 5,
|
|
||||||
GGUF_TYPE_FLOAT32 = 6,
|
|
||||||
GGUF_TYPE_BOOL = 7,
|
|
||||||
GGUF_TYPE_STRING = 8,
|
|
||||||
GGUF_TYPE_ARRAY = 9,
|
|
||||||
GGUF_TYPE_UINT64 = 10,
|
|
||||||
GGUF_TYPE_INT64 = 11,
|
|
||||||
GGUF_TYPE_FLOAT64 = 12,
|
|
||||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
|
||||||
};
|
|
||||||
|
|
||||||
struct gguf_context;
|
|
||||||
|
|
||||||
struct gguf_init_params {
|
|
||||||
bool no_alloc;
|
|
||||||
|
|
||||||
// if not NULL, create a ggml_context and allocate the tensor data in it
|
|
||||||
struct ggml_context ** ctx;
|
|
||||||
};
|
|
||||||
|
|
||||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
|
||||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
|
||||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
|
||||||
|
|
||||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
|
||||||
|
|
||||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
|
||||||
|
|
||||||
GGML_API int gguf_get_version (const struct gguf_context * ctx);
|
|
||||||
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
|
||||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
|
||||||
GGML_API void * gguf_get_data (const struct gguf_context * ctx);
|
|
||||||
|
|
||||||
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
|
||||||
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
|
||||||
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
|
|
||||||
|
|
||||||
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
|
|
||||||
|
|
||||||
// will abort if the wrong type is used for the key
|
|
||||||
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
|
||||||
|
|
||||||
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
|
||||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
|
||||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
|
||||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
|
||||||
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
|
||||||
|
|
||||||
// removes key if it exists
|
|
||||||
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
|
||||||
|
|
||||||
// overrides existing values or adds a new one
|
|
||||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
|
||||||
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
|
||||||
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
|
||||||
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
|
||||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
|
||||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
|
||||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
|
||||||
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
|
||||||
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
|
||||||
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
|
||||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
|
||||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
|
||||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
|
||||||
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
|
|
||||||
|
|
||||||
// set or add KV pairs from another context
|
|
||||||
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
|
|
||||||
|
|
||||||
// manage tensor info
|
|
||||||
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
|
||||||
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
|
||||||
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
|
||||||
|
|
||||||
// writing gguf files can be done in 2 ways:
|
|
||||||
//
|
|
||||||
// - write the entire gguf_context to a binary file in a single pass:
|
|
||||||
//
|
|
||||||
// gguf_write_to_file(ctx, fname);
|
|
||||||
//
|
|
||||||
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
|
||||||
//
|
|
||||||
// FILE * f = fopen(fname, "wb");
|
|
||||||
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
|
|
||||||
// fwrite(f, ...);
|
|
||||||
// void * data = gguf_meta_get_meta_data(ctx);
|
|
||||||
// fseek(f, 0, SEEK_SET);
|
|
||||||
// fwrite(f, data, gguf_get_meta_size(ctx));
|
|
||||||
// free(data);
|
|
||||||
// fclose(f);
|
|
||||||
//
|
|
||||||
|
|
||||||
// write the entire context to a binary file
|
|
||||||
GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
|
||||||
|
|
||||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
|
||||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
|
||||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
// restrict not standard in C++
|
// restrict not standard in C++
|
||||||
# if defined(__GNUC__)
|
# if defined(__GNUC__)
|
||||||
|
|
202
ggml/include/gguf.h
Normal file
202
ggml/include/gguf.h
Normal file
|
@ -0,0 +1,202 @@
|
||||||
|
// This file contains functionality related to "GGUF" files, the binary file format used by ggml.
|
||||||
|
// GGUF files have the following structure:
|
||||||
|
//
|
||||||
|
// 1. File magic "GGUF" (4 bytes).
|
||||||
|
// 2. File version (uint32_t).
|
||||||
|
// 3. Number of ggml tensors in file (int64_t).
|
||||||
|
// 4. Number of key-value-pairs in file (int64_t).
|
||||||
|
// 5. For each KV pair:
|
||||||
|
// 1. The key (string).
|
||||||
|
// 2. The value type (gguf_type).
|
||||||
|
// 3a. If the value type is GGUF_TYPE_ARRAY:
|
||||||
|
// 1. The type of the array (gguf_type).
|
||||||
|
// 2. The number of elements in the array (uint64_t).
|
||||||
|
// 3. The binary representation of each element in the array.
|
||||||
|
// 3b. Otherwise:
|
||||||
|
// 1. The binary representation of the value.
|
||||||
|
// 6. For each ggml tensor:
|
||||||
|
// 1. The tensor name (string).
|
||||||
|
// 2. The number of dimensions of the tensor (uint32_t).
|
||||||
|
// 3. For each dimension:
|
||||||
|
// 1. The size of the tensor in the dimension (int64_t).
|
||||||
|
// 4. The tensor data type (ggml_type).
|
||||||
|
// 5. The tensor data offset in the tensor data binary blob (uint64_t).
|
||||||
|
// 7. The tensor data binary blob (optional, aligned).
|
||||||
|
//
|
||||||
|
// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
|
||||||
|
// All enums are stored as int32_t.
|
||||||
|
// All bool values are stored as int8_t.
|
||||||
|
// If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
|
||||||
|
// otherwise GGUF_DEFAULT_ALIGNMENT is used.
|
||||||
|
//
|
||||||
|
// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#define GGUF_MAGIC "GGUF"
|
||||||
|
#define GGUF_VERSION 3
|
||||||
|
|
||||||
|
#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
|
||||||
|
|
||||||
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// types that can be stored as GGUF KV data
|
||||||
|
enum gguf_type {
|
||||||
|
GGUF_TYPE_UINT8 = 0,
|
||||||
|
GGUF_TYPE_INT8 = 1,
|
||||||
|
GGUF_TYPE_UINT16 = 2,
|
||||||
|
GGUF_TYPE_INT16 = 3,
|
||||||
|
GGUF_TYPE_UINT32 = 4,
|
||||||
|
GGUF_TYPE_INT32 = 5,
|
||||||
|
GGUF_TYPE_FLOAT32 = 6,
|
||||||
|
GGUF_TYPE_BOOL = 7,
|
||||||
|
GGUF_TYPE_STRING = 8,
|
||||||
|
GGUF_TYPE_ARRAY = 9,
|
||||||
|
GGUF_TYPE_UINT64 = 10,
|
||||||
|
GGUF_TYPE_INT64 = 11,
|
||||||
|
GGUF_TYPE_FLOAT64 = 12,
|
||||||
|
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gguf_context;
|
||||||
|
|
||||||
|
struct gguf_init_params {
|
||||||
|
bool no_alloc;
|
||||||
|
|
||||||
|
// if not NULL, create a ggml_context and allocate the tensor data in it
|
||||||
|
struct ggml_context ** ctx;
|
||||||
|
};
|
||||||
|
|
||||||
|
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||||
|
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||||
|
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||||
|
|
||||||
|
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||||
|
|
||||||
|
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||||
|
|
||||||
|
GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx);
|
||||||
|
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
||||||
|
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
||||||
|
|
||||||
|
GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx);
|
||||||
|
GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
|
||||||
|
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
|
||||||
|
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
|
||||||
|
// will abort if the wrong type is used for the key
|
||||||
|
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API size_t gguf_get_arr_n (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
|
||||||
|
// get raw pointer to the first element of the array with the given key_id
|
||||||
|
// for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
|
||||||
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
|
||||||
|
// get ith C string from array with given key_id
|
||||||
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
|
||||||
|
|
||||||
|
GGML_API int64_t gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||||
|
GGML_API int64_t gguf_find_tensor (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
|
||||||
|
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id);
|
||||||
|
GGML_API const char * gguf_get_tensor_name (const struct gguf_context * ctx, int64_t tensor_id);
|
||||||
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int64_t tensor_id);
|
||||||
|
GGML_API size_t gguf_get_tensor_size (const struct gguf_context * ctx, int64_t tensor_id);
|
||||||
|
|
||||||
|
// removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
|
||||||
|
GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key);
|
||||||
|
|
||||||
|
// overrides an existing KV pair or adds a new one, the new KV pair is always at the back
|
||||||
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||||
|
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||||
|
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
||||||
|
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
||||||
|
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||||
|
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||||
|
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||||
|
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
||||||
|
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
||||||
|
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
||||||
|
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||||
|
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||||
|
|
||||||
|
// creates a new array with n elements of the given type and copies the corresponding number of bytes from data
|
||||||
|
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n);
|
||||||
|
|
||||||
|
// creates a new array with n strings and copies the corresponding strings from data
|
||||||
|
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n);
|
||||||
|
|
||||||
|
// set or add KV pairs from another context
|
||||||
|
GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
|
||||||
|
|
||||||
|
// add tensor to GGUF context, tensor name must be unique
|
||||||
|
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
// after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
|
||||||
|
// in such a way that the tensor data remains as one contiguous block (except for padding)
|
||||||
|
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
||||||
|
|
||||||
|
// assumes that at least gguf_get_tensor_size bytes can be read from data
|
||||||
|
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
|
||||||
|
|
||||||
|
// writing gguf files can be done in 3 ways:
|
||||||
|
//
|
||||||
|
// - write the entire gguf_context to a binary file in a single pass:
|
||||||
|
//
|
||||||
|
// gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
|
||||||
|
//
|
||||||
|
// - write only the meta data to a file, then re-open the file and append the tensor data:
|
||||||
|
//
|
||||||
|
// gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
|
||||||
|
// FILE * f = fopen(fname, "ab");
|
||||||
|
// fwrite(f, ...); // write tensor data
|
||||||
|
// fclose(f);
|
||||||
|
//
|
||||||
|
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
||||||
|
//
|
||||||
|
// FILE * f = fopen(fname, "wb");
|
||||||
|
// const size_t size_meta = gguf_get_meta_size(ctx);
|
||||||
|
// fseek(f, size_meta, SEEK_SET);
|
||||||
|
// fwrite(f, ...); // write tensor data
|
||||||
|
// void * data = malloc(size_meta);
|
||||||
|
// gguf_get_meta_data(ctx, data);
|
||||||
|
// rewind(f);
|
||||||
|
// fwrite(data, 1, data, f);
|
||||||
|
// free(data);
|
||||||
|
// fclose(f);
|
||||||
|
//
|
||||||
|
|
||||||
|
// write the entire context to a binary file
|
||||||
|
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||||
|
|
||||||
|
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||||
|
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||||
|
|
||||||
|
// writes the meta data to pointer "data"
|
||||||
|
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -208,6 +208,7 @@ add_library(ggml-base
|
||||||
../include/ggml-backend.h
|
../include/ggml-backend.h
|
||||||
../include/ggml-cpp.h
|
../include/ggml-cpp.h
|
||||||
../include/ggml-opt.h
|
../include/ggml-opt.h
|
||||||
|
../include/gguf.h
|
||||||
ggml.c
|
ggml.c
|
||||||
ggml-alloc.c
|
ggml-alloc.c
|
||||||
ggml-backend.cpp
|
ggml-backend.cpp
|
||||||
|
@ -215,7 +216,8 @@ add_library(ggml-base
|
||||||
ggml-threading.cpp
|
ggml-threading.cpp
|
||||||
ggml-threading.h
|
ggml-threading.h
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h)
|
ggml-quants.h
|
||||||
|
gguf.cpp)
|
||||||
|
|
||||||
target_include_directories(ggml-base PRIVATE .)
|
target_include_directories(ggml-base PRIVATE .)
|
||||||
|
|
||||||
|
|
|
@ -574,4 +574,9 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||||
ggml_backend_load_best("opencl", silent, dir_path);
|
ggml_backend_load_best("opencl", silent, dir_path);
|
||||||
ggml_backend_load_best("musa", silent, dir_path);
|
ggml_backend_load_best("musa", silent, dir_path);
|
||||||
ggml_backend_load_best("cpu", silent, dir_path);
|
ggml_backend_load_best("cpu", silent, dir_path);
|
||||||
|
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
||||||
|
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
|
||||||
|
if (backend_path) {
|
||||||
|
ggml_backend_load(backend_path);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -764,7 +764,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||||
// check if a backend with higher prio wants to offload the op
|
// check if a backend with higher prio wants to offload the op
|
||||||
if (src_backend_id == sched->n_backends - 1) {
|
if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
|
||||||
for (int b = 0; b < src_backend_id; b++) {
|
for (int b = 0; b < src_backend_id; b++) {
|
||||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||||
SET_CAUSE(tensor, "1.off");
|
SET_CAUSE(tensor, "1.off");
|
||||||
|
@ -795,9 +795,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
||||||
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
||||||
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
|
||||||
sched->splits[cur_split].n_inputs);
|
sched->splits[cur_split].n_inputs);
|
||||||
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
||||||
|
if (j == 0) {
|
||||||
|
GGML_LOG_DEBUG(": ");
|
||||||
|
}
|
||||||
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
||||||
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
||||||
}
|
}
|
||||||
|
|
|
@ -4169,6 +4169,8 @@ static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(g
|
||||||
buffer->buft = buft;
|
buffer->buft = buft;
|
||||||
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
|
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
|
||||||
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
|
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
|
||||||
|
buffer->iface.get_tensor = nullptr;
|
||||||
|
buffer->iface.cpy_tensor = nullptr;
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,7 @@
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <array>
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#define NOINLINE __declspec(noinline)
|
#define NOINLINE __declspec(noinline)
|
||||||
|
@ -1051,6 +1052,704 @@ class tinyBLAS_Q0_AVX {
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
|
template <typename TA, typename TB, typename TC>
|
||||||
|
class tinyBLAS_Q0_PPC {
|
||||||
|
public:
|
||||||
|
tinyBLAS_Q0_PPC(int64_t k,
|
||||||
|
const TA *A, int64_t lda,
|
||||||
|
const TB *B, int64_t ldb,
|
||||||
|
TC *C, int64_t ldc,
|
||||||
|
int ith, int nth)
|
||||||
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void matmul(int64_t m, int64_t n) {
|
||||||
|
mnpack(0, m, 0, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
template<int RM, int RN>
|
||||||
|
inline void save_res(int ii, int jj, int idx, vector float* fin_res) {
|
||||||
|
for (int I = 0; I < RM; I++) {
|
||||||
|
for (int J = 0; J < RN; J++) {
|
||||||
|
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int size>
|
||||||
|
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
|
||||||
|
vector signed int vec_C[4];
|
||||||
|
vector float CA[4] = {0};
|
||||||
|
vector float res[4] = {0};
|
||||||
|
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
||||||
|
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||||
|
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename VA, typename VB>
|
||||||
|
void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
||||||
|
int64_t i, j;
|
||||||
|
TA *aoffset = NULL;
|
||||||
|
VA *vecOffset = NULL;
|
||||||
|
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||||
|
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||||
|
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||||
|
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
|
||||||
|
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
|
||||||
|
VB t1, t2, t3, t4, t5, t6, t7, t8;
|
||||||
|
vector unsigned char xor_vector;
|
||||||
|
uint8_t flip_vec = 0x80;
|
||||||
|
xor_vector = vec_splats(flip_vec);
|
||||||
|
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||||
|
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||||
|
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||||
|
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||||
|
|
||||||
|
aoffset = const_cast<TA*>(a);
|
||||||
|
vecOffset = vec;
|
||||||
|
j = (rows >> 3);
|
||||||
|
if (j > 0) {
|
||||||
|
do {
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset1 + lda;
|
||||||
|
aoffset3 = aoffset2 + lda;
|
||||||
|
aoffset4 = aoffset3 + lda;
|
||||||
|
aoffset5 = aoffset4 + lda;
|
||||||
|
aoffset6 = aoffset5 + lda;
|
||||||
|
aoffset7 = aoffset6 + lda;
|
||||||
|
aoffset8 = aoffset7 + lda;
|
||||||
|
aoffset += 8 * lda;
|
||||||
|
|
||||||
|
i = (cols >> 3);
|
||||||
|
if (i > 0) {
|
||||||
|
do {
|
||||||
|
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||||
|
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||||
|
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||||
|
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
|
||||||
|
C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs);
|
||||||
|
C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs);
|
||||||
|
C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs);
|
||||||
|
C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs);
|
||||||
|
|
||||||
|
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||||
|
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||||
|
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||||
|
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||||
|
__builtin_vsx_disassemble_pair(c5, &C5);
|
||||||
|
__builtin_vsx_disassemble_pair(c6, &C6);
|
||||||
|
__builtin_vsx_disassemble_pair(c7, &C7);
|
||||||
|
__builtin_vsx_disassemble_pair(c8, &C8);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||||
|
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||||
|
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||||
|
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset);
|
||||||
|
vec_xst(t6, 0, vecOffset+16);
|
||||||
|
vec_xst(t7, 0, vecOffset+32);
|
||||||
|
vec_xst(t8, 0, vecOffset+48);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||||
|
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||||
|
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||||
|
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+64);
|
||||||
|
vec_xst(t6, 0, vecOffset+80);
|
||||||
|
vec_xst(t7, 0, vecOffset+96);
|
||||||
|
vec_xst(t8, 0, vecOffset+112);
|
||||||
|
|
||||||
|
t1 = vec_perm(c5[0], c6[0], swiz1);
|
||||||
|
t2 = vec_perm(c5[0], c6[0], swiz2);
|
||||||
|
t3 = vec_perm(c7[0], c8[0], swiz1);
|
||||||
|
t4 = vec_perm(c7[0], c8[0], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+128);
|
||||||
|
vec_xst(t6, 0, vecOffset+144);
|
||||||
|
vec_xst(t7, 0, vecOffset+160);
|
||||||
|
vec_xst(t8, 0, vecOffset+176);
|
||||||
|
|
||||||
|
t1 = vec_perm(c5[1], c6[1], swiz1);
|
||||||
|
t2 = vec_perm(c5[1], c6[1], swiz2);
|
||||||
|
t3 = vec_perm(c7[1], c8[1], swiz1);
|
||||||
|
t4 = vec_perm(c7[1], c8[1], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+192);
|
||||||
|
vec_xst(t6, 0, vecOffset+208);
|
||||||
|
vec_xst(t7, 0, vecOffset+224);
|
||||||
|
vec_xst(t8, 0, vecOffset+240);
|
||||||
|
|
||||||
|
aoffset1 += lda;
|
||||||
|
aoffset2 += lda;
|
||||||
|
aoffset3 += lda;
|
||||||
|
aoffset4 += lda;
|
||||||
|
aoffset5 += lda;
|
||||||
|
aoffset6 += lda;
|
||||||
|
aoffset7 += lda;
|
||||||
|
aoffset8 += lda;
|
||||||
|
vecOffset += 256;
|
||||||
|
i--;
|
||||||
|
} while(i > 0);
|
||||||
|
}
|
||||||
|
j--;
|
||||||
|
} while(j > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rows & 4) {
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset1 + lda;
|
||||||
|
aoffset3 = aoffset2 + lda;
|
||||||
|
aoffset4 = aoffset3 + lda;
|
||||||
|
aoffset += 4 * lda;
|
||||||
|
|
||||||
|
i = (cols >> 3);
|
||||||
|
if (i > 0) {
|
||||||
|
do {
|
||||||
|
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||||
|
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||||
|
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||||
|
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
|
||||||
|
|
||||||
|
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||||
|
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||||
|
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||||
|
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||||
|
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||||
|
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||||
|
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset);
|
||||||
|
vec_xst(t6, 0, vecOffset+16);
|
||||||
|
vec_xst(t7, 0, vecOffset+32);
|
||||||
|
vec_xst(t8, 0, vecOffset+48);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||||
|
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||||
|
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||||
|
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+64);
|
||||||
|
vec_xst(t6, 0, vecOffset+80);
|
||||||
|
vec_xst(t7, 0, vecOffset+96);
|
||||||
|
vec_xst(t8, 0, vecOffset+112);
|
||||||
|
|
||||||
|
aoffset1 += lda;
|
||||||
|
aoffset2 += lda;
|
||||||
|
aoffset3 += lda;
|
||||||
|
aoffset4 += lda;
|
||||||
|
vecOffset += 128;
|
||||||
|
i--;
|
||||||
|
} while(i > 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (rows & 3) {
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset1 + lda;
|
||||||
|
aoffset3 = aoffset2 + lda;
|
||||||
|
i = (cols >> 3);
|
||||||
|
if (i > 0) {
|
||||||
|
do {
|
||||||
|
switch(rows) {
|
||||||
|
case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||||
|
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||||
|
case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||||
|
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||||
|
case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||||
|
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||||
|
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||||
|
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||||
|
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset);
|
||||||
|
vec_xst(t6, 0, vecOffset+16);
|
||||||
|
vec_xst(t7, 0, vecOffset+32);
|
||||||
|
vec_xst(t8, 0, vecOffset+48);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||||
|
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||||
|
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||||
|
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+64);
|
||||||
|
vec_xst(t6, 0, vecOffset+80);
|
||||||
|
vec_xst(t7, 0, vecOffset+96);
|
||||||
|
vec_xst(t8, 0, vecOffset+112);
|
||||||
|
|
||||||
|
aoffset1 += lda;
|
||||||
|
aoffset2 += lda;
|
||||||
|
aoffset3 += lda;
|
||||||
|
vecOffset += 128;
|
||||||
|
i--;
|
||||||
|
} while(i > 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||||
|
int64_t mc, nc, mp, np;
|
||||||
|
int m_rem = MIN(m - m0, 8);
|
||||||
|
int n_rem = MIN(n - n0, 8);
|
||||||
|
// TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance
|
||||||
|
// issues. After resolving them, below code will be enabled.
|
||||||
|
/*if (m_rem >= 16 && n_rem >= 8) {
|
||||||
|
mc = 16;
|
||||||
|
nc = 8;
|
||||||
|
gemm<16,8>(m0, m, n0, n);
|
||||||
|
} else if(m_rem >= 8 && n_rem >= 16) {
|
||||||
|
mc = 8;
|
||||||
|
nc = 16;
|
||||||
|
gemm<8,16>(m0, m, n0, n);
|
||||||
|
}*/
|
||||||
|
if (m_rem >= 8 && n_rem >= 8) {
|
||||||
|
mc = 8;
|
||||||
|
nc = 8;
|
||||||
|
gemm<8,8>(m0, m, n0, n);
|
||||||
|
} else if (m_rem >= 4 && n_rem >= 8) {
|
||||||
|
mc = 4;
|
||||||
|
nc = 8;
|
||||||
|
gemm<4,8>(m0, m, n0, n);
|
||||||
|
} else if (m_rem >= 8 && n_rem >= 4) {
|
||||||
|
mc = 8;
|
||||||
|
nc = 4;
|
||||||
|
gemm<8,4>(m0, m, n0, n);
|
||||||
|
} else if (m_rem >= 4 && n_rem >= 4) {
|
||||||
|
mc = 4;
|
||||||
|
nc = 4;
|
||||||
|
gemm_small<4, 4>(m0, m, n0, n);
|
||||||
|
} else if ((m_rem < 4) && (n_rem > 4)) {
|
||||||
|
nc = 4;
|
||||||
|
switch(m_rem) {
|
||||||
|
case 1:
|
||||||
|
mc = 1;
|
||||||
|
gemm_small<1, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
mc = 2;
|
||||||
|
gemm_small<2, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
mc = 3;
|
||||||
|
gemm_small<3, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else if ((m_rem > 4) && (n_rem < 4)) {
|
||||||
|
mc = 4;
|
||||||
|
switch(n_rem) {
|
||||||
|
case 1:
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<4, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<4, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<4, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
switch((m_rem << 4) | n_rem) {
|
||||||
|
case 0x43:
|
||||||
|
mc = 4;
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<4, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x42:
|
||||||
|
mc = 4;
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<4, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x41:
|
||||||
|
mc = 4;
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<4, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x34:
|
||||||
|
mc = 3;
|
||||||
|
nc = 4;
|
||||||
|
gemm_small<3, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x33:
|
||||||
|
mc = 3;
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<3, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x32:
|
||||||
|
mc = 3;
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<3, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x31:
|
||||||
|
mc = 3;
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<3, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x24:
|
||||||
|
mc = 2;
|
||||||
|
nc = 4;
|
||||||
|
gemm_small<2, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x23:
|
||||||
|
mc = 2;
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<2, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x22:
|
||||||
|
mc = 2;
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<2, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x21:
|
||||||
|
mc = 2;
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<2, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x14:
|
||||||
|
mc = 1;
|
||||||
|
nc = 4;
|
||||||
|
gemm_small<1, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x13:
|
||||||
|
mc = 1;
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<1, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x12:
|
||||||
|
mc = 1;
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<1, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x11:
|
||||||
|
mc = 1;
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<1, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mp = m0 + (m - m0) / mc * mc;
|
||||||
|
np = n0 + (n - n0) / nc * nc;
|
||||||
|
mnpack(mp, m, n0, np);
|
||||||
|
mnpack(m0, m, np, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||||
|
vec_t vec_A[8], vec_B[16] = {0};
|
||||||
|
acc_t acc_0, acc_1;
|
||||||
|
std::array<int, 4> comparray;
|
||||||
|
vector float fin_res[8] = {0};
|
||||||
|
vector float vs[8] = {0};
|
||||||
|
for (int l = 0; l < k; l++) {
|
||||||
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
|
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
|
||||||
|
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||||
|
for(int x = 0; x < 8; x++) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
|
||||||
|
}
|
||||||
|
for (int I = 0; I<4; I++) {
|
||||||
|
for (int J = 0; J<4; J++) {
|
||||||
|
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||||
|
*((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto aoffset = A+(ii*lda)+l;
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
comparray[i] = 0;
|
||||||
|
int ca = 0;
|
||||||
|
const int8_t *at = aoffset->qs;
|
||||||
|
for (int j = 0; j < 32; j++)
|
||||||
|
ca += (int)*at++;
|
||||||
|
comparray[i] = ca;
|
||||||
|
aoffset += lda;
|
||||||
|
}
|
||||||
|
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||||
|
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
|
||||||
|
}
|
||||||
|
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||||
|
save_res<4, 4>(ii, jj+4, 4, fin_res);
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||||
|
vec_t vec_A[16], vec_B[8] = {0};
|
||||||
|
acc_t acc_0, acc_1;
|
||||||
|
std::array<int, 8> comparray;
|
||||||
|
vector float fin_res[8] = {0};
|
||||||
|
vector float vs[8] = {0};
|
||||||
|
for (int l = 0; l < k; l++) {
|
||||||
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
|
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||||
|
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
|
||||||
|
for(int x = 0; x < 8; x++) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||||
|
}
|
||||||
|
for (int I = 0; I<8; I++) {
|
||||||
|
for (int J = 0; J<4; J++) {
|
||||||
|
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto aoffset = A+(ii*lda)+l;
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
comparray[i] = 0;
|
||||||
|
int ca = 0;
|
||||||
|
const int8_t *at = aoffset->qs;
|
||||||
|
for (int j = 0; j < 32; j++)
|
||||||
|
ca += (int)*at++;
|
||||||
|
comparray[i] = ca;
|
||||||
|
aoffset += lda;
|
||||||
|
}
|
||||||
|
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||||
|
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||||
|
}
|
||||||
|
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||||
|
save_res<4, 4>(ii+4, jj, 4, fin_res);
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||||
|
vec_t vec_A[16], vec_B[16] = {0};
|
||||||
|
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||||
|
std::array<int, 8> comparray;
|
||||||
|
vector float fin_res[16] = {0};
|
||||||
|
vector float vs[16] = {0};
|
||||||
|
for (int l = 0; l < k; l++) {
|
||||||
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_2);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_3);
|
||||||
|
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||||
|
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||||
|
for(int x = 0; x < 8; x++) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]);
|
||||||
|
}
|
||||||
|
for (int I = 0; I<8; I++) {
|
||||||
|
for (int J = 0; J<4; J++) {
|
||||||
|
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||||
|
*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto aoffset = A+(ii*lda)+l;
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
comparray[i] = 0;
|
||||||
|
int ca = 0;
|
||||||
|
const int8_t *at = aoffset->qs;
|
||||||
|
for (int j = 0; j < 32; j++)
|
||||||
|
ca += (int)*at++;
|
||||||
|
comparray[i] = ca;
|
||||||
|
aoffset += lda;
|
||||||
|
}
|
||||||
|
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||||
|
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||||
|
compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
|
||||||
|
compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
|
||||||
|
}
|
||||||
|
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||||
|
save_res<4, 4>(ii+4, jj, 4, fin_res);
|
||||||
|
save_res<4, 4>(ii, jj+4, 8, fin_res);
|
||||||
|
save_res<4, 4>(ii+4, jj+4, 12, fin_res);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int RM, int RN>
|
||||||
|
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||||
|
int64_t ytiles = (m - m0) / RM;
|
||||||
|
int64_t xtiles = (n - n0) / RN;
|
||||||
|
int64_t tiles = xtiles * ytiles;
|
||||||
|
int64_t duty = (tiles + nth - 1) / nth;
|
||||||
|
int64_t start = duty * ith;
|
||||||
|
int64_t end = start + duty;
|
||||||
|
vec_t vec_A[8], vec_B[8] = {0};
|
||||||
|
vector signed int vec_C[4];
|
||||||
|
acc_t acc_0;
|
||||||
|
|
||||||
|
if (end > tiles)
|
||||||
|
end = tiles;
|
||||||
|
for (int64_t job = start; job < end; ++job) {
|
||||||
|
int64_t ii = m0 + job / xtiles * RM;
|
||||||
|
int64_t jj = n0 + job % xtiles * RN;
|
||||||
|
std::array<int, RM> comparray;
|
||||||
|
vector float res[4] = {0};
|
||||||
|
vector float fin_res[4] = {0};
|
||||||
|
vector float vs[4] = {0};
|
||||||
|
vector float CA[4] = {0};
|
||||||
|
__builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value
|
||||||
|
__builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value
|
||||||
|
for (int l = 0; l < k; l++) {
|
||||||
|
__builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||||
|
__builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||||
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
|
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
|
||||||
|
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
|
||||||
|
for(int x = 0; x < 8; x+=4) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]);
|
||||||
|
}
|
||||||
|
for (int I = 0; I<RM; I++) {
|
||||||
|
for (int J = 0; J<RN; J++) {
|
||||||
|
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||||
|
auto aoffset = A+(ii*lda)+l;
|
||||||
|
for (int i = 0; i < RM; i++) {
|
||||||
|
comparray[i] = 0;
|
||||||
|
int ca = 0;
|
||||||
|
const int8_t *at = aoffset->qs;
|
||||||
|
for (int j = 0; j < 32; j++)
|
||||||
|
ca += (int)*at++;
|
||||||
|
comparray[i] = ca;
|
||||||
|
aoffset += lda;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < RM; i++) {
|
||||||
|
CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
|
||||||
|
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||||
|
fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
save_res<RM, RN>(ii, jj, 0, fin_res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int RM, int RN>
|
||||||
|
inline void kernel(int64_t ii, int64_t jj) {
|
||||||
|
if constexpr(RM == 4 && RN == 8) {
|
||||||
|
KERNEL_4x8(ii,jj);
|
||||||
|
} else if constexpr(RM == 8 && RN == 4) {
|
||||||
|
KERNEL_8x4(ii,jj);
|
||||||
|
} else if constexpr(RM == 8 && RN == 8) {
|
||||||
|
KERNEL_8x8(ii,jj);
|
||||||
|
} else {
|
||||||
|
static_assert(false, "RN/RM values not supported");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int RM, int RN>
|
||||||
|
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||||
|
int64_t ytiles = (m - m0) / RM;
|
||||||
|
int64_t xtiles = (n - n0) / RN;
|
||||||
|
int64_t tiles = xtiles * ytiles;
|
||||||
|
int64_t duty = (tiles + nth - 1) / nth;
|
||||||
|
int64_t start = duty * ith;
|
||||||
|
int64_t end = start + duty;
|
||||||
|
if (end > tiles)
|
||||||
|
end = tiles;
|
||||||
|
for (int64_t job = start; job < end; ++job) {
|
||||||
|
int64_t ii = m0 + job / xtiles * RM;
|
||||||
|
int64_t jj = n0 + job % xtiles * RN;
|
||||||
|
kernel<RM, RN>(ii, jj);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const TA *const A;
|
||||||
|
const TB *const B;
|
||||||
|
TC *C;
|
||||||
|
TA *At;
|
||||||
|
TB *Bt;
|
||||||
|
const int64_t k;
|
||||||
|
const int64_t lda;
|
||||||
|
const int64_t ldb;
|
||||||
|
const int64_t ldc;
|
||||||
|
const int ith;
|
||||||
|
const int nth;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename TA, typename TB, typename TC>
|
template <typename TA, typename TB, typename TC>
|
||||||
class tinyBLAS_PPC {
|
class tinyBLAS_PPC {
|
||||||
public:
|
public:
|
||||||
|
@ -1070,13 +1769,17 @@ class tinyBLAS_PPC {
|
||||||
|
|
||||||
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
|
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
|
||||||
|
|
||||||
void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) {
|
template<typename VA>
|
||||||
|
void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) {
|
||||||
int64_t i, j;
|
int64_t i, j;
|
||||||
float *aoffset = NULL, *boffset = NULL;
|
TA *aoffset = NULL, *boffset = NULL;
|
||||||
float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||||
float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||||
|
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||||
aoffset = const_cast<float*>(a);
|
VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
||||||
|
VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
||||||
|
VA t1, t2, t3, t4, t5, t6, t7, t8;
|
||||||
|
aoffset = const_cast<TA*>(a);
|
||||||
boffset = vec;
|
boffset = vec;
|
||||||
j = (rows >> 3);
|
j = (rows >> 3);
|
||||||
if (j > 0) {
|
if (j > 0) {
|
||||||
|
@ -1092,9 +1795,6 @@ class tinyBLAS_PPC {
|
||||||
aoffset += 8 * lda;
|
aoffset += 8 * lda;
|
||||||
i = (cols >> 3);
|
i = (cols >> 3);
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
|
||||||
vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2];
|
|
||||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
|
||||||
do {
|
do {
|
||||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||||
|
@ -1174,21 +1874,19 @@ class tinyBLAS_PPC {
|
||||||
} while(i > 0);
|
} while(i > 0);
|
||||||
}
|
}
|
||||||
if (cols & 4) {
|
if (cols & 4) {
|
||||||
vector float c1, c2, c3, c4, c5, c6, c7, c8;
|
c1[0] = vec_xl(0, aoffset1);
|
||||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
c2[0] = vec_xl(0, aoffset2);
|
||||||
c1 = vec_xl(0, aoffset1);
|
c3[0] = vec_xl(0, aoffset3);
|
||||||
c2 = vec_xl(0, aoffset2);
|
c4[0] = vec_xl(0, aoffset4);
|
||||||
c3 = vec_xl(0, aoffset3);
|
c5[0] = vec_xl(0, aoffset5);
|
||||||
c4 = vec_xl(0, aoffset4);
|
c6[0] = vec_xl(0, aoffset6);
|
||||||
c5 = vec_xl(0, aoffset5);
|
c7[0] = vec_xl(0, aoffset7);
|
||||||
c6 = vec_xl(0, aoffset6);
|
c8[0] = vec_xl(0, aoffset8);
|
||||||
c7 = vec_xl(0, aoffset7);
|
|
||||||
c8 = vec_xl(0, aoffset8);
|
|
||||||
|
|
||||||
t1 = vec_mergeh(c1, c2);
|
t1 = vec_mergeh(c1[0], c2[0]);
|
||||||
t2 = vec_mergeh(c3, c4);
|
t2 = vec_mergeh(c3[0], c4[0]);
|
||||||
t3 = vec_mergeh(c5, c6);
|
t3 = vec_mergeh(c5[0], c6[0]);
|
||||||
t4 = vec_mergeh(c7, c8);
|
t4 = vec_mergeh(c7[0], c8[0]);
|
||||||
t5 = vec_xxpermdi(t1, t2, 0);
|
t5 = vec_xxpermdi(t1, t2, 0);
|
||||||
t6 = vec_xxpermdi(t3, t4, 0);
|
t6 = vec_xxpermdi(t3, t4, 0);
|
||||||
t7 = vec_xxpermdi(t1, t2, 3);
|
t7 = vec_xxpermdi(t1, t2, 3);
|
||||||
|
@ -1198,10 +1896,10 @@ class tinyBLAS_PPC {
|
||||||
vec_xst(t7, 0, boffset+8);
|
vec_xst(t7, 0, boffset+8);
|
||||||
vec_xst(t8, 0, boffset+12);
|
vec_xst(t8, 0, boffset+12);
|
||||||
|
|
||||||
t1 = vec_mergel(c1, c2);
|
t1 = vec_mergel(c1[0], c2[0]);
|
||||||
t2 = vec_mergel(c3, c4);
|
t2 = vec_mergel(c3[0], c4[0]);
|
||||||
t3 = vec_mergel(c5, c6);
|
t3 = vec_mergel(c5[0], c6[0]);
|
||||||
t4 = vec_mergel(c7, c8);
|
t4 = vec_mergel(c7[0], c8[0]);
|
||||||
t5 = vec_xxpermdi(t1, t2, 0);
|
t5 = vec_xxpermdi(t1, t2, 0);
|
||||||
t6 = vec_xxpermdi(t3, t4, 0);
|
t6 = vec_xxpermdi(t3, t4, 0);
|
||||||
t7 = vec_xxpermdi(t1, t2, 3);
|
t7 = vec_xxpermdi(t1, t2, 3);
|
||||||
|
@ -1223,9 +1921,6 @@ class tinyBLAS_PPC {
|
||||||
aoffset += 4 * lda;
|
aoffset += 4 * lda;
|
||||||
i = (cols >> 3);
|
i = (cols >> 3);
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
__vector_pair C1, C2, C3, C4;
|
|
||||||
vector float c1[2], c2[2], c3[2], c4[2];
|
|
||||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
|
||||||
do {
|
do {
|
||||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||||
|
@ -1272,22 +1967,20 @@ class tinyBLAS_PPC {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cols & 4) {
|
if (cols & 4) {
|
||||||
vector float c1, c2, c3, c4;
|
c1[0] = vec_xl(0, aoffset1);
|
||||||
vector float t1, t2, t3, t4;
|
c2[0] = vec_xl(0, aoffset2);
|
||||||
c1 = vec_xl(0, aoffset1);
|
c3[0] = vec_xl(0, aoffset3);
|
||||||
c2 = vec_xl(0, aoffset2);
|
c4[0] = vec_xl(0, aoffset4);
|
||||||
c3 = vec_xl(0, aoffset3);
|
|
||||||
c4 = vec_xl(0, aoffset4);
|
|
||||||
|
|
||||||
t1 = vec_mergeh(c1, c2);
|
t1 = vec_mergeh(c1[0], c2[0]);
|
||||||
t2 = vec_mergeh(c3, c4);
|
t2 = vec_mergeh(c3[0], c4[0]);
|
||||||
t3 = vec_xxpermdi(t1, t2, 0);
|
t3 = vec_xxpermdi(t1, t2, 0);
|
||||||
t4 = vec_xxpermdi(t1, t2, 3);
|
t4 = vec_xxpermdi(t1, t2, 3);
|
||||||
vec_xst(t3, 0, boffset);
|
vec_xst(t3, 0, boffset);
|
||||||
vec_xst(t4, 0, boffset+4);
|
vec_xst(t4, 0, boffset+4);
|
||||||
|
|
||||||
t1 = vec_mergel(c1, c2);
|
t1 = vec_mergel(c1[0], c2[0]);
|
||||||
t2 = vec_mergel(c3, c4);
|
t2 = vec_mergel(c3[0], c4[0]);
|
||||||
t3 = vec_xxpermdi(t1, t2, 0);
|
t3 = vec_xxpermdi(t1, t2, 0);
|
||||||
t4 = vec_xxpermdi(t1, t2, 3);
|
t4 = vec_xxpermdi(t1, t2, 3);
|
||||||
vec_xst(t3, 0, boffset+8);
|
vec_xst(t3, 0, boffset+8);
|
||||||
|
@ -1299,21 +1992,19 @@ class tinyBLAS_PPC {
|
||||||
aoffset2 = aoffset1 + lda;
|
aoffset2 = aoffset1 + lda;
|
||||||
aoffset3 = aoffset2 + lda;
|
aoffset3 = aoffset2 + lda;
|
||||||
if (cols & 4) {
|
if (cols & 4) {
|
||||||
vector float c1, c2, c3, c4 = {0};
|
c1[0] = vec_xl(0, aoffset1);
|
||||||
vector float t1, t2, t3, t4;
|
c2[0] = vec_xl(0, aoffset2);
|
||||||
c1 = vec_xl(0, aoffset1);
|
c3[0] = vec_xl(0, aoffset3);
|
||||||
c2 = vec_xl(0, aoffset2);
|
|
||||||
c3 = vec_xl(0, aoffset3);
|
|
||||||
|
|
||||||
t1 = vec_mergeh(c1, c2);
|
t1 = vec_mergeh(c1[0], c2[0]);
|
||||||
t2 = vec_mergeh(c3, c4);
|
t2 = vec_mergeh(c3[0], c4[0]);
|
||||||
t3 = vec_xxpermdi(t1, t2, 0);
|
t3 = vec_xxpermdi(t1, t2, 0);
|
||||||
t4 = vec_xxpermdi(t1, t2, 3);
|
t4 = vec_xxpermdi(t1, t2, 3);
|
||||||
vec_xst(t3, 0, boffset);
|
vec_xst(t3, 0, boffset);
|
||||||
vec_xst(t4, 0, boffset+4);
|
vec_xst(t4, 0, boffset+4);
|
||||||
|
|
||||||
t1 = vec_mergel(c1, c2);
|
t1 = vec_mergel(c1[0], c2[0]);
|
||||||
t2 = vec_mergel(c3, c4);
|
t2 = vec_mergel(c3[0], c4[0]);
|
||||||
t3 = vec_xxpermdi(t1, t2, 0);
|
t3 = vec_xxpermdi(t1, t2, 0);
|
||||||
t4 = vec_xxpermdi(t1, t2, 3);
|
t4 = vec_xxpermdi(t1, t2, 3);
|
||||||
vec_xst(t3, 0, boffset+8);
|
vec_xst(t3, 0, boffset+8);
|
||||||
|
@ -1321,14 +2012,13 @@ class tinyBLAS_PPC {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void KERNEL_4x4(int64_t ii, int64_t jj) {
|
void KERNEL_4x4(int64_t ii, int64_t jj) {
|
||||||
vec_t vec_A[4], vec_B[4], vec_C[4];
|
vec_t vec_A[4], vec_B[4], vec_C[4];
|
||||||
acc_t acc_0;
|
acc_t acc_0;
|
||||||
__builtin_mma_xxsetaccz(&acc_0);
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
for (int l = 0; l < k; l+=4) {
|
for (int l = 0; l < k; l+=4) {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
|
||||||
|
@ -1343,8 +2033,8 @@ class tinyBLAS_PPC {
|
||||||
__builtin_mma_xxsetaccz(&acc_0);
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
__builtin_mma_xxsetaccz(&acc_1);
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
for (int64_t l = 0; l < k; l+=4) {
|
for (int64_t l = 0; l < k; l+=4) {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
|
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
|
||||||
|
@ -1364,8 +2054,8 @@ class tinyBLAS_PPC {
|
||||||
__builtin_mma_xxsetaccz(&acc_0);
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
__builtin_mma_xxsetaccz(&acc_1);
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
for (int64_t l = 0; l < k; l+=4) {
|
for (int64_t l = 0; l < k; l+=4) {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
|
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
|
||||||
|
@ -1387,8 +2077,8 @@ class tinyBLAS_PPC {
|
||||||
__builtin_mma_xxsetaccz(&acc_2);
|
__builtin_mma_xxsetaccz(&acc_2);
|
||||||
__builtin_mma_xxsetaccz(&acc_3);
|
__builtin_mma_xxsetaccz(&acc_3);
|
||||||
for (int l = 0; l < k; l+=8) {
|
for (int l = 0; l < k; l+=8) {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
|
||||||
for(int x = 0; x < 16; x+=2) {
|
for(int x = 0; x < 16; x+=2) {
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
|
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
|
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
|
||||||
|
@ -1571,15 +2261,15 @@ class tinyBLAS_PPC {
|
||||||
vec_t vec_A[4], vec_B[4];
|
vec_t vec_A[4], vec_B[4];
|
||||||
for (int l=0; l<k; l+=4) {
|
for (int l=0; l<k; l+=4) {
|
||||||
if (RN >= 4 && RM == 1) {
|
if (RN >= 4 && RM == 1) {
|
||||||
float* a = const_cast<float*>(A+(ii)*lda+l);
|
TA* a = const_cast<TA*>(A+(ii)*lda+l);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||||
vec_A[0] = (vec_t)vec_xl(0,a);
|
vec_A[0] = (vec_t)vec_xl(0,a);
|
||||||
vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1));
|
vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
|
||||||
vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2));
|
vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
|
||||||
vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3));
|
vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
|
||||||
} else {
|
} else {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
|
||||||
}
|
}
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||||
|
@ -1589,7 +2279,7 @@ class tinyBLAS_PPC {
|
||||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||||
for (int I = 0; I < RM; I++) {
|
for (int I = 0; I < RM; I++) {
|
||||||
for (int J = 0; J < RN; J++) {
|
for (int J = 0; J < RN; J++) {
|
||||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J);
|
*((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1812,6 +2502,20 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||||
params->ith, params->nth};
|
params->ith, params->nth};
|
||||||
tb.matmul(m, n);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
#elif defined(__MMA__)
|
||||||
|
if (n < 8 && n != 4)
|
||||||
|
return false;
|
||||||
|
if (m < 8 && m != 4)
|
||||||
|
return false;
|
||||||
|
tinyBLAS_Q0_PPC<block_q8_0, block_q8_0, float> tb{
|
||||||
|
k, (const block_q8_0 *)A, lda,
|
||||||
|
(const block_q8_0 *)B, ldb,
|
||||||
|
(float *)C, ldc,
|
||||||
|
params->ith, params->nth};
|
||||||
|
tb.matmul(m, n);
|
||||||
|
return true;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||||
uint64_t nb1,
|
uint64_t nb1,
|
||||||
uint64_t nb2,
|
uint64_t nb2,
|
||||||
uint64_t nb3){
|
uint64_t nb3){
|
||||||
static_assert(dim >= 0 && dim <= 3);
|
static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
|
||||||
|
|
||||||
const int64_t i3 = blockIdx.z;
|
const int64_t i3 = blockIdx.z;
|
||||||
const int64_t i2 = blockIdx.y;
|
const int64_t i2 = blockIdx.y;
|
||||||
|
|
|
@ -680,6 +680,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||||
return dequantize_row_iq3_s_cuda;
|
return dequantize_row_iq3_s_cuda;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return convert_unary_cuda<half>;
|
return convert_unary_cuda<half>;
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
return convert_unary_cuda<nv_bfloat16>;
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1728,7 +1728,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||||
|
|
||||||
bool use_mul_mat_vec = src0->type == GGML_TYPE_F16
|
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
||||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
|
@ -2869,6 +2869,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
#ifdef GGML_USE_MUSA
|
#ifdef GGML_USE_MUSA
|
||||||
if (a->type == GGML_TYPE_Q3_K) {
|
if (a->type == GGML_TYPE_Q3_K) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
#include "mmv.cuh"
|
#include "mmv.cuh"
|
||||||
|
|
||||||
template <typename type_acc, int block_size>
|
template <typename T, typename type_acc, int block_size>
|
||||||
static __global__ void mul_mat_vec(
|
static __global__ void mul_mat_vec(
|
||||||
const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
|
const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
|
||||||
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
|
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
|
||||||
const int64_t row = blockIdx.x;
|
const int64_t row = blockIdx.x;
|
||||||
const int64_t channel = blockIdx.z;
|
const int64_t channel = blockIdx.z;
|
||||||
|
@ -13,7 +13,6 @@ static __global__ void mul_mat_vec(
|
||||||
y += channel *stride_channel_y;
|
y += channel *stride_channel_y;
|
||||||
dst += channel *stride_channel_dst;
|
dst += channel *stride_channel_dst;
|
||||||
|
|
||||||
const half2 * x2 = (const half2 *) x;
|
|
||||||
const float2 * y2 = (const float2 *) y;
|
const float2 * y2 = (const float2 *) y;
|
||||||
|
|
||||||
extern __shared__ char data_mmv[];
|
extern __shared__ char data_mmv[];
|
||||||
|
@ -28,6 +27,9 @@ static __global__ void mul_mat_vec(
|
||||||
|
|
||||||
float sumf;
|
float sumf;
|
||||||
|
|
||||||
|
if constexpr (std::is_same<T, half>::value) {
|
||||||
|
const half2 * x2 = (const half2 *) x;
|
||||||
|
|
||||||
if (std::is_same<type_acc, float>::value) {
|
if (std::is_same<type_acc, float>::value) {
|
||||||
sumf = 0.0f;
|
sumf = 0.0f;
|
||||||
|
|
||||||
|
@ -51,6 +53,19 @@ static __global__ void mul_mat_vec(
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // FP16_AVAILABLE
|
#endif // FP16_AVAILABLE
|
||||||
}
|
}
|
||||||
|
} else if constexpr (std::is_same<T, nv_bfloat16>::value) {
|
||||||
|
const int * x2 = (const int *) x;
|
||||||
|
sumf = 0.0f;
|
||||||
|
|
||||||
|
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||||
|
const int tmpx = x2[col2];
|
||||||
|
const float2 tmpy = y2[col2];
|
||||||
|
sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
|
||||||
|
sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
static_assert(std::is_same<T, void>::value, "unsupported type");
|
||||||
|
}
|
||||||
|
|
||||||
sumf = warp_reduce_sum(sumf);
|
sumf = warp_reduce_sum(sumf);
|
||||||
|
|
||||||
|
@ -71,9 +86,9 @@ static __global__ void mul_mat_vec(
|
||||||
dst[row] = sumf;
|
dst[row] = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename type_acc>
|
template <typename T, typename type_acc>
|
||||||
static void launch_mul_mat_vec_cuda(
|
static void launch_mul_mat_vec_cuda(
|
||||||
const half * x, const float * y, float * dst,
|
const T * x, const float * y, float * dst,
|
||||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
||||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
|
@ -97,35 +112,35 @@ static void launch_mul_mat_vec_cuda(
|
||||||
const dim3 block_dims(block_size_best, 1, 1);
|
const dim3 block_dims(block_size_best, 1, 1);
|
||||||
switch (block_size_best) {
|
switch (block_size_best) {
|
||||||
case 32: {
|
case 32: {
|
||||||
mul_mat_vec<type_acc, 32><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 32><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 64: {
|
case 64: {
|
||||||
mul_mat_vec<type_acc, 64><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 64><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 96: {
|
case 96: {
|
||||||
mul_mat_vec<type_acc, 96><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 96><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 128: {
|
case 128: {
|
||||||
mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 160: {
|
case 160: {
|
||||||
mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 192: {
|
case 192: {
|
||||||
mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 224: {
|
case 224: {
|
||||||
mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 256: {
|
case 256: {
|
||||||
mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
default: {
|
default: {
|
||||||
|
@ -134,25 +149,25 @@ static void launch_mul_mat_vec_cuda(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
static void mul_mat_vec_cuda(
|
static void mul_mat_vec_cuda(
|
||||||
const half * x, const float * y, float * dst,
|
const T * x, const float * y, float * dst,
|
||||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
||||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
||||||
enum ggml_prec prec, cudaStream_t stream) {
|
enum ggml_prec prec, cudaStream_t stream) {
|
||||||
switch (prec) {
|
switch (prec) {
|
||||||
case GGML_PREC_DEFAULT: {
|
case GGML_PREC_DEFAULT: {
|
||||||
launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
launch_mul_mat_vec_cuda<T, half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
||||||
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
||||||
} break;
|
} break;
|
||||||
case GGML_PREC_F32: {
|
case GGML_PREC_F32: {
|
||||||
launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
launch_mul_mat_vec_cuda<T, float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
||||||
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -164,7 +179,6 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
||||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||||
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
|
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
|
||||||
|
|
||||||
const half * src0_d = (const half *) src0->data;
|
|
||||||
const float * src1_d = (const float *) src1->data;
|
const float * src1_d = (const float *) src1->data;
|
||||||
float * dst_d = (float *) dst->data;
|
float * dst_d = (float *) dst->data;
|
||||||
|
|
||||||
|
@ -181,7 +195,20 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
||||||
const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type);
|
const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type);
|
||||||
const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type);
|
const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type);
|
||||||
|
|
||||||
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F16: {
|
||||||
|
const half * src0_d = (const half *) src0->data;
|
||||||
|
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
|
||||||
|
channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_BF16: {
|
||||||
|
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
|
||||||
|
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
|
||||||
|
channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_mul_mat_vec(
|
void ggml_cuda_op_mul_mat_vec(
|
||||||
|
@ -190,7 +217,6 @@ void ggml_cuda_op_mul_mat_vec(
|
||||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||||
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -211,8 +237,20 @@ void ggml_cuda_op_mul_mat_vec(
|
||||||
const int64_t channel_stride_y = 0;
|
const int64_t channel_stride_y = 0;
|
||||||
const int64_t channel_stride_dst = 0;
|
const int64_t channel_stride_dst = 0;
|
||||||
|
|
||||||
mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F16: {
|
||||||
|
const half * src0_d = (const half *) src0_dd_i;
|
||||||
|
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
|
||||||
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
|
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_BF16: {
|
||||||
|
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
|
||||||
|
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
|
||||||
|
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||||
|
}
|
||||||
|
|
||||||
GGML_UNUSED(ctx);
|
GGML_UNUSED(ctx);
|
||||||
GGML_UNUSED(src1);
|
GGML_UNUSED(src1);
|
||||||
|
|
1
ggml/src/ggml-cuda/vendors/cuda.h
vendored
1
ggml/src/ggml-cuda/vendors/cuda.h
vendored
|
@ -3,6 +3,7 @@
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_bf16.h>
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
#if CUDART_VERSION < 11020
|
#if CUDART_VERSION < 11020
|
||||||
|
|
3
ggml/src/ggml-cuda/vendors/hip.h
vendored
3
ggml/src/ggml-cuda/vendors/hip.h
vendored
|
@ -3,6 +3,7 @@
|
||||||
#include <hip/hip_runtime.h>
|
#include <hip/hip_runtime.h>
|
||||||
#include <hipblas/hipblas.h>
|
#include <hipblas/hipblas.h>
|
||||||
#include <hip/hip_fp16.h>
|
#include <hip/hip_fp16.h>
|
||||||
|
#include <hip/hip_bfloat16.h>
|
||||||
#ifdef __HIP_PLATFORM_AMD__
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
// for rocblas_initialize()
|
// for rocblas_initialize()
|
||||||
#include "rocblas/rocblas.h"
|
#include "rocblas/rocblas.h"
|
||||||
|
@ -121,6 +122,8 @@
|
||||||
#define __has_builtin(x) 0
|
#define __has_builtin(x) 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
typedef hip_bfloat16 nv_bfloat16;
|
||||||
|
|
||||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||||
|
|
3
ggml/src/ggml-cuda/vendors/musa.h
vendored
3
ggml/src/ggml-cuda/vendors/musa.h
vendored
|
@ -3,6 +3,7 @@
|
||||||
#include <musa_runtime.h>
|
#include <musa_runtime.h>
|
||||||
#include <musa.h>
|
#include <musa.h>
|
||||||
#include <mublas.h>
|
#include <mublas.h>
|
||||||
|
#include <musa_bf16.h>
|
||||||
#include <musa_fp16.h>
|
#include <musa_fp16.h>
|
||||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
|
@ -132,3 +133,5 @@
|
||||||
#define cudaKernelNodeParams musaKernelNodeParams
|
#define cudaKernelNodeParams musaKernelNodeParams
|
||||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||||
#define cudaStreamEndCapture musaStreamEndCapture
|
#define cudaStreamEndCapture musaStreamEndCapture
|
||||||
|
|
||||||
|
typedef mt_bfloat16 nv_bfloat16;
|
||||||
|
|
|
@ -3,6 +3,8 @@
|
||||||
// GGML internal header
|
// GGML internal header
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||||
|
@ -551,22 +553,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||||
|
|
||||||
// expose GGUF internals for test code
|
|
||||||
|
|
||||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
|
||||||
|
|
||||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
|
||||||
|
|
||||||
struct gguf_buf {
|
|
||||||
void * data;
|
|
||||||
size_t size;
|
|
||||||
size_t offset;
|
|
||||||
};
|
|
||||||
GGML_API struct gguf_buf gguf_buf_init(size_t size);
|
|
||||||
GGML_API void gguf_buf_free(struct gguf_buf buf);
|
|
||||||
|
|
||||||
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// expose GGUF internals for test code
|
||||||
|
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||||
|
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||||
|
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
|
||||||
|
#endif // __cplusplus
|
||||||
|
|
|
@ -103,3 +103,19 @@ else()
|
||||||
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||||
)
|
)
|
||||||
endif() # GGML_METAL_EMBED_LIBRARY
|
endif() # GGML_METAL_EMBED_LIBRARY
|
||||||
|
|
||||||
|
if (NOT GGML_METAL_EMBED_LIBRARY)
|
||||||
|
install(
|
||||||
|
FILES src/ggml-metal/ggml-metal.metal
|
||||||
|
PERMISSIONS
|
||||||
|
OWNER_READ
|
||||||
|
OWNER_WRITE
|
||||||
|
GROUP_READ
|
||||||
|
WORLD_READ
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||||
|
|
||||||
|
install(
|
||||||
|
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
|
@ -27,15 +27,6 @@
|
||||||
#endif
|
#endif
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
|
||||||
|
|
||||||
#define GGML_DEBUG 0
|
|
||||||
#if (GGML_DEBUG >= 1)
|
|
||||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
|
||||||
#else
|
|
||||||
#define GGML_PRINT_DEBUG(...)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
typedef SOCKET sockfd_t;
|
typedef SOCKET sockfd_t;
|
||||||
using ssize_t = __int64;
|
using ssize_t = __int64;
|
||||||
|
@ -93,9 +84,23 @@ enum rpc_cmd {
|
||||||
RPC_CMD_COPY_TENSOR,
|
RPC_CMD_COPY_TENSOR,
|
||||||
RPC_CMD_GRAPH_COMPUTE,
|
RPC_CMD_GRAPH_COMPUTE,
|
||||||
RPC_CMD_GET_DEVICE_MEMORY,
|
RPC_CMD_GET_DEVICE_MEMORY,
|
||||||
|
RPC_CMD_INIT_TENSOR,
|
||||||
|
RPC_CMD_GET_ALLOC_SIZE,
|
||||||
RPC_CMD_COUNT,
|
RPC_CMD_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct rpc_msg_get_alloc_size_req {
|
||||||
|
rpc_tensor tensor;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rpc_msg_get_alloc_size_rsp {
|
||||||
|
uint64_t alloc_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rpc_msg_init_tensor_req {
|
||||||
|
rpc_tensor tensor;
|
||||||
|
};
|
||||||
|
|
||||||
struct rpc_msg_alloc_buffer_req {
|
struct rpc_msg_alloc_buffer_req {
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
};
|
};
|
||||||
|
@ -397,7 +402,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||||
initialized = true;
|
initialized = true;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
UNUSED(initialized);
|
GGML_UNUSED(initialized);
|
||||||
#endif
|
#endif
|
||||||
auto sock = socket_connect(host.c_str(), port);
|
auto sock = socket_connect(host.c_str(), port);
|
||||||
if (sock == nullptr) {
|
if (sock == nullptr) {
|
||||||
|
@ -461,10 +466,18 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||||
UNUSED(buffer);
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
|
||||||
// TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
|
// CUDA backend on the server pads everything to 512 due to CUDA limitations.
|
||||||
GGML_ASSERT(tensor->ne[0] % 512 == 0 && "unsupported quantized tensor");
|
// Due to bandwidth constraints, we only call the server init tensor functions if necessary.
|
||||||
|
// In particular, only quantized tensors need padding
|
||||||
|
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
|
||||||
|
rpc_msg_init_tensor_req request;
|
||||||
|
|
||||||
|
request.tensor = serialize_tensor(tensor);
|
||||||
|
|
||||||
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
|
||||||
|
GGML_ASSERT(status);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -577,9 +590,24 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||||
UNUSED(buft);
|
// See comments in init_tensor.
|
||||||
|
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
|
||||||
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||||
|
auto sock = get_socket(buft_ctx->endpoint);
|
||||||
|
|
||||||
|
rpc_msg_get_alloc_size_req request;
|
||||||
|
|
||||||
|
request.tensor = serialize_tensor(tensor);
|
||||||
|
|
||||||
|
rpc_msg_get_alloc_size_rsp response;
|
||||||
|
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
|
||||||
|
GGML_ASSERT(status);
|
||||||
|
|
||||||
|
return response.alloc_size;
|
||||||
|
} else {
|
||||||
return ggml_nbytes(tensor);
|
return ggml_nbytes(tensor);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
||||||
/* .get_name = */ ggml_backend_rpc_buffer_type_name,
|
/* .get_name = */ ggml_backend_rpc_buffer_type_name,
|
||||||
|
@ -603,7 +631,7 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||||
UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
// this is no-op because we don't have any async operations
|
// this is no-op because we don't have any async operations
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -757,6 +785,8 @@ public:
|
||||||
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
|
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
|
||||||
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
|
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
|
||||||
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
|
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
|
||||||
|
bool init_tensor(const rpc_msg_init_tensor_req & request);
|
||||||
|
bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
|
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
|
||||||
|
@ -770,6 +800,36 @@ private:
|
||||||
std::unordered_set<ggml_backend_buffer_t> buffers;
|
std::unordered_set<ggml_backend_buffer_t> buffers;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
|
||||||
|
ggml_backend_buffer_type_t buft;
|
||||||
|
struct ggml_init_params params {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||||
|
|
||||||
|
if (tensor == nullptr) {
|
||||||
|
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tensor->buffer == nullptr) {
|
||||||
|
//No buffer allocated.
|
||||||
|
buft = ggml_backend_get_default_buffer_type(backend);
|
||||||
|
} else {
|
||||||
|
buft = tensor->buffer->buft;
|
||||||
|
}
|
||||||
|
|
||||||
|
response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
|
||||||
|
|
||||||
|
ggml_free(ctx);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
|
void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
|
||||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
|
||||||
|
@ -781,7 +841,7 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
|
||||||
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
||||||
buffers.insert(buffer);
|
buffers.insert(buffer);
|
||||||
} else {
|
} else {
|
||||||
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -803,7 +863,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
|
||||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||||
if (buffers.find(buffer) == buffers.end()) {
|
if (buffers.find(buffer) == buffers.end()) {
|
||||||
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
void * base = ggml_backend_buffer_get_base(buffer);
|
void * base = ggml_backend_buffer_get_base(buffer);
|
||||||
|
@ -815,7 +875,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
||||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||||
if (buffers.find(buffer) == buffers.end()) {
|
if (buffers.find(buffer) == buffers.end()) {
|
||||||
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_free(buffer);
|
ggml_backend_buffer_free(buffer);
|
||||||
|
@ -827,7 +887,7 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
|
||||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
||||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||||
if (buffers.find(buffer) == buffers.end()) {
|
if (buffers.find(buffer) == buffers.end()) {
|
||||||
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_clear(buffer, request.value);
|
ggml_backend_buffer_clear(buffer, request.value);
|
||||||
|
@ -883,7 +943,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
||||||
if (tensor == nullptr) {
|
if (tensor == nullptr) {
|
||||||
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -905,6 +965,40 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
|
||||||
|
struct ggml_init_params params {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||||
|
if (tensor == nullptr) {
|
||||||
|
GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call the backend's buffer_init_tensor function
|
||||||
|
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||||
|
if (buffer && buffer->iface.init_tensor) {
|
||||||
|
buffer->iface.init_tensor(buffer, tensor);
|
||||||
|
} else {
|
||||||
|
GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tensor->extra != nullptr) {
|
||||||
|
// This pointer can either be passed around client/server, or probably better stored server-side and kept track of.
|
||||||
|
// Currently unimplemented.
|
||||||
|
GGML_LOG_ERROR("tensor->extra populated by the backend, this is currently unsupported.\n");
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_free(ctx);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
|
bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
|
||||||
struct ggml_init_params params {
|
struct ggml_init_params params {
|
||||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||||
|
@ -914,7 +1008,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||||
if (tensor == nullptr) {
|
if (tensor == nullptr) {
|
||||||
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -948,7 +1042,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
||||||
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
|
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
|
||||||
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
|
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
|
||||||
if (src == nullptr || dst == nullptr) {
|
if (src == nullptr || dst == nullptr) {
|
||||||
GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
|
GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1058,6 +1152,18 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case RPC_CMD_GET_ALLOC_SIZE: {
|
||||||
|
rpc_msg_get_alloc_size_req request;
|
||||||
|
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
rpc_msg_get_alloc_size_rsp response;
|
||||||
|
server.get_alloc_size(request, response);
|
||||||
|
if (!send_msg(sockfd, &response, sizeof(response))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
case RPC_CMD_GET_ALIGNMENT: {
|
case RPC_CMD_GET_ALIGNMENT: {
|
||||||
if (!recv_msg(sockfd, nullptr, 0)) {
|
if (!recv_msg(sockfd, nullptr, 0)) {
|
||||||
return;
|
return;
|
||||||
|
@ -1133,6 +1239,19 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case RPC_CMD_INIT_TENSOR: {
|
||||||
|
rpc_msg_init_tensor_req request;
|
||||||
|
if (!recv_msg(sockfd, &request,sizeof(request))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!server.init_tensor(request)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!send_msg(sockfd, nullptr, 0)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
case RPC_CMD_GET_TENSOR: {
|
case RPC_CMD_GET_TENSOR: {
|
||||||
rpc_msg_get_tensor_req request;
|
rpc_msg_get_tensor_req request;
|
||||||
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
||||||
|
@ -1257,14 +1376,14 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||||
|
|
||||||
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
|
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
|
||||||
|
|
||||||
UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
||||||
// TODO: obtain value from the server
|
// TODO: obtain value from the server
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
|
|
||||||
UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||||
|
@ -1285,7 +1404,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const
|
||||||
|
|
||||||
return ggml_backend_rpc_init(ctx->endpoint.c_str());
|
return ggml_backend_rpc_init(ctx->endpoint.c_str());
|
||||||
|
|
||||||
UNUSED(params);
|
GGML_UNUSED(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
|
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||||
|
@ -1293,12 +1412,12 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
|
||||||
|
|
||||||
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||||
|
|
||||||
UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||||
UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
UNUSED(op);
|
GGML_UNUSED(op);
|
||||||
//TODO: call the remote backend and cache the results
|
//TODO: call the remote backend and cache the results
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1335,20 +1454,20 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
||||||
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
|
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
|
||||||
return "RPC";
|
return "RPC";
|
||||||
|
|
||||||
UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
|
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||||
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
|
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
|
||||||
|
|
||||||
UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
UNUSED(index);
|
GGML_UNUSED(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||||
|
@ -1357,7 +1476,7 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
|
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
|
||||||
|
|
|
@ -131,7 +131,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
rwkv_wkv_f32_kernel(
|
rwkv_wkv_f32_kernel(
|
||||||
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
|
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
|
||||||
item_ct1, shared_mem_acc.get_pointer()
|
item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
|
@ -8,6 +8,20 @@ if (Vulkan_FOUND)
|
||||||
../../include/ggml-vulkan.h
|
../../include/ggml-vulkan.h
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
|
||||||
|
# If it's not, there will be an error to stderr.
|
||||||
|
# If it's supported, set a define to indicate that we should compile those shaders
|
||||||
|
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
|
||||||
|
OUTPUT_VARIABLE glslc_output
|
||||||
|
ERROR_VARIABLE glslc_error)
|
||||||
|
|
||||||
|
if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
|
||||||
|
message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
|
||||||
|
else()
|
||||||
|
message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
|
||||||
|
add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
|
# Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
|
||||||
# If it's not, there will be an error to stderr.
|
# If it's not, there will be an error to stderr.
|
||||||
# If it's supported, set a define to indicate that we should compile those shaders
|
# If it's supported, set a define to indicate that we should compile those shaders
|
||||||
|
@ -69,6 +83,10 @@ if (Vulkan_FOUND)
|
||||||
|
|
||||||
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
|
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
|
||||||
|
|
||||||
|
if (NOT CMAKE_CROSSCOMPILING)
|
||||||
|
set(_ggml_vk_genshaders_cmd "$<TARGET_FILE_DIR:vulkan-shaders-gen>/${_ggml_vk_genshaders_cmd}")
|
||||||
|
endif ()
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${_ggml_vk_header}
|
OUTPUT ${_ggml_vk_header}
|
||||||
${_ggml_vk_source}
|
${_ggml_vk_source}
|
||||||
|
|
|
@ -1645,6 +1645,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
#undef CREATE_MM2
|
#undef CREATE_MM2
|
||||||
} else
|
} else
|
||||||
#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
|
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
if (device->coopmat_support) {
|
if (device->coopmat_support) {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
@ -1739,7 +1740,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
}
|
}
|
||||||
#undef CREATE_MM2
|
#undef CREATE_MM2
|
||||||
#undef CREATE_MM
|
#undef CREATE_MM
|
||||||
} else if (device->fp16) {
|
} else
|
||||||
|
#endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
|
if (device->fp16) {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
if (device->mul_mat ## ID ## _l) \
|
if (device->mul_mat ## ID ## _l) \
|
||||||
|
@ -2040,6 +2043,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
std::cerr << "Done!" << std::endl;
|
std::cerr << "Done!" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
|
||||||
|
|
||||||
static vk_device ggml_vk_get_device(size_t idx) {
|
static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
|
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
|
||||||
|
|
||||||
|
@ -2175,9 +2180,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
|
|
||||||
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||||
|
|
||||||
if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
|
if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) {
|
||||||
// Intel drivers don't support coopmat properly yet
|
|
||||||
// Only RADV supports coopmat properly on AMD
|
|
||||||
device->coopmat_support = false;
|
device->coopmat_support = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2242,6 +2245,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
|
last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(VK_KHR_cooperative_matrix)
|
||||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||||
coopmat_features.pNext = nullptr;
|
coopmat_features.pNext = nullptr;
|
||||||
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
||||||
|
@ -2251,6 +2255,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
last_struct->pNext = (VkBaseOutStructure *)&coopmat_features;
|
last_struct->pNext = (VkBaseOutStructure *)&coopmat_features;
|
||||||
last_struct = (VkBaseOutStructure *)&coopmat_features;
|
last_struct = (VkBaseOutStructure *)&coopmat_features;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(VK_NV_cooperative_matrix2)
|
#if defined(VK_NV_cooperative_matrix2)
|
||||||
VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {};
|
VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {};
|
||||||
|
@ -2283,7 +2288,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
device_extensions.push_back("VK_EXT_subgroup_size_control");
|
device_extensions.push_back("VK_EXT_subgroup_size_control");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(VK_KHR_cooperative_matrix)
|
||||||
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (coopmat2_support) {
|
if (coopmat2_support) {
|
||||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
|
@ -2376,6 +2383,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(VK_KHR_cooperative_matrix)
|
||||||
if (device->coopmat_support) {
|
if (device->coopmat_support) {
|
||||||
// Query supported shapes
|
// Query supported shapes
|
||||||
std::vector<VkCooperativeMatrixPropertiesKHR> cm_props;
|
std::vector<VkCooperativeMatrixPropertiesKHR> cm_props;
|
||||||
|
@ -2442,7 +2450,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
if (device->coopmat_support) {
|
if (device->coopmat_support) {
|
||||||
device_extensions.push_back("VK_KHR_cooperative_matrix");
|
device_extensions.push_back("VK_KHR_cooperative_matrix");
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
device->name = GGML_VK_NAME + std::to_string(idx);
|
device->name = GGML_VK_NAME + std::to_string(idx);
|
||||||
|
|
||||||
device_create_info = {
|
device_create_info = {
|
||||||
|
@ -2515,7 +2523,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
return vk_instance.devices[idx];
|
return vk_instance.devices[idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void ggml_vk_print_gpu_info(size_t idx) {
|
static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
||||||
size_t dev_num = vk_instance.device_indices[idx];
|
size_t dev_num = vk_instance.device_indices[idx];
|
||||||
|
@ -2554,9 +2561,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
fp16_storage = true;
|
fp16_storage = true;
|
||||||
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
||||||
fp16_compute = true;
|
fp16_compute = true;
|
||||||
|
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
||||||
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
||||||
coopmat_support = true;
|
coopmat_support = true;
|
||||||
|
#endif
|
||||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
|
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
|
||||||
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
||||||
|
@ -2565,9 +2574,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
|
if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) {
|
||||||
// Intel drivers don't support coopmat properly yet
|
|
||||||
// Only RADV supports coopmat properly on AMD
|
|
||||||
coopmat_support = false;
|
coopmat_support = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2596,6 +2603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
// Pointer to the last chain element
|
// Pointer to the last chain element
|
||||||
VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features;
|
VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features;
|
||||||
|
|
||||||
|
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||||
coopmat_features.pNext = nullptr;
|
coopmat_features.pNext = nullptr;
|
||||||
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
||||||
|
@ -2611,6 +2619,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
fp16 = fp16 && vk12_features.shaderFloat16;
|
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||||
|
|
||||||
coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix;
|
coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix;
|
||||||
|
#endif
|
||||||
|
|
||||||
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
||||||
|
|
||||||
|
@ -8088,6 +8097,25 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
|
||||||
UNUSED(instance_extensions);
|
UNUSED(instance_extensions);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) {
|
||||||
|
switch (props.vendorID) {
|
||||||
|
case VK_VENDOR_ID_INTEL:
|
||||||
|
// Intel drivers don't support coopmat properly yet
|
||||||
|
return false;
|
||||||
|
case VK_VENDOR_ID_AMD:
|
||||||
|
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
|
||||||
|
// Workaround for AMD proprietary driver reporting support on all GPUs
|
||||||
|
const std::string name = props.deviceName;
|
||||||
|
return name.rfind("AMD Radeon RX 7", 0) == 0 || name.rfind("AMD Radeon(TM) RX 7", 0) == 0 || // RDNA 3 consumer GPUs
|
||||||
|
name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs
|
||||||
|
name.rfind("AMD Radeon 7", 0) == 0 || name.rfind("AMD Radeon(TM) 7", 0) == 0; // RDNA 3 APUs
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// checks
|
// checks
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
#version 460
|
||||||
|
|
||||||
|
#extension GL_KHR_cooperative_matrix : require
|
||||||
|
|
||||||
|
void main()
|
||||||
|
{
|
||||||
|
}
|
|
@ -342,9 +342,11 @@ void process_shaders() {
|
||||||
matmul_shaders(true, matmul_id, false, false, false);
|
matmul_shaders(true, matmul_id, false, false, false);
|
||||||
matmul_shaders(true, matmul_id, false, false, true);
|
matmul_shaders(true, matmul_id, false, false, true);
|
||||||
|
|
||||||
|
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
// Coopmat, fp32acc and fp16acc
|
// Coopmat, fp32acc and fp16acc
|
||||||
matmul_shaders(true, matmul_id, true, false, false);
|
matmul_shaders(true, matmul_id, true, false, false);
|
||||||
matmul_shaders(true, matmul_id, true, false, true);
|
matmul_shaders(true, matmul_id, true, false, true);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
// Coopmat2, fp32acc and fp16acc
|
// Coopmat2, fp32acc and fp16acc
|
||||||
|
|
1276
ggml/src/ggml.c
1276
ggml/src/ggml.c
File diff suppressed because it is too large
Load diff
1325
ggml/src/gguf.cpp
Normal file
1325
ggml/src/gguf.cpp
Normal file
File diff suppressed because it is too large
Load diff
|
@ -15,13 +15,13 @@ pip install gguf
|
||||||
|
|
||||||
[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
|
[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
|
||||||
|
|
||||||
[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
|
[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
|
||||||
|
|
||||||
[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||||
|
|
||||||
[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
|
[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
|
||||||
|
|
||||||
[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
Maintainers who participate in development of this package are advised to install it in editable mode:
|
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||||
|
|
|
@ -102,6 +102,8 @@ class Keys:
|
||||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||||
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
|
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
|
||||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||||
|
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
|
||||||
|
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
|
@ -255,6 +257,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
MAMBA = auto()
|
MAMBA = auto()
|
||||||
XVERSE = auto()
|
XVERSE = auto()
|
||||||
COMMAND_R = auto()
|
COMMAND_R = auto()
|
||||||
|
COHERE2 = auto()
|
||||||
DBRX = auto()
|
DBRX = auto()
|
||||||
OLMO = auto()
|
OLMO = auto()
|
||||||
OLMO2 = auto()
|
OLMO2 = auto()
|
||||||
|
@ -312,6 +315,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
FFN_GATE_SHEXP = auto()
|
FFN_GATE_SHEXP = auto()
|
||||||
FFN_DOWN_SHEXP = auto()
|
FFN_DOWN_SHEXP = auto()
|
||||||
FFN_UP_SHEXP = auto()
|
FFN_UP_SHEXP = auto()
|
||||||
|
FFN_EXP_PROBS_B = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
LAYER_OUT_NORM = auto()
|
LAYER_OUT_NORM = auto()
|
||||||
|
@ -437,6 +441,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.MAMBA: "mamba",
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
MODEL_ARCH.XVERSE: "xverse",
|
MODEL_ARCH.XVERSE: "xverse",
|
||||||
MODEL_ARCH.COMMAND_R: "command-r",
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
|
MODEL_ARCH.COHERE2: "cohere2",
|
||||||
MODEL_ARCH.DBRX: "dbrx",
|
MODEL_ARCH.DBRX: "dbrx",
|
||||||
MODEL_ARCH.OLMO: "olmo",
|
MODEL_ARCH.OLMO: "olmo",
|
||||||
MODEL_ARCH.OLMO2: "olmo2",
|
MODEL_ARCH.OLMO2: "olmo2",
|
||||||
|
@ -496,6 +501,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||||
|
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||||
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
||||||
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||||
|
@ -1136,6 +1142,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.ATTN_K_NORM,
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
MODEL_TENSOR.ATTN_Q_NORM,
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.COHERE2: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.DBRX: [
|
MODEL_ARCH.DBRX: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -1276,6 +1294,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.CHATGLM : [
|
MODEL_ARCH.CHATGLM : [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
@ -1576,6 +1595,11 @@ class GGMLQuantizationType(IntEnum):
|
||||||
TQ2_0 = 35
|
TQ2_0 = 35
|
||||||
|
|
||||||
|
|
||||||
|
class ExpertGatingFuncType(IntEnum):
|
||||||
|
SOFTMAX = 1
|
||||||
|
SIGMOID = 2
|
||||||
|
|
||||||
|
|
||||||
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,7 @@ from .constants import (
|
||||||
RopeScalingType,
|
RopeScalingType,
|
||||||
PoolingType,
|
PoolingType,
|
||||||
TokenType,
|
TokenType,
|
||||||
|
ExpertGatingFuncType,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .quants import quant_shape_from_byte_shape
|
from .quants import quant_shape_from_byte_shape
|
||||||
|
@ -715,6 +716,12 @@ class GGUFWriter:
|
||||||
def add_expert_weights_scale(self, value: float) -> None:
|
def add_expert_weights_scale(self, value: float) -> None:
|
||||||
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_expert_weights_norm(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
def add_swin_norm(self, value: bool) -> None:
|
def add_swin_norm(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
|
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
|
|
@ -276,6 +276,10 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_EXP_PROBS_B: (
|
||||||
|
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
|
||||||
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
MODEL_TENSOR.FFN_UP: (
|
MODEL_TENSOR.FFN_UP: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.13.0"
|
version = "0.14.0"
|
||||||
description = "Read and write ML models in GGUF for GGML"
|
description = "Read and write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
{include = "gguf"},
|
{include = "gguf"},
|
||||||
{include = "gguf/py.typed"},
|
{include = "gguf/py.typed"},
|
||||||
{include = "scripts"},
|
|
||||||
]
|
]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
homepage = "https://ggml.ai"
|
homepage = "https://ggml.ai"
|
||||||
|
@ -33,7 +32,7 @@ requires = ["poetry-core>=1.0.0"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint"
|
gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
|
||||||
gguf-dump = "scripts:gguf_dump_entrypoint"
|
gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
|
||||||
gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint"
|
gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
|
||||||
gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint"
|
gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
|
||||||
|
|
|
@ -9,7 +9,7 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
struct llama_model_deleter {
|
struct llama_model_deleter {
|
||||||
void operator()(llama_model * model) { llama_free_model(model); }
|
void operator()(llama_model * model) { llama_model_free(model); }
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_context_deleter {
|
struct llama_context_deleter {
|
||||||
|
|
|
@ -34,7 +34,6 @@
|
||||||
|
|
||||||
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||||
|
|
||||||
// TODO: use everywhere in the implementation
|
|
||||||
#define LLAMA_TOKEN_NULL -1
|
#define LLAMA_TOKEN_NULL -1
|
||||||
|
|
||||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||||
|
@ -105,6 +104,7 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_rope_type {
|
enum llama_rope_type {
|
||||||
|
@ -413,12 +413,19 @@ extern "C" {
|
||||||
// Call once at the end of the program - currently only used for MPI
|
// Call once at the end of the program - currently only used for MPI
|
||||||
LLAMA_API void llama_backend_free(void);
|
LLAMA_API void llama_backend_free(void);
|
||||||
|
|
||||||
LLAMA_API struct llama_model * llama_load_model_from_file(
|
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_model_params params),
|
||||||
|
"use llama_model_load_from_file instead");
|
||||||
|
|
||||||
|
LLAMA_API struct llama_model * llama_model_load_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_model_params params);
|
struct llama_model_params params);
|
||||||
|
|
||||||
// TODO: rename to llama_model_free
|
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
||||||
LLAMA_API void llama_free_model(struct llama_model * model);
|
"use llama_model_free instead");
|
||||||
|
|
||||||
|
LLAMA_API void llama_model_free(struct llama_model * model);
|
||||||
|
|
||||||
// TODO: rename to llama_init_from_model
|
// TODO: rename to llama_init_from_model
|
||||||
LLAMA_API struct llama_context * llama_new_context_with_model(
|
LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
e6d93f40dffe8733d5d72f1d8fa6b3ca27ae899f
|
c8bd0fee71dc8328d93be301bbee06bc10d30429
|
||||||
|
|
|
@ -242,6 +242,10 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||||
} else {
|
} else {
|
||||||
ab_map[name].b = cur;
|
ab_map[name].b = cur;
|
||||||
}
|
}
|
||||||
|
} else if (str_endswith(name, "_norm.weight")) {
|
||||||
|
// TODO: add support for norm vector
|
||||||
|
// for now, we don't really care because most adapters still work fine without it
|
||||||
|
continue;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
||||||
}
|
}
|
||||||
|
@ -251,6 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||||
for (auto & it : ab_map) {
|
for (auto & it : ab_map) {
|
||||||
const std::string & name = it.first;
|
const std::string & name = it.first;
|
||||||
llama_lora_weight & w = it.second;
|
llama_lora_weight & w = it.second;
|
||||||
|
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
||||||
|
|
||||||
if (!w.a || !w.b) {
|
if (!w.a || !w.b) {
|
||||||
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
||||||
|
@ -259,17 +264,24 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||||
// device buft and device ctx
|
// device buft and device ctx
|
||||||
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
||||||
if (!model_tensor) {
|
if (!model_tensor) {
|
||||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
||||||
// validate tensor shape
|
// validate tensor shape
|
||||||
|
if (is_token_embd) {
|
||||||
|
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
||||||
|
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
|
||||||
|
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
|
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||||
}
|
}
|
||||||
if (w.a->ne[1] != w.b->ne[0]) {
|
if (w.a->ne[1] != w.b->ne[0]) {
|
||||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// save tensor to adapter
|
// save tensor to adapter
|
||||||
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
|
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
|
||||||
|
|
|
@ -45,6 +45,13 @@ struct llama_lora_weight {
|
||||||
struct ggml_tensor * a = nullptr;
|
struct ggml_tensor * a = nullptr;
|
||||||
struct ggml_tensor * b = nullptr;
|
struct ggml_tensor * b = nullptr;
|
||||||
|
|
||||||
|
// get actual scale based on rank and alpha
|
||||||
|
float get_scale(float alpha, float adapter_scale) {
|
||||||
|
const float rank = (float) b->ne[0];
|
||||||
|
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
|
||||||
|
return scale;
|
||||||
|
}
|
||||||
|
|
||||||
llama_lora_weight() = default;
|
llama_lora_weight() = default;
|
||||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||||
};
|
};
|
||||||
|
|
|
@ -39,6 +39,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_MAMBA, "mamba" },
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
{ LLM_ARCH_XVERSE, "xverse" },
|
{ LLM_ARCH_XVERSE, "xverse" },
|
||||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||||
|
{ LLM_ARCH_COHERE2, "cohere2" },
|
||||||
{ LLM_ARCH_DBRX, "dbrx" },
|
{ LLM_ARCH_DBRX, "dbrx" },
|
||||||
{ LLM_ARCH_OLMO, "olmo" },
|
{ LLM_ARCH_OLMO, "olmo" },
|
||||||
{ LLM_ARCH_OLMO2, "olmo2" },
|
{ LLM_ARCH_OLMO2, "olmo2" },
|
||||||
|
@ -91,6 +92,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||||
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
||||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||||
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||||
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||||
|
@ -807,6 +810,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_COHERE2,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_DBRX,
|
LLM_ARCH_DBRX,
|
||||||
{
|
{
|
||||||
|
@ -968,6 +986,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
|
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1350,6 +1369,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||||
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
|
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||||
// this tensor is loaded for T5, but never used
|
// this tensor is loaded for T5, but never used
|
||||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||||
|
|
|
@ -43,6 +43,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_MAMBA,
|
LLM_ARCH_MAMBA,
|
||||||
LLM_ARCH_XVERSE,
|
LLM_ARCH_XVERSE,
|
||||||
LLM_ARCH_COMMAND_R,
|
LLM_ARCH_COMMAND_R,
|
||||||
|
LLM_ARCH_COHERE2,
|
||||||
LLM_ARCH_DBRX,
|
LLM_ARCH_DBRX,
|
||||||
LLM_ARCH_OLMO,
|
LLM_ARCH_OLMO,
|
||||||
LLM_ARCH_OLMO2,
|
LLM_ARCH_OLMO2,
|
||||||
|
@ -95,6 +96,8 @@ enum llm_kv {
|
||||||
LLM_KV_EXPERT_USED_COUNT,
|
LLM_KV_EXPERT_USED_COUNT,
|
||||||
LLM_KV_EXPERT_SHARED_COUNT,
|
LLM_KV_EXPERT_SHARED_COUNT,
|
||||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||||
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||||
|
LLM_KV_EXPERT_GATING_FUNC,
|
||||||
LLM_KV_POOLING_TYPE,
|
LLM_KV_POOLING_TYPE,
|
||||||
LLM_KV_LOGIT_SCALE,
|
LLM_KV_LOGIT_SCALE,
|
||||||
LLM_KV_DECODER_START_TOKEN_ID,
|
LLM_KV_DECODER_START_TOKEN_ID,
|
||||||
|
@ -230,6 +233,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||||
LLM_TENSOR_FFN_UP_SHEXP,
|
LLM_TENSOR_FFN_UP_SHEXP,
|
||||||
|
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
LLM_TENSOR_LAYER_OUT_NORM,
|
LLM_TENSOR_LAYER_OUT_NORM,
|
||||||
|
|
|
@ -45,6 +45,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
|
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
|
||||||
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
||||||
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
||||||
|
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
||||||
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
||||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
||||||
|
@ -148,6 +149,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||||
return LLM_CHAT_TEMPLATE_MINICPM;
|
return LLM_CHAT_TEMPLATE_MINICPM;
|
||||||
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
||||||
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
|
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
|
||||||
|
} else if (tmpl_contains(LU8("'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'"))) {
|
||||||
|
return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
|
||||||
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
||||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||||
// EXAONE-3.0-7.8B-Instruct
|
// EXAONE-3.0-7.8B-Instruct
|
||||||
|
@ -453,6 +456,21 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "Assistant:";
|
ss << "Assistant:";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
|
||||||
|
// DeepSeek-V3
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
if (role == "system") {
|
||||||
|
ss << message->content << "\n\n";
|
||||||
|
} else if (role == "user") {
|
||||||
|
ss << LU8("<|User|>") << message->content;
|
||||||
|
} else if (role == "assistant") {
|
||||||
|
ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << LU8("<|Assistant|>");
|
||||||
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
||||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||||
// EXAONE-3.0-7.8B-Instruct
|
// EXAONE-3.0-7.8B-Instruct
|
||||||
|
|
|
@ -25,6 +25,7 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
|
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
|
||||||
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
||||||
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
||||||
|
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
||||||
LLM_CHAT_TEMPLATE_COMMAND_R,
|
LLM_CHAT_TEMPLATE_COMMAND_R,
|
||||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||||
LLM_CHAT_TEMPLATE_CHATGML_3,
|
LLM_CHAT_TEMPLATE_CHATGML_3,
|
||||||
|
|
|
@ -6,7 +6,13 @@
|
||||||
|
|
||||||
// bump if necessary
|
// bump if necessary
|
||||||
#define LLAMA_MAX_LAYERS 512
|
#define LLAMA_MAX_LAYERS 512
|
||||||
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
|
||||||
|
|
||||||
|
enum llama_expert_gating_func_type {
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_hparams_posnet {
|
struct llama_hparams_posnet {
|
||||||
uint32_t n_embd;
|
uint32_t n_embd;
|
||||||
|
@ -55,6 +61,8 @@ struct llama_hparams {
|
||||||
uint32_t n_norm_groups = 0;
|
uint32_t n_norm_groups = 0;
|
||||||
|
|
||||||
float expert_weights_scale = 0.0;
|
float expert_weights_scale = 0.0;
|
||||||
|
bool expert_weights_norm = false;
|
||||||
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||||
|
|
||||||
float f_norm_eps;
|
float f_norm_eps;
|
||||||
float f_norm_rms_eps;
|
float f_norm_rms_eps;
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
|
|
||||||
|
#include "gguf.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||||
{
|
{
|
||||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "[";
|
ss << "[";
|
||||||
for (int j = 0; j < arr_n; j++) {
|
for (int j = 0; j < arr_n; j++) {
|
||||||
|
|
|
@ -119,10 +119,10 @@ bool llama_kv_cache_init(
|
||||||
|
|
||||||
struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||||
struct llama_kv_cache & cache,
|
struct llama_kv_cache & cache,
|
||||||
const struct llama_ubatch & batch) {
|
const struct llama_ubatch & ubatch) {
|
||||||
const uint32_t n_tokens = batch.n_tokens;
|
const uint32_t n_tokens = ubatch.n_tokens;
|
||||||
const uint32_t n_seqs = batch.n_seqs;
|
const uint32_t n_seqs = ubatch.n_seqs;
|
||||||
const uint32_t n_seq_tokens = batch.n_seq_tokens;
|
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
|
|
||||||
if (cache.recurrent) {
|
if (cache.recurrent) {
|
||||||
// For recurrent state architectures (like Mamba or RWKV),
|
// For recurrent state architectures (like Mamba or RWKV),
|
||||||
|
@ -130,16 +130,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||||
// A slot should be always be contiguous.
|
// A slot should be always be contiguous.
|
||||||
|
|
||||||
// can only process batches with an equal number of new tokens in each sequence
|
// can only process batches with an equal number of new tokens in each sequence
|
||||||
GGML_ASSERT(batch.equal_seqs);
|
GGML_ASSERT(ubatch.equal_seqs);
|
||||||
|
|
||||||
int32_t min = cache.size - 1;
|
int32_t min = cache.size - 1;
|
||||||
int32_t max = 0;
|
int32_t max = 0;
|
||||||
|
|
||||||
// everything should fit if all seq_ids are smaller than the max
|
// everything should fit if all seq_ids are smaller than the max
|
||||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||||
const uint32_t n_seq_id = batch.n_seq_id[s];
|
const uint32_t n_seq_id = ubatch.n_seq_id[s];
|
||||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
const llama_seq_id seq_id = ubatch.seq_id[s][j];
|
||||||
|
|
||||||
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
|
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
|
||||||
// too big seq_id
|
// too big seq_id
|
||||||
|
@ -198,7 +198,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||||
|
|
||||||
// find usable cell range
|
// find usable cell range
|
||||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||||
const llama_seq_id seq_id = batch.seq_id[s][0];
|
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
||||||
llama_kv_cell & seq_meta = cache.cells[seq_id];
|
llama_kv_cell & seq_meta = cache.cells[seq_id];
|
||||||
bool has_cell = false;
|
bool has_cell = false;
|
||||||
if (seq_meta.tail >= 0) {
|
if (seq_meta.tail >= 0) {
|
||||||
|
@ -237,7 +237,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||||
// gather and re-order
|
// gather and re-order
|
||||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||||
int32_t dst_id = s + min;
|
int32_t dst_id = s + min;
|
||||||
int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
|
int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
|
||||||
if (dst_id != src_id) {
|
if (dst_id != src_id) {
|
||||||
llama_kv_cell & dst_cell = cache.cells[dst_id];
|
llama_kv_cell & dst_cell = cache.cells[dst_id];
|
||||||
llama_kv_cell & src_cell = cache.cells[src_id];
|
llama_kv_cell & src_cell = cache.cells[src_id];
|
||||||
|
@ -258,7 +258,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||||
|
|
||||||
// update the pos of the used seqs
|
// update the pos of the used seqs
|
||||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||||
const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||||
int32_t cell_id = s + min;
|
int32_t cell_id = s + min;
|
||||||
llama_kv_cell & cell = cache.cells[cell_id];
|
llama_kv_cell & cell = cache.cells[cell_id];
|
||||||
|
|
||||||
|
@ -266,12 +266,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||||
// What should happen when the pos backtracks or skips a value?
|
// What should happen when the pos backtracks or skips a value?
|
||||||
// Clearing the state mid-batch would require special-casing which isn't done.
|
// Clearing the state mid-batch would require special-casing which isn't done.
|
||||||
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
|
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
|
||||||
__func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
|
__func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
|
||||||
}
|
}
|
||||||
cell.pos = last_pos;
|
cell.pos = last_pos;
|
||||||
cell.seq_id.clear();
|
cell.seq_id.clear();
|
||||||
for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
|
for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
|
||||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
const llama_seq_id seq_id = ubatch.seq_id[s][j];
|
||||||
cell.seq_id.insert(seq_id);
|
cell.seq_id.insert(seq_id);
|
||||||
cache.cells[seq_id].tail = cell_id;
|
cache.cells[seq_id].tail = cell_id;
|
||||||
}
|
}
|
||||||
|
@ -325,10 +325,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||||
for (uint32_t s = 0; s < n_seqs; s++) {
|
for (uint32_t s = 0; s < n_seqs; s++) {
|
||||||
for (uint32_t i = 0; i < n_seq_tokens; ++i) {
|
for (uint32_t i = 0; i < n_seq_tokens; ++i) {
|
||||||
uint32_t k = s*n_seq_tokens + i;
|
uint32_t k = s*n_seq_tokens + i;
|
||||||
cache.cells[cache.head + k].pos = batch.pos[k];
|
cache.cells[cache.head + k].pos = ubatch.pos[k];
|
||||||
|
|
||||||
for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
|
for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
|
||||||
cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
|
cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -241,12 +241,16 @@ llama_file::~llama_file() = default;
|
||||||
size_t llama_file::tell() const { return pimpl->tell(); }
|
size_t llama_file::tell() const { return pimpl->tell(); }
|
||||||
size_t llama_file::size() const { return pimpl->size; }
|
size_t llama_file::size() const { return pimpl->size; }
|
||||||
|
|
||||||
int llama_file::fileno() const {
|
int llama_file::file_id() const {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
return _fileno(pimpl->fp);
|
return _fileno(pimpl->fp);
|
||||||
|
#else
|
||||||
|
#if defined(fileno)
|
||||||
|
return fileno(pimpl->fp);
|
||||||
#else
|
#else
|
||||||
return ::fileno(pimpl->fp);
|
return ::fileno(pimpl->fp);
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
||||||
|
@ -265,7 +269,7 @@ struct llama_mmap::impl {
|
||||||
|
|
||||||
impl(struct llama_file * file, size_t prefetch, bool numa) {
|
impl(struct llama_file * file, size_t prefetch, bool numa) {
|
||||||
size = file->size();
|
size = file->size();
|
||||||
int fd = file->fileno();
|
int fd = file->file_id();
|
||||||
int flags = MAP_SHARED;
|
int flags = MAP_SHARED;
|
||||||
if (numa) { prefetch = 0; }
|
if (numa) { prefetch = 0; }
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
|
@ -357,7 +361,7 @@ struct llama_mmap::impl {
|
||||||
|
|
||||||
size = file->size();
|
size = file->size();
|
||||||
|
|
||||||
HANDLE hFile = (HANDLE) _get_osfhandle(file->fileno());
|
HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
|
||||||
|
|
||||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ struct llama_file {
|
||||||
size_t tell() const;
|
size_t tell() const;
|
||||||
size_t size() const;
|
size_t size() const;
|
||||||
|
|
||||||
int fileno() const;
|
int file_id() const; // fileno overload
|
||||||
|
|
||||||
void seek(size_t offset, int whence) const;
|
void seek(size_t offset, int whence) const;
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ const char * llama_file_version_name(llama_fver version) {
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace GGUFMeta {
|
namespace GGUFMeta {
|
||||||
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
|
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
|
||||||
struct GKV_Base_Type {
|
struct GKV_Base_Type {
|
||||||
static constexpr gguf_type gt = gt_;
|
static constexpr gguf_type gt = gt_;
|
||||||
|
|
||||||
|
@ -60,10 +60,11 @@ namespace GGUFMeta {
|
||||||
public:
|
public:
|
||||||
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
||||||
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
||||||
|
const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
|
||||||
return ArrayInfo {
|
return ArrayInfo {
|
||||||
gguf_get_arr_type(ctx, k),
|
arr_type,
|
||||||
size_t(gguf_get_arr_n(ctx, k)),
|
size_t(gguf_get_arr_n(ctx, k)),
|
||||||
gguf_get_arr_data(ctx, k),
|
arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -553,7 +554,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
||||||
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
||||||
const std::string type_name =
|
const std::string type_name =
|
||||||
type == GGUF_TYPE_ARRAY
|
type == GGUF_TYPE_ARRAY
|
||||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||||
: gguf_type_name(type);
|
: gguf_type_name(type);
|
||||||
|
|
||||||
std::string value = gguf_kv_to_str(meta.get(), i);
|
std::string value = gguf_kv_to_str(meta.get(), i);
|
||||||
|
|
|
@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
|
||||||
case MODEL_70B: return "70B";
|
case MODEL_70B: return "70B";
|
||||||
case MODEL_236B: return "236B";
|
case MODEL_236B: return "236B";
|
||||||
case MODEL_314B: return "314B";
|
case MODEL_314B: return "314B";
|
||||||
|
case MODEL_671B: return "671B";
|
||||||
case MODEL_SMALL: return "0.1B";
|
case MODEL_SMALL: return "0.1B";
|
||||||
case MODEL_MEDIUM: return "0.4B";
|
case MODEL_MEDIUM: return "0.4B";
|
||||||
case MODEL_LARGE: return "0.8B";
|
case MODEL_LARGE: return "0.8B";
|
||||||
|
@ -125,6 +126,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
|
||||||
|
switch (type) {
|
||||||
|
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
|
||||||
|
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
|
||||||
|
default: return "unknown";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::string llama_model_arch_name (const llama_model & model) {
|
std::string llama_model_arch_name (const llama_model & model) {
|
||||||
return llm_arch_name(model.arch);
|
return llm_arch_name(model.arch);
|
||||||
}
|
}
|
||||||
|
@ -786,6 +795,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_COHERE2:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||||
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 32: model.type = e_model::MODEL_8B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_DBRX:
|
case LLM_ARCH_DBRX:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
@ -923,11 +942,19 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||||
|
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||||
|
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
|
||||||
|
// that have no expert_gating_func model parameter set
|
||||||
|
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||||
|
}
|
||||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 27: model.type = e_model::MODEL_16B; break;
|
case 27: model.type = e_model::MODEL_16B; break;
|
||||||
case 60: model.type = e_model::MODEL_236B; break;
|
case 60: model.type = e_model::MODEL_236B; break;
|
||||||
|
case 61: model.type = e_model::MODEL_671B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -1249,6 +1276,10 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
|
||||||
tokenizer_pre == "deepseek-coder") {
|
tokenizer_pre == "deepseek-coder") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
||||||
vocab.tokenizer_clean_spaces = false;
|
vocab.tokenizer_clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "deepseek-v3") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
|
||||||
|
vocab.tokenizer_clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "falcon") {
|
tokenizer_pre == "falcon") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||||
|
@ -1892,24 +1923,24 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||||||
|
|
||||||
// special tokens
|
// special tokens
|
||||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
if (vocab.special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
if (vocab.special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||||
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
if (vocab.special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
||||||
if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
|
if (vocab.special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
|
||||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
if (vocab.special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
if (vocab.special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
if (vocab.special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||||
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
if (vocab.special_cls_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
||||||
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
if (vocab.special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
||||||
|
|
||||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
if (vocab.linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||||
|
|
||||||
if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
|
if (vocab.special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
|
||||||
if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
|
if (vocab.special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
|
||||||
if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
|
if (vocab.special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
|
||||||
if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
|
if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
|
||||||
if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
|
if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
|
||||||
if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
|
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
|
||||||
|
|
||||||
for (const auto & id : vocab.special_eog_ids) {
|
for (const auto & id : vocab.special_eog_ids) {
|
||||||
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
|
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
|
||||||
|
@ -1931,6 +1962,8 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||||
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||||
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1976,6 +2009,10 @@ struct llama_model_params llama_model_default_params() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_free_model(struct llama_model * model) {
|
void llama_free_model(struct llama_model * model) {
|
||||||
|
llama_model_free(model);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_free(struct llama_model * model) {
|
||||||
delete model;
|
delete model;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2031,6 +2068,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
case LLM_ARCH_XVERSE:
|
case LLM_ARCH_XVERSE:
|
||||||
case LLM_ARCH_COMMAND_R:
|
case LLM_ARCH_COMMAND_R:
|
||||||
|
case LLM_ARCH_COHERE2:
|
||||||
case LLM_ARCH_OLMO:
|
case LLM_ARCH_OLMO:
|
||||||
case LLM_ARCH_ARCTIC:
|
case LLM_ARCH_ARCTIC:
|
||||||
case LLM_ARCH_DEEPSEEK:
|
case LLM_ARCH_DEEPSEEK:
|
||||||
|
|
|
@ -63,6 +63,7 @@ enum llm_type {
|
||||||
MODEL_70B,
|
MODEL_70B,
|
||||||
MODEL_236B,
|
MODEL_236B,
|
||||||
MODEL_314B,
|
MODEL_314B,
|
||||||
|
MODEL_671B,
|
||||||
MODEL_SMALL,
|
MODEL_SMALL,
|
||||||
MODEL_MEDIUM,
|
MODEL_MEDIUM,
|
||||||
MODEL_LARGE,
|
MODEL_LARGE,
|
||||||
|
@ -213,6 +214,7 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||||
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
||||||
struct ggml_tensor * ffn_act = nullptr;
|
struct ggml_tensor * ffn_act = nullptr;
|
||||||
|
struct ggml_tensor * ffn_exp_probs_b = nullptr;
|
||||||
|
|
||||||
// mamba proj
|
// mamba proj
|
||||||
struct ggml_tensor * ssm_in = nullptr;
|
struct ggml_tensor * ssm_in = nullptr;
|
||||||
|
|
|
@ -7,14 +7,12 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <cinttypes>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
// TODO: replace with ggml API call
|
|
||||||
#define QK_K 256
|
|
||||||
|
|
||||||
static void zeros(std::ofstream & file, size_t n) {
|
static void zeros(std::ofstream & file, size_t n) {
|
||||||
char zero = 0;
|
char zero = 0;
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
@ -22,7 +20,7 @@ static void zeros(std::ofstream & file, size_t n) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct quantize_state_internal {
|
struct quantize_state_impl {
|
||||||
const llama_model & model;
|
const llama_model & model;
|
||||||
const llama_model_quantize_params * params;
|
const llama_model_quantize_params * params;
|
||||||
|
|
||||||
|
@ -43,13 +41,13 @@ struct quantize_state_internal {
|
||||||
// used to figure out if a model shares tok_embd with the output weight
|
// used to figure out if a model shares tok_embd with the output weight
|
||||||
bool has_output = false;
|
bool has_output = false;
|
||||||
|
|
||||||
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
|
||||||
: model(model)
|
: model(model)
|
||||||
, params(params)
|
, params(params)
|
||||||
{}
|
{}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void llama_tensor_dequantize_internal(
|
static void llama_tensor_dequantize_impl(
|
||||||
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||||
const size_t nelements, const int nthread
|
const size_t nelements, const int nthread
|
||||||
) {
|
) {
|
||||||
|
@ -121,7 +119,7 @@ static void llama_tensor_dequantize_internal(
|
||||||
workers.clear();
|
workers.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
|
@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||||
new_type = qs.params->output_tensor_type;
|
new_type = qs.params->output_tensor_type;
|
||||||
} else {
|
} else {
|
||||||
int nx = tensor->ne[0];
|
const int64_t nx = tensor->ne[0];
|
||||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
const int64_t qk_k = ggml_blck_size(new_type);
|
||||||
|
|
||||||
|
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
||||||
new_type = GGML_TYPE_Q8_0;
|
new_type = GGML_TYPE_Q8_0;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||||
|
@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
||||||
//}
|
//}
|
||||||
bool convert_incompatible_tensor = false;
|
bool convert_incompatible_tensor = false;
|
||||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
{
|
||||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
const int64_t nx = tensor->ne[0];
|
||||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
const int64_t ny = tensor->ne[1];
|
||||||
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
const int64_t qk_k = ggml_blck_size(new_type);
|
||||||
new_type == GGML_TYPE_IQ1_M) {
|
|
||||||
int nx = tensor->ne[0];
|
if (nx % qk_k != 0) {
|
||||||
int ny = tensor->ne[1];
|
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
|
||||||
if (nx % QK_K != 0) {
|
|
||||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
|
||||||
convert_incompatible_tensor = true;
|
convert_incompatible_tensor = true;
|
||||||
} else {
|
} else {
|
||||||
++qs.n_k_quantized;
|
++qs.n_k_quantized;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (convert_incompatible_tensor) {
|
if (convert_incompatible_tensor) {
|
||||||
switch (new_type) {
|
switch (new_type) {
|
||||||
case GGML_TYPE_TQ1_0:
|
case GGML_TYPE_TQ1_0:
|
||||||
|
@ -410,7 +409,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
return new_type;
|
return new_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||||
if (nthread < 2) {
|
if (nthread < 2) {
|
||||||
// single-thread
|
// single-thread
|
||||||
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
||||||
|
@ -464,7 +463,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
||||||
return new_size;
|
return new_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||||
ggml_type default_type;
|
ggml_type default_type;
|
||||||
llama_ftype ftype = params->ftype;
|
llama_ftype ftype = params->ftype;
|
||||||
|
|
||||||
|
@ -534,7 +533,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
llm_load_hparams(ml, model);
|
llm_load_hparams(ml, model);
|
||||||
llm_load_stats (ml, model);
|
llm_load_stats (ml, model);
|
||||||
|
|
||||||
struct quantize_state_internal qs(model, params);
|
struct quantize_state_impl qs(model, params);
|
||||||
|
|
||||||
if (params->only_copy) {
|
if (params->only_copy) {
|
||||||
ftype = model.ftype;
|
ftype = model.ftype;
|
||||||
|
@ -837,7 +836,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||||
} else {
|
} else {
|
||||||
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||||
f32_data = (float *) f32_conv_buf.data();
|
f32_data = (float *) f32_conv_buf.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -866,7 +865,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||||
|
|
||||||
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||||
}
|
}
|
||||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||||
}
|
}
|
||||||
|
@ -875,7 +874,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// update the gguf meta data as we go
|
// update the gguf meta data as we go
|
||||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
||||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
|
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
||||||
|
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
||||||
|
|
||||||
// write tensor data + padding
|
// write tensor data + padding
|
||||||
fout.write((const char *) new_data, new_size);
|
fout.write((const char *) new_data, new_size);
|
||||||
|
@ -919,7 +919,7 @@ uint32_t llama_model_quantize(
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
const llama_model_quantize_params * params) {
|
const llama_model_quantize_params * params) {
|
||||||
try {
|
try {
|
||||||
llama_model_quantize_internal(fname_inp, fname_out, params);
|
llama_model_quantize_impl(fname_inp, fname_out, params);
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
|
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -382,6 +382,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
"\\p{N}+",
|
"\\p{N}+",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
||||||
|
regex_exprs = {
|
||||||
|
"\\p{N}{1,3}",
|
||||||
|
"[一-龥-ゟ゠-ヿ]+",
|
||||||
|
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
};
|
||||||
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"[\r\n]",
|
"[\r\n]",
|
||||||
|
@ -490,7 +497,7 @@ struct llm_tokenizer_bpe_session {
|
||||||
|
|
||||||
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
||||||
if (vocab.tokenizer_add_bos) {
|
if (vocab.tokenizer_add_bos) {
|
||||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
|
||||||
output.push_back(vocab.special_bos_id);
|
output.push_back(vocab.special_bos_id);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -499,7 +506,7 @@ struct llm_tokenizer_bpe_session {
|
||||||
|
|
||||||
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
||||||
if (vocab.tokenizer_add_eos) {
|
if (vocab.tokenizer_add_eos) {
|
||||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
|
||||||
output.push_back(vocab.special_eos_id);
|
output.push_back(vocab.special_eos_id);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1447,7 +1454,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||||
bool is_prev_special = true; // prefix with space if first token
|
bool is_prev_special = true; // prefix with space if first token
|
||||||
|
|
||||||
if (add_special && vocab.tokenizer_add_bos) {
|
if (add_special && vocab.tokenizer_add_bos) {
|
||||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
|
||||||
output.push_back(vocab.special_bos_id);
|
output.push_back(vocab.special_bos_id);
|
||||||
is_prev_special = true;
|
is_prev_special = true;
|
||||||
}
|
}
|
||||||
|
@ -1482,7 +1489,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (add_special && vocab.tokenizer_add_eos) {
|
if (add_special && vocab.tokenizer_add_eos) {
|
||||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
|
||||||
output.push_back(vocab.special_eos_id);
|
output.push_back(vocab.special_eos_id);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -1515,7 +1522,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||||
case LLAMA_VOCAB_TYPE_WPM:
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
{
|
{
|
||||||
if (add_special) {
|
if (add_special) {
|
||||||
GGML_ASSERT(vocab.special_cls_id != -1);
|
GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL);
|
||||||
output.push_back(vocab.special_cls_id);
|
output.push_back(vocab.special_cls_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1535,14 +1542,14 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (add_special) {
|
if (add_special) {
|
||||||
GGML_ASSERT(vocab.special_sep_id != -1);
|
GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL);
|
||||||
output.push_back(vocab.special_sep_id);
|
output.push_back(vocab.special_sep_id);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_UGM:
|
case LLAMA_VOCAB_TYPE_UGM:
|
||||||
{
|
{
|
||||||
if (add_special && vocab.tokenizer_add_bos) {
|
if (add_special && vocab.tokenizer_add_bos) {
|
||||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
|
||||||
output.push_back(vocab.special_bos_id);
|
output.push_back(vocab.special_bos_id);
|
||||||
}
|
}
|
||||||
llm_tokenizer_ugm_session session(vocab);
|
llm_tokenizer_ugm_session session(vocab);
|
||||||
|
@ -1567,7 +1574,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (add_special && vocab.tokenizer_add_eos) {
|
if (add_special && vocab.tokenizer_add_eos) {
|
||||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
|
||||||
output.push_back(vocab.special_eos_id);
|
output.push_back(vocab.special_eos_id);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -1635,7 +1642,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
||||||
return token != -1 && vocab.special_eog_ids.count(token) > 0;
|
return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
|
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
|
||||||
|
|
299
src/llama.cpp
299
src/llama.cpp
|
@ -8,7 +8,6 @@
|
||||||
#include "llama-kv-cache.h"
|
#include "llama-kv-cache.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
#include "llama-quant.h"
|
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
@ -18,12 +17,8 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cctype>
|
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
#include <cinttypes>
|
|
||||||
#include <climits>
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdarg>
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
@ -31,10 +26,7 @@
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <initializer_list>
|
#include <initializer_list>
|
||||||
#include <locale>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <numeric>
|
|
||||||
#include <type_traits>
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
@ -1552,6 +1544,32 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_COHERE2:
|
||||||
|
{
|
||||||
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
||||||
|
// init output from the input tok embed
|
||||||
|
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
|
||||||
|
llama_model_loader::TENSOR_DUPLICATED);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||||
|
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
||||||
|
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
||||||
{
|
{
|
||||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
@ -1831,6 +1849,7 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
} else {
|
} else {
|
||||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
if (n_expert == 0) {
|
if (n_expert == 0) {
|
||||||
throw std::runtime_error("n_expert must be > 0");
|
throw std::runtime_error("n_expert must be > 0");
|
||||||
|
@ -2513,21 +2532,36 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct llama_context & lctx,
|
struct llama_context & lctx,
|
||||||
const llama_hparams & hparams,
|
const llama_hparams & hparams,
|
||||||
const llama_ubatch & batch,
|
const llama_ubatch & ubatch,
|
||||||
struct ggml_tensor * tok_embd,
|
struct ggml_tensor * tok_embd,
|
||||||
const llm_build_cb & cb) {
|
const llm_build_cb & cb) {
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
if (batch.token) {
|
if (ubatch.token) {
|
||||||
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens);
|
||||||
cb(lctx.inp_tokens, "inp_tokens", -1);
|
cb(lctx.inp_tokens, "inp_tokens", -1);
|
||||||
ggml_set_input(lctx.inp_tokens);
|
ggml_set_input(lctx.inp_tokens);
|
||||||
|
|
||||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||||
|
|
||||||
|
// apply lora for embedding tokens if needed
|
||||||
|
for (auto & it : lctx.lora_adapters) {
|
||||||
|
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
|
||||||
|
if (lora == nullptr) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const float adapter_scale = it.second;
|
||||||
|
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
||||||
|
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
|
||||||
|
ctx, lora->b, // non-transposed lora_b
|
||||||
|
ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
|
||||||
|
), scale);
|
||||||
|
inpL = ggml_add(ctx, inpL, inpL_delta);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
|
||||||
inpL = lctx.inp_embd;
|
inpL = lctx.inp_embd;
|
||||||
ggml_set_input(lctx.inp_embd);
|
ggml_set_input(lctx.inp_embd);
|
||||||
}
|
}
|
||||||
|
@ -2598,9 +2632,8 @@ static struct ggml_tensor * llm_build_lora_mm(
|
||||||
if (lora == nullptr) {
|
if (lora == nullptr) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const float alpha = it.first->alpha;
|
const float adapter_scale = it.second;
|
||||||
const float rank = (float) lora->b->ne[0];
|
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
||||||
const float scale = alpha ? it.second * alpha / rank : it.second;
|
|
||||||
struct ggml_tensor * ab_cur = ggml_mul_mat(
|
struct ggml_tensor * ab_cur = ggml_mul_mat(
|
||||||
ctx0, lora->b,
|
ctx0, lora->b,
|
||||||
ggml_mul_mat(ctx0, lora->a, cur)
|
ggml_mul_mat(ctx0, lora->a, cur)
|
||||||
|
@ -2811,12 +2844,14 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
||||||
struct ggml_tensor * up_exps,
|
struct ggml_tensor * up_exps,
|
||||||
struct ggml_tensor * gate_exps,
|
struct ggml_tensor * gate_exps,
|
||||||
struct ggml_tensor * down_exps,
|
struct ggml_tensor * down_exps,
|
||||||
|
struct ggml_tensor * exp_probs_b,
|
||||||
int64_t n_expert,
|
int64_t n_expert,
|
||||||
int64_t n_expert_used,
|
int64_t n_expert_used,
|
||||||
llm_ffn_op_type type_op,
|
llm_ffn_op_type type_op,
|
||||||
bool norm_w,
|
bool norm_w,
|
||||||
bool scale_w,
|
bool scale_w,
|
||||||
float w_scale,
|
float w_scale,
|
||||||
|
llama_expert_gating_func_type gating_op,
|
||||||
const llm_build_cb & cb,
|
const llm_build_cb & cb,
|
||||||
int il) {
|
int il) {
|
||||||
int64_t n_embd = cur->ne[0];
|
int64_t n_embd = cur->ne[0];
|
||||||
|
@ -2825,11 +2860,31 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
||||||
ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
|
ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
|
||||||
cb(logits, "ffn_moe_logits", il);
|
cb(logits, "ffn_moe_logits", il);
|
||||||
|
|
||||||
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
ggml_tensor * probs = nullptr;
|
||||||
|
switch (gating_op) {
|
||||||
|
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
|
||||||
|
{
|
||||||
|
probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
||||||
|
} break;
|
||||||
|
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
|
||||||
|
{
|
||||||
|
probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
cb(probs, "ffn_moe_probs", il);
|
cb(probs, "ffn_moe_probs", il);
|
||||||
|
|
||||||
|
// add experts selection bias - introduced in DeepSeek V3
|
||||||
|
// leave probs unbiased as it's later used to get expert weights
|
||||||
|
ggml_tensor * selection_probs = probs;
|
||||||
|
if (exp_probs_b != nullptr) {
|
||||||
|
selection_probs = ggml_add(ctx, probs, exp_probs_b);
|
||||||
|
cb(selection_probs, "ffn_moe_probs_biased", il);
|
||||||
|
}
|
||||||
|
|
||||||
// select experts
|
// select experts
|
||||||
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||||
cb(selected_experts, "ffn_moe_topk", il);
|
cb(selected_experts, "ffn_moe_topk", il);
|
||||||
|
|
||||||
|
@ -3100,7 +3155,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
|
||||||
static struct ggml_tensor * llm_build_mamba(
|
static struct ggml_tensor * llm_build_mamba(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct llama_context & lctx,
|
struct llama_context & lctx,
|
||||||
const llama_ubatch & batch,
|
const llama_ubatch & ubatch,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
struct ggml_tensor * cur,
|
struct ggml_tensor * cur,
|
||||||
struct ggml_tensor * state_copy,
|
struct ggml_tensor * state_copy,
|
||||||
|
@ -3116,17 +3171,17 @@ static struct ggml_tensor * llm_build_mamba(
|
||||||
const int64_t d_inner = hparams.ssm_d_inner;
|
const int64_t d_inner = hparams.ssm_d_inner;
|
||||||
const int64_t d_state = hparams.ssm_d_state;
|
const int64_t d_state = hparams.ssm_d_state;
|
||||||
const int64_t dt_rank = hparams.ssm_dt_rank;
|
const int64_t dt_rank = hparams.ssm_dt_rank;
|
||||||
const int64_t n_seqs = batch.n_seqs;
|
const int64_t n_seqs = ubatch.n_seqs;
|
||||||
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
|
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
|
||||||
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
|
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
|
||||||
// Use the same RMS norm as the final layer norm
|
// Use the same RMS norm as the final layer norm
|
||||||
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
||||||
|
|
||||||
const int64_t n_seq_tokens = batch.n_seq_tokens;
|
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
|
|
||||||
GGML_ASSERT(n_seqs != 0);
|
GGML_ASSERT(n_seqs != 0);
|
||||||
GGML_ASSERT(batch.equal_seqs);
|
GGML_ASSERT(ubatch.equal_seqs);
|
||||||
GGML_ASSERT(batch.n_tokens == n_seq_tokens * n_seqs);
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||||
|
|
||||||
struct ggml_tensor * conv_states_all = kv.k_l[il];
|
struct ggml_tensor * conv_states_all = kv.k_l[il];
|
||||||
struct ggml_tensor * ssm_states_all = kv.v_l[il];
|
struct ggml_tensor * ssm_states_all = kv.v_l[il];
|
||||||
|
@ -3926,6 +3981,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
|
|
||||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
model.layers[il].ffn_norm, NULL,
|
model.layers[il].ffn_norm, NULL,
|
||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
|
@ -3950,9 +4006,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, true,
|
LLM_FFN_SILU, true,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
}
|
}
|
||||||
|
@ -4602,9 +4660,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_GELU, true,
|
LLM_FFN_GELU, true,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -4743,9 +4803,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, true,
|
LLM_FFN_SILU, true,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -5991,9 +6053,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, false,
|
LLM_FFN_SILU, false,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -7633,6 +7697,137 @@ struct llm_build_context {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_cohere2() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
const float f_logit_scale = hparams.f_logit_scale;
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
// cohere2 requires different mask for layers using sliding window (SWA)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
|
||||||
|
|
||||||
|
// sliding window switch pattern
|
||||||
|
const int32_t sliding_window_pattern = 4;
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
// three layers sliding window attention (window size 4096) and ROPE
|
||||||
|
// fourth layer uses global attention without positional embeddings
|
||||||
|
const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
|
||||||
|
struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
struct ggml_tensor * ffn_inp = cur;
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// rope freq factors for 128k context
|
||||||
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
|
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_sliding) {
|
||||||
|
Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
|
||||||
|
beta_fast, beta_slow);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
|
rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
|
||||||
|
attn_factor, beta_fast, beta_slow);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
} else {
|
||||||
|
// For non-sliding layers, just reshape without applying RoPE
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
|
||||||
|
KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * attn_out = cur;
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
{
|
||||||
|
cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
|
||||||
|
NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
|
||||||
|
cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add together residual + FFN + self-attention
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
cur = ggml_add(ctx0, cur, attn_out);
|
||||||
|
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
|
|
||||||
|
if (f_logit_scale) {
|
||||||
|
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
// ref: https://allenai.org/olmo
|
// ref: https://allenai.org/olmo
|
||||||
// based on the original build_llama() function, changes:
|
// based on the original build_llama() function, changes:
|
||||||
// * non-parametric layer norm
|
// * non-parametric layer norm
|
||||||
|
@ -7985,9 +8180,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, false,
|
LLM_FFN_SILU, false,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -8382,9 +8579,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, true,
|
LLM_FFN_SILU, true,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -8523,9 +8722,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, false,
|
LLM_FFN_SILU, false,
|
||||||
false, hparams.expert_weights_scale,
|
false, hparams.expert_weights_scale,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(moe_out, "ffn_moe_out", il);
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -8752,9 +8953,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
model.layers[il].ffn_exp_probs_b,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, false,
|
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||||
true, hparams.expert_weights_scale,
|
true, hparams.expert_weights_scale,
|
||||||
|
(enum llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(moe_out, "ffn_moe_out", il);
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -10384,6 +10587,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_command_r();
|
result = llm.build_command_r();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_COHERE2:
|
||||||
|
{
|
||||||
|
result = llm.build_cohere2();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_DBRX:
|
case LLM_ARCH_DBRX:
|
||||||
{
|
{
|
||||||
result = llm.build_dbrx();
|
result = llm.build_dbrx();
|
||||||
|
@ -10517,7 +10724,7 @@ static enum ggml_status llama_graph_compute(
|
||||||
// return positive int on warning
|
// return positive int on warning
|
||||||
// return negative int on error
|
// return negative int on error
|
||||||
//
|
//
|
||||||
static int llama_decode_internal(
|
static int llama_decode_impl(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
llama_batch inp_batch) {
|
llama_batch inp_batch) {
|
||||||
|
|
||||||
|
@ -10852,7 +11059,7 @@ static int llama_decode_internal(
|
||||||
// return positive int on warning
|
// return positive int on warning
|
||||||
// return negative int on error
|
// return negative int on error
|
||||||
//
|
//
|
||||||
static int llama_encode_internal(
|
static int llama_encode_impl(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
llama_batch inp_batch) {
|
llama_batch inp_batch) {
|
||||||
|
|
||||||
|
@ -11034,7 +11241,7 @@ static int llama_encode_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||||
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
|
||||||
auto & kv_self = lctx.kv_self;
|
auto & kv_self = lctx.kv_self;
|
||||||
|
|
||||||
const auto & hparams = lctx.model.hparams;
|
const auto & hparams = lctx.model.hparams;
|
||||||
|
@ -11254,7 +11461,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||||
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
|
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
static void llama_kv_cache_update_impl(struct llama_context & lctx) {
|
||||||
bool need_reserve = false;
|
bool need_reserve = false;
|
||||||
|
|
||||||
if (lctx.kv_self.has_shift) {
|
if (lctx.kv_self.has_shift) {
|
||||||
|
@ -11290,7 +11497,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||||
|
|
||||||
// defragment the KV cache if needed
|
// defragment the KV cache if needed
|
||||||
if (lctx.kv_self.do_defrag) {
|
if (lctx.kv_self.do_defrag) {
|
||||||
llama_kv_cache_defrag_internal(lctx);
|
llama_kv_cache_defrag_impl(lctx);
|
||||||
|
|
||||||
need_reserve = true;
|
need_reserve = true;
|
||||||
|
|
||||||
|
@ -11319,13 +11526,7 @@ int32_t llama_lora_adapter_set(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_lora_adapter * adapter,
|
struct llama_lora_adapter * adapter,
|
||||||
float scale) {
|
float scale) {
|
||||||
if (ctx->cparams.flash_attn) {
|
|
||||||
LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx->lora_adapters[adapter] = scale;
|
ctx->lora_adapters[adapter] = scale;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11456,6 +11657,12 @@ int64_t llama_time_us(void) {
|
||||||
struct llama_model * llama_load_model_from_file(
|
struct llama_model * llama_load_model_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_model_params params) {
|
struct llama_model_params params) {
|
||||||
|
return llama_model_load_from_file(path_model, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct llama_model * llama_model_load_from_file(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_model_params params) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
llama_model * model = new llama_model;
|
llama_model * model = new llama_model;
|
||||||
|
@ -11494,7 +11701,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||||
if (!rpc_reg) {
|
if (!rpc_reg) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11502,7 +11709,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||||
if (!ggml_backend_rpc_add_device_fn) {
|
if (!ggml_backend_rpc_add_device_fn) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11512,7 +11719,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
model->devices.push_back(dev);
|
model->devices.push_back(dev);
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11544,7 +11751,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||||
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
|
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
|
||||||
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
|
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
||||||
|
@ -11567,7 +11774,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11991,7 +12198,7 @@ void llama_kv_cache_defrag(struct llama_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_update(struct llama_context * ctx) {
|
void llama_kv_cache_update(struct llama_context * ctx) {
|
||||||
llama_kv_cache_update_internal(*ctx);
|
llama_kv_cache_update_impl(*ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
||||||
|
@ -12003,7 +12210,7 @@ bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
||||||
int32_t llama_encode(
|
int32_t llama_encode(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_batch batch) {
|
struct llama_batch batch) {
|
||||||
const int ret = llama_encode_internal(*ctx, batch);
|
const int ret = llama_encode_impl(*ctx, batch);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
|
LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
|
||||||
}
|
}
|
||||||
|
@ -12014,7 +12221,7 @@ int32_t llama_encode(
|
||||||
int32_t llama_decode(
|
int32_t llama_decode(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_batch batch) {
|
struct llama_batch batch) {
|
||||||
const int ret = llama_decode_internal(*ctx, batch);
|
const int ret = llama_decode_impl(*ctx, batch);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
||||||
}
|
}
|
||||||
|
@ -12234,16 +12441,16 @@ int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
||||||
std::string str_split_path(split_path);
|
std::string str_split_path(split_path);
|
||||||
char postfix[32];
|
char postfix[32];
|
||||||
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
||||||
std::string str_postfix(postfix);
|
std::string str_postfix(postfix);
|
||||||
|
|
||||||
// check if dest ends with postfix
|
// check if split_prefix ends with postfix
|
||||||
int size_prefix = str_split_path.size() - str_postfix.size();
|
int size_prefix = str_split_path.size() - str_postfix.size();
|
||||||
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
||||||
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
||||||
return size_prefix;
|
return size_prefix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12252,6 +12459,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
||||||
|
|
||||||
const char * llama_print_system_info(void) {
|
const char * llama_print_system_info(void) {
|
||||||
static std::string s;
|
static std::string s;
|
||||||
|
s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
|
||||||
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||||
auto * reg = ggml_backend_reg_get(i);
|
auto * reg = ggml_backend_reg_get(i);
|
||||||
|
|
|
@ -667,18 +667,24 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
{ "\\p{N}", unicode_cpt_flags::NUMBER },
|
{ "\\p{N}", unicode_cpt_flags::NUMBER },
|
||||||
{ "\\p{L}", unicode_cpt_flags::LETTER },
|
{ "\\p{L}", unicode_cpt_flags::LETTER },
|
||||||
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
|
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
|
||||||
|
{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
|
||||||
|
{ "\\p{S}", unicode_cpt_flags::SYMBOL },
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<int, int> k_ucat_cpt = {
|
static const std::map<int, int> k_ucat_cpt = {
|
||||||
{ unicode_cpt_flags::NUMBER, 0xD1 },
|
{ unicode_cpt_flags::NUMBER, 0xD1 },
|
||||||
{ unicode_cpt_flags::LETTER, 0xD2 },
|
{ unicode_cpt_flags::LETTER, 0xD2 },
|
||||||
{ unicode_cpt_flags::PUNCTUATION, 0xD3 },
|
{ unicode_cpt_flags::PUNCTUATION, 0xD3 },
|
||||||
|
{ unicode_cpt_flags::ACCENT_MARK, 0xD4 },
|
||||||
|
{ unicode_cpt_flags::SYMBOL, 0xD5 },
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<int, std::string> k_ucat_map = {
|
static const std::map<int, std::string> k_ucat_map = {
|
||||||
{ unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
|
{ unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
|
||||||
{ unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
{ unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
||||||
{ unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
{ unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
||||||
|
{ unicode_cpt_flags::ACCENT_MARK, "" }, // no sub-128 codepoints
|
||||||
|
{ unicode_cpt_flags::SYMBOL, "\\\x24\\\x2B\x3C-\x3E\x5E\x60\\\x7C" }, // $+<=>^`|
|
||||||
};
|
};
|
||||||
|
|
||||||
// compute collapsed codepoints only if needed by at least one regex
|
// compute collapsed codepoints only if needed by at least one regex
|
||||||
|
|
|
@ -13,10 +13,10 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::thread([&model_path]() {
|
std::thread([&model_path]() {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
|
auto * model = llama_model_load_from_file(model_path, llama_model_default_params());
|
||||||
auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
|
auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}).join();
|
}).join();
|
||||||
|
|
||||||
|
|
|
@ -24,8 +24,9 @@ enum handcrafted_file_type {
|
||||||
|
|
||||||
HANDCRAFTED_KV_BAD_KEY_SIZE = 10 + offset_has_kv,
|
HANDCRAFTED_KV_BAD_KEY_SIZE = 10 + offset_has_kv,
|
||||||
HANDCRAFTED_KV_BAD_TYPE = 20 + offset_has_kv,
|
HANDCRAFTED_KV_BAD_TYPE = 20 + offset_has_kv,
|
||||||
HANDCRAFTED_KV_BAD_VALUE_SIZE = 30 + offset_has_kv,
|
// HANDCRAFTED_KV_BAD_VALUE_SIZE = 30 + offset_has_kv, // removed because it can result in allocations > 1 TB (default sanitizer limit)
|
||||||
HANDCRAFTED_KV_DUPLICATE_KEY = 40 + offset_has_kv,
|
HANDCRAFTED_KV_DUPLICATE_KEY = 40 + offset_has_kv,
|
||||||
|
HANDCRAFTED_KV_BAD_ALIGN = 50 + offset_has_kv,
|
||||||
HANDCRAFTED_KV_SUCCESS = 800 + offset_has_kv,
|
HANDCRAFTED_KV_SUCCESS = 800 + offset_has_kv,
|
||||||
|
|
||||||
HANDCRAFTED_TENSORS_BAD_NAME_SIZE = 10 + offset_has_tensors,
|
HANDCRAFTED_TENSORS_BAD_NAME_SIZE = 10 + offset_has_tensors,
|
||||||
|
@ -35,12 +36,14 @@ enum handcrafted_file_type {
|
||||||
HANDCRAFTED_TENSORS_BAD_TYPE = 50 + offset_has_tensors,
|
HANDCRAFTED_TENSORS_BAD_TYPE = 50 + offset_has_tensors,
|
||||||
HANDCRAFTED_TENSORS_BAD_OFFSET = 60 + offset_has_tensors,
|
HANDCRAFTED_TENSORS_BAD_OFFSET = 60 + offset_has_tensors,
|
||||||
HANDCRAFTED_TENSORS_DUPLICATE_NAME = 70 + offset_has_tensors,
|
HANDCRAFTED_TENSORS_DUPLICATE_NAME = 70 + offset_has_tensors,
|
||||||
HANDCRAFTED_TENSORS_BAD_ALIGNMENT = 80 + offset_has_tensors,
|
HANDCRAFTED_TENSORS_BAD_ALIGN = 75 + offset_has_tensors,
|
||||||
|
HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN = 80 + offset_has_tensors,
|
||||||
HANDCRAFTED_TENSORS_SUCCESS = 800 + offset_has_tensors,
|
HANDCRAFTED_TENSORS_SUCCESS = 800 + offset_has_tensors,
|
||||||
HANDCRAFTED_TENSORS_CUSTOM_ALIGN = 810 + offset_has_tensors,
|
HANDCRAFTED_TENSORS_CUSTOM_ALIGN = 810 + offset_has_tensors,
|
||||||
|
|
||||||
HANDCRAFTED_DATA_NOT_ENOUGH_DATA = 10 + offset_has_data,
|
HANDCRAFTED_DATA_NOT_ENOUGH_DATA = 10 + offset_has_data,
|
||||||
HANDCRAFTED_DATA_BAD_ALIGNMENT = 20 + offset_has_data,
|
HANDCRAFTED_DATA_BAD_ALIGN = 15 + offset_has_data,
|
||||||
|
HANDCRAFTED_DATA_INCONSISTENT_ALIGN = 20 + offset_has_data,
|
||||||
HANDCRAFTED_DATA_SUCCESS = 800 + offset_has_data,
|
HANDCRAFTED_DATA_SUCCESS = 800 + offset_has_data,
|
||||||
HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data,
|
HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data,
|
||||||
};
|
};
|
||||||
|
@ -56,8 +59,8 @@ std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
|
||||||
|
|
||||||
case HANDCRAFTED_KV_BAD_KEY_SIZE: return "KV_BAD_KEY_SIZE";
|
case HANDCRAFTED_KV_BAD_KEY_SIZE: return "KV_BAD_KEY_SIZE";
|
||||||
case HANDCRAFTED_KV_BAD_TYPE: return "KV_BAD_TYPE";
|
case HANDCRAFTED_KV_BAD_TYPE: return "KV_BAD_TYPE";
|
||||||
case HANDCRAFTED_KV_BAD_VALUE_SIZE: return "KV_BAD_VALUE_SIZE";
|
|
||||||
case HANDCRAFTED_KV_DUPLICATE_KEY: return "KV_DUPLICATE_KEY";
|
case HANDCRAFTED_KV_DUPLICATE_KEY: return "KV_DUPLICATE_KEY";
|
||||||
|
case HANDCRAFTED_KV_BAD_ALIGN: return "KV_BAD_ALIGN";
|
||||||
case HANDCRAFTED_KV_SUCCESS: return "KV_RANDOM_KV";
|
case HANDCRAFTED_KV_SUCCESS: return "KV_RANDOM_KV";
|
||||||
|
|
||||||
case HANDCRAFTED_TENSORS_BAD_NAME_SIZE: return "TENSORS_BAD_NAME_SIZE";
|
case HANDCRAFTED_TENSORS_BAD_NAME_SIZE: return "TENSORS_BAD_NAME_SIZE";
|
||||||
|
@ -67,12 +70,14 @@ std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
|
||||||
case HANDCRAFTED_TENSORS_BAD_TYPE: return "TENSORS_BAD_TYPE";
|
case HANDCRAFTED_TENSORS_BAD_TYPE: return "TENSORS_BAD_TYPE";
|
||||||
case HANDCRAFTED_TENSORS_BAD_OFFSET: return "TENSORS_BAD_OFFSET";
|
case HANDCRAFTED_TENSORS_BAD_OFFSET: return "TENSORS_BAD_OFFSET";
|
||||||
case HANDCRAFTED_TENSORS_DUPLICATE_NAME: return "TENSORS_DUPLICATE_NAME";
|
case HANDCRAFTED_TENSORS_DUPLICATE_NAME: return "TENSORS_DUPLICATE_NAME";
|
||||||
case HANDCRAFTED_TENSORS_BAD_ALIGNMENT: return "TENSORS_BAD_ALIGNMENT";
|
case HANDCRAFTED_TENSORS_BAD_ALIGN: return "TENSORS_BAD_ALIGN";
|
||||||
|
case HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN: return "TENSORS_INCONSISTENT_ALIGN";
|
||||||
case HANDCRAFTED_TENSORS_SUCCESS: return "TENSORS_SUCCESS";
|
case HANDCRAFTED_TENSORS_SUCCESS: return "TENSORS_SUCCESS";
|
||||||
case HANDCRAFTED_TENSORS_CUSTOM_ALIGN: return "TENSORS_CUSTOM_ALIGN";
|
case HANDCRAFTED_TENSORS_CUSTOM_ALIGN: return "TENSORS_CUSTOM_ALIGN";
|
||||||
|
|
||||||
case HANDCRAFTED_DATA_NOT_ENOUGH_DATA: return "DATA_NOT_ENOUGH_DATA";
|
case HANDCRAFTED_DATA_NOT_ENOUGH_DATA: return "DATA_NOT_ENOUGH_DATA";
|
||||||
case HANDCRAFTED_DATA_BAD_ALIGNMENT: return "DATA_BAD_ALIGNMENT";
|
case HANDCRAFTED_DATA_BAD_ALIGN: return "DATA_BAD_ALIGN";
|
||||||
|
case HANDCRAFTED_DATA_INCONSISTENT_ALIGN: return "DATA_INCONSISTENT_ALIGN";
|
||||||
case HANDCRAFTED_DATA_SUCCESS: return "DATA_SUCCESS";
|
case HANDCRAFTED_DATA_SUCCESS: return "DATA_SUCCESS";
|
||||||
case HANDCRAFTED_DATA_CUSTOM_ALIGN: return "DATA_CUSTOM_ALIGN";
|
case HANDCRAFTED_DATA_CUSTOM_ALIGN: return "DATA_CUSTOM_ALIGN";
|
||||||
}
|
}
|
||||||
|
@ -140,31 +145,41 @@ std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937
|
||||||
return kv_types;
|
return kv_types;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void helper_write(const void * data, const size_t nbytes, FILE * file) {
|
template <typename T>
|
||||||
|
static void helper_write(FILE * file, const T & val) {
|
||||||
|
GGML_ASSERT(fwrite(&val, 1, sizeof(val), file) == sizeof(val));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void helper_write(FILE * file, const void * data, const size_t nbytes) {
|
||||||
GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
|
GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
|
static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
|
||||||
FILE * file = tmpfile();
|
FILE * file = tmpfile();
|
||||||
|
|
||||||
|
if (!file) {
|
||||||
|
return file;
|
||||||
|
}
|
||||||
|
|
||||||
std::mt19937 rng(seed);
|
std::mt19937 rng(seed);
|
||||||
|
uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
|
||||||
|
|
||||||
if (hft == HANDCRAFTED_HEADER_BAD_MAGIC) {
|
if (hft == HANDCRAFTED_HEADER_BAD_MAGIC) {
|
||||||
const char bad_magic[4] = {'F', 'U', 'G', 'G'};
|
const char bad_magic[4] = {'F', 'U', 'G', 'G'};
|
||||||
helper_write(bad_magic, sizeof(bad_magic), file);
|
helper_write(file, bad_magic, sizeof(bad_magic));
|
||||||
} else {
|
} else {
|
||||||
helper_write(GGUF_MAGIC, 4, file);
|
helper_write(file, GGUF_MAGIC, 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
|
if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
|
||||||
const uint32_t version = 1;
|
const uint32_t version = 1;
|
||||||
helper_write(&version, sizeof(version), file);
|
helper_write(file, version);
|
||||||
} else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
|
} else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
|
||||||
const uint32_t version = GGUF_VERSION + 1;
|
const uint32_t version = GGUF_VERSION + 1;
|
||||||
helper_write(&version, sizeof(version), file);
|
helper_write(file, version);
|
||||||
} else {
|
} else {
|
||||||
const uint32_t version = GGUF_VERSION;
|
const uint32_t version = GGUF_VERSION;
|
||||||
helper_write(&version, sizeof(version), file);
|
helper_write(file, version);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<tensor_config_t> tensor_configs;
|
std::vector<tensor_config_t> tensor_configs;
|
||||||
|
@ -174,10 +189,10 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||||
|
|
||||||
if (hft == HANDCRAFTED_HEADER_BAD_N_TENSORS) {
|
if (hft == HANDCRAFTED_HEADER_BAD_N_TENSORS) {
|
||||||
const uint64_t n_tensors = -1;
|
const uint64_t n_tensors = -1;
|
||||||
helper_write(&n_tensors, sizeof(n_tensors), file);
|
helper_write(file, n_tensors);
|
||||||
} else {
|
} else {
|
||||||
const uint64_t n_tensors = tensor_configs.size();
|
const uint64_t n_tensors = tensor_configs.size();
|
||||||
helper_write(&n_tensors, sizeof(n_tensors), file);
|
helper_write(file, n_tensors);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
|
std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
|
||||||
|
@ -186,41 +201,49 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
uint64_t n_kv = kv_types.size();
|
uint64_t n_kv = kv_types.size();
|
||||||
if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
if (hft == HANDCRAFTED_KV_BAD_ALIGN ||
|
||||||
|
hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN ||
|
||||||
|
hft == HANDCRAFTED_DATA_BAD_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
||||||
|
|
||||||
n_kv += 1;
|
n_kv += 1;
|
||||||
} else if (hft == HANDCRAFTED_HEADER_BAD_N_KV) {
|
} else if (hft == HANDCRAFTED_HEADER_BAD_N_KV) {
|
||||||
n_kv = -1;
|
n_kv = -1;
|
||||||
}
|
}
|
||||||
helper_write(&n_kv, sizeof(n_kv), file);
|
helper_write(file, n_kv);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hft < offset_has_kv) {
|
if (hft < offset_has_kv) {
|
||||||
|
while (ftell(file) % alignment != 0) {
|
||||||
|
const char pad = 0;
|
||||||
|
helper_write(file, pad);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < extra_bytes; ++i) {
|
for (int i = 0; i < extra_bytes; ++i) {
|
||||||
const char tmp = 0;
|
const char tmp = 0;
|
||||||
helper_write(&tmp, sizeof(tmp), file);
|
helper_write(file, tmp);
|
||||||
}
|
}
|
||||||
rewind(file);
|
rewind(file);
|
||||||
return file;
|
return file;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < int(kv_types.size()); ++i) {
|
for (int i = 0; i < int(kv_types.size()); ++i) {
|
||||||
const enum gguf_type type = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].first);
|
const enum gguf_type type = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].first);
|
||||||
const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].second);
|
const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].second);
|
||||||
|
|
||||||
const std::string key = "my_key_" + std::to_string((hft == HANDCRAFTED_KV_DUPLICATE_KEY ? i/2 : i));
|
const std::string key = "my_key_" + std::to_string((hft == HANDCRAFTED_KV_DUPLICATE_KEY ? i/2 : i));
|
||||||
|
|
||||||
if (hft == HANDCRAFTED_KV_BAD_KEY_SIZE) {
|
if (hft == HANDCRAFTED_KV_BAD_KEY_SIZE) {
|
||||||
const uint64_t n = -1;
|
const uint64_t n = -1;
|
||||||
helper_write(&n, sizeof(n), file);
|
helper_write(file, n);
|
||||||
} else {
|
} else {
|
||||||
const uint64_t n = key.length();
|
const uint64_t n = key.length();
|
||||||
helper_write(&n, sizeof(n), file);
|
helper_write(file, n);
|
||||||
}
|
}
|
||||||
helper_write(key.data(), key.length(), file);
|
helper_write(file, key.data(), key.length());
|
||||||
|
|
||||||
{
|
{
|
||||||
const int32_t type32 = int32_t(type);
|
const int32_t type32 = int32_t(type);
|
||||||
helper_write(&type32, sizeof(type32), file);
|
helper_write(file, type32);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t data[16];
|
uint32_t data[16];
|
||||||
|
@ -233,69 +256,67 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||||
|
|
||||||
if (type == GGUF_TYPE_STRING) {
|
if (type == GGUF_TYPE_STRING) {
|
||||||
const uint64_t n = rng() % sizeof(data);
|
const uint64_t n = rng() % sizeof(data);
|
||||||
helper_write(&n, sizeof(n), file);
|
helper_write(file, n);
|
||||||
helper_write(data, n, file);
|
helper_write(file, data, n);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type == GGUF_TYPE_ARRAY) {
|
if (type == GGUF_TYPE_ARRAY) {
|
||||||
{
|
{
|
||||||
const int32_t type32 = int32_t(type_arr);
|
const int32_t type32 = int32_t(type_arr);
|
||||||
helper_write(&type32, sizeof(type32), file);
|
helper_write(file, type32);
|
||||||
}
|
}
|
||||||
if (type_arr == GGUF_TYPE_STRING) {
|
if (type_arr == GGUF_TYPE_STRING) {
|
||||||
const uint64_t nstr = rng() % (16 + 1);
|
const uint64_t nstr = rng() % (16 + 1);
|
||||||
helper_write(&nstr, sizeof(nstr), file);
|
helper_write(file, nstr);
|
||||||
for (uint64_t istr = 0; istr < nstr; ++istr) {
|
for (uint64_t istr = 0; istr < nstr; ++istr) {
|
||||||
const uint64_t n = rng() % (sizeof(uint32_t) + 1);
|
const uint64_t n = rng() % (sizeof(uint32_t) + 1);
|
||||||
helper_write(&n, sizeof(n), file);
|
helper_write(file, n);
|
||||||
helper_write(&data[istr], n, file);
|
helper_write(file, &data[istr], n);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const size_t type_size = gguf_type_size(type_arr);
|
const size_t type_size = gguf_type_size(type_arr);
|
||||||
const uint64_t n = (rng() % sizeof(data)) / type_size;
|
const uint64_t n = (rng() % sizeof(data)) / type_size;
|
||||||
helper_write(&n, sizeof(n), file);
|
helper_write(file, n);
|
||||||
helper_write(&data, n*type_size, file);
|
helper_write(file, &data, n*type_size);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t type_size = hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type);
|
helper_write(file, data, hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type));
|
||||||
if (hft == HANDCRAFTED_KV_BAD_VALUE_SIZE) {
|
|
||||||
type_size += rng() % 3;
|
|
||||||
}
|
|
||||||
helper_write(data, type_size, file);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
if (hft == HANDCRAFTED_KV_BAD_ALIGN ||
|
||||||
const std::string key = "general.alignment";
|
hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN ||
|
||||||
{
|
hft == HANDCRAFTED_DATA_BAD_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
||||||
const uint64_t n = key.length();
|
|
||||||
helper_write(&n, sizeof(n), file);
|
const uint64_t n = strlen(GGUF_KEY_GENERAL_ALIGNMENT);
|
||||||
}
|
helper_write(file, n);
|
||||||
helper_write(key.data(), key.length(), file);
|
helper_write(file, GGUF_KEY_GENERAL_ALIGNMENT, n);
|
||||||
|
|
||||||
const int32_t type = gguf_type(GGUF_TYPE_UINT32);
|
const int32_t type = gguf_type(GGUF_TYPE_UINT32);
|
||||||
helper_write(&type, sizeof(type), file);
|
helper_write(file, type);
|
||||||
|
|
||||||
const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT + 1;
|
alignment = expect_context_not_null(hft) ? 1 : 13;
|
||||||
helper_write(&alignment, sizeof(alignment), file);
|
helper_write(file, alignment);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hft < offset_has_tensors) {
|
if (hft < offset_has_tensors) {
|
||||||
|
while (ftell(file) % alignment != 0) {
|
||||||
|
const char pad = 0;
|
||||||
|
helper_write(file, pad);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < extra_bytes; ++i) {
|
for (int i = 0; i < extra_bytes; ++i) {
|
||||||
const char tmp = 0;
|
const char tmp = 0;
|
||||||
helper_write(&tmp, sizeof(tmp), file);
|
helper_write(file, tmp);
|
||||||
}
|
}
|
||||||
rewind(file);
|
rewind(file);
|
||||||
return file;
|
return file;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
|
if (hft == HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN || hft == HANDCRAFTED_DATA_INCONSISTENT_ALIGN) {
|
||||||
if (hft == HANDCRAFTED_TENSORS_BAD_ALIGNMENT || hft == HANDCRAFTED_DATA_BAD_ALIGNMENT) {
|
alignment = 1;
|
||||||
alignment -= 1;
|
|
||||||
} else if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
|
||||||
alignment += 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t offset = 0;
|
uint64_t offset = 0;
|
||||||
|
@ -313,9 +334,9 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
const uint64_t n = name.length();
|
const uint64_t n = name.length();
|
||||||
helper_write(&n, sizeof(n), file);
|
helper_write(file, n);
|
||||||
}
|
}
|
||||||
helper_write(name.data(), name.length(), file);
|
helper_write(file, name.data(), name.length());
|
||||||
|
|
||||||
uint32_t n_dims = hft == HANDCRAFTED_TENSORS_NE_TOO_BIG ? 2 : 1;
|
uint32_t n_dims = hft == HANDCRAFTED_TENSORS_NE_TOO_BIG ? 2 : 1;
|
||||||
for (int i = GGML_MAX_DIMS-1; i >= 1; --i) {
|
for (int i = GGML_MAX_DIMS-1; i >= 1; --i) {
|
||||||
|
@ -326,35 +347,35 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||||
}
|
}
|
||||||
if (hft == HANDCRAFTED_TENSORS_BAD_N_DIMS) {
|
if (hft == HANDCRAFTED_TENSORS_BAD_N_DIMS) {
|
||||||
const uint32_t n_dims_bad = GGML_MAX_DIMS + 1;
|
const uint32_t n_dims_bad = GGML_MAX_DIMS + 1;
|
||||||
helper_write(&n_dims_bad, sizeof(n_dims_bad), file);
|
helper_write(file, n_dims_bad);
|
||||||
} else {
|
} else {
|
||||||
helper_write(&n_dims, sizeof(n_dims), file);
|
helper_write(file, n_dims);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) {
|
if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) {
|
||||||
for (uint32_t j = 0; j < n_dims; ++j) {
|
for (uint32_t j = 0; j < n_dims; ++j) {
|
||||||
const int64_t bad_dim = -1;
|
const int64_t bad_dim = -1;
|
||||||
helper_write(&bad_dim, sizeof(bad_dim), file);
|
helper_write(file, bad_dim);
|
||||||
}
|
}
|
||||||
} else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){
|
} else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){
|
||||||
for (uint32_t j = 0; j < n_dims; ++j) {
|
for (uint32_t j = 0; j < n_dims; ++j) {
|
||||||
const int64_t big_dim = 4*int64_t(INT32_MAX);
|
const int64_t big_dim = 4*int64_t(INT32_MAX);
|
||||||
helper_write(&big_dim, sizeof(big_dim), file);
|
helper_write(file, big_dim);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
helper_write(shape.data(), n_dims*sizeof(int64_t), file);
|
helper_write(file, shape.data(), n_dims*sizeof(int64_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? -1 : int32_t(type);
|
const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? GGML_TYPE_COUNT : int32_t(type);
|
||||||
helper_write(&type32, sizeof(type32), file);
|
helper_write(file, type32);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hft == HANDCRAFTED_TENSORS_BAD_OFFSET) {
|
if (hft == HANDCRAFTED_TENSORS_BAD_OFFSET) {
|
||||||
const uint64_t bad_offset = -1;
|
const uint64_t bad_offset = -1;
|
||||||
helper_write(&bad_offset, sizeof(bad_offset), file);
|
helper_write(file, bad_offset);
|
||||||
} else {
|
} else {
|
||||||
helper_write(&offset, sizeof(offset), file);
|
helper_write(file, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t ne = shape[0];
|
int64_t ne = shape[0];
|
||||||
|
@ -364,12 +385,9 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||||
offset += GGML_PAD(ggml_row_size(type, ne), alignment);
|
offset += GGML_PAD(ggml_row_size(type, ne), alignment);
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint32_t alignment_overshoot = ftell(file) % alignment;
|
while (ftell(file) % alignment != 0) {
|
||||||
if (alignment_overshoot != 0) {
|
|
||||||
for (size_t i = alignment_overshoot; i < alignment; ++i) {
|
|
||||||
const char pad = 0;
|
const char pad = 0;
|
||||||
helper_write(&pad, sizeof(pad), file);
|
helper_write(file, pad);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hft >= offset_has_data) {
|
if (hft >= offset_has_data) {
|
||||||
|
@ -380,13 +398,13 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||||
}
|
}
|
||||||
for (uint64_t i = 0; i < nbytes; ++i) {
|
for (uint64_t i = 0; i < nbytes; ++i) {
|
||||||
const uint8_t random_byte = i % 256;
|
const uint8_t random_byte = i % 256;
|
||||||
helper_write(&random_byte, sizeof(random_byte), file);
|
helper_write(file, random_byte);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < extra_bytes; ++i) {
|
for (int i = 0; i < extra_bytes; ++i) {
|
||||||
const char tmp = 0;
|
const char tmp = 0;
|
||||||
helper_write(&tmp, sizeof(tmp), file);
|
helper_write(file, tmp);
|
||||||
}
|
}
|
||||||
rewind(file);
|
rewind(file);
|
||||||
return file;
|
return file;
|
||||||
|
@ -505,6 +523,16 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * data_gguf = reinterpret_cast<const char *>(gguf_get_arr_data(gguf_ctx, id));
|
const char * data_gguf = reinterpret_cast<const char *>(gguf_get_arr_data(gguf_ctx, id));
|
||||||
|
|
||||||
|
if (type_arr == GGUF_TYPE_BOOL) {
|
||||||
|
for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||||
|
if (bool(data8[arr_i]) != bool(data_gguf[arr_i])) {
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) {
|
if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) {
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
@ -512,12 +540,20 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * data_gguf = reinterpret_cast<const char *>(gguf_get_val_data(gguf_ctx, id));
|
const char * data_gguf = reinterpret_cast<const char *>(gguf_get_val_data(gguf_ctx, id));
|
||||||
|
|
||||||
|
if (type == GGUF_TYPE_BOOL) {
|
||||||
|
if (bool(*data8) != bool(*data_gguf)) {
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) {
|
if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) {
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint32_t expected_alignment = alignment_defined ? GGUF_DEFAULT_ALIGNMENT + 1 : GGUF_DEFAULT_ALIGNMENT;
|
const uint32_t expected_alignment = alignment_defined ? 1 : GGUF_DEFAULT_ALIGNMENT;
|
||||||
if (gguf_get_alignment(gguf_ctx) != expected_alignment) {
|
if (gguf_get_alignment(gguf_ctx) != expected_alignment) {
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
@ -539,7 +575,7 @@ static bool handcrafted_check_tensors(const gguf_context * gguf_ctx, const unsig
|
||||||
|
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
|
|
||||||
const int id_alignment = gguf_find_key(gguf_ctx, "general.alignment");
|
const int id_alignment = gguf_find_key(gguf_ctx, GGUF_KEY_GENERAL_ALIGNMENT);
|
||||||
const uint32_t alignment = id_alignment >= 0 ? gguf_get_val_u32(gguf_ctx, id_alignment) : GGUF_DEFAULT_ALIGNMENT;
|
const uint32_t alignment = id_alignment >= 0 ? gguf_get_val_u32(gguf_ctx, id_alignment) : GGUF_DEFAULT_ALIGNMENT;
|
||||||
|
|
||||||
uint64_t expected_offset = 0;
|
uint64_t expected_offset = 0;
|
||||||
|
@ -607,7 +643,7 @@ static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const u
|
||||||
|
|
||||||
std::vector<uint8_t> data(size);
|
std::vector<uint8_t> data(size);
|
||||||
GGML_ASSERT(fseek(file, gguf_get_data_offset(gguf_ctx) + offset, SEEK_SET) == 0);
|
GGML_ASSERT(fseek(file, gguf_get_data_offset(gguf_ctx) + offset, SEEK_SET) == 0);
|
||||||
GGML_ASSERT(fread(data.data(), 1, size, file) == size);
|
GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size());
|
||||||
|
|
||||||
for (size_t j = 0; j < size; ++j) {
|
for (size_t j = 0; j < size; ++j) {
|
||||||
const uint8_t expected_byte = (j + offset) % 256;
|
const uint8_t expected_byte = (j + offset) % 256;
|
||||||
|
@ -627,15 +663,15 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||||
const std::vector<handcrafted_file_type> hfts = {
|
const std::vector<handcrafted_file_type> hfts = {
|
||||||
HANDCRAFTED_HEADER_BAD_MAGIC,
|
HANDCRAFTED_HEADER_BAD_MAGIC,
|
||||||
HANDCRAFTED_HEADER_BAD_VERSION_1,
|
HANDCRAFTED_HEADER_BAD_VERSION_1,
|
||||||
// HANDCRAFTED_FILE_TYPE_BAD_VERSION_FUTURE, // FIXME
|
HANDCRAFTED_HEADER_BAD_VERSION_FUTURE,
|
||||||
HANDCRAFTED_HEADER_BAD_N_KV,
|
HANDCRAFTED_HEADER_BAD_N_KV,
|
||||||
HANDCRAFTED_HEADER_BAD_N_TENSORS,
|
HANDCRAFTED_HEADER_BAD_N_TENSORS,
|
||||||
HANDCRAFTED_HEADER_EMPTY,
|
HANDCRAFTED_HEADER_EMPTY,
|
||||||
|
|
||||||
HANDCRAFTED_KV_BAD_KEY_SIZE,
|
HANDCRAFTED_KV_BAD_KEY_SIZE,
|
||||||
HANDCRAFTED_KV_BAD_TYPE,
|
HANDCRAFTED_KV_BAD_TYPE,
|
||||||
// HANDCRAFTED_KV_BAD_VALUE_SIZE, // FIXME sanitizer limit
|
HANDCRAFTED_KV_DUPLICATE_KEY,
|
||||||
// HANDCRAFTED_FILE_TYPE_DUPLICATE_KEY, // FIXME
|
HANDCRAFTED_KV_BAD_ALIGN,
|
||||||
HANDCRAFTED_KV_SUCCESS,
|
HANDCRAFTED_KV_SUCCESS,
|
||||||
|
|
||||||
HANDCRAFTED_TENSORS_BAD_NAME_SIZE,
|
HANDCRAFTED_TENSORS_BAD_NAME_SIZE,
|
||||||
|
@ -643,14 +679,16 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||||
HANDCRAFTED_TENSORS_BAD_SHAPE,
|
HANDCRAFTED_TENSORS_BAD_SHAPE,
|
||||||
HANDCRAFTED_TENSORS_NE_TOO_BIG,
|
HANDCRAFTED_TENSORS_NE_TOO_BIG,
|
||||||
HANDCRAFTED_TENSORS_BAD_TYPE,
|
HANDCRAFTED_TENSORS_BAD_TYPE,
|
||||||
// HANDCRAFTED_TENSORS_BAD_OFFSET, // FIXME
|
HANDCRAFTED_TENSORS_BAD_OFFSET,
|
||||||
HANDCRAFTED_TENSORS_DUPLICATE_NAME,
|
HANDCRAFTED_TENSORS_DUPLICATE_NAME,
|
||||||
// HANDCRAFTED_TENSORS_BAD_ALIGNMENT, // FIXME
|
HANDCRAFTED_TENSORS_BAD_ALIGN,
|
||||||
|
HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN,
|
||||||
HANDCRAFTED_TENSORS_SUCCESS,
|
HANDCRAFTED_TENSORS_SUCCESS,
|
||||||
HANDCRAFTED_TENSORS_CUSTOM_ALIGN,
|
HANDCRAFTED_TENSORS_CUSTOM_ALIGN,
|
||||||
|
|
||||||
HANDCRAFTED_DATA_NOT_ENOUGH_DATA,
|
HANDCRAFTED_DATA_NOT_ENOUGH_DATA,
|
||||||
// HANDCRAFTED_DATA_BAD_ALIGNMENT, // FIXME
|
HANDCRAFTED_DATA_BAD_ALIGN,
|
||||||
|
HANDCRAFTED_DATA_INCONSISTENT_ALIGN,
|
||||||
HANDCRAFTED_DATA_SUCCESS,
|
HANDCRAFTED_DATA_SUCCESS,
|
||||||
HANDCRAFTED_DATA_CUSTOM_ALIGN,
|
HANDCRAFTED_DATA_CUSTOM_ALIGN,
|
||||||
};
|
};
|
||||||
|
@ -674,6 +712,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||||
/*no_alloc =*/ false,
|
/*no_alloc =*/ false,
|
||||||
/*ctx =*/ hft >= offset_has_data ? &ctx : nullptr,
|
/*ctx =*/ hft >= offset_has_data ? &ctx : nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
|
struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
|
||||||
|
|
||||||
if (expect_context_not_null(hft)) {
|
if (expect_context_not_null(hft)) {
|
||||||
|
@ -689,7 +728,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||||
}
|
}
|
||||||
ntest++;
|
ntest++;
|
||||||
|
|
||||||
if (false && hft >= offset_has_data && !expect_context_not_null(hft)) { // FIXME
|
if (hft >= offset_has_data && !expect_context_not_null(hft)) {
|
||||||
printf("%s: - no_dangling_ggml_context_pointer: ", __func__);
|
printf("%s: - no_dangling_ggml_context_pointer: ", __func__);
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
printf("\033[1;31mFAIL\033[0m\n");
|
printf("\033[1;31mFAIL\033[0m\n");
|
||||||
|
@ -700,23 +739,6 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||||
ntest++;
|
ntest++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (false && expect_context_not_null(hft)) { // FIXME
|
|
||||||
FILE * file_eb = get_handcrafted_file(seed, hft, /*extra_bytes =*/ 1);
|
|
||||||
struct gguf_context * gguf_ctx_eb = gguf_init_from_file_impl(file_eb, gguf_params);
|
|
||||||
|
|
||||||
printf("%s: - context_null_with_extra_bytes: ", __func__);
|
|
||||||
if (gguf_ctx_eb) {
|
|
||||||
printf("\033[1;31mFAIL\033[0m\n");
|
|
||||||
} else {
|
|
||||||
printf("\033[1;32mOK\033[0m\n");
|
|
||||||
npass++;
|
|
||||||
}
|
|
||||||
ntest++;
|
|
||||||
|
|
||||||
gguf_free(gguf_ctx_eb);
|
|
||||||
fclose(file_eb);
|
|
||||||
}
|
|
||||||
|
|
||||||
const bool alignment_defined = hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN;
|
const bool alignment_defined = hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN;
|
||||||
|
|
||||||
if (expect_context_not_null(hft)) {
|
if (expect_context_not_null(hft)) {
|
||||||
|
@ -763,14 +785,15 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||||
ntest++;
|
ntest++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fclose(file);
|
||||||
if (gguf_ctx) {
|
if (gguf_ctx) {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
gguf_free(gguf_ctx);
|
gguf_free(gguf_ctx);
|
||||||
}
|
}
|
||||||
fclose(file);
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return std::make_pair(npass, ntest);
|
return std::make_pair(npass, ntest);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -789,10 +812,6 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t
|
||||||
const std::string key = "my_key_" + std::to_string(rng() % 1024);
|
const std::string key = "my_key_" + std::to_string(rng() % 1024);
|
||||||
const enum gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
|
const enum gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
|
||||||
|
|
||||||
if (type == GGUF_TYPE_STRING || type == GGUF_TYPE_ARRAY) {
|
|
||||||
continue; // FIXME memory leak
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGUF_TYPE_UINT8: gguf_set_val_u8 (gguf_ctx, key.c_str(), rng() % (1 << 7)); break;
|
case GGUF_TYPE_UINT8: gguf_set_val_u8 (gguf_ctx, key.c_str(), rng() % (1 << 7)); break;
|
||||||
case GGUF_TYPE_INT8: gguf_set_val_i8 (gguf_ctx, key.c_str(), rng() % (1 << 7) - (1 << 6)); break;
|
case GGUF_TYPE_INT8: gguf_set_val_i8 (gguf_ctx, key.c_str(), rng() % (1 << 7) - (1 << 6)); break;
|
||||||
|
@ -826,6 +845,9 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t
|
||||||
std::vector<uint32_t> random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
|
std::vector<uint32_t> random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
|
||||||
for (size_t j = 0; j < random_data.size(); ++j) {
|
for (size_t j = 0; j < random_data.size(); ++j) {
|
||||||
random_data[j] = rng();
|
random_data[j] = rng();
|
||||||
|
if (type_arr == GGUF_TYPE_BOOL) {
|
||||||
|
random_data[j] &= 0x01010101; // the sanitizer complains if booleans are not 0 or 1
|
||||||
|
}
|
||||||
}
|
}
|
||||||
gguf_set_arr_data(gguf_ctx, key.c_str(), type_arr, random_data.data(), ne);
|
gguf_set_arr_data(gguf_ctx, key.c_str(), type_arr, random_data.data(), ne);
|
||||||
} break;
|
} break;
|
||||||
|
@ -928,6 +950,17 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (type_arr == GGUF_TYPE_BOOL) {
|
||||||
|
const int8_t * data = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx, id));
|
||||||
|
const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
|
||||||
|
for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||||
|
if (bool(data[arr_i]) != bool(data_other[arr_i])) {
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (type_arr == GGUF_TYPE_STRING) {
|
if (type_arr == GGUF_TYPE_STRING) {
|
||||||
for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
|
for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||||
const std::string str = gguf_get_arr_str(ctx, id, arr_i);
|
const std::string str = gguf_get_arr_str(ctx, id, arr_i);
|
||||||
|
@ -939,8 +972,8 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * data = reinterpret_cast<const char *>(gguf_get_arr_data(ctx, id));
|
const int8_t * data = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx, id));
|
||||||
const char * data_other = reinterpret_cast<const char *>(gguf_get_arr_data(other, idx_other));
|
const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
|
||||||
if (!std::equal(data, data + arr_n*gguf_type_size(type_arr), data_other)) {
|
if (!std::equal(data, data + arr_n*gguf_type_size(type_arr), data_other)) {
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
@ -1028,21 +1061,6 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
|
static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
|
||||||
FILE * file = tmpfile();
|
|
||||||
#ifdef _WIN32
|
|
||||||
if (!file) {
|
|
||||||
printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
|
|
||||||
printf("%s: skipping tests");
|
|
||||||
return std::make_pair(0, 0);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
GGML_ASSERT(file);
|
|
||||||
#endif // _WIN32
|
|
||||||
|
|
||||||
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
||||||
return std::make_pair(0, 0); // FIXME
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
||||||
printf("%s: device=%s, backend=%s, only_meta=%s\n",
|
printf("%s: device=%s, backend=%s, only_meta=%s\n",
|
||||||
__func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
|
__func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
|
||||||
|
@ -1060,10 +1078,24 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
|
||||||
bbuf = result.buffer;
|
bbuf = result.buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct gguf_buf gbuf = gguf_buf_init(16 * 1024);
|
FILE * file = tmpfile();
|
||||||
gguf_write_to_buf(gguf_ctx_0, &gbuf, only_meta);
|
|
||||||
helper_write(gbuf.data, gbuf.offset, file);
|
#ifdef _WIN32
|
||||||
|
if (!file) {
|
||||||
|
printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
|
||||||
|
printf("%s: skipping tests");
|
||||||
|
return std::make_pair(0, 0);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_ASSERT(file);
|
||||||
|
#endif // _WIN32
|
||||||
|
|
||||||
|
{
|
||||||
|
std::vector<int8_t> buf;
|
||||||
|
gguf_write_to_buf(gguf_ctx_0, buf, only_meta);
|
||||||
|
GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size());
|
||||||
rewind(file);
|
rewind(file);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_context * ctx_1 = nullptr;
|
struct ggml_context * ctx_1 = nullptr;
|
||||||
struct gguf_init_params gguf_params = {
|
struct gguf_init_params gguf_params = {
|
||||||
|
@ -1151,9 +1183,8 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
|
||||||
ggml_free(ctx_1);
|
ggml_free(ctx_1);
|
||||||
gguf_free(gguf_ctx_0);
|
gguf_free(gguf_ctx_0);
|
||||||
gguf_free(gguf_ctx_1);
|
gguf_free(gguf_ctx_1);
|
||||||
gguf_buf_free(gbuf);
|
|
||||||
ggml_backend_free(backend);
|
ggml_backend_free(backend);
|
||||||
GGML_ASSERT(fclose(file) == 0);
|
fclose(file);
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
return std::make_pair(npass, ntest);
|
return std::make_pair(npass, ntest);
|
||||||
|
|
|
@ -21,7 +21,7 @@ int main(int argc, char *argv[] ) {
|
||||||
(void) ctx;
|
(void) ctx;
|
||||||
return progress > 0.50;
|
return progress > 0.50;
|
||||||
};
|
};
|
||||||
auto * model = llama_load_model_from_file(model_path, params);
|
auto * model = llama_model_load_from_file(model_path, params);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
|
return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue