Merge branch 'master' into compilade/mamba2
This commit is contained in:
commit
038d958333
132 changed files with 6559 additions and 5146 deletions
|
@ -10,12 +10,12 @@
|
|||
#include <cassert>
|
||||
|
||||
int main(void) {
|
||||
gpt_params params;
|
||||
common_params params;
|
||||
|
||||
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
|
||||
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
||||
try {
|
||||
auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
|
||||
auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
|
||||
std::unordered_set<std::string> seen_args;
|
||||
std::unordered_set<std::string> seen_env_vars;
|
||||
for (const auto & opt : ctx_arg.options) {
|
||||
|
@ -58,44 +58,44 @@ int main(void) {
|
|||
|
||||
// missing value
|
||||
argv = {"binary_name", "-m"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
|
||||
// wrong value (int)
|
||||
argv = {"binary_name", "-ngl", "hello"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
|
||||
// wrong value (enum)
|
||||
argv = {"binary_name", "-sm", "hello"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
|
||||
// non-existence arg in specific example (--draft cannot be used outside llama-speculative)
|
||||
argv = {"binary_name", "--draft", "123"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
|
||||
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
|
||||
|
||||
|
||||
printf("test-arg-parser: test valid usage\n\n");
|
||||
|
||||
argv = {"binary_name", "-m", "model_file.gguf"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.model == "model_file.gguf");
|
||||
|
||||
argv = {"binary_name", "-t", "1234"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.cpuparams.n_threads == 1234);
|
||||
|
||||
argv = {"binary_name", "--verbose"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.verbosity > 1);
|
||||
|
||||
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.model == "abc.gguf");
|
||||
assert(params.n_predict == 6789);
|
||||
assert(params.n_batch == 9090);
|
||||
|
||||
// --draft cannot be used outside llama-speculative
|
||||
argv = {"binary_name", "--draft", "123"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
||||
assert(params.n_draft == 123);
|
||||
|
||||
// skip this part on windows, because setenv is not supported
|
||||
|
@ -106,12 +106,12 @@ int main(void) {
|
|||
|
||||
setenv("LLAMA_ARG_THREADS", "blah", true);
|
||||
argv = {"binary_name"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
|
||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||
argv = {"binary_name"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.model == "blah.gguf");
|
||||
assert(params.cpuparams.n_threads == 1010);
|
||||
|
||||
|
@ -121,7 +121,7 @@ int main(void) {
|
|||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||
argv = {"binary_name", "-m", "overwritten.gguf"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.model == "overwritten.gguf");
|
||||
assert(params.cpuparams.n_threads == 1010);
|
||||
#endif // _WIN32
|
||||
|
|
|
@ -116,6 +116,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||
// This is going to create some weird integers though.
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
||||
} else if (tensor->type == GGML_TYPE_I64) {
|
||||
// Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
|
||||
const size_t nbytes_half = ggml_nbytes(tensor)/2;
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
|
||||
ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
@ -128,7 +133,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|||
std::vector<uint8_t> buf(ggml_nbytes(t));
|
||||
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
|
||||
|
||||
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
|
||||
const auto * tt = ggml_get_type_traits(t->type);
|
||||
size_t bs = ggml_blck_size(t->type);
|
||||
std::vector<float> vq(ggml_blck_size(t->type));
|
||||
bool quantized = ggml_is_quantized(t->type);
|
||||
|
@ -145,6 +150,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|||
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
|
||||
} else if (t->type == GGML_TYPE_F32) {
|
||||
tv.push_back(*(float *) &buf[i]);
|
||||
} else if (t->type == GGML_TYPE_I64) {
|
||||
tv.push_back((float)*(int64_t *) &buf[i]);
|
||||
} else if (t->type == GGML_TYPE_I32) {
|
||||
tv.push_back((float)*(int32_t *) &buf[i]);
|
||||
} else if (t->type == GGML_TYPE_I16) {
|
||||
|
@ -152,7 +159,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|||
} else if (t->type == GGML_TYPE_I8) {
|
||||
tv.push_back((float)*(int8_t *) &buf[i]);
|
||||
} else if (quantized) {
|
||||
tt.to_float(&buf[i], vq.data(), bs);
|
||||
tt->to_float(&buf[i], vq.data(), bs);
|
||||
tv.insert(tv.end(), vq.begin(), vq.end());
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
|
@ -672,14 +679,11 @@ struct test_case {
|
|||
}
|
||||
|
||||
// run
|
||||
ggml_backend_synchronize(backend);
|
||||
|
||||
int64_t total_time_us = 0;
|
||||
int total_runs = 0;
|
||||
do {
|
||||
int64_t start_time = ggml_time_us();
|
||||
ggml_backend_graph_compute(backend, gf);
|
||||
ggml_backend_synchronize(backend);
|
||||
int64_t end_time = ggml_time_us();
|
||||
|
||||
total_time_us += end_time - start_time;
|
||||
|
@ -1119,6 +1123,71 @@ struct test_get_rows : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// GGML_OP_ARGMAX
|
||||
struct test_argmax : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_argmax(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 100, 1, 1})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * out = ggml_argmax(ctx, a);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 0.0;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_COUNT_EQUAL
|
||||
struct test_count_equal : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_count_equal(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {4, 500, 1, 1})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * a_argmax = ggml_argmax(ctx, a);
|
||||
ggml_set_name(a_argmax, "a_argmax");
|
||||
|
||||
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_set_name(b, "b");
|
||||
|
||||
ggml_tensor * b_argmax = ggml_argmax(ctx, a);
|
||||
ggml_set_name(b_argmax, "b_argmax");
|
||||
|
||||
ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 0.0;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_REPEAT
|
||||
struct test_repeat : public test_case {
|
||||
const ggml_type type;
|
||||
|
@ -3294,6 +3363,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
||||
|
||||
test_cases.emplace_back(new test_argmax());
|
||||
test_cases.emplace_back(new test_count_equal());
|
||||
|
||||
for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
|
||||
|
@ -3312,8 +3384,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
||||
|
||||
for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
|
||||
test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
|
||||
|
@ -3755,20 +3827,22 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// enumerate backends
|
||||
printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
|
||||
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
|
||||
|
||||
size_t n_ok = 0;
|
||||
|
||||
for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
|
||||
printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
|
||||
if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
|
||||
printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev));
|
||||
|
||||
if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
|
||||
printf(" Skipping\n");
|
||||
n_ok++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
|
||||
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
||||
GGML_ASSERT(backend != NULL);
|
||||
|
||||
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
|
||||
|
@ -3778,12 +3852,18 @@ int main(int argc, char ** argv) {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (ggml_backend_is_cpu(backend)) {
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
||||
if (ggml_backend_set_n_threads_fn) {
|
||||
// TODO: better value for n_threads
|
||||
ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
|
||||
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
printf(" Backend name: %s\n", ggml_backend_name(backend));
|
||||
printf(" Device description: %s\n", ggml_backend_dev_description(dev));
|
||||
size_t free, total; // NOLINT
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
|
||||
printf("\n");
|
||||
|
||||
bool ok = test_backend(backend, mode, op_name_filter);
|
||||
|
||||
|
@ -3800,9 +3880,9 @@ int main(int argc, char ** argv) {
|
|||
ggml_backend_free(backend);
|
||||
}
|
||||
|
||||
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
|
||||
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
|
||||
|
||||
if (n_ok != ggml_backend_reg_get_count()) {
|
||||
if (n_ok != ggml_backend_dev_count()) {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -140,11 +140,11 @@ int main(void) {
|
|||
|
||||
// test llama_chat_format_single for system message
|
||||
printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
|
||||
std::vector<llama_chat_msg> chat2;
|
||||
llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
|
||||
std::vector<common_chat_msg> chat2;
|
||||
common_chat_msg sys_msg{"system", "You are a helpful assistant"};
|
||||
|
||||
auto fmt_sys = [&](std::string tmpl) {
|
||||
auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
|
||||
auto output = common_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
|
||||
printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
|
||||
printf("-------------------------\n");
|
||||
return output;
|
||||
|
@ -160,10 +160,10 @@ int main(void) {
|
|||
chat2.push_back({"system", "You are a helpful assistant"});
|
||||
chat2.push_back({"user", "Hello"});
|
||||
chat2.push_back({"assistant", "I am assistant"});
|
||||
llama_chat_msg new_msg{"user", "How are you"};
|
||||
common_chat_msg new_msg{"user", "How are you"};
|
||||
|
||||
auto fmt_single = [&](std::string tmpl) {
|
||||
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
|
||||
auto output = common_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
|
||||
printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
|
||||
printf("-------------------------\n");
|
||||
return output;
|
||||
|
|
|
@ -24,8 +24,8 @@ int main() {
|
|||
}
|
||||
|
||||
if (rand () % 10 < 5) {
|
||||
gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
|
||||
gpt_log_set_prefix (gpt_log_main(), rand() % 2);
|
||||
common_log_set_timestamps(common_log_main(), rand() % 2);
|
||||
common_log_set_prefix (common_log_main(), rand() % 2);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
|
@ -44,26 +44,26 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
|
|||
}
|
||||
|
||||
// Total quantization error on test data
|
||||
static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||
static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
|
||||
qfns.from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
qfns->from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
return array_rmse(test_data, tmp_out.data(), test_size);
|
||||
}
|
||||
|
||||
// Total quantization error on test data
|
||||
static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||
static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
std::vector<float> tmp_out_ref(test_size);
|
||||
|
||||
qfns.from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
qfns->from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
|
||||
qfns.from_float_ref(test_data, tmp_q.data(), test_size);
|
||||
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
||||
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
||||
|
||||
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
||||
}
|
||||
|
@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
|||
|
||||
// Total dot product error
|
||||
static float dot_product_error(
|
||||
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
|
||||
const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
|
||||
) {
|
||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||
|
||||
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
||||
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
|
||||
|
||||
qfns.from_float(test_data1, tmp_q1.data(), test_size);
|
||||
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
||||
qfns->from_float(test_data1, tmp_q1.data(), test_size);
|
||||
vdot->from_float(test_data2, tmp_q2.data(), test_size);
|
||||
|
||||
float result = INFINITY;
|
||||
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||
qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||
|
||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||
|
||||
|
@ -131,10 +131,10 @@ int main(int argc, char * argv[]) {
|
|||
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
|
||||
// deprecated - skip
|
||||
if (qfns.blck_size == 0) {
|
||||
if (qfns->blck_size == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -143,7 +143,7 @@ int main(int argc, char * argv[]) {
|
|||
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
||||
ggml_quantize_init(ei);
|
||||
|
||||
if (qfns.from_float && qfns.to_float) {
|
||||
if (qfns->from_float && qfns->to_float) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
const float max_quantization_error =
|
||||
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
||||
|
|
|
@ -122,9 +122,9 @@ static void usage(char * argv[]) {
|
|||
printf(" --type TYPE set test type as");
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
if (ggml_type_name(type) != NULL) {
|
||||
if (qfns.from_float && qfns.to_float) {
|
||||
if (qfns->from_float && qfns->to_float) {
|
||||
printf(" %s", ggml_type_name(type));
|
||||
}
|
||||
}
|
||||
|
@ -270,12 +270,12 @@ int main(int argc, char * argv[]) {
|
|||
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (qfns.from_float && qfns.to_float) {
|
||||
if (qfns->from_float && qfns->to_float) {
|
||||
printf("%s\n", ggml_type_name(type));
|
||||
|
||||
ggml_quantize_init(type);
|
||||
|
@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
|
|||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns.from_float_ref(test_data1, test_q1, size);
|
||||
qfns->from_float_ref(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
@ -299,7 +299,7 @@ int main(int argc, char * argv[]) {
|
|||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns.from_float(test_data1, test_q1, size);
|
||||
qfns->from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
@ -310,11 +310,11 @@ int main(int argc, char * argv[]) {
|
|||
|
||||
if (params.op_dequantize_row_q) {
|
||||
printf(" dequantize_row_q\n");
|
||||
qfns.from_float(test_data1, test_q1, largest);
|
||||
qfns->from_float(test_data1, test_q1, largest);
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns.to_float(test_q1, test_out, size);
|
||||
qfns->to_float(test_q1, test_out, size);
|
||||
return test_out[0];
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
@ -328,8 +328,8 @@ int main(int argc, char * argv[]) {
|
|||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
||||
vdot.from_float(test_data1, test_q1, size);
|
||||
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
|
||||
vdot->from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
@ -340,13 +340,13 @@ int main(int argc, char * argv[]) {
|
|||
|
||||
if (params.op_vec_dot_q) {
|
||||
printf(" vec_dot_q\n");
|
||||
qfns.from_float(test_data1, test_q1, largest);
|
||||
qfns.from_float(test_data2, test_q2, largest);
|
||||
qfns->from_float(test_data1, test_q1, largest);
|
||||
qfns->from_float(test_data2, test_q2, largest);
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
float result;
|
||||
qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||
qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||
return result;
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
|
|
@ -202,7 +202,7 @@ int main(int argc, char **argv) {
|
|||
for (int i = 0; i < nthread; i++) {
|
||||
threads[i] = std::thread([&, i]() {
|
||||
for (const auto & test_kv : k_tests) {
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
|
||||
const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
|
||||
|
||||
// here only print the result of the first thread
|
||||
// because the other threads are running the same tests
|
||||
|
@ -212,7 +212,7 @@ int main(int argc, char **argv) {
|
|||
|
||||
printf("\n");
|
||||
printf("src: '%s'\n", test_kv.first.c_str());
|
||||
printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
|
||||
printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
|
||||
printf("tok: ");
|
||||
for (const auto & tok : res) {
|
||||
printf("%d ", tok);
|
||||
|
@ -229,16 +229,16 @@ int main(int argc, char **argv) {
|
|||
if (!correct) {
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||
llama_detokenize(ctx, res).c_str(),
|
||||
llama_detokenize(ctx, test_kv.second).c_str());
|
||||
common_detokenize(ctx, res).c_str(),
|
||||
common_detokenize(ctx, test_kv.second).c_str());
|
||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||
for (const auto & t : test_kv.second) {
|
||||
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
||||
fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||
for (const auto & t : res) {
|
||||
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
||||
fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
|
@ -273,7 +273,7 @@ int main(int argc, char **argv) {
|
|||
{
|
||||
const auto t_start = ggml_time_us();
|
||||
|
||||
res = llama_tokenize(ctx, text, add_special, false);
|
||||
res = common_tokenize(ctx, text, add_special, false);
|
||||
|
||||
const auto t_end = ggml_time_us();
|
||||
|
||||
|
|
|
@ -78,10 +78,10 @@ int main(int argc, char **argv) {
|
|||
const int n_vocab = llama_n_vocab(model);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
|
||||
std::string str = common_detokenize(ctx, std::vector<int>(1, i));
|
||||
try {
|
||||
auto cps = unicode_cpts_from_utf8(str);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
|
||||
if (ignore_merges && tokens.size() > 1) {
|
||||
fprintf(stderr,
|
||||
"%s : error: token %d detokenizes to '%s'(%zu) but "
|
||||
|
@ -94,7 +94,7 @@ int main(int argc, char **argv) {
|
|||
fprintf(stderr, "]\n");
|
||||
return 2;
|
||||
}
|
||||
std::string check = llama_detokenize(ctx, tokens);
|
||||
std::string check = common_detokenize(ctx, tokens);
|
||||
if (check != str) {
|
||||
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
||||
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
||||
|
@ -123,8 +123,8 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
|
||||
std::string str = unicode_cpt_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize(ctx, tokens);
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
|
||||
std::string check = common_detokenize(ctx, tokens);
|
||||
if (cp != 9601 && str != check) {
|
||||
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
|
|
|
@ -66,9 +66,9 @@ int main(int argc, char ** argv) {
|
|||
const int n_vocab = llama_n_vocab(model);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
|
||||
std::string check = llama_detokenize(ctx, tokens);
|
||||
std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
|
||||
std::string check = common_detokenize(ctx, tokens);
|
||||
if (check != str) {
|
||||
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
||||
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
||||
|
@ -93,8 +93,8 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
std::string str = unicode_cpt_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
|
||||
std::string check = llama_detokenize(ctx, tokens);
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
|
||||
std::string check = common_detokenize(ctx, tokens);
|
||||
if (cp != 9601 && str != check) {
|
||||
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue