Merge branch 'master' into compilade/mamba2

This commit is contained in:
Francis Couture-Harpin 2024-10-12 16:12:06 -04:00
commit 038d958333
132 changed files with 6559 additions and 5146 deletions

View file

@ -10,12 +10,12 @@
#include <cassert>
int main(void) {
gpt_params params;
common_params params;
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
try {
auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
std::unordered_set<std::string> seen_args;
std::unordered_set<std::string> seen_env_vars;
for (const auto & opt : ctx_arg.options) {
@ -58,44 +58,44 @@ int main(void) {
// missing value
argv = {"binary_name", "-m"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
// wrong value (int)
argv = {"binary_name", "-ngl", "hello"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
// wrong value (enum)
argv = {"binary_name", "-sm", "hello"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
// non-existence arg in specific example (--draft cannot be used outside llama-speculative)
argv = {"binary_name", "--draft", "123"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
printf("test-arg-parser: test valid usage\n\n");
argv = {"binary_name", "-m", "model_file.gguf"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "model_file.gguf");
argv = {"binary_name", "-t", "1234"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.cpuparams.n_threads == 1234);
argv = {"binary_name", "--verbose"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.verbosity > 1);
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "abc.gguf");
assert(params.n_predict == 6789);
assert(params.n_batch == 9090);
// --draft cannot be used outside llama-speculative
argv = {"binary_name", "--draft", "123"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
assert(params.n_draft == 123);
// skip this part on windows, because setenv is not supported
@ -106,12 +106,12 @@ int main(void) {
setenv("LLAMA_ARG_THREADS", "blah", true);
argv = {"binary_name"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "blah.gguf");
assert(params.cpuparams.n_threads == 1010);
@ -121,7 +121,7 @@ int main(void) {
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name", "-m", "overwritten.gguf"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "overwritten.gguf");
assert(params.cpuparams.n_threads == 1010);
#endif // _WIN32

View file

@ -116,6 +116,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
// This is going to create some weird integers though.
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
} else if (tensor->type == GGML_TYPE_I64) {
// Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
const size_t nbytes_half = ggml_nbytes(tensor)/2;
ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
} else {
GGML_ABORT("fatal error");
}
@ -128,7 +133,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
std::vector<uint8_t> buf(ggml_nbytes(t));
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
const auto * tt = ggml_get_type_traits(t->type);
size_t bs = ggml_blck_size(t->type);
std::vector<float> vq(ggml_blck_size(t->type));
bool quantized = ggml_is_quantized(t->type);
@ -145,6 +150,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
} else if (t->type == GGML_TYPE_F32) {
tv.push_back(*(float *) &buf[i]);
} else if (t->type == GGML_TYPE_I64) {
tv.push_back((float)*(int64_t *) &buf[i]);
} else if (t->type == GGML_TYPE_I32) {
tv.push_back((float)*(int32_t *) &buf[i]);
} else if (t->type == GGML_TYPE_I16) {
@ -152,7 +159,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
} else if (t->type == GGML_TYPE_I8) {
tv.push_back((float)*(int8_t *) &buf[i]);
} else if (quantized) {
tt.to_float(&buf[i], vq.data(), bs);
tt->to_float(&buf[i], vq.data(), bs);
tv.insert(tv.end(), vq.begin(), vq.end());
} else {
GGML_ABORT("fatal error");
@ -672,14 +679,11 @@ struct test_case {
}
// run
ggml_backend_synchronize(backend);
int64_t total_time_us = 0;
int total_runs = 0;
do {
int64_t start_time = ggml_time_us();
ggml_backend_graph_compute(backend, gf);
ggml_backend_synchronize(backend);
int64_t end_time = ggml_time_us();
total_time_us += end_time - start_time;
@ -1119,6 +1123,71 @@ struct test_get_rows : public test_case {
}
};
// GGML_OP_ARGMAX
struct test_argmax : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
std::string vars() override {
return VARS_TO_STR2(type, ne);
}
test_argmax(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 100, 1, 1})
: type(type), ne(ne) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(a, "a");
ggml_tensor * out = ggml_argmax(ctx, a);
ggml_set_name(out, "out");
return out;
}
double max_nmse_err() override {
return 0.0;
}
};
// GGML_OP_COUNT_EQUAL
struct test_count_equal : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
std::string vars() override {
return VARS_TO_STR2(type, ne);
}
test_count_equal(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {4, 500, 1, 1})
: type(type), ne(ne) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(a, "a");
ggml_tensor * a_argmax = ggml_argmax(ctx, a);
ggml_set_name(a_argmax, "a_argmax");
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(b, "b");
ggml_tensor * b_argmax = ggml_argmax(ctx, a);
ggml_set_name(b_argmax, "b_argmax");
ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
ggml_set_name(out, "out");
return out;
}
double max_nmse_err() override {
return 0.0;
}
};
// GGML_OP_REPEAT
struct test_repeat : public test_case {
const ggml_type type;
@ -3294,6 +3363,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
test_cases.emplace_back(new test_argmax());
test_cases.emplace_back(new test_count_equal());
for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
@ -3312,8 +3384,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
@ -3755,20 +3827,22 @@ int main(int argc, char ** argv) {
}
// enumerate backends
printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
size_t n_ok = 0;
for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev));
if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
printf(" Skipping\n");
n_ok++;
continue;
}
ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
GGML_ASSERT(backend != NULL);
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
@ -3778,12 +3852,18 @@ int main(int argc, char ** argv) {
continue;
}
if (ggml_backend_is_cpu(backend)) {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn) {
// TODO: better value for n_threads
ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
}
printf(" Backend name: %s\n", ggml_backend_name(backend));
printf(" Device description: %s\n", ggml_backend_dev_description(dev));
size_t free, total; // NOLINT
ggml_backend_dev_memory(dev, &free, &total);
printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
printf("\n");
bool ok = test_backend(backend, mode, op_name_filter);
@ -3800,9 +3880,9 @@ int main(int argc, char ** argv) {
ggml_backend_free(backend);
}
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
if (n_ok != ggml_backend_reg_get_count()) {
if (n_ok != ggml_backend_dev_count()) {
printf("\033[1;31mFAIL\033[0m\n");
return 1;
}

View file

@ -140,11 +140,11 @@ int main(void) {
// test llama_chat_format_single for system message
printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
std::vector<llama_chat_msg> chat2;
llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
std::vector<common_chat_msg> chat2;
common_chat_msg sys_msg{"system", "You are a helpful assistant"};
auto fmt_sys = [&](std::string tmpl) {
auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
auto output = common_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
printf("-------------------------\n");
return output;
@ -160,10 +160,10 @@ int main(void) {
chat2.push_back({"system", "You are a helpful assistant"});
chat2.push_back({"user", "Hello"});
chat2.push_back({"assistant", "I am assistant"});
llama_chat_msg new_msg{"user", "How are you"};
common_chat_msg new_msg{"user", "How are you"};
auto fmt_single = [&](std::string tmpl) {
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
auto output = common_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
printf("-------------------------\n");
return output;

View file

@ -24,8 +24,8 @@ int main() {
}
if (rand () % 10 < 5) {
gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
gpt_log_set_prefix (gpt_log_main(), rand() % 2);
common_log_set_timestamps(common_log_main(), rand() % 2);
common_log_set_prefix (common_log_main(), rand() % 2);
}
}
});

View file

@ -44,26 +44,26 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
}
// Total quantization error on test data
static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
qfns.from_float(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
qfns->from_float(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
return array_rmse(test_data, tmp_out.data(), test_size);
}
// Total quantization error on test data
static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size);
qfns.from_float(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
qfns->from_float(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
qfns.from_float_ref(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
}
@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
// Total dot product error
static float dot_product_error(
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
) {
std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size);
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
qfns.from_float(test_data1, tmp_q1.data(), test_size);
vdot.from_float(test_data2, tmp_q2.data(), test_size);
qfns->from_float(test_data1, tmp_q1.data(), test_size);
vdot->from_float(test_data2, tmp_q2.data(), test_size);
float result = INFINITY;
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
const float dot_ref = dot_product(test_data1, test_data2, test_size);
@ -131,10 +131,10 @@ int main(int argc, char * argv[]) {
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
const auto * qfns = ggml_get_type_traits(type);
// deprecated - skip
if (qfns.blck_size == 0) {
if (qfns->blck_size == 0) {
continue;
}
@ -143,7 +143,7 @@ int main(int argc, char * argv[]) {
printf("Testing %s\n", ggml_type_name((ggml_type) i));
ggml_quantize_init(ei);
if (qfns.from_float && qfns.to_float) {
if (qfns->from_float && qfns->to_float) {
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
const float max_quantization_error =
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :

View file

@ -122,9 +122,9 @@ static void usage(char * argv[]) {
printf(" --type TYPE set test type as");
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
const auto * qfns = ggml_get_type_traits(type);
if (ggml_type_name(type) != NULL) {
if (qfns.from_float && qfns.to_float) {
if (qfns->from_float && qfns->to_float) {
printf(" %s", ggml_type_name(type));
}
}
@ -270,12 +270,12 @@ int main(int argc, char * argv[]) {
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
const auto * qfns = ggml_get_type_traits(type);
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
continue;
}
if (qfns.from_float && qfns.to_float) {
if (qfns->from_float && qfns->to_float) {
printf("%s\n", ggml_type_name(type));
ggml_quantize_init(type);
@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
qfns.from_float_ref(test_data1, test_q1, size);
qfns->from_float_ref(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = ggml_row_size(type, size);
@ -299,7 +299,7 @@ int main(int argc, char * argv[]) {
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
qfns.from_float(test_data1, test_q1, size);
qfns->from_float(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = ggml_row_size(type, size);
@ -310,11 +310,11 @@ int main(int argc, char * argv[]) {
if (params.op_dequantize_row_q) {
printf(" dequantize_row_q\n");
qfns.from_float(test_data1, test_q1, largest);
qfns->from_float(test_data1, test_q1, largest);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
qfns.to_float(test_q1, test_out, size);
qfns->to_float(test_q1, test_out, size);
return test_out[0];
};
size_t quantized_size = ggml_row_size(type, size);
@ -328,8 +328,8 @@ int main(int argc, char * argv[]) {
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
vdot.from_float(test_data1, test_q1, size);
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
vdot->from_float(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = ggml_row_size(type, size);
@ -340,13 +340,13 @@ int main(int argc, char * argv[]) {
if (params.op_vec_dot_q) {
printf(" vec_dot_q\n");
qfns.from_float(test_data1, test_q1, largest);
qfns.from_float(test_data2, test_q2, largest);
qfns->from_float(test_data1, test_q1, largest);
qfns->from_float(test_data2, test_q2, largest);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
float result;
qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
return result;
};
size_t quantized_size = ggml_row_size(type, size);

View file

@ -202,7 +202,7 @@ int main(int argc, char **argv) {
for (int i = 0; i < nthread; i++) {
threads[i] = std::thread([&, i]() {
for (const auto & test_kv : k_tests) {
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
// here only print the result of the first thread
// because the other threads are running the same tests
@ -212,7 +212,7 @@ int main(int argc, char **argv) {
printf("\n");
printf("src: '%s'\n", test_kv.first.c_str());
printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
printf("tok: ");
for (const auto & tok : res) {
printf("%d ", tok);
@ -229,16 +229,16 @@ int main(int argc, char **argv) {
if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
llama_detokenize(ctx, res).c_str(),
llama_detokenize(ctx, test_kv.second).c_str());
common_detokenize(ctx, res).c_str(),
common_detokenize(ctx, test_kv.second).c_str());
fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
}
fprintf(stderr, "\n");
fprintf(stderr, "%s : got tokens: ", __func__);
for (const auto & t : res) {
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
}
fprintf(stderr, "\n");
@ -273,7 +273,7 @@ int main(int argc, char **argv) {
{
const auto t_start = ggml_time_us();
res = llama_tokenize(ctx, text, add_special, false);
res = common_tokenize(ctx, text, add_special, false);
const auto t_end = ggml_time_us();

View file

@ -78,10 +78,10 @@ int main(int argc, char **argv) {
const int n_vocab = llama_n_vocab(model);
for (int i = 0; i < n_vocab; ++i) {
std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
std::string str = common_detokenize(ctx, std::vector<int>(1, i));
try {
auto cps = unicode_cpts_from_utf8(str);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
if (ignore_merges && tokens.size() > 1) {
fprintf(stderr,
"%s : error: token %d detokenizes to '%s'(%zu) but "
@ -94,7 +94,7 @@ int main(int argc, char **argv) {
fprintf(stderr, "]\n");
return 2;
}
std::string check = llama_detokenize(ctx, tokens);
std::string check = common_detokenize(ctx, tokens);
if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@ -123,8 +123,8 @@ int main(int argc, char **argv) {
}
std::string str = unicode_cpt_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize(ctx, tokens);
std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
std::string check = common_detokenize(ctx, tokens);
if (cp != 9601 && str != check) {
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
cp, check.c_str(), check.length(), str.c_str(), str.length());

View file

@ -66,9 +66,9 @@ int main(int argc, char ** argv) {
const int n_vocab = llama_n_vocab(model);
for (int i = 0; i < n_vocab; ++i) {
std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
std::string check = llama_detokenize(ctx, tokens);
std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
std::string check = common_detokenize(ctx, tokens);
if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@ -93,8 +93,8 @@ int main(int argc, char ** argv) {
}
std::string str = unicode_cpt_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
std::string check = llama_detokenize(ctx, tokens);
std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
std::string check = common_detokenize(ctx, tokens);
if (cp != 9601 && str != check) {
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
cp, check.c_str(), check.length(), str.c_str(), str.length());