Merge branch 'master' into compilade/mamba2

This commit is contained in:
Francis Couture-Harpin 2024-10-12 16:12:06 -04:00
commit 038d958333
132 changed files with 6559 additions and 5146 deletions

View file

@ -116,6 +116,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
// This is going to create some weird integers though.
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
} else if (tensor->type == GGML_TYPE_I64) {
// Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
const size_t nbytes_half = ggml_nbytes(tensor)/2;
ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
} else {
GGML_ABORT("fatal error");
}
@ -128,7 +133,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
std::vector<uint8_t> buf(ggml_nbytes(t));
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
const auto * tt = ggml_get_type_traits(t->type);
size_t bs = ggml_blck_size(t->type);
std::vector<float> vq(ggml_blck_size(t->type));
bool quantized = ggml_is_quantized(t->type);
@ -145,6 +150,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
} else if (t->type == GGML_TYPE_F32) {
tv.push_back(*(float *) &buf[i]);
} else if (t->type == GGML_TYPE_I64) {
tv.push_back((float)*(int64_t *) &buf[i]);
} else if (t->type == GGML_TYPE_I32) {
tv.push_back((float)*(int32_t *) &buf[i]);
} else if (t->type == GGML_TYPE_I16) {
@ -152,7 +159,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
} else if (t->type == GGML_TYPE_I8) {
tv.push_back((float)*(int8_t *) &buf[i]);
} else if (quantized) {
tt.to_float(&buf[i], vq.data(), bs);
tt->to_float(&buf[i], vq.data(), bs);
tv.insert(tv.end(), vq.begin(), vq.end());
} else {
GGML_ABORT("fatal error");
@ -672,14 +679,11 @@ struct test_case {
}
// run
ggml_backend_synchronize(backend);
int64_t total_time_us = 0;
int total_runs = 0;
do {
int64_t start_time = ggml_time_us();
ggml_backend_graph_compute(backend, gf);
ggml_backend_synchronize(backend);
int64_t end_time = ggml_time_us();
total_time_us += end_time - start_time;
@ -1119,6 +1123,71 @@ struct test_get_rows : public test_case {
}
};
// GGML_OP_ARGMAX
struct test_argmax : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
std::string vars() override {
return VARS_TO_STR2(type, ne);
}
test_argmax(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 100, 1, 1})
: type(type), ne(ne) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(a, "a");
ggml_tensor * out = ggml_argmax(ctx, a);
ggml_set_name(out, "out");
return out;
}
double max_nmse_err() override {
return 0.0;
}
};
// GGML_OP_COUNT_EQUAL
struct test_count_equal : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
std::string vars() override {
return VARS_TO_STR2(type, ne);
}
test_count_equal(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {4, 500, 1, 1})
: type(type), ne(ne) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(a, "a");
ggml_tensor * a_argmax = ggml_argmax(ctx, a);
ggml_set_name(a_argmax, "a_argmax");
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(b, "b");
ggml_tensor * b_argmax = ggml_argmax(ctx, a);
ggml_set_name(b_argmax, "b_argmax");
ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
ggml_set_name(out, "out");
return out;
}
double max_nmse_err() override {
return 0.0;
}
};
// GGML_OP_REPEAT
struct test_repeat : public test_case {
const ggml_type type;
@ -3294,6 +3363,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
test_cases.emplace_back(new test_argmax());
test_cases.emplace_back(new test_count_equal());
for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
@ -3312,8 +3384,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
@ -3755,20 +3827,22 @@ int main(int argc, char ** argv) {
}
// enumerate backends
printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
size_t n_ok = 0;
for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev));
if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
printf(" Skipping\n");
n_ok++;
continue;
}
ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
GGML_ASSERT(backend != NULL);
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
@ -3778,12 +3852,18 @@ int main(int argc, char ** argv) {
continue;
}
if (ggml_backend_is_cpu(backend)) {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn) {
// TODO: better value for n_threads
ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
}
printf(" Backend name: %s\n", ggml_backend_name(backend));
printf(" Device description: %s\n", ggml_backend_dev_description(dev));
size_t free, total; // NOLINT
ggml_backend_dev_memory(dev, &free, &total);
printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
printf("\n");
bool ok = test_backend(backend, mode, op_name_filter);
@ -3800,9 +3880,9 @@ int main(int argc, char ** argv) {
ggml_backend_free(backend);
}
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
if (n_ok != ggml_backend_reg_get_count()) {
if (n_ok != ggml_backend_dev_count()) {
printf("\033[1;31mFAIL\033[0m\n");
return 1;
}