ggml/examples: add backend support for numerical optimization (ggml/949)

* CUDA eval works

* stochastic gradient descent op

* Adam except decay

* CUDA CROSS_ENTROPY_LOSS_BACK

* CUDA mnist-fc training works

* backend CLI arg

* refactor gguf load

* remove sched from opt_step_adam

* implement l1 regularization (weight decay)

* extra call to add optimizer

* initialize gradients with ggml_graph_reset

* gradient accumulation

* increment iter per eval instead of epoch

* adjust backend interfaces

* fix ggml_graph_reset without backend

* fix ggml graph export/import

* fixup

* rename

* revert ggml_opt changes

* more general CUDA repeat_back

* update documentation, fix CNN

* validation split

* add clarifying comment

* optimize PyTorch training

* adjust buffer size, thread count

* fix 0.0f validation split

* Update examples/mnist/mnist-common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix gradient accumulation

* tensor flag for accumulators -> tensor hash set

* Update include/ggml.h

Co-authored-by: slaren <slarengh@gmail.com>

* Update tests/test-backend-ops.cpp

Co-authored-by: slaren <slarengh@gmail.com>

* Update tests/test-backend-ops.cpp

Co-authored-by: slaren <slarengh@gmail.com>

* fix test prints

* Update src/ggml-backend.c

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* better CUDA support for noncontiguous out_prod

* add comment

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
Johannes Gäßler 2024-09-20 19:04:44 +03:00 committed by Georgi Gerganov
parent a6809c6a2e
commit 424c5d00a9
24 changed files with 883 additions and 129 deletions

View file

@ -799,10 +799,11 @@ struct test_case {
out = ggml_sum(ctx, out);
ggml_set_name(out, "sum_of_out");
}
ggml_set_loss(out);
ggml_build_forward_expand(gf, out);
ggml_graph_cpy(gf, gb);
ggml_build_backward_expand(ctx, gf, gb, false);
ggml_build_backward_expand(ctx, gf, gb, false, false);
if (expect.size() != 1 || expect[0] != 0.0f) {
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
@ -837,22 +838,11 @@ struct test_case {
return false;
}
// randomize tensors
initialize_tensors(ctx);
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
if (!t->grad) {
continue;
}
initialize_tensors(ctx); // Randomizes all tensors (including gradients).
ggml_graph_reset(gb); // Sets gradients to 1 if loss, 0 otherwise.
std::vector<float> tmp(ggml_nelements(t->grad));
ggml_backend_tensor_set(t->grad, tmp.data(), 0, ggml_nbytes(t->grad));
}
// build graphs
const float onef = 1.0f;
ggml_backend_graph_compute(backend, gf);
ggml_backend_tensor_set(out->grad, &onef, 0, ggml_nbytes(out->grad));
ggml_backend_graph_compute(backend, gb);
bool ok = true;
@ -1681,6 +1671,50 @@ struct test_mul_mat_id : public test_case {
}
};
// GGML_OP_OUT_PROD
struct test_out_prod : public test_case {
const ggml_type type_a;
const ggml_type type_b;
const int64_t m;
const int64_t n;
const int64_t k;
const std::array<int64_t, 2> bs; // dims 3 and 4
const bool trans_b;
std::string vars() override {
return VARS_TO_STR7(type_a, type_b, m, n, k, bs, trans_b);
}
double max_nmse_err() override {
return 5e-4;
}
test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
int64_t m = 32, int64_t n = 32, int64_t k = 32,
std::array<int64_t, 2> bs = {10, 10},
bool trans_b = false)
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), trans_b(trans_b) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
ggml_set_name(a, "a");
ggml_tensor * b;
if (trans_b) {
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0], bs[1]);
b = ggml_transpose(ctx, b);
} else {
b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0], bs[1]);
}
ggml_set_name(b, "b");
ggml_tensor * out = ggml_out_prod(ctx, a, b);
ggml_set_name(out, "out");
return out;
}
};
// GGML_OP_SQR
struct test_sqr : public test_case {
const ggml_type type;
@ -2666,6 +2700,51 @@ struct test_cross_entropy_loss : public test_case {
}
};
// GGML_OP_OPT_STEP_ADAMW
struct test_opt_step_adamw : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
const float alpha;
const float beta1;
const float beta2;
const float eps;
const float wd;
std::string vars() override {
return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
}
test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 5, 4, 3},
float alpha = 1e-3f,
float beta1 = 0.9f,
float beta2 = 0.999f,
float eps = 1e-8f,
float wd = 0.0f)
: type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
ggml_set_name(a, "a");
ggml_tensor * out = ggml_opt_step_adamw(ctx, a, alpha, beta1, beta2, eps, wd);
ggml_set_name(out, "out");
return out;
}
void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
}
}
bool grad_precise() override {
return true;
}
};
enum llm_norm_type {
LLM_NORM,
LLM_NORM_RMS,
@ -3159,14 +3238,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, 3}, {2, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, 3}, {1, 1, 1, 2}));
for (int ne3 : {1, 3}) { // CUDA backwards pass only supports ne3 == 1
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
}
test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
@ -3350,6 +3430,27 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
}
for (ggml_type type_a : base_types) {
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, { 1, 1}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}, true));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
}
}
test_cases.emplace_back(new test_sqr());
test_cases.emplace_back(new test_sqrt());
test_cases.emplace_back(new test_log());
@ -3476,6 +3577,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
test_cases.emplace_back(new test_cross_entropy_loss());
for (float wd : {0.0f, 1e-2f}) {
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
}
// these tests are disabled to save execution time, but they can be handy for debugging
#if 0