ggml/examples: add backend support for numerical optimization (ggml/949)
* CUDA eval works * stochastic gradient descent op * Adam except decay * CUDA CROSS_ENTROPY_LOSS_BACK * CUDA mnist-fc training works * backend CLI arg * refactor gguf load * remove sched from opt_step_adam * implement l1 regularization (weight decay) * extra call to add optimizer * initialize gradients with ggml_graph_reset * gradient accumulation * increment iter per eval instead of epoch * adjust backend interfaces * fix ggml_graph_reset without backend * fix ggml graph export/import * fixup * rename * revert ggml_opt changes * more general CUDA repeat_back * update documentation, fix CNN * validation split * add clarifying comment * optimize PyTorch training * adjust buffer size, thread count * fix 0.0f validation split * Update examples/mnist/mnist-common.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix gradient accumulation * tensor flag for accumulators -> tensor hash set * Update include/ggml.h Co-authored-by: slaren <slarengh@gmail.com> * Update tests/test-backend-ops.cpp Co-authored-by: slaren <slarengh@gmail.com> * Update tests/test-backend-ops.cpp Co-authored-by: slaren <slarengh@gmail.com> * fix test prints * Update src/ggml-backend.c Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * better CUDA support for noncontiguous out_prod * add comment --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
parent
a6809c6a2e
commit
424c5d00a9
24 changed files with 883 additions and 129 deletions
|
@ -799,10 +799,11 @@ struct test_case {
|
|||
out = ggml_sum(ctx, out);
|
||||
ggml_set_name(out, "sum_of_out");
|
||||
}
|
||||
ggml_set_loss(out);
|
||||
|
||||
ggml_build_forward_expand(gf, out);
|
||||
ggml_graph_cpy(gf, gb);
|
||||
ggml_build_backward_expand(ctx, gf, gb, false);
|
||||
ggml_build_backward_expand(ctx, gf, gb, false, false);
|
||||
if (expect.size() != 1 || expect[0] != 0.0f) {
|
||||
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
|
@ -837,22 +838,11 @@ struct test_case {
|
|||
return false;
|
||||
}
|
||||
|
||||
// randomize tensors
|
||||
initialize_tensors(ctx);
|
||||
|
||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (!t->grad) {
|
||||
continue;
|
||||
}
|
||||
initialize_tensors(ctx); // Randomizes all tensors (including gradients).
|
||||
ggml_graph_reset(gb); // Sets gradients to 1 if loss, 0 otherwise.
|
||||
|
||||
std::vector<float> tmp(ggml_nelements(t->grad));
|
||||
ggml_backend_tensor_set(t->grad, tmp.data(), 0, ggml_nbytes(t->grad));
|
||||
}
|
||||
|
||||
// build graphs
|
||||
const float onef = 1.0f;
|
||||
ggml_backend_graph_compute(backend, gf);
|
||||
ggml_backend_tensor_set(out->grad, &onef, 0, ggml_nbytes(out->grad));
|
||||
ggml_backend_graph_compute(backend, gb);
|
||||
|
||||
bool ok = true;
|
||||
|
@ -1681,6 +1671,50 @@ struct test_mul_mat_id : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// GGML_OP_OUT_PROD
|
||||
struct test_out_prod : public test_case {
|
||||
const ggml_type type_a;
|
||||
const ggml_type type_b;
|
||||
const int64_t m;
|
||||
const int64_t n;
|
||||
const int64_t k;
|
||||
const std::array<int64_t, 2> bs; // dims 3 and 4
|
||||
const bool trans_b;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR7(type_a, type_b, m, n, k, bs, trans_b);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 5e-4;
|
||||
}
|
||||
|
||||
test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
||||
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
||||
std::array<int64_t, 2> bs = {10, 10},
|
||||
bool trans_b = false)
|
||||
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), trans_b(trans_b) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * b;
|
||||
if (trans_b) {
|
||||
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0], bs[1]);
|
||||
b = ggml_transpose(ctx, b);
|
||||
} else {
|
||||
b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0], bs[1]);
|
||||
}
|
||||
ggml_set_name(b, "b");
|
||||
|
||||
ggml_tensor * out = ggml_out_prod(ctx, a, b);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_SQR
|
||||
struct test_sqr : public test_case {
|
||||
const ggml_type type;
|
||||
|
@ -2666,6 +2700,51 @@ struct test_cross_entropy_loss : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// GGML_OP_OPT_STEP_ADAMW
|
||||
struct test_opt_step_adamw : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const float alpha;
|
||||
const float beta1;
|
||||
const float beta2;
|
||||
const float eps;
|
||||
const float wd;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
|
||||
}
|
||||
|
||||
test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
||||
float alpha = 1e-3f,
|
||||
float beta1 = 0.9f,
|
||||
float beta2 = 0.999f,
|
||||
float eps = 1e-8f,
|
||||
float wd = 0.0f)
|
||||
: type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * out = ggml_opt_step_adamw(ctx, a, alpha, beta1, beta2, eps, wd);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
|
||||
}
|
||||
}
|
||||
|
||||
bool grad_precise() override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
LLM_NORM_RMS,
|
||||
|
@ -3159,14 +3238,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
||||
|
||||
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, 3}, {2, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, 3}, {1, 1, 1, 2}));
|
||||
for (int ne3 : {1, 3}) { // CUDA backwards pass only supports ne3 == 1
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
|
||||
|
@ -3350,6 +3430,27 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
}
|
||||
}
|
||||
|
||||
for (ggml_type type_a : base_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, { 1, 1}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
||||
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}, true));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
||||
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_sqr());
|
||||
test_cases.emplace_back(new test_sqrt());
|
||||
test_cases.emplace_back(new test_log());
|
||||
|
@ -3476,6 +3577,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
}
|
||||
|
||||
test_cases.emplace_back(new test_cross_entropy_loss());
|
||||
for (float wd : {0.0f, 1e-2f}) {
|
||||
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
|
||||
}
|
||||
|
||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||
#if 0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue