Merge branch 'master' into compilade/mamba2
This commit is contained in:
commit
1ee6c482d0
343 changed files with 61682 additions and 30750 deletions
|
@ -110,24 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
|
|||
# llama_target_and_test(test-double-float.cpp) # SLOW
|
||||
llama_target_and_test(test-log.cpp)
|
||||
llama_target_and_test(test-arg-parser.cpp)
|
||||
llama_target_and_test(test-quantize-fns.cpp)
|
||||
llama_target_and_test(test-quantize-perf.cpp)
|
||||
llama_target_and_test(test-sampling.cpp)
|
||||
llama_target_and_test(test-chat-template.cpp)
|
||||
|
||||
llama_target_and_test(test-grammar-parser.cpp)
|
||||
llama_target_and_test(test-llama-grammar.cpp)
|
||||
llama_target_and_test(test-grammar-integration.cpp)
|
||||
llama_target_and_test(test-grad0.cpp)
|
||||
llama_target_and_test(test-barrier.cpp)
|
||||
llama_target_and_test(test-llama-grammar.cpp)
|
||||
# llama_target_and_test(test-opt.cpp) # SLOW
|
||||
llama_target_and_test(test-backend-ops.cpp)
|
||||
|
||||
llama_target_and_test(test-rope.cpp)
|
||||
|
||||
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
||||
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these tests use the backends directly and cannot be built with dynamic loading
|
||||
llama_target_and_test(test-barrier.cpp)
|
||||
llama_target_and_test(test-quantize-fns.cpp)
|
||||
llama_target_and_test(test-quantize-perf.cpp)
|
||||
llama_target_and_test(test-rope.cpp)
|
||||
endif()
|
||||
|
||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { readFileSync } from "fs"
|
||||
import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
|
||||
import { SchemaConverter } from "../examples/server/public_legacy/json-schema-to-grammar.mjs"
|
||||
|
||||
const [, , file] = process.argv
|
||||
const url = `file://${file}`
|
||||
|
|
|
@ -70,7 +70,7 @@ int main(void) {
|
|||
|
||||
// non-existence arg in specific example (--draft cannot be used outside llama-speculative)
|
||||
argv = {"binary_name", "--draft", "123"};
|
||||
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
|
||||
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING));
|
||||
|
||||
|
||||
printf("test-arg-parser: test valid usage\n\n");
|
||||
|
@ -96,7 +96,7 @@ int main(void) {
|
|||
// --draft cannot be used outside llama-speculative
|
||||
argv = {"binary_name", "--draft", "123"};
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
||||
assert(params.n_draft == 123);
|
||||
assert(params.speculative.n_max == 123);
|
||||
|
||||
// skip this part on windows, because setenv is not supported
|
||||
#ifdef _WIN32
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
|
||||
|
||||
#include <ggml.h>
|
||||
#include <ggml-cpu.h>
|
||||
#include <ggml-alloc.h>
|
||||
#include <ggml-backend.h>
|
||||
|
||||
|
@ -26,7 +25,6 @@
|
|||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <stdio.h>
|
||||
|
@ -639,19 +637,20 @@ struct test_case {
|
|||
|
||||
// determine number of runs
|
||||
int n_runs;
|
||||
bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
if (op_flops(out) > 0) {
|
||||
// based on flops
|
||||
const uint64_t GFLOP = 1000 * 1000 * 1000;
|
||||
const uint64_t target_flops_cpu = 8ULL * GFLOP;
|
||||
const uint64_t target_flops_gpu = 100ULL * GFLOP;
|
||||
uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
|
||||
uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
|
||||
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
|
||||
} else {
|
||||
// based on memory size
|
||||
const size_t GB = 1ULL << 30;
|
||||
const size_t target_size_cpu = 8 * GB;
|
||||
const size_t target_size_gpu = 32 * GB;
|
||||
size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
|
||||
size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
|
||||
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
||||
}
|
||||
|
||||
|
@ -681,6 +680,7 @@ struct test_case {
|
|||
|
||||
// run
|
||||
int64_t total_time_us = 0;
|
||||
int64_t total_mem = 0;
|
||||
int total_runs = 0;
|
||||
do {
|
||||
int64_t start_time = ggml_time_us();
|
||||
|
@ -688,6 +688,7 @@ struct test_case {
|
|||
int64_t end_time = ggml_time_us();
|
||||
|
||||
total_time_us += end_time - start_time;
|
||||
total_mem += mem;
|
||||
total_runs += n_runs;
|
||||
} while (total_time_us < 1000*1000); // run for at least 1 second
|
||||
|
||||
|
@ -717,7 +718,7 @@ struct test_case {
|
|||
} else {
|
||||
printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
|
||||
op_size(out) / 1024,
|
||||
mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
|
||||
total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
|
@ -809,15 +810,14 @@ struct test_case {
|
|||
|
||||
ggml_build_forward_expand(gf, out);
|
||||
ggml_graph_cpy(gf, gb);
|
||||
ggml_build_backward_expand(ctx, gf, gb, false);
|
||||
ggml_build_backward_expand(ctx, ctx, gb, false);
|
||||
if (expect.size() != 1 || expect[0] != 0.0f) {
|
||||
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
|
||||
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: refactor so that this check is only needed once
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (!ggml_backend_supports_op(backend, t)) {
|
||||
printf("not supported [%s] ", ggml_backend_name(backend));
|
||||
|
@ -860,7 +860,13 @@ struct test_case {
|
|||
const char * bn = ggml_backend_name(backend);
|
||||
const int64_t ne = ggml_nelements(t);
|
||||
|
||||
std::vector<float> ga = tensor_to_float(t->grad);
|
||||
std::vector<float> ga;
|
||||
struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
|
||||
if (grad) {
|
||||
ga = tensor_to_float(grad);
|
||||
} else {
|
||||
ga.resize(ne); // default value is 0.0f
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
|
||||
// check for nans
|
||||
|
@ -1147,6 +1153,26 @@ struct test_argmax : public test_case {
|
|||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
std::random_device rd;
|
||||
std::default_random_engine rng(rd());
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_F32) {
|
||||
// initialize with unique values to avoid ties
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<float> data(t->ne[0]);
|
||||
for (int i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
|
||||
}
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 0.0;
|
||||
}
|
||||
|
@ -1644,8 +1670,8 @@ struct test_ssm_scan : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// GGML_OP_RWKV_WKV
|
||||
struct test_rwkv_wkv : public test_case {
|
||||
// GGML_OP_RWKV_WKV6
|
||||
struct test_rwkv_wkv6 : public test_case {
|
||||
const ggml_type type;
|
||||
|
||||
const int64_t head_count;
|
||||
|
@ -1657,7 +1683,7 @@ struct test_rwkv_wkv : public test_case {
|
|||
return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
|
||||
}
|
||||
|
||||
test_rwkv_wkv(ggml_type type = GGML_TYPE_F32,
|
||||
test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
|
||||
int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
|
||||
: type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
|
||||
|
||||
|
@ -1669,7 +1695,7 @@ struct test_rwkv_wkv : public test_case {
|
|||
ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
|
||||
ggml_tensor * td = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
|
||||
ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
|
||||
ggml_tensor * out = ggml_rwkv_wkv(ctx, k, v, r, tf, td, s);
|
||||
ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
@ -2528,6 +2554,35 @@ struct test_sum_rows : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// GGML_OP_MEAN
|
||||
struct test_mean : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_mean(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_set_param(ctx, a);
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * out = ggml_mean(ctx, a);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
float grad_eps() override {
|
||||
return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_UPSCALE
|
||||
struct test_upscale : public test_case {
|
||||
const ggml_type type;
|
||||
|
@ -2770,6 +2825,13 @@ struct test_flash_attn_ext : public test_case {
|
|||
return 5e-4;
|
||||
}
|
||||
|
||||
uint64_t op_flops(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
// Just counting matmul costs:
|
||||
// Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
|
||||
return 2 * 2 * nh * nb * hs * kv;
|
||||
}
|
||||
|
||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
|
||||
bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
|
||||
|
@ -2855,24 +2917,14 @@ struct test_cross_entropy_loss : public test_case {
|
|||
struct test_opt_step_adamw : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const float alpha;
|
||||
const float beta1;
|
||||
const float beta2;
|
||||
const float eps;
|
||||
const float wd;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
||||
float alpha = 1e-3f,
|
||||
float beta1 = 0.9f,
|
||||
float beta2 = 0.999f,
|
||||
float eps = 1e-8f,
|
||||
float wd = 0.0f)
|
||||
: type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
|
||||
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
|
@ -2882,7 +2934,16 @@ struct test_opt_step_adamw : public test_case {
|
|||
ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
ggml_set_name(grad, "grad");
|
||||
|
||||
ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, alpha, beta1, beta2, eps, wd);
|
||||
ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
ggml_set_name(grad_m, "grad_m");
|
||||
|
||||
ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
ggml_set_name(grad_v, "grad_v");
|
||||
|
||||
ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
|
||||
ggml_set_name(adamw_params, "adamw_params");
|
||||
|
||||
ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
|
@ -2890,7 +2951,7 @@ struct test_opt_step_adamw : public test_case {
|
|||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
|
||||
init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3428,6 +3489,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
||||
|
||||
test_cases.emplace_back(new test_argmax());
|
||||
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100, 10, 1, 1}));
|
||||
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
|
||||
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_count_equal());
|
||||
|
||||
for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
|
||||
|
@ -3530,10 +3596,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1, 1024, 1, 32, 4)); // Mamba-1
|
||||
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 32, 32, 2, 32, 4)); // Mamba-2
|
||||
|
||||
test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 1, 1));
|
||||
test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 1));
|
||||
test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 4));
|
||||
test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 128, 4));
|
||||
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
|
||||
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
|
||||
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
|
||||
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
|
||||
|
||||
#if 1
|
||||
for (ggml_type type_a : base_types) {
|
||||
|
@ -3630,7 +3696,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
for (int n_mats : {4}) {
|
||||
for (int n_used : {2}) {
|
||||
for (bool b : {false}) {
|
||||
for (int n : {1}) {
|
||||
for (int n : {1, 32}) {
|
||||
int m = 512;
|
||||
int k = 256;
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
||||
|
@ -3757,6 +3823,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
|
||||
test_cases.emplace_back(new test_sum());
|
||||
test_cases.emplace_back(new test_sum_rows());
|
||||
test_cases.emplace_back(new test_mean());
|
||||
test_cases.emplace_back(new test_upscale());
|
||||
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
|
||||
test_cases.emplace_back(new test_upscale_ext());
|
||||
|
@ -3776,7 +3843,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 3, 32, 35, }) {
|
||||
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
||||
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
|
||||
}
|
||||
}
|
||||
|
@ -3788,9 +3855,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
}
|
||||
|
||||
test_cases.emplace_back(new test_cross_entropy_loss());
|
||||
for (float wd : {0.0f, 1e-2f}) {
|
||||
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
|
||||
}
|
||||
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
|
||||
|
||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||
#if 0
|
||||
|
@ -3810,6 +3875,20 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f));
|
||||
|
||||
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1}));
|
||||
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
|
||||
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
|
||||
|
||||
for (int bs : {1, 512}) {
|
||||
for (ggml_type type_a : all_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||
|
@ -3824,7 +3903,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
||||
if (mode == MODE_TEST) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
||||
ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
|
||||
if (backend_cpu == NULL) {
|
||||
printf(" Failed to initialize CPU backend\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t n_ok = 0;
|
||||
for (auto & test : test_cases) {
|
||||
|
@ -3904,7 +3987,9 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
// enumerate backends
|
||||
// load and enumerate backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
|
||||
|
||||
size_t n_ok = 0;
|
||||
|
@ -3920,16 +4005,15 @@ int main(int argc, char ** argv) {
|
|||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
||||
GGML_ASSERT(backend != NULL);
|
||||
|
||||
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
|
||||
if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
|
||||
printf(" Skipping CPU backend\n");
|
||||
ggml_backend_free(backend);
|
||||
n_ok++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
||||
GGML_ASSERT(backend != NULL);
|
||||
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
||||
if (ggml_backend_set_n_threads_fn) {
|
||||
|
@ -3958,6 +4042,8 @@ int main(int argc, char ** argv) {
|
|||
ggml_backend_free(backend);
|
||||
}
|
||||
|
||||
ggml_quantize_free();
|
||||
|
||||
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
|
||||
|
||||
if (n_ok != ggml_backend_dev_count()) {
|
||||
|
@ -3965,8 +4051,6 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
ggml_quantize_free();
|
||||
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
return 0;
|
||||
}
|
||||
|
|
1684
tests/test-grad0.cpp
1684
tests/test-grad0.cpp
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -45,22 +45,23 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
|
|||
}
|
||||
|
||||
// Total quantization error on test data
|
||||
static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
|
||||
static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
|
||||
qfns->from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
return array_rmse(test_data, tmp_out.data(), test_size);
|
||||
}
|
||||
|
||||
// Total quantization error on test data
|
||||
static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
|
||||
static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
std::vector<float> tmp_out_ref(test_size);
|
||||
|
||||
qfns->from_float(test_data, tmp_q.data(), test_size);
|
||||
// FIXME: why is done twice?
|
||||
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
|
||||
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
|
||||
|
@ -78,15 +79,15 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
|||
}
|
||||
|
||||
// Total dot product error
|
||||
static float dot_product_error(
|
||||
const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2
|
||||
) {
|
||||
static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
|
||||
GGML_UNUSED(qfns);
|
||||
|
||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||
|
||||
const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
|
||||
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
|
||||
|
||||
qfns->from_float(test_data1, tmp_q1.data(), test_size);
|
||||
qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
|
||||
vdot->from_float(test_data2, tmp_q2.data(), test_size);
|
||||
|
||||
float result = INFINITY;
|
||||
|
@ -145,8 +146,8 @@ int main(int argc, char * argv[]) {
|
|||
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
||||
ggml_quantize_init(ei);
|
||||
|
||||
if (qfns->from_float && qfns->to_float) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
if (qfns_cpu->from_float && qfns->to_float) {
|
||||
const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
|
||||
const float max_quantization_error =
|
||||
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
||||
type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
||||
|
@ -161,7 +162,7 @@ int main(int argc, char * argv[]) {
|
|||
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
|
||||
}
|
||||
|
||||
const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
|
||||
const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
|
||||
failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
|
|
|
@ -7,7 +7,6 @@
|
|||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <functional>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <memory>
|
||||
#include <stdio.h>
|
||||
|
@ -123,9 +122,10 @@ static void usage(char * argv[]) {
|
|||
printf(" --type TYPE set test type as");
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
||||
if (ggml_type_name(type) != NULL) {
|
||||
if (qfns->from_float && qfns->to_float) {
|
||||
if (qfns_cpu->from_float && qfns->to_float) {
|
||||
printf(" %s", ggml_type_name(type));
|
||||
}
|
||||
}
|
||||
|
@ -277,7 +277,7 @@ int main(int argc, char * argv[]) {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (qfns->from_float && qfns->to_float) {
|
||||
if (qfns_cpu->from_float && qfns->to_float) {
|
||||
printf("%s\n", ggml_type_name(type));
|
||||
|
||||
ggml_quantize_init(type);
|
||||
|
@ -301,7 +301,7 @@ int main(int argc, char * argv[]) {
|
|||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns->from_float(test_data1, test_q1, size);
|
||||
qfns_cpu->from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
@ -312,7 +312,7 @@ int main(int argc, char * argv[]) {
|
|||
|
||||
if (params.op_dequantize_row_q) {
|
||||
printf(" dequantize_row_q\n");
|
||||
qfns->from_float(test_data1, test_q1, largest);
|
||||
qfns_cpu->from_float(test_data1, test_q1, largest);
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
|
@ -330,7 +330,7 @@ int main(int argc, char * argv[]) {
|
|||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
|
||||
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
|
||||
vdot->from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
|
@ -342,8 +342,8 @@ int main(int argc, char * argv[]) {
|
|||
|
||||
if (params.op_vec_dot_q) {
|
||||
printf(" vec_dot_q\n");
|
||||
qfns->from_float(test_data1, test_q1, largest);
|
||||
qfns->from_float(test_data2, test_q2, largest);
|
||||
qfns_cpu->from_float(test_data1, test_q1, largest);
|
||||
qfns_cpu->from_float(test_data2, test_q2, largest);
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue