CUDA: faster softmax via shared memory + fp16 math (#4742)
This commit is contained in:
parent
1fc2f265ff
commit
8f900abfc0
2 changed files with 321 additions and 29 deletions
|
@ -450,7 +450,7 @@ struct test_case {
|
|||
|
||||
double err = nmse(f1.data(), f2.data(), f1.size());
|
||||
if (err > ud->max_err) {
|
||||
printf("[%s] NMSE = %f ", ggml_op_desc(t1), err);
|
||||
printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
|
||||
//for (int i = 0; i < (int) f1.size(); i++) {
|
||||
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
|
||||
//}
|
||||
|
@ -1449,6 +1449,7 @@ struct test_moe : public test_case {
|
|||
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
std::default_random_engine rng(0);
|
||||
|
||||
const ggml_type all_types[] = {
|
||||
GGML_TYPE_F32, GGML_TYPE_F16,
|
||||
|
@ -1583,7 +1584,19 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5));
|
||||
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
|
||||
|
||||
test_cases.emplace_back(new test_soft_max());
|
||||
std::uniform_int_distribution<> dist_ne1(1, 50);
|
||||
int exponent = 1;
|
||||
while (exponent < (1 << 17)) {
|
||||
std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
|
||||
|
||||
for (int n = 0; n < 10; ++n) {
|
||||
int64_t ne0 = dist_ne0(rng);
|
||||
int64_t ne1 = dist_ne1(rng);
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}));
|
||||
}
|
||||
|
||||
exponent <<= 1;
|
||||
}
|
||||
|
||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue