Merge branch 'ggerganov:master' into server-chat-templates
This commit is contained in:
commit
a1ee42d521
13 changed files with 144 additions and 27 deletions
|
@ -196,6 +196,7 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_pad_f32;
|
||||
vk_pipeline pipeline_repeat_f32;
|
||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
|
||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
|
||||
vk_pipeline pipeline_norm_f32;
|
||||
vk_pipeline pipeline_group_norm_f32;
|
||||
vk_pipeline pipeline_rms_norm_f32;
|
||||
|
@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
|||
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
||||
assert(compile_count > 0);
|
||||
compile_count--;
|
||||
|
||||
// "Progress bar" for shader compiles
|
||||
static uint32_t total_compile_count = 0;
|
||||
if ((total_compile_count++ % 10) == 0) {
|
||||
std::cerr << ".";
|
||||
}
|
||||
}
|
||||
compile_count_cond.notify_all();
|
||||
}
|
||||
|
@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
|
|||
static void ggml_vk_load_shaders(vk_device& device) {
|
||||
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
||||
|
||||
std::cerr << "ggml_vulkan: Compiling shaders";
|
||||
|
||||
// mulmat
|
||||
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
|
||||
|
@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
|
@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
std::cerr << "Done!" << std::endl;
|
||||
}
|
||||
|
||||
static vk_device ggml_vk_get_device(size_t idx) {
|
||||
|
@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
|
|||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
|
||||
if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_cpy_f32_f32;
|
||||
static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
|
||||
|
||||
// Choose "contiguous copy" shader if src/dst are contiguous
|
||||
bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));
|
||||
|
||||
if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
|
||||
if (contig) {
|
||||
return ctx->device->pipeline_contig_cpy_f32_f32;
|
||||
} else {
|
||||
return ctx->device->pipeline_cpy_f32_f32;
|
||||
}
|
||||
}
|
||||
if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_cpy_f32_f16;
|
||||
if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
|
||||
if (contig) {
|
||||
return ctx->device->pipeline_contig_cpy_f32_f16;
|
||||
} else {
|
||||
return ctx->device->pipeline_cpy_f32_f16;
|
||||
}
|
||||
}
|
||||
if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_cpy_f16_f16;
|
||||
if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
|
||||
if (contig) {
|
||||
return ctx->device->pipeline_contig_cpy_f16_f16;
|
||||
} else {
|
||||
return ctx->device->pipeline_cpy_f16_f16;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
|
||||
std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
|
@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
|||
const int tensor_type_size = ggml_type_size(tensor->type);
|
||||
|
||||
const uint32_t ne = ggml_nelements(tensor);
|
||||
std::array<uint32_t, 3> elements;
|
||||
|
||||
if (ne > 262144) {
|
||||
elements = { 512, 512, CEIL_DIV(ne, 262144) };
|
||||
} else if (ne > 512) {
|
||||
elements = { 512, CEIL_DIV(ne, 512), 1 };
|
||||
} else {
|
||||
elements = { ne, 1, 1 };
|
||||
}
|
||||
|
||||
const vk_op_unary_push_constants pc = {
|
||||
(uint32_t)ne,
|
||||
|
@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
|||
0.0f, 0.0f,
|
||||
};
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
|
||||
}
|
||||
|
||||
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
|
@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||
|
||||
if (x_non_contig) {
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
|
||||
} else {
|
||||
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
||||
}
|
||||
if (y_non_contig) {
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
|
||||
} else {
|
||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||
}
|
||||
|
@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
vk_pipeline to_fp16_vk_0 = nullptr;
|
||||
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||
if (x_non_contig) {
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
|
||||
}
|
||||
if (y_non_contig) {
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
|
||||
} else {
|
||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||
}
|
||||
|
@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||
|
||||
if (x_non_contig) {
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
|
||||
} else {
|
||||
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
||||
}
|
||||
if (y_non_contig) {
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
|
||||
} else {
|
||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||
}
|
||||
|
@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|||
vk_pipeline to_fp16_vk_0 = nullptr;
|
||||
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||
if (x_non_contig) {
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
|
||||
}
|
||||
if (y_non_contig) {
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
|
||||
} else {
|
||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||
}
|
||||
|
@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
case GGML_OP_CPY:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DUP:
|
||||
return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type);
|
||||
return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
|
||||
case GGML_OP_NORM:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_norm_f32;
|
||||
|
@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|||
case GGML_OP_DIV:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
|
|
42
ggml/src/vulkan-shaders/contig_copy.comp
Normal file
42
ggml/src/vulkan-shaders/contig_copy.comp
Normal file
|
@ -0,0 +1,42 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
const uint num_threads = 128;
|
||||
|
||||
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
uint idx = get_idx();
|
||||
|
||||
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||
const uint num_iter = 4;
|
||||
|
||||
// fast path for when all four iterations are in-bounds
|
||||
if (idx + (num_iter-1)*num_threads < p.ne) {
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
||||
#else
|
||||
data_d[p.d_offset + idx] = data_a[idx];
|
||||
#endif
|
||||
idx += num_threads;
|
||||
}
|
||||
} else {
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
if (idx >= p.ne) {
|
||||
continue;
|
||||
}
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
||||
#else
|
||||
data_d[p.d_offset + idx] = data_a[idx];
|
||||
#endif
|
||||
idx += num_threads;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3,6 +3,8 @@
|
|||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
|
@ -9,8 +10,6 @@ layout (push_constant) uniform parameter
|
|||
float param1; float param2;
|
||||
} p;
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
uint src0_idx_mod(uint idx) {
|
||||
const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
|
||||
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||
|
|
|
@ -3,12 +3,22 @@
|
|||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
const uint num_threads = 128;
|
||||
|
||||
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||
const uint num_iter = 4;
|
||||
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
if (idx >= p.ne) {
|
||||
continue;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
|
||||
idx += num_threads;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
|
||||
}
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
|
|
|
@ -350,6 +350,9 @@ void process_shaders() {
|
|||
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
|
||||
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||
|
|
|
@ -681,6 +681,7 @@ struct test_case {
|
|||
|
||||
// run
|
||||
int64_t total_time_us = 0;
|
||||
int64_t total_mem = 0;
|
||||
int total_runs = 0;
|
||||
do {
|
||||
int64_t start_time = ggml_time_us();
|
||||
|
@ -688,6 +689,7 @@ struct test_case {
|
|||
int64_t end_time = ggml_time_us();
|
||||
|
||||
total_time_us += end_time - start_time;
|
||||
total_mem += mem;
|
||||
total_runs += n_runs;
|
||||
} while (total_time_us < 1000*1000); // run for at least 1 second
|
||||
|
||||
|
@ -717,7 +719,7 @@ struct test_case {
|
|||
} else {
|
||||
printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
|
||||
op_size(out) / 1024,
|
||||
mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
|
||||
total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
|
@ -2740,6 +2742,13 @@ struct test_flash_attn_ext : public test_case {
|
|||
return 5e-4;
|
||||
}
|
||||
|
||||
uint64_t op_flops(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
// Just counting matmul costs:
|
||||
// Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
|
||||
return 2 * 2 * nh * nb * hs * kv;
|
||||
}
|
||||
|
||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
|
||||
bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
|
||||
|
@ -3779,6 +3788,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
|
||||
|
||||
for (int bs : {1, 512}) {
|
||||
for (ggml_type type_a : all_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue