delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

This commit is contained in:
Chenxiaotao03 2024-01-19 12:50:01 +08:00
parent ea6cdccea1
commit 9303bbf1b1
5 changed files with 46 additions and 246 deletions

View file

@ -107,6 +107,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STA
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
# add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF)
if (LLAMA_PERF)
add_definitions(-DGGML_PERF)
endif()
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)

View file

@ -3,6 +3,6 @@ cmake ../../ \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DANDROID_ABI="arm64-v8a" \
-DANDROID_PLATFORM=android-23
-DANDROID_PLATFORM=android-23 $1
make -j4

View file

@ -583,25 +583,24 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
mlp_1 = ggml_gelu(ctx0, mlp_1);
struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
// transpose from [1, 576, 2048] --> [1, 24, 24, 2048] --> [1, 2048, 24, 24]
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, mlp_3->ne[0], n_patch, n_patch, mlp_3->ne[3]);
// permute logic is src idxs 0,1,2,3 perm to dst idxs
mlp_3 = ggml_permute_cpy(ctx0, mlp_3, 2, 0, 1, 3);
// mlp_3 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
// block 1
struct ggml_tensor * block_1 = nullptr;
{
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
// stride = 1, padding = 1, bias is nullptr
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
// layer norm
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// hardswish
@ -621,17 +620,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_1_block_2_0_w, 1, 1,
model.mm_model_block_1_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// layernorm
block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// residual
block_1 = ggml_add(ctx0, mlp_3, block_1);
@ -644,11 +644,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// layer norm
block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// hardswish
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
@ -668,12 +668,15 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_2_block_2_0_w, 1, 1,
model.mm_model_block_2_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
// layernorm
block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);

222
ggml.c
View file

@ -1650,7 +1650,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CLAMP",
"CONV_TRANSPOSE_1D",
"IM2COL",
"CONV_DEPTHWISE_2D",
"CONV_TRANSPOSE_2D",
"POOL_1D",
"POOL_2D",
@ -1684,7 +1683,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -1738,7 +1737,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"conv_transpose_1d(x)",
"im2col(x)",
"conv_transpose_2d(x)",
"conv_depthwise_2d(x)",
"pool_1d(x)",
"pool_2d(x)",
"upscale(x)",
@ -1771,7 +1769,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4780,24 +4778,6 @@ struct ggml_tensor * ggml_permute(
return result;
}
// some operations don't support permuted tensor, so we need to copy it, to avoid this case
struct ggml_tensor * ggml_permute_cpy(
struct ggml_context * ctx,
struct ggml_tensor * a,
int axis0,
int axis1,
int axis2,
int axis3) {
struct ggml_tensor * result = ggml_permute(ctx, a, axis0, axis1, axis2, axis3);
// new 4d tensor
struct ggml_tensor* tensor = ggml_new_tensor_4d(ctx, a->type, result->ne[0], result->ne[1], result->ne[2], result->ne[3]);
struct ggml_tensor* cpy = ggml_cpy(ctx, result, tensor);
return cpy;
}
// ggml_transpose
struct ggml_tensor * ggml_transpose(
@ -5402,37 +5382,18 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
int d0,
int d1) {
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
const int64_t ne[4] = {
OW,
OH,
b->ne[2],
b->ne[3],
};
// GGML_ASSERT(a->ne[3] == b->ne[2]);
// GGML_ASSERT(a->ne[2] == 1);
struct ggml_tensor * result =
ggml_mul_mat(ctx,
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC1, KH, KW] => [1, OC, 1, KH * KW]
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
// weight ne: [KW, KH, OC, 1]
GGML_ASSERT(a->ne[2] == b->ne[2]);
GGML_ASSERT(a->ne[3] == 1);
bool is_node = false;
/*
if (a->grad || b->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
*/
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
ggml_set_op_params(result, params, sizeof(params));
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
result->op = GGML_OP_CONV_DEPTHWISE_2D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
result->src[2] = c;
return result;
}
// ggml_conv_2d
@ -12529,159 +12490,6 @@ static void ggml_compute_forward_im2col(
}
}
static void ggml_compute_forward_conv_depthwise_2d_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * src2,
struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith;
const int nth = params->nth;
// total patches in dst
const int np = ne2;
// patches per thread
const int dp = (np + nth - 1)/nth;
// patch range for this thread
const int ip0 = dp*ith;
const int ip1 = MIN(ip0 + dp, np);
const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
float* weight = (float*)(src0->data);
float* input = (float*)(src1->data);
// float* bias = (float*)(src2->data);
float* output = (float*)(dst->data);
for (int b = 0; b < ne13; ++b) {
for (int o_c = ip0; o_c < ip1; ++o_c) {
for (int o_h = 0; o_h < ne1; ++o_h) {
for (int o_w = 0; o_w < ne0; ++o_w) {
float result_data = 0;
int g = o_c;
int i_c = g;
for (int k_h = 0; k_h < ne01; ++k_h) {
for (int k_w = 0; k_w < ne00; ++k_w) {
int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
continue;
}
float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
float weight_data = weight[(g * ne01 + k_h) * ne00 + k_w];
result_data += input_data * weight_data;
}
}
// output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
}
}
}
}
}
static void ggml_compute_forward_conv_depthwise_2d_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * src2,
struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith;
const int nth = params->nth;
// total patches in dst
const int np = ne2;
// patches per thread
const int dp = (np + nth - 1)/nth;
// patch range for this thread
const int ip0 = dp*ith;
const int ip1 = MIN(ip0 + dp, np);
const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
ggml_fp16_t* weight = (ggml_fp16_t*)(src0->data);
float* input = (float*)(src1->data);
// float* bias = (float*)(src2->data);
float* output = (float*)(dst->data);
for (int b = 0; b < ne13; ++b) {
for (int o_c = ip0; o_c < ip1; ++o_c) {
for (int o_h = 0; o_h < ne1; ++o_h) {
for (int o_w = 0; o_w < ne0; ++o_w) {
float result_data = 0;
int g = o_c;
int i_c = g;
for (int k_h = 0; k_h < ne01; ++k_h) {
for (int k_w = 0; k_w < ne00; ++k_w) {
int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
continue;
}
float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
float weight_data = GGML_FP16_TO_FP32(weight[(g * ne01 + k_h) * ne00 + k_w]);
result_data += input_data * weight_data;
}
}
// output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
}
}
}
}
}
static void ggml_compute_forward_conv_depthwise_2d(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * src2,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_conv_depthwise_2d_f32(params, src0, src1, src2, dst);
} break;
case GGML_TYPE_F16:
{
if (src1->type == GGML_TYPE_F32) {
ggml_compute_forward_conv_depthwise_2d_f16_f32(params, src0, src1, src2, dst);
} else {
GGML_ASSERT(false);
}
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_conv_transpose_2d
@ -15024,10 +14832,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_CONV_DEPTHWISE_2D:
{
ggml_compute_forward_conv_depthwise_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
@ -16764,10 +16568,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{
n_tasks = n_threads;
} break;
case GGML_OP_CONV_DEPTHWISE_2D:
{
n_tasks = n_threads;
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
n_tasks = n_threads;

10
ggml.h
View file

@ -433,7 +433,6 @@ extern "C" {
GGML_OP_CLAMP,
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_CONV_DEPTHWISE_2D,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
@ -1297,15 +1296,6 @@ extern "C" {
int axis2,
int axis3);
// some operations don't support permuted tensor, so we need to copy it, to avoid this case
GGML_API struct ggml_tensor * ggml_permute_cpy(
struct ggml_context * ctx,
struct ggml_tensor * a,
int axis0,
int axis1,
int axis2,
int axis3);
// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
GGML_API struct ggml_tensor * ggml_transpose(
struct ggml_context * ctx,