delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake
This commit is contained in:
parent
ea6cdccea1
commit
9303bbf1b1
5 changed files with 46 additions and 246 deletions
|
@ -107,6 +107,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STA
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||||
|
|
||||||
|
|
||||||
|
# add perf arguments
|
||||||
|
option(LLAMA_PERF "llama: enable perf" OFF)
|
||||||
|
if (LLAMA_PERF)
|
||||||
|
add_definitions(-DGGML_PERF)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Required for relocatable CMake package
|
# Required for relocatable CMake package
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,6 @@ cmake ../../ \
|
||||||
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
||||||
-DCMAKE_BUILD_TYPE=Release \
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
-DANDROID_ABI="arm64-v8a" \
|
-DANDROID_ABI="arm64-v8a" \
|
||||||
-DANDROID_PLATFORM=android-23
|
-DANDROID_PLATFORM=android-23 $1
|
||||||
|
|
||||||
make -j4
|
make -j4
|
||||||
|
|
|
@ -583,25 +583,24 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
||||||
struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
||||||
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
||||||
// transpose from [1, 576, 2048] --> [1, 24, 24, 2048] --> [1, 2048, 24, 24]
|
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
||||||
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, mlp_3->ne[0], n_patch, n_patch, mlp_3->ne[3]);
|
|
||||||
// permute logic is src idxs 0,1,2,3 perm to dst idxs
|
|
||||||
mlp_3 = ggml_permute_cpy(ctx0, mlp_3, 2, 0, 1, 3);
|
|
||||||
// mlp_3 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
|
||||||
|
|
||||||
// block 1
|
// block 1
|
||||||
struct ggml_tensor * block_1 = nullptr;
|
struct ggml_tensor * block_1 = nullptr;
|
||||||
{
|
{
|
||||||
|
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
||||||
|
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
|
||||||
|
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
||||||
// stride = 1, padding = 1, bias is nullptr
|
// stride = 1, padding = 1, bias is nullptr
|
||||||
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
|
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
|
||||||
|
|
||||||
// layer norm
|
// layer norm
|
||||||
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||||
block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
|
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
|
||||||
block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||||
|
|
||||||
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||||
// hardswish
|
// hardswish
|
||||||
|
@ -621,17 +620,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||||
|
|
||||||
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
int w = block_1->ne[0], h = block_1->ne[1];
|
||||||
struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_1_block_2_0_w, 1, 1,
|
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||||
model.mm_model_block_1_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||||
block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
|
|
||||||
|
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||||
|
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
|
||||||
|
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||||
|
|
||||||
// layernorm
|
|
||||||
block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
|
|
||||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
|
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
|
||||||
block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||||
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||||
// residual
|
// residual
|
||||||
block_1 = ggml_add(ctx0, mlp_3, block_1);
|
block_1 = ggml_add(ctx0, mlp_3, block_1);
|
||||||
|
@ -644,11 +644,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
|
|
||||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||||
// layer norm
|
// layer norm
|
||||||
block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
|
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
|
||||||
block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||||
// hardswish
|
// hardswish
|
||||||
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||||
|
@ -664,22 +664,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
||||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||||
|
|
||||||
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
|
||||||
struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_2_block_2_0_w, 1, 1,
|
int w = block_1->ne[0], h = block_1->ne[1];
|
||||||
model.mm_model_block_2_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
|
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||||
block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||||
// layernorm
|
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||||
block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
|
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
|
||||||
|
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||||
|
|
||||||
|
|
||||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
|
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
|
||||||
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
|
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
|
||||||
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
|
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
|
||||||
}
|
}
|
||||||
embeddings = block_1;
|
embeddings = block_1;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
222
ggml.c
222
ggml.c
|
@ -1650,7 +1650,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"CLAMP",
|
"CLAMP",
|
||||||
"CONV_TRANSPOSE_1D",
|
"CONV_TRANSPOSE_1D",
|
||||||
"IM2COL",
|
"IM2COL",
|
||||||
"CONV_DEPTHWISE_2D",
|
|
||||||
"CONV_TRANSPOSE_2D",
|
"CONV_TRANSPOSE_2D",
|
||||||
"POOL_1D",
|
"POOL_1D",
|
||||||
"POOL_2D",
|
"POOL_2D",
|
||||||
|
@ -1684,7 +1683,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"CROSS_ENTROPY_LOSS_BACK",
|
"CROSS_ENTROPY_LOSS_BACK",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
|
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
@ -1738,7 +1737,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"conv_transpose_1d(x)",
|
"conv_transpose_1d(x)",
|
||||||
"im2col(x)",
|
"im2col(x)",
|
||||||
"conv_transpose_2d(x)",
|
"conv_transpose_2d(x)",
|
||||||
"conv_depthwise_2d(x)",
|
|
||||||
"pool_1d(x)",
|
"pool_1d(x)",
|
||||||
"pool_2d(x)",
|
"pool_2d(x)",
|
||||||
"upscale(x)",
|
"upscale(x)",
|
||||||
|
@ -1771,7 +1769,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"cross_entropy_loss_back(x,y)",
|
"cross_entropy_loss_back(x,y)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
|
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
||||||
|
|
||||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||||
|
|
||||||
|
@ -4780,24 +4778,6 @@ struct ggml_tensor * ggml_permute(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// some operations don't support permuted tensor, so we need to copy it, to avoid this case
|
|
||||||
struct ggml_tensor * ggml_permute_cpy(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * a,
|
|
||||||
int axis0,
|
|
||||||
int axis1,
|
|
||||||
int axis2,
|
|
||||||
int axis3) {
|
|
||||||
struct ggml_tensor * result = ggml_permute(ctx, a, axis0, axis1, axis2, axis3);
|
|
||||||
// new 4d tensor
|
|
||||||
struct ggml_tensor* tensor = ggml_new_tensor_4d(ctx, a->type, result->ne[0], result->ne[1], result->ne[2], result->ne[3]);
|
|
||||||
|
|
||||||
struct ggml_tensor* cpy = ggml_cpy(ctx, result, tensor);
|
|
||||||
|
|
||||||
return cpy;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// ggml_transpose
|
// ggml_transpose
|
||||||
|
|
||||||
struct ggml_tensor * ggml_transpose(
|
struct ggml_tensor * ggml_transpose(
|
||||||
|
@ -5402,37 +5382,18 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||||
int d0,
|
int d0,
|
||||||
int d1) {
|
int d1) {
|
||||||
|
|
||||||
|
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
||||||
|
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
||||||
|
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
||||||
|
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
|
||||||
|
|
||||||
const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
|
struct ggml_tensor * result =
|
||||||
const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
ggml_mul_mat(ctx,
|
||||||
const int64_t ne[4] = {
|
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
||||||
OW,
|
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
||||||
OH,
|
|
||||||
b->ne[2],
|
|
||||||
b->ne[3],
|
|
||||||
};
|
|
||||||
// GGML_ASSERT(a->ne[3] == b->ne[2]);
|
|
||||||
// GGML_ASSERT(a->ne[2] == 1);
|
|
||||||
|
|
||||||
// weight ne: [KW, KH, OC, 1]
|
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
||||||
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
|
||||||
GGML_ASSERT(a->ne[3] == 1);
|
|
||||||
bool is_node = false;
|
|
||||||
/*
|
|
||||||
if (a->grad || b->grad) {
|
|
||||||
GGML_ASSERT(false); // TODO: implement backward
|
|
||||||
is_node = true;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
|
||||||
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
|
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
|
||||||
|
|
||||||
result->op = GGML_OP_CONV_DEPTHWISE_2D;
|
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
||||||
result->src[0] = a;
|
|
||||||
result->src[1] = b;
|
|
||||||
result->src[2] = c;
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
// ggml_conv_2d
|
// ggml_conv_2d
|
||||||
|
@ -12529,159 +12490,6 @@ static void ggml_compute_forward_im2col(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_compute_forward_conv_depthwise_2d_f32(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
const struct ggml_tensor * src0,
|
|
||||||
const struct ggml_tensor * src1,
|
|
||||||
const struct ggml_tensor * src2,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
|
||||||
|
|
||||||
const int ith = params->ith;
|
|
||||||
const int nth = params->nth;
|
|
||||||
|
|
||||||
// total patches in dst
|
|
||||||
const int np = ne2;
|
|
||||||
|
|
||||||
// patches per thread
|
|
||||||
const int dp = (np + nth - 1)/nth;
|
|
||||||
|
|
||||||
// patch range for this thread
|
|
||||||
const int ip0 = dp*ith;
|
|
||||||
const int ip1 = MIN(ip0 + dp, np);
|
|
||||||
|
|
||||||
const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
|
|
||||||
const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
|
|
||||||
const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
|
|
||||||
const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
|
|
||||||
const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
|
|
||||||
const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
|
|
||||||
|
|
||||||
float* weight = (float*)(src0->data);
|
|
||||||
float* input = (float*)(src1->data);
|
|
||||||
// float* bias = (float*)(src2->data);
|
|
||||||
float* output = (float*)(dst->data);
|
|
||||||
for (int b = 0; b < ne13; ++b) {
|
|
||||||
for (int o_c = ip0; o_c < ip1; ++o_c) {
|
|
||||||
for (int o_h = 0; o_h < ne1; ++o_h) {
|
|
||||||
for (int o_w = 0; o_w < ne0; ++o_w) {
|
|
||||||
float result_data = 0;
|
|
||||||
int g = o_c;
|
|
||||||
int i_c = g;
|
|
||||||
for (int k_h = 0; k_h < ne01; ++k_h) {
|
|
||||||
for (int k_w = 0; k_w < ne00; ++k_w) {
|
|
||||||
int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
|
|
||||||
int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
|
|
||||||
if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
|
|
||||||
float weight_data = weight[(g * ne01 + k_h) * ne00 + k_w];
|
|
||||||
result_data += input_data * weight_data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
|
|
||||||
output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_conv_depthwise_2d_f16_f32(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
const struct ggml_tensor * src0,
|
|
||||||
const struct ggml_tensor * src1,
|
|
||||||
const struct ggml_tensor * src2,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
|
||||||
|
|
||||||
const int ith = params->ith;
|
|
||||||
const int nth = params->nth;
|
|
||||||
|
|
||||||
// total patches in dst
|
|
||||||
const int np = ne2;
|
|
||||||
|
|
||||||
// patches per thread
|
|
||||||
const int dp = (np + nth - 1)/nth;
|
|
||||||
|
|
||||||
// patch range for this thread
|
|
||||||
const int ip0 = dp*ith;
|
|
||||||
const int ip1 = MIN(ip0 + dp, np);
|
|
||||||
|
|
||||||
const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
|
|
||||||
const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
|
|
||||||
const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
|
|
||||||
const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
|
|
||||||
const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
|
|
||||||
const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
|
|
||||||
|
|
||||||
ggml_fp16_t* weight = (ggml_fp16_t*)(src0->data);
|
|
||||||
float* input = (float*)(src1->data);
|
|
||||||
// float* bias = (float*)(src2->data);
|
|
||||||
float* output = (float*)(dst->data);
|
|
||||||
for (int b = 0; b < ne13; ++b) {
|
|
||||||
for (int o_c = ip0; o_c < ip1; ++o_c) {
|
|
||||||
for (int o_h = 0; o_h < ne1; ++o_h) {
|
|
||||||
for (int o_w = 0; o_w < ne0; ++o_w) {
|
|
||||||
float result_data = 0;
|
|
||||||
int g = o_c;
|
|
||||||
int i_c = g;
|
|
||||||
for (int k_h = 0; k_h < ne01; ++k_h) {
|
|
||||||
for (int k_w = 0; k_w < ne00; ++k_w) {
|
|
||||||
int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
|
|
||||||
int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
|
|
||||||
if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
|
|
||||||
float weight_data = GGML_FP16_TO_FP32(weight[(g * ne01 + k_h) * ne00 + k_w]);
|
|
||||||
result_data += input_data * weight_data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
|
|
||||||
output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_conv_depthwise_2d(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
const struct ggml_tensor * src0,
|
|
||||||
const struct ggml_tensor * src1,
|
|
||||||
const struct ggml_tensor * src2,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
switch (src0->type) {
|
|
||||||
case GGML_TYPE_F32:
|
|
||||||
{
|
|
||||||
ggml_compute_forward_conv_depthwise_2d_f32(params, src0, src1, src2, dst);
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_F16:
|
|
||||||
{
|
|
||||||
if (src1->type == GGML_TYPE_F32) {
|
|
||||||
ggml_compute_forward_conv_depthwise_2d_f16_f32(params, src0, src1, src2, dst);
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ggml_compute_forward_conv_transpose_2d
|
// ggml_compute_forward_conv_transpose_2d
|
||||||
|
|
||||||
|
@ -15024,10 +14832,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
|
ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_DEPTHWISE_2D:
|
|
||||||
{
|
|
||||||
ggml_compute_forward_conv_depthwise_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
|
||||||
} break;
|
|
||||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
|
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
|
||||||
|
@ -16764,10 +16568,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_DEPTHWISE_2D:
|
|
||||||
{
|
|
||||||
n_tasks = n_threads;
|
|
||||||
} break;
|
|
||||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
10
ggml.h
10
ggml.h
|
@ -433,7 +433,6 @@ extern "C" {
|
||||||
GGML_OP_CLAMP,
|
GGML_OP_CLAMP,
|
||||||
GGML_OP_CONV_TRANSPOSE_1D,
|
GGML_OP_CONV_TRANSPOSE_1D,
|
||||||
GGML_OP_IM2COL,
|
GGML_OP_IM2COL,
|
||||||
GGML_OP_CONV_DEPTHWISE_2D,
|
|
||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
GGML_OP_POOL_1D,
|
GGML_OP_POOL_1D,
|
||||||
GGML_OP_POOL_2D,
|
GGML_OP_POOL_2D,
|
||||||
|
@ -1297,15 +1296,6 @@ extern "C" {
|
||||||
int axis2,
|
int axis2,
|
||||||
int axis3);
|
int axis3);
|
||||||
|
|
||||||
// some operations don't support permuted tensor, so we need to copy it, to avoid this case
|
|
||||||
GGML_API struct ggml_tensor * ggml_permute_cpy(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * a,
|
|
||||||
int axis0,
|
|
||||||
int axis1,
|
|
||||||
int axis2,
|
|
||||||
int axis3);
|
|
||||||
|
|
||||||
// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
|
// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
|
||||||
GGML_API struct ggml_tensor * ggml_transpose(
|
GGML_API struct ggml_tensor * ggml_transpose(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue