From 9303bbf1b15baf30d857e4aff1aa84de152a549a Mon Sep 17 00:00:00 2001 From: Chenxiaotao03 Date: Fri, 19 Jan 2024 12:50:01 +0800 Subject: [PATCH] delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake --- CMakeLists.txt | 7 ++ android/build_64.sh | 2 +- examples/llava/clip.cpp | 51 ++++----- ggml.c | 222 ++-------------------------------------- ggml.h | 10 -- 5 files changed, 46 insertions(+), 246 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2741568ed..390bccc23 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -107,6 +107,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STA option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ON) + +# add perf arguments +option(LLAMA_PERF "llama: enable perf" OFF) +if (LLAMA_PERF) + add_definitions(-DGGML_PERF) +endif() + # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) diff --git a/android/build_64.sh b/android/build_64.sh index 529fb291e..3982854e2 100755 --- a/android/build_64.sh +++ b/android/build_64.sh @@ -3,6 +3,6 @@ cmake ../../ \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DCMAKE_BUILD_TYPE=Release \ -DANDROID_ABI="arm64-v8a" \ --DANDROID_PLATFORM=android-23 +-DANDROID_PLATFORM=android-23 $1 make -j4 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index c900f5a2b..34f36b4a0 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -583,25 +583,24 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 mlp_1 = ggml_gelu(ctx0, mlp_1); struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); - // transpose from [1, 576, 2048] --> [1, 24, 24, 2048] --> [1, 2048, 24, 24] - mlp_3 = ggml_reshape_4d(ctx0, mlp_3, mlp_3->ne[0], n_patch, n_patch, mlp_3->ne[3]); - // permute logic is src idxs 0,1,2,3 perm to dst idxs - mlp_3 = ggml_permute_cpy(ctx0, mlp_3, 2, 0, 1, 3); - // mlp_3 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] // block 1 struct ggml_tensor * block_1 = nullptr; { + // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] + mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); + mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); // stride = 1, padding = 1, bias is nullptr block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1); // layer norm // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] block_1 = ggml_norm(ctx0, block_1, eps); block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); - block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] // hardswish @@ -621,17 +620,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); block_1 = ggml_mul(ctx0, block_1_hw, block_1); - // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_1_block_2_0_w, 1, 1, - model.mm_model_block_1_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]); - block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1); + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - // layernorm - block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] block_1 = ggml_norm(ctx0, block_1, eps); block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); - block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] // residual block_1 = ggml_add(ctx0, mlp_3, block_1); @@ -644,11 +644,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // layer norm - block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] block_1 = ggml_norm(ctx0, block_1, eps); block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); - block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // hardswish struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); @@ -664,22 +664,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); block_1 = ggml_hardsigmoid(ctx0, block_1); - + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); block_1 = ggml_mul(ctx0, block_1_hw, block_1); - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_2_block_2_0_w, 1, 1, - model.mm_model_block_2_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]); - block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1); - // layernorm - block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] block_1 = ggml_norm(ctx0, block_1, eps); block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] - } + } embeddings = block_1; } else { diff --git a/ggml.c b/ggml.c index 3befb4efe..2e3849210 100644 --- a/ggml.c +++ b/ggml.c @@ -1650,7 +1650,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CLAMP", "CONV_TRANSPOSE_1D", "IM2COL", - "CONV_DEPTHWISE_2D", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", @@ -1684,7 +1683,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); +static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1738,7 +1737,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "conv_transpose_2d(x)", - "conv_depthwise_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", @@ -1771,7 +1769,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); +static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4780,24 +4778,6 @@ struct ggml_tensor * ggml_permute( return result; } -// some operations don't support permuted tensor, so we need to copy it, to avoid this case -struct ggml_tensor * ggml_permute_cpy( - struct ggml_context * ctx, - struct ggml_tensor * a, - int axis0, - int axis1, - int axis2, - int axis3) { - struct ggml_tensor * result = ggml_permute(ctx, a, axis0, axis1, axis2, axis3); - // new 4d tensor - struct ggml_tensor* tensor = ggml_new_tensor_4d(ctx, a->type, result->ne[0], result->ne[1], result->ne[2], result->ne[3]); - - struct ggml_tensor* cpy = ggml_cpy(ctx, result, tensor); - - return cpy; -} - - // ggml_transpose struct ggml_tensor * ggml_transpose( @@ -5402,37 +5382,18 @@ struct ggml_tensor * ggml_conv_depthwise_2d( int d0, int d1) { + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); + struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, + ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), + s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW] - const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); - const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); - const int64_t ne[4] = { - OW, - OH, - b->ne[2], - b->ne[3], - }; - // GGML_ASSERT(a->ne[3] == b->ne[2]); - // GGML_ASSERT(a->ne[2] == 1); + struct ggml_tensor * result = + ggml_mul_mat(ctx, + ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW] + ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] - // weight ne: [KW, KH, OC, 1] - GGML_ASSERT(a->ne[2] == b->ne[2]); - GGML_ASSERT(a->ne[3] == 1); - bool is_node = false; - /* - if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - */ - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - int32_t params[] = { s0, s1, p0, p1, d0, d1 }; - ggml_set_op_params(result, params, sizeof(params)); + result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] - result->op = GGML_OP_CONV_DEPTHWISE_2D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - result->src[1] = b; - result->src[2] = c; return result; } // ggml_conv_2d @@ -12529,159 +12490,6 @@ static void ggml_compute_forward_im2col( } } -static void ggml_compute_forward_conv_depthwise_2d_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, - struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - // total patches in dst - const int np = ne2; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); - - const int32_t stride_h = ggml_get_op_params_i32(dst, 0); - const int32_t stride_w = ggml_get_op_params_i32(dst, 1); - const int32_t pad_h = ggml_get_op_params_i32(dst, 2); - const int32_t pad_w = ggml_get_op_params_i32(dst, 3); - const int32_t dilation_h = ggml_get_op_params_i32(dst, 4); - const int32_t dilation_w = ggml_get_op_params_i32(dst, 5); - - float* weight = (float*)(src0->data); - float* input = (float*)(src1->data); - // float* bias = (float*)(src2->data); - float* output = (float*)(dst->data); - for (int b = 0; b < ne13; ++b) { - for (int o_c = ip0; o_c < ip1; ++o_c) { - for (int o_h = 0; o_h < ne1; ++o_h) { - for (int o_w = 0; o_w < ne0; ++o_w) { - float result_data = 0; - int g = o_c; - int i_c = g; - for (int k_h = 0; k_h < ne01; ++k_h) { - for (int k_w = 0; k_w < ne00; ++k_w) { - int i_h = o_h * stride_h - pad_h + k_h * dilation_h; - int i_w = o_w * stride_w - pad_w + k_w * dilation_w; - if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) { - continue; - } - float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w]; - float weight_data = weight[(g * ne01 + k_h) * ne00 + k_w]; - result_data += input_data * weight_data; - } - } - // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c]; - output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data; - } - } - } - } - -} - -static void ggml_compute_forward_conv_depthwise_2d_f16_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, - struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - // total patches in dst - const int np = ne2; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); - - const int32_t stride_h = ggml_get_op_params_i32(dst, 0); - const int32_t stride_w = ggml_get_op_params_i32(dst, 1); - const int32_t pad_h = ggml_get_op_params_i32(dst, 2); - const int32_t pad_w = ggml_get_op_params_i32(dst, 3); - const int32_t dilation_h = ggml_get_op_params_i32(dst, 4); - const int32_t dilation_w = ggml_get_op_params_i32(dst, 5); - - ggml_fp16_t* weight = (ggml_fp16_t*)(src0->data); - float* input = (float*)(src1->data); - // float* bias = (float*)(src2->data); - float* output = (float*)(dst->data); - for (int b = 0; b < ne13; ++b) { - for (int o_c = ip0; o_c < ip1; ++o_c) { - for (int o_h = 0; o_h < ne1; ++o_h) { - for (int o_w = 0; o_w < ne0; ++o_w) { - float result_data = 0; - int g = o_c; - int i_c = g; - for (int k_h = 0; k_h < ne01; ++k_h) { - for (int k_w = 0; k_w < ne00; ++k_w) { - int i_h = o_h * stride_h - pad_h + k_h * dilation_h; - int i_w = o_w * stride_w - pad_w + k_w * dilation_w; - if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) { - continue; - } - float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w]; - float weight_data = GGML_FP16_TO_FP32(weight[(g * ne01 + k_h) * ne00 + k_w]); - result_data += input_data * weight_data; - } - } - // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c]; - output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data; - } - } - } - } - -} - -static void ggml_compute_forward_conv_depthwise_2d( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, - struct ggml_tensor * dst) { - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_conv_depthwise_2d_f32(params, src0, src1, src2, dst); - } break; - case GGML_TYPE_F16: - { - if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_conv_depthwise_2d_f16_f32(params, src0, src1, src2, dst); - } else { - GGML_ASSERT(false); - } - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} // ggml_compute_forward_conv_transpose_2d @@ -15024,10 +14832,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor); } break; - case GGML_OP_CONV_DEPTHWISE_2D: - { - ggml_compute_forward_conv_depthwise_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); - } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); @@ -16764,10 +16568,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = n_threads; } break; - case GGML_OP_CONV_DEPTHWISE_2D: - { - n_tasks = n_threads; - } break; case GGML_OP_CONV_TRANSPOSE_2D: { n_tasks = n_threads; diff --git a/ggml.h b/ggml.h index 1ca68c2f4..50e3882c8 100644 --- a/ggml.h +++ b/ggml.h @@ -433,7 +433,6 @@ extern "C" { GGML_OP_CLAMP, GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, - GGML_OP_CONV_DEPTHWISE_2D, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, @@ -1297,15 +1296,6 @@ extern "C" { int axis2, int axis3); - // some operations don't support permuted tensor, so we need to copy it, to avoid this case - GGML_API struct ggml_tensor * ggml_permute_cpy( - struct ggml_context * ctx, - struct ggml_tensor * a, - int axis0, - int axis1, - int axis2, - int axis3); - // alias for ggml_permute(ctx, a, 1, 0, 2, 3) GGML_API struct ggml_tensor * ggml_transpose( struct ggml_context * ctx,