delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

2024-01-19 12:50:01 +08:00 · 2024-01-19 12:50:01 +08:00 · 9303bbf1b1
commit 9303bbf1b1
parent ea6cdccea1
5 changed files with 46 additions and 246 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -107,6 +107,13 @@ option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STA
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
 # add perf arguments
 option(LLAMA_PERF                            "llama: enable perf"                               OFF)
 if (LLAMA_PERF)
    add_definitions(-DGGML_PERF)
 endif()
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
--- a/android/build_64.sh
+++ b/android/build_64.sh
@ -3,6 +3,6 @@ cmake ../../ \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DCMAKE_BUILD_TYPE=Release \
 -DANDROID_ABI="arm64-v8a" \
-DANDROID_PLATFORM=android-23
+-DANDROID_PLATFORM=android-23 $1
 make -j4
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -583,25 +583,24 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            mlp_1 = ggml_gelu(ctx0, mlp_1);
            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
-            // transpose from [1, 576, 2048] --> [1, 24, 24, 2048] --> [1, 2048, 24, 24]
+            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
            mlp_3 = ggml_reshape_4d(ctx0, mlp_3, mlp_3->ne[0], n_patch, n_patch, mlp_3->ne[3]);
            // permute logic is src idxs 0,1,2,3 perm to dst idxs
            mlp_3 = ggml_permute_cpy(ctx0, mlp_3, 2, 0, 1, 3);
            // mlp_3 shape = [1, 2048, 24, 24],  ne = [24, 24, 2048, 1]
            // block 1
            struct ggml_tensor * block_1 = nullptr;
            {
                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                // stride = 1, padding = 1, bias is nullptr
                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
                // layer norm
                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
-                block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                // hardswish
@ -621,17 +620,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
-                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                int w = block_1->ne[0], h = block_1->ne[1];
-                struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_1_block_2_0_w, 1, 1, 
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                        model.mm_model_block_1_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
-                block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
+
                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
                // layernorm
                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
-                block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                // residual
                block_1 = ggml_add(ctx0, mlp_3, block_1);
@ -644,11 +644,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // layer norm
-                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
-                block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // hardswish
                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
@ -664,22 +664,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
                block_1 = ggml_hardsigmoid(ctx0, block_1);
-                
+
                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
-                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+
-                struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_2_block_2_0_w, 1, 1, 
+                int w = block_1->ne[0], h = block_1->ne[1];
-                        model.mm_model_block_2_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
-                // layernorm
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);                
                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
-            }
+            }            
            embeddings = block_1;
        }
        else {
--- a/ggml.c
+++ b/ggml.c
@ -1650,7 +1650,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CLAMP",
    "CONV_TRANSPOSE_1D",
    "IM2COL",
    "CONV_DEPTHWISE_2D",
    "CONV_TRANSPOSE_2D",
    "POOL_1D",
    "POOL_2D",
@ -1684,7 +1683,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1738,7 +1737,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "conv_transpose_1d(x)",
    "im2col(x)",
    "conv_transpose_2d(x)",
    "conv_depthwise_2d(x)",
    "pool_1d(x)",
    "pool_2d(x)",
    "upscale(x)",
@ -1771,7 +1769,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4780,24 +4778,6 @@ struct ggml_tensor * ggml_permute(
    return result;
 }
 // some operations don't support permuted tensor, so we need to copy it, to avoid this case
 struct ggml_tensor * ggml_permute_cpy(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   axis0,
        int                   axis1,
        int                   axis2,
        int                   axis3) {
    struct ggml_tensor * result = ggml_permute(ctx, a, axis0, axis1, axis2, axis3);
    // new 4d tensor
    struct ggml_tensor* tensor = ggml_new_tensor_4d(ctx, a->type, result->ne[0], result->ne[1], result->ne[2], result->ne[3]);
    struct ggml_tensor* cpy = ggml_cpy(ctx, result, tensor);
    return cpy;
 }
 // ggml_transpose
 struct ggml_tensor * ggml_transpose(
@ -5402,37 +5382,18 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
    int                  d0,
    int                  d1) {
    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
                                        s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
-    const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
+    struct ggml_tensor * result =
-    const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+        ggml_mul_mat(ctx,
-    const int64_t ne[4] = {
+                ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1),                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
-        OW,
+                ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
        OH,
        b->ne[2],
        b->ne[3],
    };
    // GGML_ASSERT(a->ne[3] == b->ne[2]);
    // GGML_ASSERT(a->ne[2] == 1);
-    // weight ne: [KW, KH, OC, 1]
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
    GGML_ASSERT(a->ne[2] == b->ne[2]);
    GGML_ASSERT(a->ne[3] == 1);
    bool is_node = false;
    /*
    if (a->grad || b->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }
    */
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
    ggml_set_op_params(result, params, sizeof(params));
    result->op = GGML_OP_CONV_DEPTHWISE_2D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;
    result->src[2] = c;
    return result;
 }
 // ggml_conv_2d
@ -12529,159 +12490,6 @@ static void ggml_compute_forward_im2col(
    }
 }
 static void ggml_compute_forward_conv_depthwise_2d_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        const struct ggml_tensor * src2,
              struct ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
    // total patches in dst
    const int np = ne2;
    // patches per thread
    const int dp = (np + nth - 1)/nth;
    // patch range for this thread
    const int ip0 = dp*ith;
    const int ip1 = MIN(ip0 + dp, np);
    const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
    const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
    const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
    const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
    const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
    const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
    float* weight = (float*)(src0->data);
    float* input = (float*)(src1->data);
    // float* bias = (float*)(src2->data);
    float* output = (float*)(dst->data);
    for (int b = 0; b < ne13; ++b) {
        for (int o_c = ip0; o_c < ip1; ++o_c) {
            for (int o_h = 0; o_h < ne1; ++o_h) {
                for (int o_w = 0; o_w < ne0; ++o_w) {
                    float result_data = 0;
                    int g = o_c;
                    int i_c = g;
                    for (int k_h = 0; k_h < ne01; ++k_h) {
                        for (int k_w = 0; k_w < ne00; ++k_w) {
                            int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
                            int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
                            if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
                                continue;
                            }
                            float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
                            float weight_data = weight[(g * ne01 + k_h) * ne00 + k_w];
                            result_data += input_data * weight_data;
                        }
                    }
                    // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
                    output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
                }
            }
        }
    }
 }
 static void ggml_compute_forward_conv_depthwise_2d_f16_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        const struct ggml_tensor * src2,
              struct ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
    // total patches in dst
    const int np = ne2;
    // patches per thread
    const int dp = (np + nth - 1)/nth;
    // patch range for this thread
    const int ip0 = dp*ith;
    const int ip1 = MIN(ip0 + dp, np);
    const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
    const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
    const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
    const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
    const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
    const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
    ggml_fp16_t* weight = (ggml_fp16_t*)(src0->data);
    float* input = (float*)(src1->data);
    // float* bias = (float*)(src2->data);
    float* output = (float*)(dst->data);
    for (int b = 0; b < ne13; ++b) {
        for (int o_c = ip0; o_c < ip1; ++o_c) {
            for (int o_h = 0; o_h < ne1; ++o_h) {
                for (int o_w = 0; o_w < ne0; ++o_w) {
                    float result_data = 0;
                    int g = o_c;
                    int i_c = g;
                    for (int k_h = 0; k_h < ne01; ++k_h) {
                        for (int k_w = 0; k_w < ne00; ++k_w) {
                            int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
                            int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
                            if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
                                continue;
                            }
                            float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
                            float weight_data = GGML_FP16_TO_FP32(weight[(g * ne01 + k_h) * ne00 + k_w]);
                            result_data += input_data * weight_data;
                        }
                    }
                    // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
                    output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
                }
            }
        }
    }
 }
 static void ggml_compute_forward_conv_depthwise_2d(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        const struct ggml_tensor * src2,
              struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_conv_depthwise_2d_f32(params, src0, src1, src2, dst);
            } break;
        case GGML_TYPE_F16:
            {
                if (src1->type == GGML_TYPE_F32) {
                    ggml_compute_forward_conv_depthwise_2d_f16_f32(params, src0, src1, src2, dst);
                } else {
                    GGML_ASSERT(false);
                }
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_conv_transpose_2d
@ -15024,10 +14832,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
            } break;
        case GGML_OP_CONV_DEPTHWISE_2D:
            {
                ggml_compute_forward_conv_depthwise_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
            } break;
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
                ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
@ -16764,10 +16568,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_CONV_DEPTHWISE_2D:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
                n_tasks = n_threads;
--- a/ggml.h
+++ b/ggml.h
@ -433,7 +433,6 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
        GGML_OP_CONV_DEPTHWISE_2D,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
@ -1297,15 +1296,6 @@ extern "C" {
            int                   axis2,
            int                   axis3);
    // some operations don't support permuted tensor, so we need to copy it, to avoid this case
    GGML_API struct ggml_tensor * ggml_permute_cpy(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   axis0,
            int                   axis1,
            int                   axis2,
            int                   axis3);
    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
    GGML_API struct ggml_tensor * ggml_transpose(
            struct ggml_context * ctx,