Removes unnecessary autoreleasing under arc + allows Package.swift to be consumed

2023-09-04 17:41:38 -05:00 · 2023-09-04 17:41:38 -05:00 · 9f49b2d520
commit 9f49b2d520
parent 2ba85c8609
3 changed files with 71 additions and 51 deletions
--- a/Package.swift
+++ b/Package.swift
@ -12,12 +12,7 @@ let package = Package(
            name: "llama",
            path: ".",
            exclude: ["ggml-metal.metal"],
-            sources: [
-                "ggml.c",
-                "llama.cpp",
-                "ggml-alloc.c",
-                "k_quants.c"
-            ],
+            sources: ["ggml.c", "llama.cpp", "ggml-alloc.c", "k_quants.c", "ggml-metal.m"],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32"]),
@ -25,9 +20,12 @@ let package = Package(
                .define("GGML_USE_ACCELERATE")
            ],
            linkerSettings: [
-                .linkedFramework("Accelerate")
+                .linkedFramework("Accelerate"),
+                .linkedFramework("Metal"),
+                .linkedFramework("MetalKit")
            ]
        ),
    ],
+    cLanguageStandard: .c11,
    cxxLanguageStandard: .cxx11
 )
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -76,7 +76,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(norm);
    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@ -117,24 +116,10 @@ static NSString * const msl_library_source = @"see metal.metal";
 struct ggml_metal_context * ggml_metal_init(int n_cb) {
    metal_printf("%s: allocating\n", __func__);

-    // Show all the Metal device instances in the system
-    NSArray * devices = MTLCopyAllDevices();
-    id <MTLDevice> device;
-    NSString * s;
-    for (device in devices) {
-        s = [device name];
-        metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
-    }
-
-    // Pick and show default Metal device
-    device = MTLCreateSystemDefaultDevice();
-    s = [device name];
-    metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
-
-    // Configure context
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
-    ctx->device = device;
+
    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
+    ctx->device = MTLCreateSystemDefaultDevice();
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
    ctx->concur_list_len = 0;
@ -190,14 +175,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
-        metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
-                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
-                (int) ctx->pipeline_##name.threadExecutionWidth); \
        if (error) { \
            metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
            return NULL; \
        }
-
+        
        GGML_METAL_ADD_KERNEL(add);
        GGML_METAL_ADD_KERNEL(add_row);
        GGML_METAL_ADD_KERNEL(mul);
@ -220,7 +202,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(norm);
        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@ -261,7 +242,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 void ggml_metal_free(struct ggml_metal_context * ctx) {
    metal_printf("%s: deallocating\n", __func__);
 #define GGML_METAL_DEL_KERNEL(name) \
-    [ctx->function_##name release]; \
+//    [ctx->function_##name release]; \
    [ctx->pipeline_##name release];

    GGML_METAL_DEL_KERNEL(add);
@ -286,7 +267,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(norm);
    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@ -312,15 +292,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {

 #undef GGML_METAL_DEL_KERNEL

-    for (int i = 0; i < ctx->n_buffers; ++i) {
-        [ctx->buffers[i].metal release];
-    }
+//    for (int i = 0; i < ctx->n_buffers; ++i) {
+//        [ctx->buffers[i].metal release];
+//    }

-    [ctx->library release];
-    [ctx->queue release];
-    [ctx->device release];
+//    [ctx->library release];
+//    [ctx->queue release];
+//    [ctx->device release];

-    dispatch_release(ctx->d_queue);
+//    dispatch_release(ctx->d_queue);

    free(ctx);
 }
@ -871,11 +851,7 @@ void ggml_metal_graph_compute(
                                        {
                                            nth0 = 32;
                                            nth1 = 1;
-                                            if (ne11 * ne12 < 4) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
-                                            } else {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
-                                            }
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
                                        } break;
                                    case GGML_TYPE_Q4_0:
                                        {
@ -927,8 +903,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4; //1;
-                                            nth1 = 8; //32;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
@ -976,12 +952,9 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];

                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
+                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
-                                else if (src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
                                else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -995,8 +968,8 @@ void ggml_metal_graph_compute(
                                else if (src0t == GGML_TYPE_Q6_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
-                                    int64_t ny = (ne11 + 3)/4;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                            }
                        } break;
--- a/ggml.c
+++ b/ggml.c
@ -56,6 +56,7 @@
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;

+
 static void atomic_store(atomic_int * ptr, LONG val) {
    InterlockedExchange(ptr, val);
 }
@ -817,6 +818,46 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128

 #if !defined(__aarch64__)

+inline static uint16_t vaddvq_u8(uint8x16_t v) {
+    return
+        (uint16_t)vgetq_lane_u8(v, 0)  + (uint16_t)vgetq_lane_u8(v, 1)  +
+        (uint16_t)vgetq_lane_u8(v, 2)  + (uint16_t)vgetq_lane_u8(v, 3)  +
+        (uint16_t)vgetq_lane_u8(v, 4)  + (uint16_t)vgetq_lane_u8(v, 5)  +
+        (uint16_t)vgetq_lane_u8(v, 6)  + (uint16_t)vgetq_lane_u8(v, 7)  +
+        (uint16_t)vgetq_lane_u8(v, 8)  + (uint16_t)vgetq_lane_u8(v, 9)  +
+        (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
+        (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
+        (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
+}
+
+inline static int16_t vaddvq_s8(int8x16_t v) {
+    return
+        (int16_t)vgetq_lane_s8(v, 0)  + (int16_t)vgetq_lane_s8(v, 1)  +
+        (int16_t)vgetq_lane_s8(v, 2)  + (int16_t)vgetq_lane_s8(v, 3)  +
+        (int16_t)vgetq_lane_s8(v, 4)  + (int16_t)vgetq_lane_s8(v, 5)  +
+        (int16_t)vgetq_lane_s8(v, 6)  + (int16_t)vgetq_lane_s8(v, 7)  +
+        (int16_t)vgetq_lane_s8(v, 8)  + (int16_t)vgetq_lane_s8(v, 9)  +
+        (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
+        (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
+        (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
+}
+
+inline static int32_t vaddvq_s16(int16x8_t v) {
+    return
+        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
+}
+
+inline static uint32_t vaddvq_u16(uint16x8_t v) {
+    return
+        (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
+        (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
+        (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
+        (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
+}
+
 inline static int32_t vaddvq_s32(int32x4_t v) {
    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 }
@ -825,6 +866,12 @@ inline static float vaddvq_f32(float32x4_t v) {
    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 }

+inline static float vminvq_f32(float32x4_t v) {
+    return
+        MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
 inline static float vmaxvq_f32(float32x4_t v) {
    return
        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@ -19919,6 +19966,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                (int64_t) info->ne[2] *
                (int64_t) info->ne[3];

+            printf("Info Type: %u\n", info->type);
            if (ne % ggml_blck_size(info->type) != 0) {
                fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
                        __func__, info->name.data, ne, ggml_blck_size(info->type));
@ -19929,6 +19977,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

            const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);

+            printf("Size of data in bytes: %zu\n", ctx->size);
            ctx->size += GGML_PAD(size_cur, ctx->alignment);
        }
    }