diff --git a/Package.swift b/Package.swift index 96f52c4f0..3efea63bf 100644 --- a/Package.swift +++ b/Package.swift @@ -12,12 +12,7 @@ let package = Package( name: "llama", path: ".", exclude: ["ggml-metal.metal"], - sources: [ - "ggml.c", - "llama.cpp", - "ggml-alloc.c", - "k_quants.c" - ], + sources: ["ggml.c", "llama.cpp", "ggml-alloc.c", "k_quants.c", "ggml-metal.m"], publicHeadersPath: "spm-headers", cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32"]), @@ -25,9 +20,12 @@ let package = Package( .define("GGML_USE_ACCELERATE") ], linkerSettings: [ - .linkedFramework("Accelerate") + .linkedFramework("Accelerate"), + .linkedFramework("Metal"), + .linkedFramework("MetalKit") ] ), ], + cLanguageStandard: .c11, cxxLanguageStandard: .cxx11 ) diff --git a/ggml-metal.m b/ggml-metal.m index d0d23442e..c924e30ae 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -76,7 +76,6 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); - GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); @@ -117,24 +116,10 @@ static NSString * const msl_library_source = @"see metal.metal"; struct ggml_metal_context * ggml_metal_init(int n_cb) { metal_printf("%s: allocating\n", __func__); - // Show all the Metal device instances in the system - NSArray * devices = MTLCopyAllDevices(); - id device; - NSString * s; - for (device in devices) { - s = [device name]; - metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); - } - - // Pick and show default Metal device - device = MTLCreateSystemDefaultDevice(); - s = [device name]; - metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]); - - // Configure context struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); - ctx->device = device; + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); + ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; ctx->concur_list_len = 0; @@ -190,14 +175,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ - (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ - (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } - + GGML_METAL_ADD_KERNEL(add); GGML_METAL_ADD_KERNEL(add_row); GGML_METAL_ADD_KERNEL(mul); @@ -220,7 +202,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); - GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); @@ -261,7 +242,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { void ggml_metal_free(struct ggml_metal_context * ctx) { metal_printf("%s: deallocating\n", __func__); #define GGML_METAL_DEL_KERNEL(name) \ - [ctx->function_##name release]; \ +// [ctx->function_##name release]; \ [ctx->pipeline_##name release]; GGML_METAL_DEL_KERNEL(add); @@ -286,7 +267,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(norm); GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); - GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row); GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); @@ -312,15 +292,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { #undef GGML_METAL_DEL_KERNEL - for (int i = 0; i < ctx->n_buffers; ++i) { - [ctx->buffers[i].metal release]; - } +// for (int i = 0; i < ctx->n_buffers; ++i) { +// [ctx->buffers[i].metal release]; +// } - [ctx->library release]; - [ctx->queue release]; - [ctx->device release]; +// [ctx->library release]; +// [ctx->queue release]; +// [ctx->device release]; - dispatch_release(ctx->d_queue); +// dispatch_release(ctx->d_queue); free(ctx); } @@ -871,11 +851,7 @@ void ggml_metal_graph_compute( { nth0 = 32; nth1 = 1; - if (ne11 * ne12 < 4) { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row]; - } else { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; - } + [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; } break; case GGML_TYPE_Q4_0: { @@ -927,8 +903,8 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne12 == 1); - nth0 = 4; //1; - nth1 = 8; //32; + nth0 = 2; + nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; } break; case GGML_TYPE_Q5_K: @@ -976,12 +952,9 @@ void ggml_metal_graph_compute( [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 || - src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) { + src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } else if (src0t == GGML_TYPE_Q3_K) { #ifdef GGML_QKK_64 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; @@ -995,8 +968,8 @@ void ggml_metal_graph_compute( else if (src0t == GGML_TYPE_Q6_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { - int64_t ny = (ne11 + 3)/4; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } } } break; diff --git a/ggml.c b/ggml.c index 38b1155c1..69eaea4b7 100644 --- a/ggml.c +++ b/ggml.c @@ -56,6 +56,7 @@ typedef volatile LONG atomic_int; typedef atomic_int atomic_bool; + static void atomic_store(atomic_int * ptr, LONG val) { InterlockedExchange(ptr, val); } @@ -817,6 +818,46 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #if !defined(__aarch64__) +inline static uint16_t vaddvq_u8(uint8x16_t v) { + return + (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) + + (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) + + (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) + + (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) + + (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) + + (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) + + (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) + + (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); +} + +inline static int16_t vaddvq_s8(int8x16_t v) { + return + (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) + + (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) + + (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) + + (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) + + (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) + + (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) + + (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) + + (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15); +} + +inline static int32_t vaddvq_s16(int16x8_t v) { + return + (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +} + +inline static uint32_t vaddvq_u16(uint16x8_t v) { + return + (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) + + (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) + + (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) + + (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7); +} + inline static int32_t vaddvq_s32(int32x4_t v) { return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); } @@ -825,6 +866,12 @@ inline static float vaddvq_f32(float32x4_t v) { return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); } +inline static float vminvq_f32(float32x4_t v) { + return + MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + inline static float vmaxvq_f32(float32x4_t v) { return MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), @@ -19919,6 +19966,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p (int64_t) info->ne[2] * (int64_t) info->ne[3]; + printf("Info Type: %u\n", info->type); if (ne % ggml_blck_size(info->type) != 0) { fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", __func__, info->name.data, ne, ggml_blck_size(info->type)); @@ -19929,6 +19977,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type); + printf("Size of data in bytes: %zu\n", ctx->size); ctx->size += GGML_PAD(size_cur, ctx->alignment); } }