Removes unnecessary autoreleasing under arc + allows Package.swift to be consumed
This commit is contained in:
parent
2ba85c8609
commit
9f49b2d520
3 changed files with 71 additions and 51 deletions
|
@ -12,12 +12,7 @@ let package = Package(
|
|||
name: "llama",
|
||||
path: ".",
|
||||
exclude: ["ggml-metal.metal"],
|
||||
sources: [
|
||||
"ggml.c",
|
||||
"llama.cpp",
|
||||
"ggml-alloc.c",
|
||||
"k_quants.c"
|
||||
],
|
||||
sources: ["ggml.c", "llama.cpp", "ggml-alloc.c", "k_quants.c", "ggml-metal.m"],
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
||||
|
@ -25,9 +20,12 @@ let package = Package(
|
|||
.define("GGML_USE_ACCELERATE")
|
||||
],
|
||||
linkerSettings: [
|
||||
.linkedFramework("Accelerate")
|
||||
.linkedFramework("Accelerate"),
|
||||
.linkedFramework("Metal"),
|
||||
.linkedFramework("MetalKit")
|
||||
]
|
||||
),
|
||||
],
|
||||
cLanguageStandard: .c11,
|
||||
cxxLanguageStandard: .cxx11
|
||||
)
|
||||
|
|
61
ggml-metal.m
61
ggml-metal.m
|
@ -76,7 +76,6 @@ struct ggml_metal_context {
|
|||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
||||
|
@ -117,24 +116,10 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|||
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
metal_printf("%s: allocating\n", __func__);
|
||||
|
||||
// Show all the Metal device instances in the system
|
||||
NSArray * devices = MTLCopyAllDevices();
|
||||
id <MTLDevice> device;
|
||||
NSString * s;
|
||||
for (device in devices) {
|
||||
s = [device name];
|
||||
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
||||
}
|
||||
|
||||
// Pick and show default Metal device
|
||||
device = MTLCreateSystemDefaultDevice();
|
||||
s = [device name];
|
||||
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
||||
|
||||
// Configure context
|
||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||
ctx->device = device;
|
||||
|
||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||
ctx->device = MTLCreateSystemDefaultDevice();
|
||||
ctx->queue = [ctx->device newCommandQueue];
|
||||
ctx->n_buffers = 0;
|
||||
ctx->concur_list_len = 0;
|
||||
|
@ -190,14 +175,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
#define GGML_METAL_ADD_KERNEL(name) \
|
||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||
metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||
if (error) { \
|
||||
metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
return NULL; \
|
||||
}
|
||||
|
||||
|
||||
GGML_METAL_ADD_KERNEL(add);
|
||||
GGML_METAL_ADD_KERNEL(add_row);
|
||||
GGML_METAL_ADD_KERNEL(mul);
|
||||
|
@ -220,7 +202,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
||||
|
@ -261,7 +242,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
metal_printf("%s: deallocating\n", __func__);
|
||||
#define GGML_METAL_DEL_KERNEL(name) \
|
||||
[ctx->function_##name release]; \
|
||||
// [ctx->function_##name release]; \
|
||||
[ctx->pipeline_##name release];
|
||||
|
||||
GGML_METAL_DEL_KERNEL(add);
|
||||
|
@ -286,7 +267,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||
GGML_METAL_DEL_KERNEL(norm);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
||||
|
@ -312,15 +292,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
|
||||
#undef GGML_METAL_DEL_KERNEL
|
||||
|
||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||
[ctx->buffers[i].metal release];
|
||||
}
|
||||
// for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||
// [ctx->buffers[i].metal release];
|
||||
// }
|
||||
|
||||
[ctx->library release];
|
||||
[ctx->queue release];
|
||||
[ctx->device release];
|
||||
// [ctx->library release];
|
||||
// [ctx->queue release];
|
||||
// [ctx->device release];
|
||||
|
||||
dispatch_release(ctx->d_queue);
|
||||
// dispatch_release(ctx->d_queue);
|
||||
|
||||
free(ctx);
|
||||
}
|
||||
|
@ -871,11 +851,7 @@ void ggml_metal_graph_compute(
|
|||
{
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
if (ne11 * ne12 < 4) {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
||||
} else {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
}
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
|
@ -927,8 +903,8 @@ void ggml_metal_graph_compute(
|
|||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4; //1;
|
||||
nth1 = 8; //32;
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
|
@ -976,12 +952,9 @@ void ggml_metal_graph_compute(
|
|||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
|
@ -995,8 +968,8 @@ void ggml_metal_graph_compute(
|
|||
else if (src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
int64_t ny = (ne11 + 3)/4;
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
}
|
||||
} break;
|
||||
|
|
49
ggml.c
49
ggml.c
|
@ -56,6 +56,7 @@
|
|||
typedef volatile LONG atomic_int;
|
||||
typedef atomic_int atomic_bool;
|
||||
|
||||
|
||||
static void atomic_store(atomic_int * ptr, LONG val) {
|
||||
InterlockedExchange(ptr, val);
|
||||
}
|
||||
|
@ -817,6 +818,46 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
||||
return
|
||||
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
||||
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
||||
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
||||
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
||||
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
||||
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
||||
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
||||
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
||||
}
|
||||
|
||||
inline static int16_t vaddvq_s8(int8x16_t v) {
|
||||
return
|
||||
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
|
||||
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
|
||||
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
|
||||
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
|
||||
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
|
||||
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
|
||||
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
|
||||
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||
return
|
||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
||||
}
|
||||
|
||||
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
||||
return
|
||||
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
||||
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
||||
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
||||
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
@ -825,6 +866,12 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vminvq_f32(float32x4_t v) {
|
||||
return
|
||||
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
|
@ -19919,6 +19966,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
(int64_t) info->ne[2] *
|
||||
(int64_t) info->ne[3];
|
||||
|
||||
printf("Info Type: %u\n", info->type);
|
||||
if (ne % ggml_blck_size(info->type) != 0) {
|
||||
fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
||||
__func__, info->name.data, ne, ggml_blck_size(info->type));
|
||||
|
@ -19929,6 +19977,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
|
||||
|
||||
printf("Size of data in bytes: %zu\n", ctx->size);
|
||||
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue