diff --git a/common/log.h b/common/log.h index bf9fafd68..0b9b01052 100644 --- a/common/log.h +++ b/common/log.h @@ -341,14 +341,14 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri } } + if (_disabled) + { + // Log is disabled + return nullptr; + } + if (_initialized) { - if (_disabled) - { - // Log is disabled - return nullptr; - } - // with fallback in case something went wrong return logfile ? logfile : stderr; } diff --git a/ggml-alloc.c b/ggml-alloc.c index 95d672b45..57ac67c5b 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -284,7 +284,14 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) // address and size of the buffer when measuring // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers static void * const MEASURE_BASE_ADDR = (void *) 0x1000; +#if defined(__ARM_NEON) && !defined(__aarch64__) +// 32-bit +// TODO: Use for 32-bit x86 as well +static const size_t MEASURE_MAX_SIZE = (1ULL<<32) - 1; // 4 GB +#else +// 64-bit static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB +#endif struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); diff --git a/ggml-metal.m b/ggml-metal.m index 4267db9be..88e7e1356 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -116,10 +116,24 @@ static NSString * const msl_library_source = @"see metal.metal"; struct ggml_metal_context * ggml_metal_init(int n_cb) { metal_printf("%s: allocating\n", __func__); - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + // Show all the Metal device instances in the system + NSArray * devices = MTLCopyAllDevices(); + id device; + NSString * s; + for (device in devices) { + s = [device name]; + metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); + } + // Pick and show default Metal device + device = MTLCreateSystemDefaultDevice(); + s = [device name]; + metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]); + + // Configure context + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + ctx->device = device; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); - ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; ctx->concur_list_len = 0; diff --git a/ggml.c b/ggml.c index b2427562e..1dc493783 100644 --- a/ggml.c +++ b/ggml.c @@ -818,46 +818,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #if !defined(__aarch64__) -inline static uint16_t vaddvq_u8(uint8x16_t v) { - return - (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) + - (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) + - (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) + - (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) + - (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) + - (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) + - (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) + - (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); -} - -inline static int16_t vaddvq_s8(int8x16_t v) { - return - (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) + - (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) + - (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) + - (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) + - (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) + - (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) + - (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) + - (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15); -} - -inline static int32_t vaddvq_s16(int16x8_t v) { - return - (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + - (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + - (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + - (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); -} - -inline static uint32_t vaddvq_u16(uint16x8_t v) { - return - (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) + - (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) + - (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) + - (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7); -} - inline static int32_t vaddvq_s32(int32x4_t v) { return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); } @@ -866,12 +826,6 @@ inline static float vaddvq_f32(float32x4_t v) { return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); } -inline static float vminvq_f32(float32x4_t v) { - return - MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - inline static float vmaxvq_f32(float32x4_t v) { return MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), diff --git a/k_quants.c b/k_quants.c index 3deeaedf7..4accd2480 100644 --- a/k_quants.c +++ b/k_quants.c @@ -13,6 +13,26 @@ // #include +#if !defined(__aarch64__) +inline static int32_t vaddvq_s16(int16x8_t v) { + return + (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +} + +inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { + int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); + int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); + return vcombine_s16(a0, b0); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} +#endif + #else #ifdef __wasm_simd128__ @@ -1302,7 +1322,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const uint8x16_t m3 = vdupq_n_u8(0x3); const uint8x16_t m4 = vdupq_n_u8(0xF); +#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); +#endif int8x16x2_t q2bytes; uint8_t aux[16]; @@ -1608,7 +1630,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #ifdef __ARM_NEON const uint8x16_t m3 = vdupq_n_u8(0x3); +#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); +#endif int8x16x4_t q2bytes; @@ -2592,8 +2616,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * restrict q4 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - //int32x4_t isum = mzero; - int32_t sumi1 = 0; int32_t sumi2 = 0; @@ -3092,9 +3114,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #ifdef __ARM_NEON const uint8x16_t m4b = vdupq_n_u8(0xf); - const int32x4_t mzero = vdupq_n_s32(0); const uint8x16_t mone = vdupq_n_u8(1); const uint8x16_t mtwo = vdupq_n_u8(2); +#if defined(__ARM_FEATURE_DOTPROD) + const int32x4_t mzero = vdupq_n_s32(0); +#endif int8x16x4_t q5bytes; @@ -3437,8 +3461,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #ifdef __ARM_NEON const uint8x16_t m4b = vdupq_n_u8(0xf); - const int32x4_t mzero = vdupq_n_s32(0); const uint8x16_t mh = vdupq_n_u8(16); +#if defined(__ARM_FEATURE_DOTPROD) + const int32x4_t mzero = vdupq_n_s32(0); +#endif int8x16x4_t q5bytes; uint8x16x4_t q5h; @@ -3656,7 +3682,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sum = 0; const uint8x16_t m4b = vdupq_n_u8(0xF); +#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); +#endif //const int8x16_t m32s = vdupq_n_s8(32); const uint8x16_t mone = vdupq_n_u8(3); @@ -4045,8 +4073,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sum = 0; const uint8x16_t m4b = vdupq_n_u8(0xF); - const int32x4_t vzero = vdupq_n_s32(0); const int8x16_t m32s = vdupq_n_s8(32); +#if defined(__ARM_FEATURE_DOTPROD) + const int32x4_t vzero = vdupq_n_s32(0); +#endif const uint8x16_t mone = vdupq_n_u8(3);