Merge branch 'master' into concedo_experimental
# Conflicts: # README.md
This commit is contained in:
commit
b1c66e4ad7
5 changed files with 64 additions and 59 deletions
12
common/log.h
12
common/log.h
|
@ -341,14 +341,14 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (_disabled)
|
||||||
|
{
|
||||||
|
// Log is disabled
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
if (_initialized)
|
if (_initialized)
|
||||||
{
|
{
|
||||||
if (_disabled)
|
|
||||||
{
|
|
||||||
// Log is disabled
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// with fallback in case something went wrong
|
// with fallback in case something went wrong
|
||||||
return logfile ? logfile : stderr;
|
return logfile ? logfile : stderr;
|
||||||
}
|
}
|
||||||
|
|
|
@ -284,7 +284,14 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
||||||
// address and size of the buffer when measuring
|
// address and size of the buffer when measuring
|
||||||
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
|
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
|
||||||
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
|
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
|
||||||
|
#if defined(__ARM_NEON) && !defined(__aarch64__)
|
||||||
|
// 32-bit
|
||||||
|
// TODO: Use for 32-bit x86 as well
|
||||||
|
static const size_t MEASURE_MAX_SIZE = (1ULL<<32) - 1; // 4 GB
|
||||||
|
#else
|
||||||
|
// 64-bit
|
||||||
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
|
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||||
|
|
18
ggml-metal.m
18
ggml-metal.m
|
@ -116,10 +116,24 @@ static NSString * const msl_library_source = @"see metal.metal";
|
||||||
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
metal_printf("%s: allocating\n", __func__);
|
metal_printf("%s: allocating\n", __func__);
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
// Show all the Metal device instances in the system
|
||||||
|
NSArray * devices = MTLCopyAllDevices();
|
||||||
|
id <MTLDevice> device;
|
||||||
|
NSString * s;
|
||||||
|
for (device in devices) {
|
||||||
|
s = [device name];
|
||||||
|
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pick and show default Metal device
|
||||||
|
device = MTLCreateSystemDefaultDevice();
|
||||||
|
s = [device name];
|
||||||
|
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
||||||
|
|
||||||
|
// Configure context
|
||||||
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
|
ctx->device = device;
|
||||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||||
ctx->device = MTLCreateSystemDefaultDevice();
|
|
||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
ctx->n_buffers = 0;
|
ctx->n_buffers = 0;
|
||||||
ctx->concur_list_len = 0;
|
ctx->concur_list_len = 0;
|
||||||
|
|
46
ggml.c
46
ggml.c
|
@ -818,46 +818,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
||||||
|
|
||||||
#if !defined(__aarch64__)
|
#if !defined(__aarch64__)
|
||||||
|
|
||||||
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
|
||||||
return
|
|
||||||
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
|
||||||
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
|
||||||
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
|
||||||
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
|
||||||
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
|
||||||
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
|
||||||
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
|
||||||
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int16_t vaddvq_s8(int8x16_t v) {
|
|
||||||
return
|
|
||||||
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
|
|
||||||
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
|
|
||||||
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
|
|
||||||
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
|
|
||||||
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
|
|
||||||
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
|
|
||||||
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
|
|
||||||
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
|
||||||
return
|
|
||||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
|
||||||
return
|
|
||||||
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
|
||||||
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
|
||||||
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
|
||||||
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||||
}
|
}
|
||||||
|
@ -866,12 +826,6 @@ inline static float vaddvq_f32(float32x4_t v) {
|
||||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static float vminvq_f32(float32x4_t v) {
|
|
||||||
return
|
|
||||||
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
|
||||||
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static float vmaxvq_f32(float32x4_t v) {
|
inline static float vmaxvq_f32(float32x4_t v) {
|
||||||
return
|
return
|
||||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||||
|
|
40
k_quants.c
40
k_quants.c
|
@ -13,6 +13,26 @@
|
||||||
//
|
//
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#if !defined(__aarch64__)
|
||||||
|
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||||
|
return
|
||||||
|
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
||||||
|
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
||||||
|
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
||||||
|
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||||
|
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||||
|
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||||
|
return vcombine_s16(a0, b0);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||||
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef __wasm_simd128__
|
#ifdef __wasm_simd128__
|
||||||
|
@ -1302,7 +1322,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
||||||
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
int8x16x2_t q2bytes;
|
int8x16x2_t q2bytes;
|
||||||
uint8_t aux[16];
|
uint8_t aux[16];
|
||||||
|
@ -1608,7 +1630,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
int8x16x4_t q2bytes;
|
int8x16x4_t q2bytes;
|
||||||
|
|
||||||
|
@ -2592,8 +2616,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
const uint8_t * restrict q4 = x[i].qs;
|
const uint8_t * restrict q4 = x[i].qs;
|
||||||
const int8_t * restrict q8 = y[i].qs;
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
//int32x4_t isum = mzero;
|
|
||||||
|
|
||||||
int32_t sumi1 = 0;
|
int32_t sumi1 = 0;
|
||||||
int32_t sumi2 = 0;
|
int32_t sumi2 = 0;
|
||||||
|
|
||||||
|
@ -3092,9 +3114,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||||
const int32x4_t mzero = vdupq_n_s32(0);
|
|
||||||
const uint8x16_t mone = vdupq_n_u8(1);
|
const uint8x16_t mone = vdupq_n_u8(1);
|
||||||
const uint8x16_t mtwo = vdupq_n_u8(2);
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
const int32x4_t mzero = vdupq_n_s32(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
int8x16x4_t q5bytes;
|
int8x16x4_t q5bytes;
|
||||||
|
|
||||||
|
@ -3437,8 +3461,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||||
const int32x4_t mzero = vdupq_n_s32(0);
|
|
||||||
const uint8x16_t mh = vdupq_n_u8(16);
|
const uint8x16_t mh = vdupq_n_u8(16);
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
const int32x4_t mzero = vdupq_n_s32(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
int8x16x4_t q5bytes;
|
int8x16x4_t q5bytes;
|
||||||
uint8x16x4_t q5h;
|
uint8x16x4_t q5h;
|
||||||
|
@ -3656,7 +3682,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
|
#endif
|
||||||
//const int8x16_t m32s = vdupq_n_s8(32);
|
//const int8x16_t m32s = vdupq_n_s8(32);
|
||||||
|
|
||||||
const uint8x16_t mone = vdupq_n_u8(3);
|
const uint8x16_t mone = vdupq_n_u8(3);
|
||||||
|
@ -4045,8 +4073,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
|
||||||
const int8x16_t m32s = vdupq_n_s8(32);
|
const int8x16_t m32s = vdupq_n_s8(32);
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
const uint8x16_t mone = vdupq_n_u8(3);
|
const uint8x16_t mone = vdupq_n_u8(3);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue