Merge 787bc4af60
into da0400344b
This commit is contained in:
commit
159bc313b0
9 changed files with 1628 additions and 8 deletions
11
Makefile
11
Makefile
|
@ -300,6 +300,12 @@ ifdef LLAMA_QKK_64
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef LLAMA_NO_SQLLM
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_SQLLM
|
||||||
|
OBJS += sqllm.o
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
ifndef LLAMA_NO_ACCELERATE
|
ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac OS - include Accelerate framework.
|
# Mac OS - include Accelerate framework.
|
||||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||||
|
@ -449,6 +455,11 @@ k_quants.o: k_quants.c k_quants.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_NO_K_QUANTS
|
endif # LLAMA_NO_K_QUANTS
|
||||||
|
|
||||||
|
ifndef LLAMA_NO_SQLLM
|
||||||
|
sqllm.o: sqllm.c sqllm.h
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
endif # LLAMA_NO_SQLLM
|
||||||
|
|
||||||
# combine build flags with cmdline overrides
|
# combine build flags with cmdline overrides
|
||||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
||||||
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||||
|
|
1433
convert-sqllm-to-gguf.py
Normal file
1433
convert-sqllm-to-gguf.py
Normal file
File diff suppressed because it is too large
Load diff
54
ggml.c
54
ggml.c
|
@ -6,6 +6,10 @@
|
||||||
#include "k_quants.h"
|
#include "k_quants.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SQLLM
|
||||||
|
#include "sqllm.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||||
|
@ -1777,6 +1781,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.type_size = sizeof(block_q8_K),
|
.type_size = sizeof(block_q8_K),
|
||||||
.is_quantized = true,
|
.is_quantized = true,
|
||||||
.from_float = quantize_row_q8_K,
|
.from_float = quantize_row_q8_K,
|
||||||
|
},
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_SQLLM
|
||||||
|
[GGML_TYPE_Q4_SQ] = {
|
||||||
|
.type_name = "q4_sq",
|
||||||
|
.blck_size = 1,
|
||||||
|
.type_size = sizeof(int32_t),
|
||||||
|
.is_quantized = true,
|
||||||
|
.to_float = NULL,
|
||||||
|
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
|
.from_float_reference = NULL,
|
||||||
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_q4_sq_fp16,
|
||||||
|
.vec_dot_type = GGML_TYPE_F16,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
@ -4414,6 +4431,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q4_SQ: wtype = GGML_TYPE_Q4_SQ; break;
|
||||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||||
}
|
}
|
||||||
|
@ -4788,7 +4806,13 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
view_src = view_src->view_src;
|
view_src = view_src->view_src;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
size_t data_size = 0;
|
||||||
|
if (type == GGML_TYPE_Q4_SQ) { //SQLLM
|
||||||
|
data_size += 16*2 + (ne[0]/2);
|
||||||
|
} else {
|
||||||
|
data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 1; i < n_dims; i++) {
|
for (int i = 1; i < n_dims; i++) {
|
||||||
data_size *= ne[i];
|
data_size *= ne[i];
|
||||||
}
|
}
|
||||||
|
@ -4856,8 +4880,13 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
result->ne[i] = ne[i];
|
result->ne[i] = ne[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (type == GGML_TYPE_Q4_SQ) { //SQLLM
|
||||||
|
result->nb[0] = ggml_type_size(type);
|
||||||
|
result->nb[1] = result->nb[0]*(16/2 + result->ne[0]/8);
|
||||||
|
} else {
|
||||||
result->nb[0] = ggml_type_size(type);
|
result->nb[0] = ggml_type_size(type);
|
||||||
result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
|
result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
|
||||||
|
}
|
||||||
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
||||||
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
|
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
|
||||||
}
|
}
|
||||||
|
@ -9039,6 +9068,7 @@ static void ggml_compute_forward_add(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_Q4_SQ:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -9303,6 +9333,7 @@ static void ggml_compute_forward_add1(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_Q4_SQ:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -9418,6 +9449,7 @@ static void ggml_compute_forward_acc(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_Q4_SQ:
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -11330,8 +11362,9 @@ static void ggml_compute_forward_mul_mat(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT) {
|
if (params->type == GGML_TASK_INIT){
|
||||||
if (src1->type != vec_dot_type) {
|
if (src1->type != vec_dot_type) {
|
||||||
|
|
||||||
char * wdata = params->wdata;
|
char * wdata = params->wdata;
|
||||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||||
|
|
||||||
|
@ -11352,8 +11385,15 @@ static void ggml_compute_forward_mul_mat(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
void * wdata;
|
||||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
size_t row_size;
|
||||||
|
if (src0->type != GGML_TYPE_Q4_SQ) {
|
||||||
|
row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||||
|
wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||||
|
} else {
|
||||||
|
row_size = ne10*sizeof(int16_t); // for fp16 row
|
||||||
|
wdata = params->wdata;
|
||||||
|
}
|
||||||
|
|
||||||
const int64_t nr0 = ne01; // src0 rows
|
const int64_t nr0 = ne01; // src0 rows
|
||||||
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
||||||
|
@ -11417,7 +11457,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
// the original src1 data pointer, so we should index using the indices directly
|
// the original src1 data pointer, so we should index using the indices directly
|
||||||
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
||||||
const char * src1_col = (const char *) wdata +
|
const char * src1_col = (const char *) wdata +
|
||||||
(src1_cont || src1->type != vec_dot_type
|
(src1_cont || src1->type != vec_dot_type || src0->type == GGML_TYPE_Q4_SQ
|
||||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||||
|
|
||||||
|
@ -11735,6 +11775,7 @@ static void ggml_compute_forward_set(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_Q4_SQ:
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -11905,6 +11946,7 @@ static void ggml_compute_forward_get_rows(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_Q4_SQ:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -12534,6 +12576,7 @@ static void ggml_compute_forward_alibi(
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
|
case GGML_TYPE_Q4_SQ:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
@ -12608,6 +12651,7 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
|
case GGML_TYPE_Q4_SQ:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -312,6 +312,7 @@ extern "C" {
|
||||||
GGML_TYPE_Q5_K = 13,
|
GGML_TYPE_Q5_K = 13,
|
||||||
GGML_TYPE_Q6_K = 14,
|
GGML_TYPE_Q6_K = 14,
|
||||||
GGML_TYPE_Q8_K = 15,
|
GGML_TYPE_Q8_K = 15,
|
||||||
|
GGML_TYPE_Q4_SQ = 19,
|
||||||
GGML_TYPE_I8,
|
GGML_TYPE_I8,
|
||||||
GGML_TYPE_I16,
|
GGML_TYPE_I16,
|
||||||
GGML_TYPE_I32,
|
GGML_TYPE_I32,
|
||||||
|
@ -340,6 +341,7 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q4_SQ = 19, // except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
|
|
|
@ -427,6 +427,7 @@ class GGMLQuantizationType(IntEnum):
|
||||||
Q5_K = 13
|
Q5_K = 13
|
||||||
Q6_K = 14
|
Q6_K = 14
|
||||||
Q8_K = 15
|
Q8_K = 15
|
||||||
|
Q4_SQ = 19
|
||||||
|
|
||||||
|
|
||||||
class GGUFValueType(IntEnum):
|
class GGUFValueType(IntEnum):
|
||||||
|
|
|
@ -1354,6 +1354,7 @@ struct llama_model_loader {
|
||||||
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
||||||
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
||||||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||||
|
case GGML_TYPE_Q4_SQ: ftype = LLAMA_FTYPE_MOSTLY_Q4_SQ; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||||
|
@ -1619,6 +1620,9 @@ static std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
||||||
|
|
||||||
|
//SQLLM
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_SQ: return "mostly Q4_SQ";
|
||||||
|
|
||||||
default: return "unknown, may not work";
|
default: return "unknown, may not work";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3761,7 +3765,7 @@ static bool llama_eval_internal(
|
||||||
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
||||||
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
||||||
// with the BLAS calls. need a better solution
|
// with the BLAS calls. need a better solution
|
||||||
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() && !(model.ftype == LLAMA_FTYPE_MOSTLY_Q4_SQ)) {
|
||||||
n_threads = std::min(4, n_threads);
|
n_threads = std::min(4, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5653,6 +5657,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_SQLLM
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_SQ: quantized_type = GGML_TYPE_Q4_SQ; break;
|
||||||
#endif
|
#endif
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -98,6 +98,7 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_SQ = 19, // except 1d tensors
|
||||||
|
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
|
108
sqllm.c
Normal file
108
sqllm.c
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
#include "sqllm.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
|
||||||
|
void ggml_vec_dot_q4_sq_fp16(const int n, float * restrict s, void * restrict v, ggml_fp16_t * restrict y) {
|
||||||
|
|
||||||
|
const int nb = n / 8;
|
||||||
|
|
||||||
|
// #ifdef __ARM_NEON
|
||||||
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||||
|
|
||||||
|
// pointer initialization
|
||||||
|
int32_t * baselut = v;
|
||||||
|
int32_t * qweight = baselut + 8; // get start of row
|
||||||
|
float * yvector = (void *) y;
|
||||||
|
|
||||||
|
// initialize sum
|
||||||
|
float16x8_t sumf1 = vdupq_n_f16(0);
|
||||||
|
float16x8_t sumf2 = vdupq_n_f16(0);
|
||||||
|
float16x8_t sumf3 = vdupq_n_f16(0);
|
||||||
|
float16x8_t sumf4 = vdupq_n_f16(0);
|
||||||
|
|
||||||
|
// initialize lookup table
|
||||||
|
uint8x16_t lut1 = vld1q_u8((void *) baselut);
|
||||||
|
uint8x16_t lut2 = vld1q_u8((void *) (baselut+4));
|
||||||
|
uint8x16_t lutl = vuzp1q_u8(lut1, lut2);
|
||||||
|
uint8x16_t luth = vuzp2q_u8(lut1, lut2);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i+=4) {
|
||||||
|
// get packed vector
|
||||||
|
uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||||
|
uint8x16_t packed_vector = vld1q_u8((void *) &qweight[i]);
|
||||||
|
|
||||||
|
// 4-bit -> 2 8-bit vectors
|
||||||
|
uint8x16_t packed_vector_lb = vandq_u8 (packed_vector, m4b);
|
||||||
|
uint8x16_t packed_vector_hb = vshrq_n_u8(packed_vector, 4);
|
||||||
|
|
||||||
|
// get separate 8-bit indices (split across two vectors) by interleaving
|
||||||
|
uint8x16_t packed_vector_0 = vzip1q_u8(packed_vector_lb, packed_vector_hb);
|
||||||
|
uint8x16_t packed_vector_1 = vzip2q_u8(packed_vector_lb, packed_vector_hb);
|
||||||
|
|
||||||
|
//perform table lookups
|
||||||
|
uint8x16_t lookup_0l = vqtbl1q_u8 (lutl, packed_vector_0);
|
||||||
|
uint8x16_t lookup_0h = vqtbl1q_u8 (luth, packed_vector_0);
|
||||||
|
uint8x16_t lookup_1l = vqtbl1q_u8 (lutl, packed_vector_1);
|
||||||
|
uint8x16_t lookup_1h = vqtbl1q_u8 (luth, packed_vector_1);
|
||||||
|
|
||||||
|
// interleave lookup values
|
||||||
|
float16x8_t lookup_0_z1 = (float16x8_t) vzip1q_u8(lookup_0l, lookup_0h);
|
||||||
|
float16x8_t lookup_0_z2 = (float16x8_t) vzip2q_u8(lookup_0l, lookup_0h);
|
||||||
|
float16x8_t lookup_1_z1 = (float16x8_t) vzip1q_u8(lookup_1l, lookup_1h);
|
||||||
|
float16x8_t lookup_1_z2 = (float16x8_t) vzip2q_u8(lookup_1l, lookup_1h);
|
||||||
|
|
||||||
|
//load int8 values
|
||||||
|
float16x8_t tmp1 = vld1q_f16(((void *) &yvector[4*i]));
|
||||||
|
float16x8_t tmp2 = vld1q_f16(((void *) &yvector[4*i+4]));
|
||||||
|
float16x8_t tmp3 = vld1q_f16(((void *) &yvector[4*i+8]));
|
||||||
|
float16x8_t tmp4 = vld1q_f16(((void *) &yvector[4*i+12]));
|
||||||
|
|
||||||
|
//fp16 mul
|
||||||
|
sumf1 = vfmaq_f16(sumf1, lookup_0_z1, tmp1);
|
||||||
|
sumf2 = vfmaq_f16(sumf2, lookup_0_z2, tmp2);
|
||||||
|
sumf3 = vfmaq_f16(sumf3, lookup_1_z1, tmp3);
|
||||||
|
sumf4 = vfmaq_f16(sumf4, lookup_1_z2, tmp4);
|
||||||
|
}
|
||||||
|
|
||||||
|
float16x8_t sumf5 = vaddq_f16(sumf1, sumf2);
|
||||||
|
float16x8_t sumf6 = vaddq_f16(sumf3, sumf4);
|
||||||
|
float16x8_t sumf7 = vaddq_f16(sumf5, sumf6);
|
||||||
|
|
||||||
|
float res = 0.0;
|
||||||
|
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (sumf7));
|
||||||
|
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(sumf7));
|
||||||
|
res = (float) vaddvq_f32(vaddq_f32(t0, t1));
|
||||||
|
|
||||||
|
*s = res;
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
int32_t * baseptr = v;
|
||||||
|
int32_t * qweight = baseptr + 8; // get start of row
|
||||||
|
|
||||||
|
// scalar
|
||||||
|
float sumf = 0.0;
|
||||||
|
|
||||||
|
ggml_fp16_t * lut = v;
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
int32_t packed = qweight[i];
|
||||||
|
|
||||||
|
for (int j = 0; j < 8; ++j) {
|
||||||
|
const int idx = (packed >> j*4) & 0x0F;
|
||||||
|
const ggml_fp16_t val = lut[idx];
|
||||||
|
const ggml_fp16_t val2 = y[8*i+j];
|
||||||
|
|
||||||
|
sumf += ggml_fp16_to_fp32(val) * ggml_fp16_to_fp32(val2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
}
|
13
sqllm.h
Normal file
13
sqllm.h
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#ifdef __ARM_NEON
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#endif
|
||||||
|
void ggml_vec_dot_q4_sq_fp16(const int n, float * restrict s, void * restrict v, ggml_fp16_t * restrict y);
|
Loading…
Add table
Add a link
Reference in a new issue