Merge branch 'master' into vulkan-build-integration
This commit is contained in:
commit
24fd7d3d52
49 changed files with 3835 additions and 1081 deletions
|
@ -8,7 +8,7 @@ arg1="$1"
|
|||
shift
|
||||
|
||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||
python3 ./convert-hf-to-gguf.py "$@"
|
||||
python3 ./convert_hf_to_gguf.py "$@"
|
||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./llama-quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
|
|
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -61,6 +61,11 @@ llama-batched-swift
|
|||
out/
|
||||
tmp/
|
||||
|
||||
# Deprecated
|
||||
|
||||
/main
|
||||
/server
|
||||
|
||||
# CI
|
||||
|
||||
!.github/workflows/*.yml
|
||||
|
|
50
Makefile
50
Makefile
|
@ -551,14 +551,20 @@ ifdef GGML_OPENBLAS64
|
|||
endif # GGML_OPENBLAS64
|
||||
|
||||
ifdef GGML_BLIS
|
||||
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
|
||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||
OBJ_GGML += ggml/src/ggml-blas.o
|
||||
endif # GGML_BLIS
|
||||
|
||||
ifdef GGML_NVPL
|
||||
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
|
||||
MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
|
||||
OBJ_GGML += ggml/src/ggml-blas.o
|
||||
endif # GGML_NVPL
|
||||
|
||||
ifndef GGML_NO_LLAMAFILE
|
||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||
OBJ_GGML += ggml/src/sgemm.o
|
||||
OBJ_GGML += ggml/src/llamafile/sgemm.o
|
||||
endif
|
||||
|
||||
ifdef GGML_RPC
|
||||
|
@ -857,7 +863,8 @@ OBJ_GGML += \
|
|||
ggml/src/ggml.o \
|
||||
ggml/src/ggml-alloc.o \
|
||||
ggml/src/ggml-backend.o \
|
||||
ggml/src/ggml-quants.o
|
||||
ggml/src/ggml-quants.o \
|
||||
ggml/src/ggml-aarch64.o
|
||||
|
||||
OBJ_LLAMA = \
|
||||
src/llama.o \
|
||||
|
@ -991,15 +998,22 @@ ggml/src/ggml-quants.o: \
|
|||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-aarch64.o: \
|
||||
ggml/src/ggml-aarch64.c \
|
||||
ggml/include/ggml.h \
|
||||
ggml/src/ggml-aarch64.h \
|
||||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-blas.o: \
|
||||
ggml/src/ggml-blas.cpp \
|
||||
ggml/include/ggml-blas.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
ifndef GGML_NO_LLAMAFILE
|
||||
ggml/src/sgemm.o: \
|
||||
ggml/src/sgemm.cpp \
|
||||
ggml/src/sgemm.h \
|
||||
ggml/src/llamafile/sgemm.o: \
|
||||
ggml/src/llamafile/sgemm.cpp \
|
||||
ggml/src/llamafile/sgemm.h \
|
||||
ggml/include/ggml.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # GGML_NO_LLAMAFILE
|
||||
|
@ -1528,15 +1542,17 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
|||
# Mark legacy binary targets as .PHONY so that they are always checked.
|
||||
.PHONY: main quantize perplexity embedding server finetune
|
||||
|
||||
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
|
||||
# Eventually we will want to remove these target from building all the time.
|
||||
main: examples/deprecation-warning/deprecation-warning.cpp
|
||||
ifneq (,$(wildcard main))
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'main' binary is deprecated. Please use 'llama-cli' instead."
|
||||
@echo " Remove the 'main' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
|
||||
|
||||
server: examples/deprecation-warning/deprecation-warning.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
|
||||
|
||||
quantize: examples/deprecation-warning/deprecation-warning.cpp
|
||||
ifneq (,$(wildcard quantize))
|
||||
|
@ -1568,16 +1584,6 @@ ifneq (,$(wildcard embedding))
|
|||
@echo "#########"
|
||||
endif
|
||||
|
||||
server: examples/deprecation-warning/deprecation-warning.cpp
|
||||
ifneq (,$(wildcard server))
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'server' binary is deprecated. Please use 'llama-server' instead."
|
||||
@echo " Remove the 'server' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
||||
finetune: examples/deprecation-warning/deprecation-warning.cpp
|
||||
ifneq (,$(wildcard finetune))
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
|
|
@ -10,6 +10,7 @@ var sources = [
|
|||
"ggml/src/ggml-alloc.c",
|
||||
"ggml/src/ggml-backend.c",
|
||||
"ggml/src/ggml-quants.c",
|
||||
"ggml/src/ggml-aarch64.c",
|
||||
]
|
||||
|
||||
var resources: [Resource] = []
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
#if defined(_MSC_VER)
|
||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#include "common.h"
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
|
|
|
@ -1203,11 +1203,10 @@ class RefactModel(Model):
|
|||
|
||||
# TODO: how to determine special FIM tokens automatically?
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
||||
special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
|
||||
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
||||
special_vocab._set_special_token("prefix", 1)
|
||||
special_vocab._set_special_token("suffix", 3)
|
||||
special_vocab._set_special_token("middle", 2)
|
||||
special_vocab._set_special_token("fsep", 4) # is this correct?
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
|
@ -2144,6 +2143,9 @@ class InternLM2Model(Model):
|
|||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.IsByte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
# take care of ununsed raw token
|
||||
if piece.startswith('[UNUSED'):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
|
@ -2159,6 +2161,47 @@ class InternLM2Model(Model):
|
|||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||
|
||||
chat_eos_token = '<|im_end|>'
|
||||
chat_eos_token_id = None
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
||||
for token_id, foken_data in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
token = foken_data["content"]
|
||||
if token == chat_eos_token:
|
||||
chat_eos_token_id = token_id
|
||||
token = token.encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
assert(tokens[token_id] == token)
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
if foken_data.get("special"):
|
||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
tokenizer_file = self.dir_model / 'tokenizer.json'
|
||||
if tokenizer_file.is_file():
|
||||
with open(tokenizer_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
added_tokens = tokenizer_json.get("added_tokens", [])
|
||||
for foken_data in added_tokens:
|
||||
token_id = int(foken_data["id"])
|
||||
token = foken_data["content"]
|
||||
if token == chat_eos_token:
|
||||
chat_eos_token_id = token_id
|
||||
token = token.encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
assert(tokens[token_id] == token)
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
if foken_data.get("special"):
|
||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
|
@ -2168,28 +2211,16 @@ class InternLM2Model(Model):
|
|||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
old_eos = special_vocab.special_token_ids["eos"]
|
||||
if "chat" in os.path.basename(self.dir_model.absolute()):
|
||||
if chat_eos_token_id is not None:
|
||||
# For the chat model, we replace the eos with '<|im_end|>'.
|
||||
# TODO: this is a hack, should be fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
||||
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
|
||||
logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
|
||||
in chat mode so that the conversation can end normally.")
|
||||
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
||||
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
||||
" in chat mode so that the conversation can end normally.")
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _try_get_sft_eos(self, tokenizer):
|
||||
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
|
||||
im_end_list = tokenizer.Encode('<|im_end|>')
|
||||
eos_token = None
|
||||
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
|
||||
if len(unused_145_list) == 1:
|
||||
eos_token = unused_145_list[0]
|
||||
if len(im_end_list) == 1:
|
||||
eos_token = im_end_list[0]
|
||||
assert eos_token
|
||||
return eos_token
|
||||
|
||||
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
|
@ -2208,6 +2239,10 @@ in chat mode so that the conversation can end normally.")
|
|||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_heads = self.hparams["num_attention_heads"]
|
||||
|
|
|
@ -28,6 +28,7 @@ In order to build llama.cpp you have four different options.
|
|||
```
|
||||
|
||||
- Notes:
|
||||
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
|
||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
|
||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||
- For debug builds, run `make LLAMA_DEBUG=1`
|
||||
|
@ -41,6 +42,7 @@ In order to build llama.cpp you have four different options.
|
|||
|
||||
**Notes**:
|
||||
|
||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||
- For debug builds, there are two cases:
|
||||
|
|
|
@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|||
|
||||
char src1_str[128] = {0};
|
||||
if (src1) {
|
||||
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||
}
|
||||
|
||||
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||
|
|
|
@ -347,7 +347,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
|||
char hex_result[17];
|
||||
for (int offset = 0; offset < 8; offset++) {
|
||||
unsigned int shift_bits_by = (8 * (8 - offset - 1));
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
|
@ -384,7 +384,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
|||
|
||||
char hex_result[41] = {0};
|
||||
for (int offset = 0; offset < 20; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
|
@ -421,7 +421,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
|||
|
||||
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
|
||||
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
|
@ -460,7 +460,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
|||
char hex_result[17];
|
||||
for (int offset = 0; offset < 8; offset++) {
|
||||
unsigned int shift_bits_by = (8 * (8 - offset - 1));
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
|
@ -490,7 +490,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
|||
|
||||
char hex_result[41];
|
||||
for (int offset = 0; offset < 20; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
|
@ -520,7 +520,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
|||
|
||||
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
|
||||
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
|
@ -552,7 +552,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
|||
generate_uuidv5(result, uuid);
|
||||
|
||||
char string_buffer[37] = {0};
|
||||
sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
uuid[0], uuid[1], uuid[2], uuid[3],
|
||||
uuid[4], uuid[5], uuid[6], uuid[7],
|
||||
uuid[8], uuid[9], uuid[10], uuid[11],
|
||||
|
|
|
@ -289,8 +289,13 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
if (add_bos) {
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
} else {
|
||||
LOG_TEE("error: input is empty\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Tokenize negative prompt
|
||||
|
|
|
@ -154,7 +154,7 @@ static void test_roundtrip_on_chunk(
|
|||
}
|
||||
|
||||
if (use_reference) {
|
||||
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
|
||||
qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
|
||||
} else {
|
||||
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
|
||||
}
|
||||
|
|
|
@ -46,6 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
|
|
|
@ -737,6 +737,8 @@ struct server_context {
|
|||
slot.ga_n = ga_n;
|
||||
slot.ga_w = ga_w;
|
||||
|
||||
slot.sparams = params.sparams;
|
||||
|
||||
slot.reset();
|
||||
|
||||
slots.push_back(slot);
|
||||
|
@ -2003,6 +2005,11 @@ struct server_context {
|
|||
int32_t n_batch = llama_n_batch(ctx);
|
||||
int32_t n_ubatch = llama_n_ubatch(ctx);
|
||||
|
||||
// track if this is an embedding or non-embedding batch
|
||||
// if we've added sampled tokens above, we are in non-embedding mode
|
||||
// -1: none, 0: non-embedding, 1: embedding
|
||||
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
|
||||
|
||||
// next, batch any pending prompts without exceeding n_batch
|
||||
if (params.cont_batching || batch.n_tokens == 0) {
|
||||
for (auto & slot : slots) {
|
||||
|
@ -2173,6 +2180,14 @@ struct server_context {
|
|||
}
|
||||
}
|
||||
|
||||
// check that we are in the right batch_type, if not defer the slot
|
||||
bool slot_type = slot.embedding ? 1 : 0;
|
||||
if (batch_type == -1) {
|
||||
batch_type = slot_type;
|
||||
} else if (batch_type != slot_type) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep only the common part
|
||||
int p0 = (int) system_tokens.size() + slot.n_past;
|
||||
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
|
||||
|
@ -2274,6 +2289,9 @@ struct server_context {
|
|||
{"n_tokens", batch.n_tokens},
|
||||
});
|
||||
|
||||
// make sure we're in the right embedding mode
|
||||
llama_set_embeddings(ctx, batch_type == 1);
|
||||
|
||||
// process the created batch of tokens
|
||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
|
@ -2988,6 +3006,11 @@ int main(int argc, char ** argv) {
|
|||
};
|
||||
|
||||
const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
|
||||
json data = json::parse(req.body);
|
||||
|
@ -3083,6 +3106,11 @@ int main(int argc, char ** argv) {
|
|||
};
|
||||
|
||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
||||
|
||||
|
@ -3155,6 +3183,11 @@ int main(int argc, char ** argv) {
|
|||
};
|
||||
|
||||
const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support infill. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
|
||||
json data = json::parse(req.body);
|
||||
|
@ -3241,13 +3274,8 @@ int main(int argc, char ** argv) {
|
|||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
};
|
||||
|
||||
const auto handle_embeddings = [¶ms, &ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
if (!params.embedding) {
|
||||
res.status = 501;
|
||||
res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8");
|
||||
return;
|
||||
}
|
||||
|
||||
const json body = json::parse(req.body);
|
||||
bool is_openai = false;
|
||||
|
|
|
@ -122,8 +122,26 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
|||
|
||||
for (size_t i = 0; i < messages.size(); ++i) {
|
||||
const auto & curr_msg = messages[i];
|
||||
|
||||
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||
std::string content = json_value(curr_msg, "content", std::string(""));
|
||||
|
||||
std::string content;
|
||||
if (curr_msg.contains("content")) {
|
||||
if (curr_msg["content"].is_string()) {
|
||||
content = curr_msg["content"].get<std::string>();
|
||||
} else if (curr_msg["content"].is_array()) {
|
||||
for (const auto & part : curr_msg["content"]) {
|
||||
if (part.contains("text")) {
|
||||
content += "\n" + part["text"].get<std::string>();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
||||
}
|
||||
|
||||
chat.push_back({role, content});
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
|
|||
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
|
||||
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||
fprintf(stream, " --show-count print the total number of tokens.\n");
|
||||
}
|
||||
|
@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|||
// variables where to put any arguments we see.
|
||||
bool printing_ids = false;
|
||||
bool no_bos = false;
|
||||
bool no_parse_special = false;
|
||||
bool disable_logging = false;
|
||||
bool show_token_count = false;
|
||||
const char * model_path = NULL;
|
||||
|
@ -229,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
|
|||
else if (arg == "--no-bos") {
|
||||
no_bos = true;
|
||||
}
|
||||
else if (arg == "--no-parse-special") {
|
||||
no_parse_special = true;
|
||||
}
|
||||
else if (arg == "-p" || arg == "--prompt") {
|
||||
if (prompt_set) {
|
||||
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
|
||||
|
@ -359,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
|
|||
|
||||
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
||||
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||
const bool parse_special = !no_parse_special;
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
|
||||
|
||||
if (printing_ids) {
|
||||
printf("[");
|
||||
|
|
|
@ -104,7 +104,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
|
|||
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
||||
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||
"ggml: BLAS library vendor")
|
||||
option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF)
|
||||
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
|
||||
|
||||
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
|
|
|
@ -383,6 +383,9 @@ extern "C" {
|
|||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_IQ1_M = 29,
|
||||
GGML_TYPE_BF16 = 30,
|
||||
GGML_TYPE_Q4_0_4_4 = 31,
|
||||
GGML_TYPE_Q4_0_4_8 = 32,
|
||||
GGML_TYPE_Q4_0_8_8 = 33,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
|
@ -424,6 +427,9 @@ extern "C" {
|
|||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
@ -708,7 +714,7 @@ extern "C" {
|
|||
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||
|
||||
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
|
||||
GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
|
||||
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
|
||||
|
@ -2404,20 +2410,31 @@ extern "C" {
|
|||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_to_mat_t)
|
||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
typedef struct {
|
||||
const char * type_name;
|
||||
int blck_size;
|
||||
int64_t blck_size;
|
||||
int64_t blck_size_interleave; // interleave elements in blocks
|
||||
size_t type_size;
|
||||
bool is_quantized;
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_t from_float_reference;
|
||||
ggml_from_float_t from_float_ref;
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
} ggml_type_traits_t;
|
||||
|
||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||
|
|
|
@ -238,12 +238,12 @@ if (GGML_BLAS)
|
|||
endif()
|
||||
|
||||
if (GGML_LLAMAFILE)
|
||||
message(STATUS "Using ggml SGEMM")
|
||||
message(STATUS "Using llamafile")
|
||||
|
||||
add_compile_definitions(GGML_USE_LLAMAFILE)
|
||||
|
||||
set(GGML_HEADERS_LLAMAFILE sgemm.h)
|
||||
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
|
||||
set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
|
||||
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
|
@ -1180,6 +1180,7 @@ add_library(ggml
|
|||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
|
||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||
ggml-aarch64.c ggml-aarch64.h
|
||||
)
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
|
|
2191
ggml/src/ggml-aarch64.c
Normal file
2191
ggml/src/ggml-aarch64.c
Normal file
File diff suppressed because it is too large
Load diff
39
ggml/src/ggml-aarch64.h
Normal file
39
ggml/src/ggml-aarch64.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// GEMV
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// GEMM
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -394,7 +394,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
|
|||
|
||||
// backend registry
|
||||
|
||||
#define GGML_REG_MAX_BACKENDS 16
|
||||
#define GGML_REG_MAX_BACKENDS 64
|
||||
|
||||
struct ggml_backend_reg {
|
||||
char name[128];
|
||||
|
|
|
@ -8,11 +8,12 @@
|
|||
# include <Accelerate/Accelerate.h>
|
||||
#elif defined(GGML_BLAS_USE_MKL)
|
||||
# include <mkl.h>
|
||||
#elif defined(GGML_BLAS_USE_BLIS)
|
||||
# include <blis.h>
|
||||
#elif defined(GGML_BLAS_USE_NVPL)
|
||||
# include <nvpl_blas.h>
|
||||
#else
|
||||
# include <cblas.h>
|
||||
# ifdef BLIS_ENABLE_CBLAS
|
||||
# include <blis.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
struct ggml_backend_blas_context {
|
||||
|
@ -140,10 +141,14 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
openblas_set_num_threads(ctx->n_threads);
|
||||
#endif
|
||||
|
||||
#if defined(BLIS_ENABLE_CBLAS)
|
||||
#if defined(GGML_BLAS_USE_BLIS)
|
||||
bli_thread_set_num_threads(ctx->n_threads);
|
||||
#endif
|
||||
|
||||
#if defined(GGML_BLAS_USE_NVPL)
|
||||
nvpl_blas_set_num_threads(ctx->n_threads);
|
||||
#endif
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
const int64_t i03 = i13/r3;
|
||||
|
|
|
@ -199,6 +199,30 @@ typedef struct {
|
|||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
|
||||
} block_q4_0x4;
|
||||
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
|
||||
} block_q4_0x8;
|
||||
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
||||
} block_q8_0x4;
|
||||
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
|
||||
} block_q8_0x8;
|
||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
|
|
@ -104,7 +104,7 @@
|
|||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaSuccess hipSuccess
|
||||
#define __trap abort
|
||||
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
||||
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
||||
|
|
|
@ -70,6 +70,10 @@ struct mma_int_A_I16K8 {
|
|||
}
|
||||
#endif // defined(INT8_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void load_low(const int * __restrict__ xs0, const int & stride) {
|
||||
((mma_int_A_I16K4 *) x)[0].load(xs0, stride);
|
||||
}
|
||||
};
|
||||
|
||||
struct mma_int_B_J8K4 {
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -37,47 +37,92 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
}
|
||||
|
||||
template <bool need_sum>
|
||||
template <mmq_q8_1_ds_layout ds_layout>
|
||||
static __global__ void quantize_mmq_q8_1(
|
||||
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
|
||||
|
||||
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
|
||||
constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;
|
||||
|
||||
const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
|
||||
|
||||
if (ix0 >= kx0_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float4 * x4 = (const float4 *) x;
|
||||
|
||||
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
|
||||
|
||||
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
|
||||
|
||||
const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
|
||||
const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
|
||||
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
|
||||
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
|
||||
|
||||
const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
|
||||
float amax = fabsf(xi);
|
||||
// Load 4 floats per thread and calculate max. abs. value between them:
|
||||
const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
float amax = fabsf(xi.x);
|
||||
amax = fmaxf(amax, fabsf(xi.y));
|
||||
amax = fmaxf(amax, fabsf(xi.z));
|
||||
amax = fmaxf(amax, fabsf(xi.w));
|
||||
|
||||
amax = warp_reduce_max(amax);
|
||||
|
||||
float sum;
|
||||
if (need_sum) {
|
||||
sum = warp_reduce_sum(xi);
|
||||
// Exchange max. abs. value between vals_per_scale/4 threads.
|
||||
#pragma unroll
|
||||
for (int mask = vals_per_scale/8; mask > 0; mask >>= 1) {
|
||||
amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE));
|
||||
}
|
||||
|
||||
const float d = amax / 127;
|
||||
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
||||
float sum;
|
||||
if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) {
|
||||
sum = xi.x + xi.y + xi.z + xi.w;
|
||||
|
||||
y[ib].qs[iqs] = q;
|
||||
// Exchange calculate sum across vals_per_sum/4 threads.
|
||||
#pragma unroll
|
||||
for (int mask = vals_per_sum/8; mask > 0; mask >>= 1) {
|
||||
sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, WARP_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
if (iqs % QK8_1 != 0) {
|
||||
const float d_inv = 127.0f / amax;
|
||||
char4 q;
|
||||
q.x = roundf(xi.x*d_inv);
|
||||
q.y = roundf(xi.y*d_inv);
|
||||
q.z = roundf(xi.z*d_inv);
|
||||
q.w = roundf(xi.w*d_inv);
|
||||
|
||||
// Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
|
||||
char4 * yqs4 = (char4 *) y[ib].qs;
|
||||
yqs4[iqs/4] = q;
|
||||
|
||||
if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6) {
|
||||
if (iqs % 16 != 0 || iqs >= 96) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (need_sum) {
|
||||
y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
|
||||
y[ib].d2s6[2 + iqs/16] = sum;
|
||||
|
||||
if (iqs % 64 != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float d = 1.0f / d_inv;
|
||||
|
||||
y[ib].d2s6[iqs/64] = d;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (iqs % 32 != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float d = 1.0f / d_inv;
|
||||
|
||||
if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
|
||||
y[ib].ds4[iqs/32] = make_half2(d, sum);
|
||||
} else {
|
||||
((float *) y[ib].ds)[iqs/QK8_1] = d;
|
||||
y[ib].d4[iqs/32] = d;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -101,12 +146,24 @@ void quantize_mmq_q8_1_cuda(
|
|||
|
||||
GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
|
||||
|
||||
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
const dim3 num_blocks(block_num_x, kx1, channels);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
if (mmq_need_sum(type_x)) {
|
||||
quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
} else {
|
||||
quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
|
||||
switch (mmq_get_q8_1_ds_layout(type_x)) {
|
||||
case MMQ_Q8_1_DS_LAYOUT_D4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_DS4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_D2S6:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,6 +6,10 @@
|
|||
#include <cstdint>
|
||||
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
|
||||
|
||||
static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access.");
|
||||
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
|
||||
|
||||
typedef void (*quantize_cuda_t)(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
||||
|
|
|
@ -189,7 +189,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|||
}
|
||||
|
||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||
#define VDR_Q2_K_Q8_1_MMQ 2
|
||||
#define VDR_Q2_K_Q8_1_MMQ 4
|
||||
|
||||
// contiguous v/x values
|
||||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
||||
|
@ -219,32 +219,56 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|||
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
template <int ns8>
|
||||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) {
|
||||
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8, const half2 * s8) {
|
||||
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
float sumf = 0.0f;
|
||||
float sumf_d8 = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
|
||||
const float2 dm2f = __half22float2(dm2[i0/(QI8_1/2)]);
|
||||
int sumi_d = 0;
|
||||
int sumi_m = 0;
|
||||
for (int i0 = 0; i0 < QR2_K*VDR_Q2_K_Q8_1_MMQ; i0 += QI8_1) {
|
||||
const float2 dm2f0 = __half22float2(dm2[i0/(QI8_1/2) + 0]);
|
||||
int sumi_d0 = 0;
|
||||
|
||||
const float2 dm2f1 = __half22float2(dm2[i0/(QI8_1/2) + 1]);
|
||||
int sumi_d1 = 0;
|
||||
|
||||
const int vi0 = v[i0/(QI8_1/2)];
|
||||
#pragma unroll
|
||||
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
||||
const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303;
|
||||
sumi_d = ggml_cuda_dp4a(vi, u[i], sumi_d); // SIMD dot product
|
||||
sumi_m = ggml_cuda_dp4a(0x01010101, u[i], sumi_m);
|
||||
sumi_d0 = ggml_cuda_dp4a(v[i], u[i], sumi_d0);
|
||||
}
|
||||
sumf_d8 += dm2f0.x * sumi_d0;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = i0 + QI8_1/2; i < i0 + QI8_1; ++i) {
|
||||
sumi_d1 = ggml_cuda_dp4a(v[i], u[i], sumi_d1);
|
||||
}
|
||||
sumf_d8 += dm2f1.x * sumi_d1;
|
||||
|
||||
if (i0/QI8_1 < ns8) {
|
||||
const float2 s8f = __half22float2(s8[i0/QI8_1]);
|
||||
sumf -= dm2f0.y*s8f.x;
|
||||
sumf -= dm2f1.y*s8f.y;
|
||||
} else {
|
||||
int sumi_m0 = 0;
|
||||
#pragma unroll
|
||||
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
||||
sumi_m0 = ggml_cuda_dp4a(0x01010101, u[i], sumi_m0);
|
||||
}
|
||||
sumf_d8 -= dm2f0.y * sumi_m0;
|
||||
|
||||
int sumi_m1 = 0;
|
||||
#pragma unroll
|
||||
for (int i = i0 + QI8_1/2; i < i0 + QI8_1; ++i) {
|
||||
sumi_m1 = ggml_cuda_dp4a(0x01010101, u[i], sumi_m1);
|
||||
}
|
||||
sumf_d8 -= dm2f1.y * sumi_m1;
|
||||
}
|
||||
}
|
||||
|
||||
sumf_d += dm2f.x * sumi_d;
|
||||
sumf_m += dm2f.y * sumi_m;
|
||||
}
|
||||
|
||||
return d8*(sumf_d - sumf_m);
|
||||
return sumf + d8*sumf_d8;
|
||||
}
|
||||
|
||||
#define VDR_Q3_K_Q8_1_MMVQ 1
|
||||
|
@ -283,7 +307,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|||
return d3 * sumf;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
||||
const float & d3, const float & d8) {
|
||||
|
@ -296,8 +320,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|||
|
||||
#pragma unroll
|
||||
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
||||
const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404);
|
||||
sumi_sc = ggml_cuda_dp4a(vi, u[i], sumi_sc); // SIMD dot product
|
||||
sumi_sc = ggml_cuda_dp4a(v[i], u[i], sumi_sc); // SIMD dot product
|
||||
}
|
||||
|
||||
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
||||
|
@ -334,7 +357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
||||
|
@ -397,7 +420,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|||
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
||||
|
@ -451,13 +474,16 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|||
return d*sumf;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
||||
const float & d6, const float * __restrict__ d8) {
|
||||
|
||||
float sumf_d = 0.0f;
|
||||
|
||||
const int sc_packed = get_int_b4(sc, 0);
|
||||
const int8_t * sc_reg = (const int8_t *) &sc_packed;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
|
||||
int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
|
||||
|
@ -471,7 +497,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|||
sumi_d.y = ggml_cuda_dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
|
||||
}
|
||||
|
||||
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
|
||||
sumf_d += d8[i0/4] * (sc_reg[i0/2+0]*sumi_d.x + sc_reg[i0/2+1]*sumi_d.y);
|
||||
}
|
||||
|
||||
return d6 * sumf_d;
|
||||
|
|
|
@ -609,6 +609,10 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|||
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
|
|
@ -658,7 +658,7 @@ static inline __m128i packNibbles( __m256i bytes ) {
|
|||
#endif //__loongarch_asx
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
||||
void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
||||
static const int qk = QK4_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
@ -696,11 +696,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
|
|||
}
|
||||
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q4_0_reference(x, y, k);
|
||||
quantize_row_q4_0_ref(x, y, k);
|
||||
}
|
||||
|
||||
|
||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
||||
void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
||||
const int qk = QK4_1;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
@ -738,10 +738,10 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
|
|||
}
|
||||
|
||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q4_1_reference(x, y, k);
|
||||
quantize_row_q4_1_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
|
||||
void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
|
||||
static const int qk = QK5_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
@ -786,10 +786,10 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
|
|||
}
|
||||
|
||||
void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q5_0_reference(x, y, k);
|
||||
quantize_row_q5_0_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
|
||||
void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
|
||||
const int qk = QK5_1;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
@ -834,11 +834,11 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
|
|||
}
|
||||
|
||||
void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q5_1_reference(x, y, k);
|
||||
quantize_row_q5_1_ref(x, y, k);
|
||||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
|
||||
void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
|
||||
assert(k % QK8_0 == 0);
|
||||
const int nb = k / QK8_0;
|
||||
|
||||
|
@ -1144,12 +1144,12 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|||
#else
|
||||
GGML_UNUSED(nb);
|
||||
// scalar
|
||||
quantize_row_q8_0_reference(x, y, k);
|
||||
quantize_row_q8_0_ref(x, y, k);
|
||||
#endif
|
||||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
|
||||
void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
|
||||
assert(QK8_1 == 32);
|
||||
assert(k % QK8_1 == 0);
|
||||
const int nb = k / QK8_1;
|
||||
|
@ -1508,7 +1508,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|||
#else
|
||||
GGML_UNUSED(nb);
|
||||
// scalar
|
||||
quantize_row_q8_1_reference(x, y, k);
|
||||
quantize_row_q8_1_ref(x, y, k);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -1899,7 +1899,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|||
|
||||
//========================- 2-bit (de)-quantization
|
||||
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
|
||||
void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -2002,7 +2002,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
|
|||
}
|
||||
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
quantize_row_q2_K_reference(x, vy, k);
|
||||
quantize_row_q2_K_ref(x, vy, k);
|
||||
}
|
||||
|
||||
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
||||
|
@ -2226,7 +2226,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|||
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
|
@ -2241,7 +2241,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
|
|||
|
||||
//========================= 3-bit (de)-quantization
|
||||
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
|
||||
void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -2368,7 +2368,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
|
|||
}
|
||||
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
quantize_row_q3_K_reference(x, vy, k);
|
||||
quantize_row_q3_K_ref(x, vy, k);
|
||||
}
|
||||
|
||||
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
|
||||
|
@ -2458,7 +2458,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|||
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
|
@ -2473,7 +2473,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
|
|||
|
||||
// ====================== 4-bit (de)-quantization
|
||||
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
|
||||
void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -2572,7 +2572,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
|
|||
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q4_K * restrict y = vy;
|
||||
quantize_row_q4_K_reference(x, y, k);
|
||||
quantize_row_q4_K_ref(x, y, k);
|
||||
}
|
||||
|
||||
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
|
@ -2651,7 +2651,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|||
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
|
@ -2666,7 +2666,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
|
|||
|
||||
// ====================== 5-bit (de)-quantization
|
||||
|
||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
|
||||
void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
|
@ -2783,7 +2783,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
|
|||
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q5_K * restrict y = vy;
|
||||
quantize_row_q5_K_reference(x, y, k);
|
||||
quantize_row_q5_K_ref(x, y, k);
|
||||
}
|
||||
|
||||
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
|
@ -2882,7 +2882,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|||
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
|
@ -2897,7 +2897,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
|
|||
|
||||
// ====================== 6-bit (de)-quantization
|
||||
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
|
||||
void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
|
@ -3001,7 +3001,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
|
|||
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q6_K * restrict y = vy;
|
||||
quantize_row_q6_K_reference(x, y, k);
|
||||
quantize_row_q6_K_ref(x, y, k);
|
||||
}
|
||||
|
||||
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
|
@ -3091,7 +3091,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|||
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
|
@ -3108,7 +3108,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|||
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_reference(x, y, n_per_row);
|
||||
quantize_row_q4_0_ref(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -3134,7 +3134,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|||
|
||||
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||
|
@ -3151,7 +3151,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|||
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_1_reference(x, y, n_per_row);
|
||||
quantize_row_q4_1_ref(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -3179,7 +3179,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|||
|
||||
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||
|
@ -3196,7 +3196,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|||
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_0_reference(x, y, n_per_row);
|
||||
quantize_row_q5_0_ref(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -3233,7 +3233,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|||
|
||||
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||
|
@ -3250,7 +3250,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|||
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_1_reference(x, y, n_per_row);
|
||||
quantize_row_q5_1_ref(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -3286,7 +3286,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|||
|
||||
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||
|
@ -3302,7 +3302,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
|
|||
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
(void)quant_weights; // not used
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
||||
quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
|
@ -3590,7 +3590,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
|||
|
||||
//===================================== Q8_K ==============================================
|
||||
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
|
||||
void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
|
@ -3641,7 +3641,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int6
|
|||
}
|
||||
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q8_K_reference(x, y, k);
|
||||
quantize_row_q8_K_ref(x, y, k);
|
||||
}
|
||||
|
||||
//===================================== Dot ptoducts =================================
|
||||
|
@ -3814,6 +3814,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
}
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
if (svcntb() == QK8_0) {
|
||||
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||
|
||||
|
@ -3850,7 +3851,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
#elif defined(__ARM_NEON)
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
|
||||
|
@ -5422,6 +5426,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
}
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
if (svcntb() == QK8_0) {
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
|
@ -5446,7 +5451,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
#elif defined(__ARM_NEON)
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
|
||||
|
@ -13522,10 +13530,10 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
|
|||
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq3_xxs * restrict y = vy;
|
||||
quantize_row_iq3_xxs_reference(x, y, k);
|
||||
quantize_row_iq3_xxs_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
|
||||
void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
||||
}
|
||||
|
@ -13738,10 +13746,10 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
|
|||
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq3_s * restrict y = vy;
|
||||
quantize_row_iq3_s_reference(x, y, k);
|
||||
quantize_row_iq3_s_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
|
||||
void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq3_s(x, y, 1, k, NULL);
|
||||
}
|
||||
|
@ -14479,7 +14487,7 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
|
||||
void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
quantize_row_iq4_nl(x, y, k);
|
||||
}
|
||||
|
@ -14507,10 +14515,10 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
|
|||
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq4_xs * restrict y = vy;
|
||||
quantize_row_iq4_xs_reference(x, y, k);
|
||||
quantize_row_iq4_xs_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
|
||||
void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq4_xs(x, y, 1, k, NULL);
|
||||
}
|
||||
|
@ -14697,7 +14705,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
|
|||
return nrow * nblock * sizeof(block_iq2_s);
|
||||
}
|
||||
|
||||
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
|
||||
void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq2_s(x, y, 1, k, NULL);
|
||||
}
|
||||
|
@ -14705,7 +14713,7 @@ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restri
|
|||
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq2_s * restrict y = vy;
|
||||
quantize_row_iq2_s_reference(x, y, k);
|
||||
quantize_row_iq2_s_ref(x, y, k);
|
||||
}
|
||||
|
||||
static bool validate_float(float f, size_t i) {
|
||||
|
@ -14760,6 +14768,16 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
|
|||
} \
|
||||
}
|
||||
|
||||
#define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
|
||||
const type * q = (const type *) (data); \
|
||||
for (size_t i = 0; i < (nb); ++i) { \
|
||||
for (size_t j = 0; j < (nr); ++j) { \
|
||||
if (!validate_fp16(q[i].d[j], i)) { \
|
||||
return false; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
|
||||
if (type < 0 || type >= GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
|
||||
|
@ -14977,6 +14995,16 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
|
||||
} break;
|
||||
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
|
|
|
@ -12,25 +12,25 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
|
|
@ -3768,37 +3768,13 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
||||
|
||||
const ggml_tensor_extra_gpu *src0_extra =
|
||||
(const ggml_tensor_extra_gpu *)src0->extra;
|
||||
const ggml_tensor_extra_gpu *src1_extra =
|
||||
(const ggml_tensor_extra_gpu *)src1->extra;
|
||||
const ggml_tensor_extra_gpu *dst_extra =
|
||||
(const ggml_tensor_extra_gpu *)dst->extra;
|
||||
|
||||
ggml_tensor_extra_gpu src0_row_extra;
|
||||
ggml_tensor_extra_gpu src1_row_extra;
|
||||
ggml_tensor_extra_gpu dst_row_extra;
|
||||
|
||||
ggml_tensor src0_row = *src0;
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
src0_row.extra = &src0_row_extra;
|
||||
src1_row.extra = &src1_row_extra;
|
||||
dst_row.extra = &dst_row_extra;
|
||||
|
||||
char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
||||
? (char *)src0->data
|
||||
: (char *)src0_extra->data_device[ctx.device];
|
||||
char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
||||
? (char *)src1->data
|
||||
: (char *)src1_extra->data_device[ctx.device];
|
||||
char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
|
||||
? (char *)dst->data
|
||||
: (char *)dst_extra->data_device[ctx.device];
|
||||
char *src0_original = (char *)src0->data;
|
||||
char *src1_original = (char *)src1->data;
|
||||
char *dst_original = (char *)dst->data;
|
||||
|
||||
src0_row.ne[2] = 1;
|
||||
src0_row.ne[3] = 1;
|
||||
|
@ -3827,12 +3803,9 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
const int64_t i1 = id;
|
||||
const int64_t i2 = i12;
|
||||
|
||||
src0_row_extra.data_device[ctx.device] =
|
||||
src0_original + i02*nb02;
|
||||
src1_row_extra.data_device[ctx.device] =
|
||||
src1_original + + i11*nb11 + i12*nb12;
|
||||
dst_row_extra.data_device[ctx.device] =
|
||||
dst_original + i1*nb1 + i2*nb2;
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
src1_row.data = src1_original + + i11*nb11 + i12*nb12;
|
||||
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
||||
}
|
||||
|
@ -3841,8 +3814,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
||||
ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
|
||||
src1_row_extra.data_device[ctx.device] = src1_contiguous.get();
|
||||
dst_row_extra.data_device[ctx.device] = dst_contiguous.get();
|
||||
src1_row.data = src1_contiguous.get();
|
||||
dst_row.data = dst_contiguous.get();
|
||||
|
||||
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||
int64_t num_src1_rows = 0;
|
||||
|
@ -3898,7 +3871,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
});
|
||||
}
|
||||
|
||||
src0_row_extra.data_device[ctx.device] = src0_original + i02*nb02;
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
|
||||
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||
|
@ -5221,6 +5194,10 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|||
return false;
|
||||
}
|
||||
}
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
if (src0_type == GGML_TYPE_BF16) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
|
|
|
@ -346,4 +346,10 @@ inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
|
|||
return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
|
||||
}
|
||||
|
||||
// Helper for accessing pointers with no warnings
|
||||
template <typename Tp, int dim>
|
||||
static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
|
||||
return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
|
||||
}
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
|
|
@ -158,7 +158,7 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
|
|||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_K(vx, y, scale_local_acc.get_pointer(), item_ct1);
|
||||
dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1835,10 +1835,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q4_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q4_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q4_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q4_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q4_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -1870,10 +1870,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q4_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q4_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q4_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q4_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q4_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -1950,10 +1950,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q4_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q4_1_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q4_1_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q4_1_acc_ct1),
|
||||
get_pointer(tile_x_dm_q4_1_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -1985,10 +1985,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q4_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q4_1_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q4_1_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q4_1_acc_ct1),
|
||||
get_pointer(tile_x_dm_q4_1_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2065,10 +2065,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q5_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q5_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q5_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2100,10 +2100,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q5_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q5_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q5_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2180,10 +2180,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q5_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_1_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q5_1_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_1_acc_ct1),
|
||||
get_pointer(tile_x_dm_q5_1_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2215,10 +2215,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q5_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_1_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q5_1_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_1_acc_ct1),
|
||||
get_pointer(tile_x_dm_q5_1_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2295,10 +2295,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q8_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q8_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q8_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q8_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q8_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2330,10 +2330,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q8_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q8_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q8_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q8_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q8_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2412,11 +2412,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q2_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q2_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q2_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q2_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q2_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q2_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q2_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2450,11 +2450,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q2_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q2_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q2_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q2_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q2_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q2_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q2_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2537,12 +2537,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q3_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_qh_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q3_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_qh_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q3_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2578,12 +2578,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q3_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_qh_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q3_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_qh_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q3_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2663,11 +2663,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q4_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q4_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q4_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q4_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q4_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q4_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q4_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2701,11 +2701,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q4_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q4_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q4_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q4_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q4_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q4_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q4_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2784,11 +2784,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q5_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q5_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q5_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q5_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q5_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2822,11 +2822,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q5_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q5_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q5_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q5_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q5_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2905,11 +2905,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q6_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_acc_ct1.get_pointer(),
|
||||
tile_x_dm_acc_ct1.get_pointer(),
|
||||
tile_x_sc_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_acc_ct1),
|
||||
get_pointer(tile_x_dm_acc_ct1),
|
||||
get_pointer(tile_x_sc_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -2943,11 +2943,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
mul_mat_q6_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_acc_ct1.get_pointer(),
|
||||
tile_x_dm_acc_ct1.get_pointer(),
|
||||
tile_x_sc_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_acc_ct1),
|
||||
get_pointer(tile_x_dm_acc_ct1),
|
||||
get_pointer(tile_x_sc_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
|
@ -218,7 +218,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -265,7 +265,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
|||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
group_norm_f32(x, dst, group_size, ne_elements,
|
||||
eps_ct4, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -306,7 +306,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
|
@ -136,7 +136,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
|
|||
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
||||
nrows_y, scale, max_bias, m0,
|
||||
m1, n_head_log2, item_ct1,
|
||||
local_buf_acc.get_pointer());
|
||||
get_pointer(local_buf_acc));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
212
ggml/src/ggml.c
212
ggml/src/ggml.c
|
@ -4,7 +4,7 @@
|
|||
#include "ggml-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include "ggml-aarch64.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
|
@ -37,12 +37,12 @@
|
|||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_FEATURE_MATMUL_INT8
|
||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#undef GGML_USE_LLAMAFILE
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_LLAMAFILE
|
||||
#include "sgemm.h"
|
||||
#include <llamafile/sgemm.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
@ -592,7 +592,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = false,
|
||||
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||
.vec_dot_type = GGML_TYPE_F16,
|
||||
.nrows = 1,
|
||||
|
@ -604,7 +604,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
||||
.from_float = quantize_row_q4_0,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref,
|
||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
|
@ -620,7 +620,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
||||
.from_float = quantize_row_q4_1,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
|
||||
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
|
@ -636,7 +636,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = false,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_COUNT,
|
||||
.nrows = 1,
|
||||
|
@ -648,7 +648,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = false,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_COUNT,
|
||||
.nrows = 1,
|
||||
|
@ -660,7 +660,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
||||
.from_float = quantize_row_q5_0,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref,
|
||||
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
|
@ -672,7 +672,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
||||
.from_float = quantize_row_q5_1,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref,
|
||||
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
.nrows = 1,
|
||||
|
@ -684,7 +684,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q8_0,
|
||||
.from_float = quantize_row_q8_0,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref,
|
||||
.from_float_to_mat = quantize_mat_q8_0,
|
||||
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
|
@ -699,7 +700,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.type_size = sizeof(block_q8_1),
|
||||
.is_quantized = true,
|
||||
.from_float = quantize_row_q8_1,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
.nrows = 1,
|
||||
},
|
||||
|
@ -710,7 +711,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
||||
.from_float = quantize_row_q2_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -722,7 +723,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
||||
.from_float = quantize_row_q3_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -734,7 +735,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
||||
.from_float = quantize_row_q4_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -746,7 +747,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
||||
.from_float = quantize_row_q5_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -758,7 +759,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
||||
.from_float = quantize_row_q6_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -770,7 +771,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -782,7 +783,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -794,7 +795,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
||||
.from_float = quantize_row_iq3_xxs,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
|
||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -806,7 +807,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq3_s,
|
||||
.from_float = quantize_row_iq3_s,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_s_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref,
|
||||
.vec_dot = ggml_vec_dot_iq3_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -818,7 +819,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_s,
|
||||
.from_float = quantize_row_iq2_s,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref,
|
||||
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -830,7 +831,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -842,7 +843,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -854,7 +855,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
|
||||
.from_float = quantize_row_iq4_nl,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
|
||||
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
|
@ -866,7 +867,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
||||
.from_float = quantize_row_iq4_xs,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref,
|
||||
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
|
@ -885,10 +886,58 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = false,
|
||||
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_4] = {
|
||||
.type_name = "q4_0_4x4",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 4,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_8] = {
|
||||
.type_name = "q4_0_4x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_8_8] = {
|
||||
.type_name = "q4_0_8x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 8,
|
||||
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -3066,7 +3115,7 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|||
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
||||
}
|
||||
|
||||
GGML_CALL int ggml_blck_size(enum ggml_type type) {
|
||||
GGML_CALL int64_t ggml_blck_size(enum ggml_type type) {
|
||||
return type_traits[type].blck_size;
|
||||
}
|
||||
|
||||
|
@ -3188,6 +3237,9 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|||
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
|
@ -9432,6 +9484,9 @@ static void ggml_compute_forward_add(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_add_q_f32(params, dst);
|
||||
} break;
|
||||
|
@ -9807,6 +9862,9 @@ static void ggml_compute_forward_add1(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_add1_q_f32(params, dst);
|
||||
} break;
|
||||
|
@ -9932,6 +9990,9 @@ static void ggml_compute_forward_acc(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
|
@ -12132,8 +12193,13 @@ static void ggml_compute_forward_mul_mat(
|
|||
const enum ggml_type type = src0->type;
|
||||
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
|
||||
ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
|
||||
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||
int64_t const matmul_num_cols = type_traits[type].ncols;
|
||||
int64_t const blck_size_interleave = type_traits[type].blck_size_interleave;
|
||||
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||
ggml_gemm_t const gemm = type_traits[type].gemm;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
|
@ -12192,8 +12258,17 @@ UseGgmlGemm1:;
|
|||
|
||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
||||
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
int64_t i11_processed = 0;
|
||||
if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
|
||||
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
||||
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
4, ne10, blck_size_interleave);
|
||||
}
|
||||
i11_processed = ne11 - ne11 % 4;
|
||||
}
|
||||
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
||||
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
ne10);
|
||||
}
|
||||
|
@ -12273,6 +12348,28 @@ UseGgmlGemm2:;
|
|||
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
||||
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
||||
|
||||
if ((ggml_n_dims(src0) == 2) && gemv) {
|
||||
const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
|
||||
int64_t src0_start = (ith * ne01) / nth;
|
||||
int64_t src0_end = ((ith + 1) * ne01) / nth;
|
||||
src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
|
||||
src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
|
||||
if (src0_start >= src0_end) return;
|
||||
|
||||
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
||||
if (gemm && (ne11 > 3)) {
|
||||
gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
|
||||
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
|
||||
}
|
||||
for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
|
||||
gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
|
||||
(const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
|
||||
src0_end - src0_start);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
||||
int current_chunk = ith;
|
||||
|
||||
|
@ -12317,7 +12414,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|||
|
||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
|
||||
int64_t const matmul_num_cols = type_traits[type].ncols;
|
||||
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||
|
@ -12358,7 +12457,7 @@ static void ggml_compute_forward_mul_mat_id(
|
|||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
||||
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
ne10);
|
||||
}
|
||||
|
@ -12403,6 +12502,34 @@ static void ggml_compute_forward_mul_mat_id(
|
|||
const int64_t nr0 = ne01; // src0 rows
|
||||
const int64_t nr1 = cne1; // src1 rows
|
||||
|
||||
if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
|
||||
int64_t src0_cur_start = (ith * ne01) / nth;
|
||||
int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
|
||||
src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
|
||||
src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
|
||||
if (src0_cur_start >= src0_cur_end) return;
|
||||
|
||||
for (int ir1 = 0; ir1 < nr1; ir1++) {
|
||||
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
|
||||
const int id = row_mapping.i1; // selected expert index
|
||||
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = row_mapping.i2; // row index in src1
|
||||
|
||||
const int64_t i1 = id; // selected expert index
|
||||
const int64_t i2 = i12; // row
|
||||
|
||||
const char * src1_col = (const char *) wdata +
|
||||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12 * ne11) * row_size
|
||||
: (i11 * nb11 + i12 * nb12));
|
||||
|
||||
gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
|
||||
(const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||
|
||||
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||
|
@ -12704,6 +12831,9 @@ static void ggml_compute_forward_out_prod(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_out_prod_q_f32(params, dst);
|
||||
} break;
|
||||
|
@ -12889,6 +13019,9 @@ static void ggml_compute_forward_set(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
|
@ -13148,6 +13281,9 @@ static void ggml_compute_forward_get_rows(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_get_rows_q(params, dst);
|
||||
} break;
|
||||
|
@ -13734,6 +13870,9 @@ static void ggml_compute_forward_clamp(
|
|||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
|
@ -20457,6 +20596,9 @@ size_t ggml_quantize_chunk(
|
|||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
|
@ -20920,7 +21062,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
(int64_t) info->ne[3];
|
||||
|
||||
if (ne % ggml_blck_size(info->type) != 0) {
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
||||
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
|
@ -21759,8 +21901,6 @@ int ggml_cpu_has_neon(void) {
|
|||
|
||||
int ggml_cpu_has_sve(void) {
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// TODO: Currently, SVE 256 bit is only supported.
|
||||
GGML_ASSERT(svcntb() == QK8_0);
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
|
|
|
@ -79,5 +79,4 @@ python -m twine upload dist/*
|
|||
```
|
||||
|
||||
## TODO
|
||||
- [ ] Add tests
|
||||
- [ ] Include conversion scripts as command line entry points in this package.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.9.0"
|
||||
version = "0.9.1"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
|
|
@ -162,6 +162,9 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
|
|
@ -57,6 +57,12 @@
|
|||
#include <io.h>
|
||||
#endif
|
||||
|
||||
#if __cplusplus >= 202000L
|
||||
#define LU8(x) (const char*)(u8##x)
|
||||
#else
|
||||
#define LU8(x) u8##x
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
|
@ -3782,6 +3788,9 @@ struct llama_model_loader {
|
|||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
||||
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
||||
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
@ -4475,6 +4484,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
|
@ -5871,13 +5883,6 @@ static bool llm_load_tensors(
|
|||
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
// disable MoE with SYCL until mul_mat_id is updated
|
||||
if (hparams.n_expert > 0) {
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
model.split_mode = split_mode;
|
||||
model.main_gpu = main_gpu;
|
||||
model.n_gpu_layers = n_gpu_layers;
|
||||
|
@ -8122,7 +8127,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
|
||||
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
|
@ -13200,6 +13205,8 @@ struct llm_build_context {
|
|||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
} else {
|
||||
GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
|
||||
|
||||
struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
|
||||
struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
|
||||
|
||||
|
@ -17760,6 +17767,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
|
||||
new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
new_type = GGML_TYPE_Q4_0;
|
||||
}
|
||||
}
|
||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||
|
@ -18072,6 +18083,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
|
@ -18382,6 +18396,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
int chunk_size_multiplier = 1;
|
||||
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
|
||||
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
||||
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
|
@ -18394,7 +18416,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
|
||||
chunk_size_multiplier;
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
|
@ -21121,7 +21144,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|||
size--;
|
||||
}
|
||||
if (length < (int32_t)size) {
|
||||
return (int32_t) -size;
|
||||
return -(int32_t) size;
|
||||
}
|
||||
memcpy(buf, token, size);
|
||||
return (int32_t) size;
|
||||
|
@ -21509,12 +21532,12 @@ static int32_t llama_chat_apply_template_internal(
|
|||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
|
||||
} else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "user") {
|
||||
ss << u8"<用户>";
|
||||
ss << LU8("<用户>");
|
||||
ss << trim(message->content);
|
||||
ss << "<AI>";
|
||||
} else {
|
||||
|
@ -21530,7 +21553,7 @@ static int32_t llama_chat_apply_template_internal(
|
|||
} else if (role == "user") {
|
||||
ss << "User: " << message->content << "\n\n";
|
||||
} else if (role == "assistant") {
|
||||
ss << "Assistant: " << message->content << u8"<|end▁of▁sentence|>";
|
||||
ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
#if defined(_MSC_VER)
|
||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#include "unicode.h"
|
||||
#include "unicode-data.h"
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||
|
||||
// ggml.c::quantize_row_q4_0_reference
|
||||
// ggml.c::quantize_row_q4_0_ref
|
||||
inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
|
||||
|
||||
// ggml.c::ggml_silu_f32
|
||||
|
@ -24,7 +24,7 @@ inline static float silu_orig(float x) {
|
|||
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
// ggml.c::quantize_row_q4_0_reference
|
||||
// ggml.c::quantize_row_q4_0_ref
|
||||
inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
|
||||
|
||||
// ggml.c::ggml_silu_f32
|
||||
|
|
|
@ -60,7 +60,7 @@ static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test
|
|||
qfns.from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
|
||||
qfns.from_float_reference(test_data, tmp_q.data(), test_size);
|
||||
qfns.from_float_ref(test_data, tmp_q.data(), test_size);
|
||||
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
||||
|
||||
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
||||
|
|
|
@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
|
|||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns.from_float_reference(test_data1, test_q1, size);
|
||||
qfns.from_float_ref(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue