diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 49e9b7528..00be596ce 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -89,6 +89,22 @@ let ps.tiktoken ps.torchWithoutCuda ps.transformers + + # server bench + ps.matplotlib + + # server tests + ps.openai + ps.behave + ps.prometheus-client + + # for examples/pydantic-models-to-grammar-examples.py + ps.docstring-parser + ps.pydantic + + # for scripts/compare-llama-bench.py + ps.gitpython + ps.tabulate ] ); diff --git a/.github/labeler.yml b/.github/labeler.yml index 9c0397d16..89436740d 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -16,7 +16,9 @@ SYCL: - any-glob-to-any-file: - ggml/include/ggml-sycl.h - ggml/src/ggml-sycl.cpp - - README-sycl.md + - ggml/src/ggml-sycl/** + - docs/backend/SYCL.md + - examples/sycl/** Nvidia GPU: - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml new file mode 100644 index 000000000..e5ff5e6d7 --- /dev/null +++ b/.github/workflows/python-type-check.yml @@ -0,0 +1,38 @@ +name: Python Type-Check + +on: + push: + paths: + - '.github/workflows/python-type-check.yml' + - '**.py' + - '**/requirements*.txt' + pull_request: + paths: + - '.github/workflows/python-type-check.yml' + - '**.py' + - '**/requirements*.txt' + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +jobs: + python-type-check: + runs-on: ubuntu-latest + name: pyright type-check + steps: + - name: Check out source repository + uses: actions/checkout@v4 + - name: Set up Python environment + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install Python dependencies + # TODO: use a venv + run: pip install -r requirements/requirements-all.txt + - name: Type-check with Pyright + uses: jakebailey/pyright-action@v2 + with: + version: 1.1.370 + level: warning + warnings: true diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f6cd6872..befc4433c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,9 +50,6 @@ endif() # option list # -# general -option(LLAMA_CCACHE "llama: use ccache if available" ON) - # debug option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) @@ -77,7 +74,6 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) # override ggml options -set(GGML_CCACHE ${LLAMA_CCACHE}) set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) @@ -115,7 +111,10 @@ llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) # build the library # -add_subdirectory(ggml) +if (NOT TARGET ggml) + add_subdirectory(ggml) + # ... otherwise assume ggml is added by a parent CMakeLists.txt +endif() add_subdirectory(src) # diff --git a/Makefile b/Makefile index bb6e2f968..668b38b99 100644 --- a/Makefile +++ b/Makefile @@ -64,10 +64,14 @@ TEST_TARGETS = \ tests/test-tokenizer-1-spm # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned -LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ +LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm +# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. +# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. +LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune + # Deprecation aliases ifdef LLAMA_CUBLAS $(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.) @@ -193,7 +197,7 @@ ifdef GGML_RPC BUILD_TARGETS += rpc-server endif -default: $(BUILD_TARGETS) +default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD) test: $(TEST_TARGETS) @failures=0; \ @@ -228,7 +232,7 @@ test: $(TEST_TARGETS) fi @echo 'All tests passed.' -all: $(BUILD_TARGETS) $(TEST_TARGETS) +all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD) ifdef RISCV_CROSS_COMPILE CC := riscv64-unknown-linux-gnu-gcc @@ -245,17 +249,22 @@ MK_CFLAGS = -std=c11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC MK_NVCCFLAGS = -std=c++11 -ifndef LLAMA_NO_CCACHE +ifdef LLAMA_NO_CCACHE +GGML_NO_CCACHE := 1 +DEPRECATE_WARNING := 1 +endif + +ifndef GGML_NO_CCACHE CCACHE := $(shell which ccache) ifdef CCACHE export CCACHE_SLOPPINESS = time_macros -$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.) +$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.) CC := $(CCACHE) $(CC) CXX := $(CCACHE) $(CXX) else $(info I ccache not found. Consider installing it for faster compilation.) endif # CCACHE -endif # LLAMA_NO_CCACHE +endif # GGML_NO_CCACHE # clock_gettime came in POSIX.1b (1993) # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional @@ -545,7 +554,7 @@ endif # GGML_BLIS ifndef GGML_NO_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJ_GGML += ggml/src/sgemm.o + OBJ_GGML += ggml/src/llamafile/sgemm.o endif ifdef GGML_RPC @@ -826,7 +835,8 @@ OBJ_GGML += \ ggml/src/ggml.o \ ggml/src/ggml-alloc.o \ ggml/src/ggml-backend.o \ - ggml/src/ggml-quants.o + ggml/src/ggml-quants.o \ + ggml/src/ggml-aarch64.o OBJ_LLAMA = \ src/llama.o \ @@ -926,6 +936,7 @@ $(info - LLAMA_NO_LLAMAFILE) $(info - LLAMA_NO_ACCELERATE) $(info - LLAMA_NO_OPENMP) $(info - LLAMA_NO_METAL) +$(info - LLAMA_NO_CCACHE) $(info ) endif @@ -959,15 +970,22 @@ ggml/src/ggml-quants.o: \ ggml/src/ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ +ggml/src/ggml-aarch64.o: \ + ggml/src/ggml-aarch64.c \ + ggml/include/ggml.h \ + ggml/src/ggml-aarch64.h \ + ggml/src/ggml-common.h + $(CC) $(CFLAGS) -c $< -o $@ + ggml/src/ggml-blas.o: \ ggml/src/ggml-blas.cpp \ ggml/include/ggml-blas.h $(CXX) $(CXXFLAGS) -c $< -o $@ ifndef GGML_NO_LLAMAFILE -ggml/src/sgemm.o: \ - ggml/src/sgemm.cpp \ - ggml/src/sgemm.h \ +ggml/src/llamafile/sgemm.o: \ + ggml/src/llamafile/sgemm.cpp \ + ggml/src/llamafile/sgemm.h \ ggml/include/ggml.h $(CXX) $(CXXFLAGS) -c $< -o $@ endif # GGML_NO_LLAMAFILE @@ -1092,7 +1110,7 @@ clean: rm -vrf ggml/src/ggml-cuda/template-instances/*.o rm -rvf $(BUILD_TARGETS) rm -rvf $(TEST_TARGETS) - rm -rvf $(LEGACY_TARGETS) + rm -rvf $(LEGACY_TARGETS_CLEAN) find examples pocs -type f -name "*.o" -delete # @@ -1488,3 +1506,61 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +# +# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed. +# +# Mark legacy binary targets as .PHONY so that they are always checked. +.PHONY: main quantize perplexity embedding server finetune + +# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. +# Eventually we will want to remove these target from building all the time. +main: examples/deprecation-warning/deprecation-warning.cpp + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." + +server: examples/deprecation-warning/deprecation-warning.cpp + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." + +quantize: examples/deprecation-warning/deprecation-warning.cpp +ifneq (,$(wildcard quantize)) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + @echo "#########" + @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." + @echo " Remove the 'quantize' binary to remove this warning." + @echo "#########" +endif + +perplexity: examples/deprecation-warning/deprecation-warning.cpp +ifneq (,$(wildcard perplexity)) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + @echo "#########" + @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." + @echo " Remove the 'perplexity' binary to remove this warning." + @echo "#########" +endif + +embedding: examples/deprecation-warning/deprecation-warning.cpp +ifneq (,$(wildcard embedding)) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + @echo "#########" + @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." + @echo " Remove the 'embedding' binary to remove this warning." + @echo "#########" +endif + +finetune: examples/deprecation-warning/deprecation-warning.cpp +ifneq (,$(wildcard finetune)) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + @echo "#########" + @echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead." + @echo " Remove the 'finetune' binary to remove this warning." + @echo "#########" +endif diff --git a/Package.swift b/Package.swift index 77fed86df..d40a48385 100644 --- a/Package.swift +++ b/Package.swift @@ -10,6 +10,7 @@ var sources = [ "ggml/src/ggml-alloc.c", "ggml/src/ggml-backend.c", "ggml/src/ggml-quants.c", + "ggml/src/ggml-aarch64.c", ] var resources: [Resource] = [] diff --git a/README.md b/README.md index aba6a4c59..058919068 100644 --- a/README.md +++ b/README.md @@ -96,8 +96,9 @@ Typically finetunes of the base models below are supported as well. - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [OLMo](https://allenai.org/olmo) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) +- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) -(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md)) +(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) **Multimodal models:** @@ -452,7 +453,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio - [How to build](./docs/build.md) - [Running on Docker](./docs/docker.md) - [Build on Android](./docs/android.md) -- [Performance troubleshooting](./docs/token_generation_performance_tips.md) +- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md) - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) **Seminal papers and background on the models** diff --git a/common/common.cpp b/common/common.cpp index ddb1e79ae..dac152c4f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1,3 +1,7 @@ +#if defined(_MSC_VER) +#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING +#endif + #include "common.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT diff --git a/common/log.h b/common/log.h index 09fa63c26..1bc5328ce 100644 --- a/common/log.h +++ b/common/log.h @@ -630,7 +630,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens) buf << "[ "; bool first = true; - for (const auto &token : tokens) + for (const auto & token : tokens) { if (!first) { buf << ", "; diff --git a/common/sampling.cpp b/common/sampling.cpp index 9f332fe57..6a483c815 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl( GGML_ASSERT(!original_logits.empty()); } llama_token id = 0; - // Get a pointer to the logits - float * logits = llama_get_logits_ith(ctx_main, idx); if (temp < 0.0) { // greedy sampling, with probs @@ -324,6 +322,9 @@ static llama_token llama_sampling_sample_impl( } if (ctx_sampling->grammar != NULL && !is_resampling) { + // Get a pointer to the logits + float * logits = llama_get_logits_ith(ctx_main, idx); + // Create an array with a single token data element for the sampled id llama_token_data single_token_data = {id, logits[id], 0.0f}; llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; @@ -377,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl( if (ctx_sampling->grammar != NULL && !apply_grammar) { GGML_ASSERT(original_logits != NULL); // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. - *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; + *original_logits = {logits, logits + n_vocab}; } // apply params.logit_bias map @@ -390,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl( llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); } - cur.clear(); + cur.resize(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; } llama_token_data_array cur_p = { cur.data(), cur.size(), false }; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 109135b68..61f8e370c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -265,7 +265,7 @@ class Model: break for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): - data: np.ndarray = data # type hint + data: np.ndarray # type hint n_dims = len(data.shape) data_dtype = data.dtype data_qtype: gguf.GGMLQuantizationType | None = None @@ -602,10 +602,6 @@ class Model: tokenizer_path = self.dir_model / 'tokenizer.model' - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") @@ -1363,7 +1359,7 @@ class LlamaModel(Model): def set_vocab(self): try: - self. _set_vocab_sentencepiece() + self._set_vocab_sentencepiece() except FileNotFoundError: try: self._set_vocab_llama_hf() @@ -2127,7 +2123,7 @@ class InternLM2Model(Model): logger.error(f'Error: Missing {tokenizer_path}') sys.exit(1) - sentencepiece_model = model.ModelProto() + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix @@ -2155,6 +2151,9 @@ class InternLM2Model(Model): toktype = SentencePieceTokenTypes.UNUSED elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE + # take care of ununsed raw token + if piece.startswith('[UNUSED'): + toktype = SentencePieceTokenTypes.UNKNOWN tokens.append(text) scores.append(score) @@ -2170,6 +2169,47 @@ class InternLM2Model(Model): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + chat_eos_token = '<|im_end|>' + chat_eos_token_id = None + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"] + if token == chat_eos_token: + chat_eos_token_id = token_id + token = token.encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + assert(tokens[token_id] == token) + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"] + if token == chat_eos_token: + chat_eos_token_id = token_id + token = token.encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + assert(tokens[token_id] == token) + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) @@ -2179,28 +2219,16 @@ class InternLM2Model(Model): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) old_eos = special_vocab.special_token_ids["eos"] - if "chat" in os.path.basename(self.dir_model.absolute()): + if chat_eos_token_id is not None: # For the chat model, we replace the eos with '<|im_end|>'. # TODO: this is a hack, should be fixed # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 - special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ -in chat mode so that the conversation can end normally.") + special_vocab.special_token_ids["eos"] = chat_eos_token_id + logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" + " in chat mode so that the conversation can end normally.") special_vocab.add_to_gguf(self.gguf_writer) - def _try_get_sft_eos(self, tokenizer): - unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]') - im_end_list = tokenizer.Encode('<|im_end|>') - eos_token = None - assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) - if len(unused_145_list) == 1: - eos_token = unused_145_list[0] - if len(im_end_list) == 1: - eos_token = im_end_list[0] - assert eos_token - return eos_token - def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv @@ -2219,6 +2247,10 @@ in chat mode so that the conversation can end normally.") self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_file_type(self.ftype) + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: num_heads = self.hparams["num_attention_heads"] @@ -2979,16 +3011,16 @@ class T5Model(Model): if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE + if sentencepiece_model.trainer_spec.model_type == 2: # BPE # assure the tokenizer model file name is correct assert tokenizer_path.name == 'tokenizer.model' return self._set_vocab_sentencepiece() else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces @@ -3159,7 +3191,7 @@ class JaisModel(Model): # but Jais's PyTorch model simply precalculates the slope values and places them # in relative_pes.slopes n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) - first_val = float(data_torch._data[0]) + first_val = float(data_torch[0].item()) self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) return tensors @@ -3193,7 +3225,7 @@ class ChatGLMModel(Model): def set_vocab_chatglm3(self): dir_model = self.dir_model hparams = self.hparams - tokens: list[bytearray] = [] + tokens: list[bytes] = [] toktypes: list[int] = [] scores: list[float] = [] @@ -3342,7 +3374,7 @@ class ChatGLMModel(Model): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - self.gguf_writer.add_name(self.hparams.get("_name_or_path").split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b + self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_head_kv = self.hparams.get("multi_query_group_num", n_head) diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py index 9349de3b3..95ea831a5 100755 --- a/convert_llama_ggml_to_gguf.py +++ b/convert_llama_ggml_to_gguf.py @@ -354,7 +354,8 @@ class GGMLToGGUF: def handle_metadata(cfg, hp): - import convert + import examples.convert_legacy_llama as convert + assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory' hf_config_path = cfg.model_metadata_dir / "config.json" orig_config_path = cfg.model_metadata_dir / "params.json" diff --git a/docs/build.md b/docs/build.md index bf41bfdf9..d70f72f4c 100644 --- a/docs/build.md +++ b/docs/build.md @@ -28,6 +28,7 @@ In order to build llama.cpp you have four different options. ``` - Notes: + - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel. - For faster repeated compilation, install [ccache](https://ccache.dev/). - For debug builds, run `make LLAMA_DEBUG=1` @@ -41,6 +42,7 @@ In order to build llama.cpp you have four different options. **Notes**: + - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel. - For faster repeated compilation, install [ccache](https://ccache.dev/). - For debug builds, there are two cases: diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index 721a57c00..c2c73e8ad 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -353,7 +353,7 @@ class Metadata: version: Optional[str] = None url: Optional[str] = None description: Optional[str] = None - licence: Optional[str] = None + license: Optional[str] = None source_url: Optional[str] = None source_hf_repo: Optional[str] = None @@ -492,12 +492,13 @@ class LazyTensor: LazyModel: TypeAlias = 'dict[str, LazyTensor]' +ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none'] @dataclass class ModelPlus: model: LazyModel paths: list[Path] # Where this was read from. - format: Literal['ggml', 'torch', 'safetensors', 'none'] + format: ModelFormat vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab. @@ -536,7 +537,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel: def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: - formats = set(mp.format for mp in models_plus) + formats: set[ModelFormat] = set(mp.format for mp in models_plus) assert len(formats) == 1, "different formats?" format = formats.pop() paths = [path for mp in models_plus for path in mp.paths] @@ -555,7 +556,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: else: model = merge_sharded([mp.model for mp in models_plus]) - return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types + return ModelPlus(model, paths, format, vocab) def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: @@ -805,7 +806,7 @@ class OutputFile: def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - def add_meta_model(self, params: Params, metadata: Metadata) -> None: + def add_meta_model(self, params: Params, metadata: Metadata | None) -> None: # Metadata About The Model And Its Provenence name = "LLaMA" if metadata is not None and metadata.name is not None: @@ -827,8 +828,8 @@ class OutputFile: self.gguf.add_url(metadata.url) if metadata.description is not None: self.gguf.add_description(metadata.description) - if metadata.licence is not None: - self.gguf.add_licence(metadata.licence) + if metadata.license is not None: + self.gguf.add_licence(metadata.license) if metadata.source_url is not None: self.gguf.add_source_url(metadata.source_url) if metadata.source_hf_repo is not None: @@ -943,7 +944,7 @@ class OutputFile: @staticmethod def write_vocab_only( fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -977,7 +978,7 @@ class OutputFile: fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, - metadata: Metadata = None, + metadata: Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -1396,6 +1397,8 @@ def main(args_in: list[str] | None = None) -> None: if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: vocab = model_plus.vocab + assert params is not None + logger.info(f"Vocab info: {vocab}") logger.info(f"Special vocab info: {special_vocab}") model = model_plus.model diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md new file mode 100644 index 000000000..1e20feb4a --- /dev/null +++ b/examples/deprecation-warning/README.md @@ -0,0 +1,51 @@ +# Migration notice for binary filenames + +> [!IMPORTANT] +[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809) + +This migration was important, but it is a breaking change that may not always be immediately obvious to users. + +Please update all scripts and workflows to use the new binary names. + +| Old Filename | New Filename | +| ---- | ---- | +| main | llama-cli | +| server | llama-server | +| llama-bench | llama-bench | +| embedding | llama-embedding | +| finetune | llama-finetune | +| quantize | llama-quantize | +| tokenize | llama-tokenize | +| export-lora | llama-export-lora | +| libllava.a | libllava.a | +| baby-llama | llama-baby-llama | +| batched | llama-batched | +| batched-bench | llama-batched-bench | +| benchmark-matmult | llama-benchmark-matmult | +| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml | +| eval-callback | llama-eval-callback | +| gbnf-validator | llama-gbnf-validator | +| gguf | llama-gguf | +| gguf-split | llama-gguf-split | +| gritlm | llama-gritlm | +| imatrix | llama-imatrix | +| infill | llama-infill | +| llava-cli | llama-llava-cli | +| lookahead | llama-lookahead | +| lookup | llama-lookup | +| lookup-create | llama-lookup-create | +| lookup-merge | llama-lookup-merge | +| lookup-stats | llama-lookup-stats | +| parallel | llama-parallel | +| passkey | llama-passkey | +| perplexity | llama-perplexity | +| q8dot | llama-q8dot | +| quantize-stats | llama-quantize-stats | +| retrieval | llama-retrieval | +| save-load-state | llama-save-load-state | +| simple | llama-simple | +| speculative | llama-speculative | +| train-text-from-scratch | llama-train-text-from-scratch | +| vdot | llama-vdot | +| tests/test-c.o | tests/test-c.o | + diff --git a/examples/deprecation-warning/deprecation-warning.cpp b/examples/deprecation-warning/deprecation-warning.cpp new file mode 100644 index 000000000..11b35d2c2 --- /dev/null +++ b/examples/deprecation-warning/deprecation-warning.cpp @@ -0,0 +1,35 @@ +// Warns users that this filename was deprecated, and provides a link for more information. + +#include +#include +#include + +// Main +int main(int argc, char** argv) { + std::string filename = "main"; + if (argc >= 1) { + filename = argv[0]; + } + + // Get only the program name from the full path + auto pos = filename.find_last_of('/'); + if (pos != std::string::npos) { + filename = filename.substr(pos+1); + } + + // Append "llama-" to the beginning of filename to get the replacemnt filename + auto replacement_filename = "llama-" + filename; + + // The exception is if the filename is "main", then our replacement filename is "llama-cli" + if (filename == "main") { + replacement_filename = "llama-cli"; + } + + fprintf(stdout, "\n"); + fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str()); + fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str()); + fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n"); + fprintf(stdout, "\n"); + + return EXIT_FAILURE; +} diff --git a/examples/finetune/convert_finetune_checkpoint_to_gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py index c89090918..1b79d6995 100644 --- a/examples/finetune/convert_finetune_checkpoint_to_gguf.py +++ b/examples/finetune/convert_finetune_checkpoint_to_gguf.py @@ -74,7 +74,7 @@ class Tensor: if len(self.ne) == 0: self.nbytes = 0 else: - self.nbytes = int(np.product(self.ne)) * 4 + self.nbytes = int(np.prod(self.ne)) * 4 else: raise ValueError(f"Unhandled data type '{self.dtype}'") diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 0e682154d..dc93d2301 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -204,21 +204,17 @@ int main(int argc, char ** argv) { GGML_ASSERT(llama_add_eos_token(model) != 1); LOG("add_bos: %d\n", add_bos); - bool suff_rm_leading_spc = params.escape; - if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } std::vector embd_inp; std::vector embd_end; std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); - const int space_token = 29871; - if (suff_rm_leading_spc && inp_sfx[0] == space_token) { - inp_sfx.erase(inp_sfx.begin()); - } + + GGML_ASSERT(llama_token_prefix(model) >= 0); + GGML_ASSERT(llama_token_suffix(model) >= 0); + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { @@ -516,19 +512,14 @@ int main(int argc, char ** argv) { string_process_escapes(params.input_prefix); string_process_escapes(params.input_suffix); } - suff_rm_leading_spc = params.escape; - if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } + // tokenize new prefix and suffix std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); - if (suff_rm_leading_spc && inp_sfx[0] == space_token) { - inp_sfx.erase(inp_sfx.begin()); - } + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { diff --git a/examples/json_schema_pydantic_example.py b/examples/json_schema_pydantic_example.py index c7ca7b8d9..19c0bdb5b 100644 --- a/examples/json_schema_pydantic_example.py +++ b/examples/json_schema_pydantic_example.py @@ -3,7 +3,7 @@ #! pip install pydantic #! python json_schema_pydantic_example.py -from pydantic import BaseModel, Extra, TypeAdapter +from pydantic import BaseModel, Field, TypeAdapter from annotated_types import MinLen from typing import Annotated, List, Optional import json, requests @@ -17,6 +17,9 @@ if True: The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below) ''' + response_format = None + type_adapter = None + if response_model: type_adapter = TypeAdapter(response_model) schema = type_adapter.json_schema() diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 072a230f7..a8779bf3b 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +from __future__ import annotations + import argparse import itertools import json @@ -188,7 +190,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou raise RuntimeError("At least one of min_value or max_value must be set") class BuiltinRule: - def __init__(self, content: str, deps: list = None): + def __init__(self, content: str, deps: list | None = None): self.content = content self.deps = deps or [] @@ -248,7 +250,7 @@ class SchemaConverter: def _format_literal(self, literal): escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( - lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal + lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal ) return f'"{escaped}"' @@ -403,11 +405,11 @@ class SchemaConverter: i = 0 length = len(pattern) - def to_rule(s: Tuple[str, bool]) -> str: + def to_rule(s: tuple[str, bool]) -> str: (txt, is_literal) = s return "\"" + txt + "\"" if is_literal else txt - def transform() -> Tuple[str, bool]: + def transform() -> tuple[str, bool]: ''' Parse a unit at index i (advancing it), and return its string representation + whether it's a literal. ''' @@ -420,7 +422,7 @@ class SchemaConverter: # We only need a flat structure here to apply repetition operators to the last item, and # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially # (GBNF's syntax is luckily very close to regular expressions!) - seq: list[Tuple[str, bool]] = [] + seq: list[tuple[str, bool]] = [] def get_dot(): if self._dotall: diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py index b00bf7c6d..36f6b92fb 100644 --- a/examples/llava/convert_image_encoder_to_gguf.py +++ b/examples/llava/convert_image_encoder_to_gguf.py @@ -185,6 +185,8 @@ else: fout.add_description("two-tower CLIP model") if has_text_encoder: + assert t_hparams is not None + assert tokens is not None # text_model hparams fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) @@ -259,8 +261,8 @@ if has_vision_encoder: if processor is not None: - image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean - image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue] + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue] else: image_mean = args.image_mean if args.image_mean is not None else default_image_mean image_std = args.image_std if args.image_std is not None else default_image_std @@ -272,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu) if has_llava_projector: - model.vision_model.encoder.layers.pop(-1) + model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue] projector = torch.load(args.llava_projector) for name, data in projector.items(): name = get_tensor_name(name) @@ -286,7 +288,7 @@ if has_llava_projector: print("Projector tensors added\n") -state_dict = model.state_dict() +state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue] for name, data in state_dict.items(): if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): # we don't need this diff --git a/examples/llava/llava_surgery_v2.py b/examples/llava/llava_surgery_v2.py index eb56d6988..2d5b32fe6 100644 --- a/examples/llava/llava_surgery_v2.py +++ b/examples/llava/llava_surgery_v2.py @@ -2,7 +2,9 @@ import argparse import glob import os import torch -from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file +from safetensors import safe_open +from safetensors.torch import save_file +from typing import Any, ContextManager, cast # Function to determine if file is a SafeTensor file def is_safetensor_file(file_path): @@ -13,7 +15,7 @@ def is_safetensor_file(file_path): def load_model(file_path): if is_safetensor_file(file_path): tensors = {} - with safe_open(file_path, framework="pt", device="cpu") as f: + with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f: for key in f.keys(): tensors[key] = f.get_tensor(key).clone() # output shape @@ -134,7 +136,7 @@ if len(mm_tensors) == 0: if last_checkpoint is not None: for k, v in last_checkpoint.items(): print(k) - print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.") + print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.") print("No tensors found. Is this a LLaVA model?") exit() @@ -143,8 +145,10 @@ print(f"Found additional {len(first_mm_tensors)} tensors to extract.") # projector = {name: checkpoint.[name].float() for name in mm_tensors} projector = {} for name in mm_tensors: + assert last_checkpoint is not None projector[name] = last_checkpoint[name].float() for name in first_mm_tensors: + assert first_checkpoint is not None projector[name] = first_checkpoint[name].float() if len(projector) > 0: diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py index f029c73a2..d8145710c 100644 --- a/examples/pydantic_models_to_grammar.py +++ b/examples/pydantic_models_to_grammar.py @@ -6,10 +6,10 @@ import re from copy import copy from enum import Enum from inspect import getdoc, isclass -from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin from docstring_parser import parse -from pydantic import BaseModel, Field, create_model +from pydantic import BaseModel, create_model if TYPE_CHECKING: from types import GenericAlias @@ -17,6 +17,9 @@ else: # python 3.8 compat from typing import _GenericAlias as GenericAlias +# TODO: fix this +# pyright: reportAttributeAccessIssue=information + class PydanticDataType(Enum): """ @@ -234,8 +237,9 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None # Define the integer part rule integer_part_rule = ( - "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + ( - f"-min{min_digit}" if min_digit is not None else "") + "integer-part" + + (f"-max{max_digit}" if max_digit is not None else "") + + (f"-min{min_digit}" if min_digit is not None else "") ) # Define the fractional part rule based on precision constraints @@ -458,7 +462,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas if not issubclass(model, BaseModel): # For non-Pydantic classes, generate model_fields from __annotations__ or __init__ if hasattr(model, "__annotations__") and model.__annotations__: - model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} + model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} # pyright: ignore[reportGeneralTypeIssues] else: init_signature = inspect.signature(model.__init__) parameters = init_signature.parameters @@ -680,7 +684,7 @@ def generate_markdown_documentation( str: Generated text documentation. """ documentation = "" - pyd_models = [(model, True) for model in pydantic_models] + pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models] for model, add_prefix in pyd_models: if add_prefix: documentation += f"{model_prefix}: {model.__name__}\n" @@ -700,7 +704,7 @@ def generate_markdown_documentation( # Indenting the fields section documentation += f" {fields_prefix}:\n" else: - documentation += f" Fields:\n" + documentation += f" Fields:\n" # noqa: F541 if isclass(model) and issubclass(model, BaseModel): for name, field_type in model.__annotations__.items(): # if name == "markdown_code_block": @@ -778,7 +782,7 @@ def generate_field_markdown( return field_text if field_description != "": - field_text += f" Description: " + field_description + "\n" + field_text += f" Description: {field_description}\n" # Check for and include field-specific examples if available if hasattr(model, "Config") and hasattr(model.Config, @@ -833,7 +837,7 @@ def generate_text_documentation( str: Generated text documentation. """ documentation = "" - pyd_models = [(model, True) for model in pydantic_models] + pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models] for model, add_prefix in pyd_models: if add_prefix: documentation += f"{model_prefix}: {model.__name__}\n" @@ -1164,7 +1168,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]): dynamic_fields[param.name] = ( param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) # Creating the dynamic model - dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) # type: ignore[call-overload] + dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) for name, param_doc in param_docs: dynamic_model.model_fields[name].description = param_doc.description @@ -1228,9 +1232,6 @@ def map_grammar_names_to_pydantic_model_class(pydantic_model_list): return output -from enum import Enum - - def json_schema_to_python_types(schema): type_map = { "any": Any, @@ -1275,7 +1276,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: if items != {}: array = {"properties": items} array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") - fields[field_name] = (List[array_type], ...) # type: ignore[valid-type] + fields[field_name] = (List[array_type], ...) else: fields[field_name] = (list, ...) elif field_type == "object": @@ -1285,7 +1286,8 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: required = field_data.get("enum", []) for key, field in fields.items(): if key not in required: - fields[key] = (Optional[fields[key][0]], ...) + optional_type = fields[key][0] + fields[key] = (Optional[optional_type], ...) else: field_type = json_schema_to_python_types(field_type) fields[field_name] = (field_type, ...) @@ -1305,6 +1307,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: required = dictionary.get("required", []) for key, field in fields.items(): if key not in required: - fields[key] = (Optional[fields[key][0]], ...) + optional_type = fields[key][0] + fields[key] = (Optional[optional_type], ...) custom_model = create_model(model_name, **fields) return custom_model diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py index 160966649..8e7f46cf9 100644 --- a/examples/pydantic_models_to_grammar_examples.py +++ b/examples/pydantic_models_to_grammar_examples.py @@ -1,6 +1,7 @@ # Function calling example using pydantic models. +from __future__ import annotations + import datetime -import importlib import json from enum import Enum from typing import Optional, Union @@ -215,9 +216,9 @@ for call in json_data: if call["function"] == "Calculator": print(Calculator(**call["params"]).run()) elif call["function"] == "get_current_datetime": - print(current_datetime_model(**call["params"]).run()) + print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] elif call["function"] == "get_current_weather": - print(current_weather_tool_model(**call["params"]).run()) + print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] # Should output something like this: # 2024-01-14 13:36:06 # {"location": "London", "temperature": "42", "unit": "celsius"} diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 76e2052d5..1578c4afb 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -46,6 +46,9 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 4fbbb2032..2daac0884 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import argparse import json import os @@ -59,10 +61,11 @@ def main(args_in: list[str] | None = None) -> None: sys.exit(1) # start the benchmark + iterations = 0 + data = {} try: start_benchmark(args) - iterations = 0 with open("results.github.env", 'w') as github_env: # parse output with open('k6-results.json', 'r') as bench_results: @@ -129,7 +132,7 @@ def main(args_in: list[str] | None = None) -> None: timestamps, metric_values = zip(*values) metric_values = [float(value) for value in metric_values] prometheus_metrics[metric] = metric_values - timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps] + timestamps_dt = [str(datetime.fromtimestamp(int(ts))) for ts in timestamps] plt.figure(figsize=(16, 10), dpi=80) plt.plot(timestamps_dt, metric_values, label=metric) plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) @@ -156,7 +159,7 @@ def main(args_in: list[str] | None = None) -> None: plt.close() # Mermaid format in case images upload failed - with (open(f"{metric}.mermaid", 'w') as mermaid_f): + with open(f"{metric}.mermaid", 'w') as mermaid_f: mermaid = ( f"""--- config: @@ -278,7 +281,7 @@ def start_server_background(args): } server_process = subprocess.Popen( args, - **pkwargs) + **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] def server_log(in_stream, out_stream): for line in iter(in_stream.readline, b''): diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 47bea1591..8feff6702 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -884,7 +884,8 @@ struct server_context { bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot_params default_params; - llama_sampling_params default_sparams; + // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) + llama_sampling_params default_sparams = params.sparams; auto & data = task.data; if (data.count("__oaicompat") != 0) { diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 7b5dabb01..df0814cc9 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,5 +1,4 @@ import asyncio -import collections import json import os import re @@ -8,19 +7,23 @@ import subprocess import sys import threading import time +from collections.abc import Sequence from contextlib import closing from re import RegexFlag +from typing import Any, Literal, cast import aiohttp import numpy as np import openai -from behave import step +from openai.types.chat import ChatCompletionChunk +from behave import step # pyright: ignore[reportAttributeAccessIssue] from behave.api.async_step import async_run_until_complete from prometheus_client import parser +# pyright: reportRedeclaration=false @step("a server listening on {server_fqdn}:{server_port}") -def step_server_config(context, server_fqdn, server_port): +def step_server_config(context, server_fqdn: str, server_port: str): context.server_fqdn = server_fqdn context.server_port = int(server_port) context.n_threads = None @@ -74,34 +77,34 @@ def step_server_config(context, server_fqdn, server_port): @step('a model file {hf_file} from HF repo {hf_repo}') -def step_download_hf_model(context, hf_file, hf_repo): +def step_download_hf_model(context, hf_file: str, hf_repo: str): context.model_hf_repo = hf_repo context.model_hf_file = hf_file context.model_file = os.path.basename(hf_file) @step('a model file {model_file}') -def step_model_file(context, model_file): +def step_model_file(context, model_file: str): context.model_file = model_file @step('a model url {model_url}') -def step_model_url(context, model_url): +def step_model_url(context, model_url: str): context.model_url = model_url @step('a model alias {model_alias}') -def step_model_alias(context, model_alias): +def step_model_alias(context, model_alias: str): context.model_alias = model_alias @step('{seed:d} as server seed') -def step_seed(context, seed): +def step_seed(context, seed: int): context.server_seed = seed @step('{ngl:d} GPU offloaded layers') -def step_n_gpu_layer(context, ngl): +def step_n_gpu_layer(context, ngl: int): if 'N_GPU_LAYERS' in os.environ: new_ngl = int(os.environ['N_GPU_LAYERS']) if context.debug: @@ -111,37 +114,37 @@ def step_n_gpu_layer(context, ngl): @step('{n_threads:d} threads') -def step_n_threads(context, n_threads): +def step_n_threads(context, n_threads: int): context.n_thread = n_threads @step('{draft:d} as draft') -def step_draft(context, draft): +def step_draft(context, draft: int): context.draft = draft @step('{n_ctx:d} KV cache size') -def step_n_ctx(context, n_ctx): +def step_n_ctx(context, n_ctx: int): context.n_ctx = n_ctx @step('{n_slots:d} slots') -def step_n_slots(context, n_slots): +def step_n_slots(context, n_slots: int): context.n_slots = n_slots @step('{n_predict:d} server max tokens to predict') -def step_server_n_predict(context, n_predict): +def step_server_n_predict(context, n_predict: int): context.n_server_predict = n_predict @step('{slot_save_path} as slot save path') -def step_slot_save_path(context, slot_save_path): +def step_slot_save_path(context, slot_save_path: str): context.slot_save_path = slot_save_path @step('using slot id {id_slot:d}') -def step_id_slot(context, id_slot): +def step_id_slot(context, id_slot: int): context.id_slot = id_slot @@ -191,7 +194,7 @@ def step_start_server(context): @step("the server is {expecting_status}") @async_run_until_complete -async def step_wait_for_the_server_to_be_started(context, expecting_status): +async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str): match expecting_status: case 'healthy': await wait_for_health_status(context, context.base_url, 200, 'ok', @@ -221,7 +224,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status): @step('all slots are {expected_slot_status_string}') @async_run_until_complete -async def step_all_slots_status(context, expected_slot_status_string): +async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str): match expected_slot_status_string: case 'idle': expected_slot_status = 0 @@ -237,7 +240,7 @@ async def step_all_slots_status(context, expected_slot_status_string): @step('a completion request with {api_error} api error') @async_run_until_complete -async def step_request_completion(context, api_error): +async def step_request_completion(context, api_error: Literal['raised'] | str): expect_api_error = api_error == 'raised' seeds = await completions_seed(context, num_seeds=1) completion = await request_completion(context.prompts.pop(), @@ -777,8 +780,8 @@ def step_assert_metric_value(context, metric_name, metric_value): def step_available_models(context): # openai client always expects an api_key openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope' - openai.api_base = f'{context.base_url}/v1' - context.models = openai.Model.list().data + openai.base_url = f'{context.base_url}/v1/' + context.models = openai.models.list().data @step('{n_model:d} models are supported') @@ -789,7 +792,7 @@ def step_supported_models(context, n_model): @step('model {i_model:d} is {param} {preposition} {param_value}') -def step_supported_models(context, i_model, param, preposition, param_value): +def step_supported_models(context, i_model: int, param: Literal['identified', 'trained'] | str, preposition: str, param_value: str): assert i_model < len(context.models) model = context.models[i_model] @@ -798,7 +801,7 @@ def step_supported_models(context, i_model, param, preposition, param_value): case 'identified': value = model.id case 'trained': - value = str(model.meta.n_ctx_train) + value = str(model.meta["n_ctx_train"]) case _: assert False, "param {param} not supported" assert param_value == value, f"model param {param} {value} != {param_value}" @@ -810,6 +813,7 @@ async def concurrent_requests(context, f_completion, *args, **kwargs): print(f"starting {context.n_prompts} concurrent completion requests...") assert context.n_prompts > 0 seeds = await completions_seed(context) + assert seeds is not None for prompt_no in range(context.n_prompts): shifted_args = [context.prompts.pop(), seeds[prompt_no], *args] context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) @@ -861,7 +865,7 @@ async def request_completion(prompt, id_slot=None, expect_api_error=None, user_api_key=None, - temperature=None): + temperature=None) -> int | dict[str, Any]: if debug: print(f"Sending completion request: {prompt}") origin = "my.super.domain" @@ -899,8 +903,8 @@ async def request_completion(prompt, async def oai_chat_completions(user_prompt, seed, system_prompt, - base_url, - base_path, + base_url: str, + base_path: str, async_client, debug=False, temperature=None, @@ -909,7 +913,7 @@ async def oai_chat_completions(user_prompt, enable_streaming=None, response_format=None, user_api_key=None, - expect_api_error=None): + expect_api_error=None) -> int | dict[str, Any]: if debug: print(f"Sending OAI Chat completions request: {user_prompt}") # openai client always expects an api key @@ -989,32 +993,35 @@ async def oai_chat_completions(user_prompt, else: try: openai.api_key = user_api_key - openai.api_base = f'{base_url}{base_path}' - chat_completion = openai.Completion.create( + openai.base_url = f'{base_url}{base_path.removesuffix("chat")}' + assert model is not None + chat_completion = openai.chat.completions.create( messages=payload['messages'], model=model, max_tokens=n_predict, stream=enable_streaming, - response_format=payload.get('response_format'), + response_format=payload.get('response_format') or openai.NOT_GIVEN, seed=seed, temperature=payload['temperature'] ) - except openai.error.AuthenticationError as e: + except openai.AuthenticationError as e: if expect_api_error is not None and expect_api_error: return 401 else: assert False, f'error raised: {e}' if enable_streaming: + chat_completion = cast(openai.Stream[ChatCompletionChunk], chat_completion) for chunk in chat_completion: assert len(chunk.choices) == 1 delta = chunk.choices[0].delta - if 'content' in delta: - completion_response['content'] += delta['content'] + if delta.content is not None: + completion_response['content'] += delta.content completion_response['timings']['predicted_n'] += 1 completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop' else: assert len(chat_completion.choices) == 1 + assert chat_completion.usage is not None completion_response = { 'content': chat_completion.choices[0].message.content, 'timings': { @@ -1028,7 +1035,7 @@ async def oai_chat_completions(user_prompt, return completion_response -async def request_embedding(content, seed, base_url=None): +async def request_embedding(content, seed, base_url=None) -> list[list[float]]: async with aiohttp.ClientSession() as session: async with session.post(f'{base_url}/embedding', json={ @@ -1041,7 +1048,7 @@ async def request_embedding(content, seed, base_url=None): async def request_oai_embeddings(input, seed, base_url=None, user_api_key=None, - model=None, async_client=False): + model=None, async_client=False) -> list[list[float]]: # openai client always expects an api_key user_api_key = user_api_key if user_api_key is not None else 'nope' if async_client: @@ -1063,7 +1070,7 @@ async def request_oai_embeddings(input, seed, response_json = await response.json() assert response_json['model'] == model, f"invalid model received: {response_json['model']}" assert response_json['object'] == 'list' - if isinstance(input, collections.abc.Sequence): + if isinstance(input, Sequence): embeddings = [] for an_oai_embeddings in response_json['data']: embeddings.append(an_oai_embeddings['embedding']) @@ -1072,19 +1079,14 @@ async def request_oai_embeddings(input, seed, return embeddings else: openai.api_key = user_api_key - openai.api_base = f'{base_url}/v1' - oai_embeddings = openai.Embedding.create( + openai.base_url = f'{base_url}/v1/' + assert model is not None + oai_embeddings = openai.embeddings.create( model=model, input=input, ) - if isinstance(input, collections.abc.Sequence): - embeddings = [] - for an_oai_embeddings in oai_embeddings.data: - embeddings.append(an_oai_embeddings.embedding) - else: - embeddings = [oai_embeddings.data.embedding] - return embeddings + return [e.embedding for e in oai_embeddings.data] def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): @@ -1122,7 +1124,7 @@ def assert_all_predictions_equal(completion_responses): if i == j: continue content_j = response_j['content'] - assert content_i == content_j, "contents not equal" + assert content_i == content_j, "contents not equal" def assert_all_predictions_different(completion_responses): @@ -1136,7 +1138,7 @@ def assert_all_predictions_different(completion_responses): if i == j: continue content_j = response_j['content'] - assert content_i != content_j, "contents not different" + assert content_i != content_j, "contents not different" def assert_all_token_probabilities_equal(completion_responses): @@ -1153,7 +1155,7 @@ def assert_all_token_probabilities_equal(completion_responses): if i == j: continue probs_j = response_j['completion_probabilities'][pos]['probs'] - assert probs_i == probs_j, "contents not equal" + assert probs_i == probs_j, "contents not equal" async def gather_tasks_results(context): @@ -1343,7 +1345,7 @@ def start_server_background(context): } context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], - **pkwargs) + **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] def server_log(in_stream, out_stream): for line in iter(in_stream.readline, b''): diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 2e4f42ad2..2c741ea10 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 behave~=1.2.6 huggingface_hub~=0.20.3 -numpy~=1.24.4 -openai~=0.25.0 +numpy~=1.26.4 +openai~=1.30.3 prometheus-client~=0.20.0 diff --git a/examples/server_embd.py b/examples/server_embd.py index a9a36a44c..0e34c6cea 100644 --- a/examples/server_embd.py +++ b/examples/server_embd.py @@ -1,13 +1,15 @@ import asyncio +import asyncio.threads import requests import numpy as np + n = 8 result = [] async def requests_post_async(*args, **kwargs): - return await asyncio.to_thread(requests.post, *args, **kwargs) + return await asyncio.threads.to_thread(requests.post, *args, **kwargs) async def main(): model_url = "http://127.0.0.1:6900" diff --git a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py b/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py index ed93673bc..e045beb72 100644 --- a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py +++ b/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py @@ -66,7 +66,7 @@ class Tensor: if len(self.ne) == 0: self.nbytes = 0 else: - self.nbytes = int(np.product(self.ne)) * 4 + self.nbytes = int(np.prod(self.ne)) * 4 else: raise ValueError(f"Unhandled data type '{self.dtype}'") diff --git a/flake.lock b/flake.lock index 973ff4e56..5001fbf91 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1717285511, - "narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=", + "lastModified": 1719994518, + "narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8", + "rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1719506693, - "narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=", + "lastModified": 1720031269, + "narHash": "sha256-rwz8NJZV+387rnWpTYcXaRNvzUSnnF9aHONoJIYmiUQ=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a", + "rev": "9f4128e00b0ae8ec65918efeba59db998750ead6", "type": "github" }, "original": { @@ -36,14 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1717284937, - "narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=", + "lastModified": 1719876945, + "narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=", "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz" }, "original": { "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz" } }, "root": { diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 0d0d52d57..649ac3dcc 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -104,7 +104,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework" option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING "ggml: BLAS library vendor") -option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF) +option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF) option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) diff --git a/ggml/ggml_vk_generate_shaders.py b/ggml/ggml_vk_generate_shaders.py index 38914eedb..41d5d9b8c 100644 --- a/ggml/ggml_vk_generate_shaders.py +++ b/ggml/ggml_vk_generate_shaders.py @@ -99,6 +99,8 @@ async def main(): tasks = [] + base_dict = {"FLOAT_TYPE": "float"} + for fp16 in (False, True): # MUL_MAT matmul_shaders(tasks, fp16, False) @@ -106,8 +108,6 @@ async def main(): matmul_shaders(tasks, fp16, True) for tname in type_names: - base_dict = {"FLOAT_TYPE": "float"} - # mul mat vec data_a_key = f"DATA_A_{tname.upper()}" shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp" diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d895c9acd..1e3677537 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -383,6 +383,9 @@ extern "C" { GGML_TYPE_F64 = 28, GGML_TYPE_IQ1_M = 29, GGML_TYPE_BF16 = 30, + GGML_TYPE_Q4_0_4_4 = 31, + GGML_TYPE_Q4_0_4_8 = 32, + GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_COUNT, }; @@ -424,6 +427,9 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors }; // available tensor operations: @@ -2406,6 +2412,12 @@ extern "C" { typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, const void * GGML_RESTRICT y, size_t by, int nrc); + typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, + int64_t k, int64_t bx); + typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, + const void * GGML_RESTRICT y, int nr, int nc); + typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, + const void * GGML_RESTRICT y, int nr, int nc); typedef struct { const char * type_name; @@ -2418,6 +2430,11 @@ extern "C" { ggml_vec_dot_t vec_dot; enum ggml_type vec_dot_type; int64_t nrows; // number of rows to process simultaneously; + int64_t ncols; // number of columns to process simultaneously; + int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize; + ggml_from_float_to_mat_t from_float_to_mat; + ggml_gemv_t gemv; + ggml_gemm_t gemm; } ggml_type_traits_t; GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index c6694df67..c5ee7e425 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -238,12 +238,12 @@ if (GGML_BLAS) endif() if (GGML_LLAMAFILE) - message(STATUS "Using ggml SGEMM") + message(STATUS "Using llamafile") add_compile_definitions(GGML_USE_LLAMAFILE) - set(GGML_HEADERS_LLAMAFILE sgemm.h) - set(GGML_SOURCES_LLAMAFILE sgemm.cpp) + set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h) + set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp) endif() if (GGML_CUDA) @@ -1153,6 +1153,7 @@ add_library(ggml ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} + ggml-aarch64.c ggml-aarch64.h ) if (EMSCRIPTEN) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c new file mode 100644 index 000000000..008718634 --- /dev/null +++ b/ggml/src/ggml-aarch64.c @@ -0,0 +1,2187 @@ +// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd. +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" + +#include "ggml-quants.h" +#include "ggml-impl.h" + +#include +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#include "ggml-aarch64.h" + +#pragma GCC diagnostic ignored "-Woverlength-strings" + +#define UNUSED GGML_UNUSED + +// Functions to create the interleaved data layout formats + +// interleave 4 block_q4_0s in blocks of interleave_blcksize +// returns an interleaved block_q4_0x4 +// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks +// first, then interleave quants from 4 block_q4_0s in blocks of interleave_blcksize +// +// - in : an array of block_q4_0 pointers +// - interleave_blcksize : the block_q4_0 quants bytes are interleaved in blocks of +// interleave_blcksize bytes +// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes +// from bias offset form to pure sign form (this saves subtract +// operations durin unpacking) +// +static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) { + block_q4_0x4 out; + + for (int i = 0; i < 4; i++) { + out.d[i] = in[i].d; + } + + for (int i = 0; i < QK4_0 * 2; i++) { + int src_offset = (i / (4 * interleave_blcksize)) * interleave_blcksize; + int src_id = (i % (4 * interleave_blcksize)) / interleave_blcksize; + src_offset += (i % interleave_blcksize); + + out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask; + } + + return out; +} + +// interleave 8 block_q4_0s in blocks of interleave_blcksize +// returns an interleaved block_q4_0x8 +// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks +// first, then interleave quants from 8 block_q4_0s in blocks of interleave_blcksize +static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) { + block_q4_0x8 out; + + for (int i = 0; i < 8; i++) { + out.d[i] = in[i].d; + } + + for (int i = 0; i < QK4_0 * 4; i++) { + int src_offset = (i / (8 * interleave_blcksize)) * interleave_blcksize; + int src_id = (i % (8 * interleave_blcksize)) / interleave_blcksize; + src_offset += (i % interleave_blcksize); + + out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask; + } + + return out; +} + +void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0x4 * restrict y = (block_q8_0x4 *) vy; + +#if defined(__ARM_NEON) + float32x4_t srcv[4][8]; + float id[4]; + + for (int i = 0; i < nb; i++) { + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int row_iter = 0; row_iter < 4; row_iter++) { + for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]); + + for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]); + for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]); + for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + } + + for (int j = 0; j < 8; j++) { + float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]); + int32x4_t vi = vcvtnq_s32_f32(v); + y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[1][j], id[1]); + vi = vcvtnq_s32_f32(v); + y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0); + y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1); + y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2); + y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[2][j], id[2]); + vi = vcvtnq_s32_f32(v); + y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0); + y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1); + y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2); + y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[3][j], id[3]); + vi = vcvtnq_s32_f32(v); + y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0); + y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1); + y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2); + y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3); + } + } +#else + // scalar + const int interleave_blcksize = 4; + float srcv[4][QK8_0]; + float id[4]; + + for (int i = 0; i < nb; i++) { + for (int row_iter = 0; row_iter < 4; row_iter++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + amax = MAX(amax, fabsf(srcv[row_iter][j])); + } + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + } + + for (int j = 0; j < QK8_0 * 4; j++) { + int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize; + int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize; + src_offset += (j % interleave_blcksize); + + float x0 = srcv[src_id][src_offset] * id[src_id]; + y[i].qs[j] = roundf(x0);; + } + } +#endif +} + +void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0x4 * restrict y = (block_q8_0x4 *) vy; + +#if defined(__ARM_NEON) + float32x4_t srcv[4][8]; + float id[4]; + + for (int i = 0; i < nb; i++) { + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int row_iter = 0; row_iter < 4; row_iter++) { + for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]); + + for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]); + for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]); + for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + } + + for (int j = 0; j < 4; j++) { + float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]); + int32x4_t vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3); + v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[1][2 * j], id[1]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3); + v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[2][2 * j], id[2]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3); + v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[3][2 * j], id[3]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3); + v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3); + } + } +#else + // scalar + const int interleave_blcksize = 8; + float srcv[4][QK8_0]; + float id[4]; + + for (int i = 0; i < nb; i++) { + for (int row_iter = 0; row_iter < 4; row_iter++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + amax = MAX(amax, fabsf(srcv[row_iter][j])); + } + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + } + + for (int j = 0; j < QK8_0 * 4; j++) { + int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize; + int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize; + src_offset += (j % interleave_blcksize); + + float x0 = srcv[src_id][src_offset] * id[src_id]; + y[i].qs[j] = roundf(x0);; + } + } +#endif +} + +void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t interleave_blcksize) { + assert(nrow == 4); + UNUSED(nrow); + if (interleave_blcksize == 4) quantize_q8_0_4x4(x, vy, n_per_row); + else if (interleave_blcksize == 8) quantize_q8_0_4x8(x, vy, n_per_row); + else assert(false); +} + +static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int interleave_blcksize) { + assert(n_per_row % QK4_0 == 0); + const int nb = n_per_row / QK4_0; + + void * out_ptr = NULL; + if (nrows_interleaved == 8) { + out_ptr = (block_q4_0x8 *) dst; + } + else if (nrows_interleaved == 4) { + out_ptr = (block_q4_0x4 *) dst; + } + assert(nrows_interleaved <= 8); + block_q4_0 dst_tmp[8]; + + for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) { + + for (int64_t x = 0; x < nb; x++) { + + for (int i = 0; i < nrows_interleaved; i++ ) { + quantize_row_q4_0_reference(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0); + } + + if (nrows_interleaved == 8) { + *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, interleave_blcksize, 0x88); + out_ptr = (block_q4_0x8 *) out_ptr + 1; + } + else if (nrows_interleaved == 4) { + *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, interleave_blcksize, 0x88); + out_ptr = (block_q4_0x4 *) out_ptr + 1; + } + } + } + + return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0)); +} + +size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + if (!quant_weights) { + return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4); + } + else { + assert(false); + return 0; + } +} + +size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + if (!quant_weights) { + return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8); + } + else { + assert(false); + return 0; + } +} + +size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + if (!quant_weights) { + return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8); + } + else { + assert(false); + return 0; + } +} + +void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined(__ARM_FEATURE_SVE) + if (svcntw() == 8) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); + } +#endif +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) + GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && + "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); +#elif defined(__ARM_NEON) && defined(__aarch64__) + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + + __asm__ __volatile__( + "movi v31.16b, #0x4\n" + "movi v30.16b, #0xf0\n" + "add %x[b_ptr], %x[b_ptr], #0x8\n" + "1:" // Column loop + "add x22, %x[a_ptr], #0x2\n" + "movi v29.16b, #0x0\n" + "mov x21, %x[nb]\n" + "2:" // Block loop + "ldr q28, [%x[b_ptr], #0x0]\n" + "ldr q27, [x22, #0x0]\n" + "movi v26.4s, #0x0\n" + "sub x20, x22, #0x2\n" + "ldr q25, [x22, #0x10]\n" + "ldr q24, [%x[b_ptr], #0x10]\n" + "sub x21, x21, #0x1\n" + "add x22, x22, #0x22\n" + "ldr q23, [%x[b_ptr], #0x20]\n" + "ldr q22, [%x[b_ptr], #0x30]\n" + "ld1r { v21.8h }, [x20]\n" + "ldr q20, [%x[b_ptr], #-0x8]\n" + "sshl v16.16b, v28.16b, v31.16b\n" + "and v28.16b, v28.16b, v30.16b\n" + "sshl v19.16b, v24.16b, v31.16b\n" + "and v24.16b, v24.16b, v30.16b\n" + "add %x[b_ptr], %x[b_ptr], #0x48\n" + "sshl v18.16b, v23.16b, v31.16b\n" + "and v23.16b, v23.16b, v30.16b\n" + ".inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n" + "sshl v17.16b, v22.16b, v31.16b\n" + "and v22.16b, v22.16b, v30.16b\n" + "fcvtl v21.4s, v21.4h\n" + "fcvtl v16.4s, v20.4h\n" + ".inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n" + "fmul v16.4s, v16.4s, v21.4s\n" + ".inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n" + ".inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n" + ".inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n" + ".inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n" + ".inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n" + ".inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n" + "scvtf v26.4s, v26.4s, #0x4\n" + "fmla v29.4s, v26.4s, v16.4s\n" + "cbnz x21, 2b\n" + "sub %x[nc], %x[nc], #0x4\n" + "str q29, [%x[res_ptr], #0x0]\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "cbnz %x[nc], 1b\n" + : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc) + : [a_ptr] "r" (a_ptr), [nb] "r" (nb) + : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22" + ); +#else + float sumf[4]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } +#endif +} + +void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined(__ARM_FEATURE_SVE) + if (svcntw() == 8) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); + } +#endif +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + + __asm__ __volatile__( + "movi v2.16b, #0x4\n" + "movi v1.16b, #0xf0\n" + "add %x[b_ptr], %x[b_ptr], #0x8\n" + "1:" // Column loop + "add x23, %x[a_ptr], #0x2\n" + "movi v0.16b, #0x0\n" + "mov x22, %x[nb]\n" + "2:" // Block loop + "ldr q31, [%x[b_ptr], #0x0]\n" + "ldr q30, [%x[b_ptr], #0x10]\n" + "mov x21, x23\n" + "movi v29.4s, #0x0\n" + "ldr q28, [%x[b_ptr], #0x20]\n" + "ldr q27, [%x[b_ptr], #0x30]\n" + "movi v26.4s, #0x0\n" + "sub x20, x23, #0x2\n" + "ld1r { v25.8h }, [x20]\n" + "ldr q24, [%x[b_ptr], #-0x8]\n" + "sub x22, x22, #0x1\n" + "add x23, x23, #0x22\n" + "ld1r { v23.2d }, [x21], #0x8\n" + "sshl v22.16b, v31.16b, v2.16b\n" + "sshl v16.16b, v30.16b, v2.16b\n" + "add %x[b_ptr], %x[b_ptr], #0x48\n" + "ld1r { v21.2d }, [x21], #0x8\n" + "sshl v20.16b, v28.16b, v2.16b\n" + "sshl v19.16b, v27.16b, v2.16b\n" + "ld1r { v18.2d }, [x21], #0x8\n" + "ld1r { v17.2d }, [x21], #0x8\n" + "and v31.16b, v31.16b, v1.16b\n" + "and v30.16b, v30.16b, v1.16b\n" + ".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n" + ".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n" + "and v28.16b, v28.16b, v1.16b\n" + "and v27.16b, v27.16b, v1.16b\n" + "fcvtl v25.4s, v25.4h\n" + "fcvtl v16.4s, v24.4h\n" + ".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n" + ".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n" + "fmul v16.4s, v16.4s, v25.4s\n" + ".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n" + ".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n" + ".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n" + ".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n" + "addp v29.4s, v29.4s, v26.4s\n" + "scvtf v29.4s, v29.4s, #0x4\n" + "fmla v0.4s, v29.4s, v16.4s\n" + "cbnz x22, 2b\n" + "sub %x[nc], %x[nc], #0x4\n" + "str q0, [%x[res_ptr], #0x0]\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "cbnz %x[nc], 1b\n" + : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc) + : [a_ptr] "r" (a_ptr), [nb] "r" (nb) + : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23" + ); +#elif defined(__ARM_NEON) && defined(__aarch64__) + GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && + "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " + "performance"); +#else + float sumf[4]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } +#endif +} + +void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined(__ARM_FEATURE_SVE) + if (svcntw() == 8) { + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + + __asm__ __volatile__( + "ptrue p0.b\n" + "add %x[b_ptr], %x[b_ptr], #0x10\n" + "1:" // Column loop + "add x22, %x[a_ptr], #0x2\n" + "mov z31.b, #0x0\n" + "mov x21, %x[nb]\n" + "2:" // Block loop + "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n" + "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n" + "mov z28.s, #0x0\n" + "mov z27.s, #0x0\n" + "ld1rd { z26.d }, p0/Z, [x22]\n" + "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n" + "sub x20, x22, #0x2\n" + "sub x21, x21, #0x1\n" + "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n" + "ld1rd { z23.d }, p0/Z, [x22, #8]\n" + "lsl z22.b, z30.b, #0x4\n" + "lsl z16.b, z29.b, #0x4\n" + "and z30.b, z30.b, #0xf0\n" + "and z29.b, z29.b, #0xf0\n" + "ld1rd { z21.d }, p0/Z, [x22, #16]\n" + "ld1rd { z20.d }, p0/Z, [x22, #24]\n" + "lsl z19.b, z25.b, #0x4\n" + "and z25.b, z25.b, #0xf0\n" + "ld1rh { z17.h }, p0/Z, [x20]\n" + "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n" + "sdot z28.s, z22.b, z26.b\n" + "sdot z27.s, z16.b, z26.b\n" + "lsl z16.b, z24.b, #0x4\n" + "add x22, x22, #0x22\n" + "and z24.b, z24.b, #0xf0\n" + "add %x[b_ptr], %x[b_ptr], #0x90\n" + "fcvt z17.s, p0/m, z17.h\n" + "fcvt z18.s, p0/m, z18.h\n" + "sdot z28.s, z19.b, z23.b\n" + "sdot z27.s, z16.b, z23.b\n" + "fmul z18.s, z18.s, z17.s\n" + "sdot z28.s, z30.b, z21.b\n" + "sdot z27.s, z29.b, z21.b\n" + "sdot z28.s, z25.b, z20.b\n" + "sdot z27.s, z24.b, z20.b\n" + "uzp1 z17.s, z28.s, z27.s\n" + "uzp2 z16.s, z28.s, z27.s\n" + "add z17.s, z17.s, z16.s\n" + "asr z17.s, z17.s, #0x4\n" + "scvtf z17.s, p0/m, z17.s\n" + "fmla z31.s, p0/M, z17.s, z18.s\n" + "cbnz x21, 2b\n" + "sub %x[nc], %x[nc], #0x8\n" + "st1w { z31.s }, p0, [%x[res_ptr]]\n" + "add %x[res_ptr], %x[res_ptr], #0x20\n" + "cbnz %x[nc], 1b\n" + : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc) + : [a_ptr] "r" (a_ptr), [nb] "r" (nb) + : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); + return; + } + else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { + GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " + "performance"); + } + else if (ggml_cpu_has_neon()) { + GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " + "quantization format for optimal performance"); + } +#endif +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) + GGML_ASSERT(ggml_cpu_has_sve() && + "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance"); +#elif defined(__ARM_NEON) && defined(__aarch64__) + GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && + "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " + "performance"); +#else + float sumf[8]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } +#endif +} + +void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) + if (svcntw() == 8) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); + } +#endif +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) + GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && + "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); +#elif defined(__ARM_NEON) && defined(__aarch64__) + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + size_t res_stride = bs * sizeof(float); + + __asm__ __volatile__( + "mov x10, %x[nr]\n" + "mov x9, #0x88\n" + "cmp x10, #0x10\n" + "mul x9, %x[nb], x9\n" + "blt 4f\n" + "1:" // Row loop + "add x28, %x[b_ptr], #0x8\n" + "mov x27, %x[nc]\n" + "add x26, %x[res_ptr], %x[res_stride], LSL #4\n" + "2:" // Column loop + "add x25, %x[a_ptr], #0x8\n" + "movi v15.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "mov x24, %x[nb]\n" + "add x23, x25, x9\n" + "movi v18.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "add x22, x23, x9\n" + "movi v11.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "add x21, x22, x9\n" + "movi v23.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v7.16b, #0x0\n" + "movi v0.16b, #0x0\n" + "movi v4.16b, #0x0\n" + "movi v5.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v8.16b, #0x0\n" + "movi v1.16b, #0x0\n" + "3:" // Block loop + "ldr q3, [x28, #0x0]\n" + "ldr q31, [x25, #0x0]\n" + "movi v28.16b, #0x4\n" + "movi v10.4s, #0x0\n" + "ldr q22, [x28, #0x10]\n" + "ldr q6, [x25, #0x10]\n" + "movi v29.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "ldr q27, [x28, #0x20]\n" + "ldr q30, [x28, #0x30]\n" + "movi v20.4s, #0x0\n" + "movi v24.16b, #0xf0\n" + "ldr d2, [x25, #-0x8]\n" + "ldr d26, [x23, #-0x8]\n" + "sshl v12.16b, v3.16b, v28.16b\n" + "sub x20, x28, #0x8\n" + "ldr d17, [x20, #0x0]\n" + "and v3.16b, v3.16b, v24.16b\n" + "subs x24, x24, #0x1\n" + "add x28, x28, #0x48\n" + ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n" + ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n" + ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n" + ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n" + "sshl v31.16b, v22.16b, v28.16b\n" + "and v22.16b, v22.16b, v24.16b\n" + "fcvtl v17.4s, v17.4h\n" + "fcvtl v2.4s, v2.4h\n" + "fcvtl v26.4s, v26.4h\n" + ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n" + ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n" + ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n" + ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n" + "sshl v6.16b, v27.16b, v28.16b\n" + "sshl v28.16b, v30.16b, v28.16b\n" + "and v27.16b, v27.16b, v24.16b\n" + "and v30.16b, v30.16b, v24.16b\n" + "ldr q24, [x25, #0x20]\n" + ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n" + ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n" + ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n" + ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x30]\n" + ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n" + ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n" + ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n" + ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x40]\n" + ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n" + ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n" + ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n" + ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x50]\n" + ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n" + ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n" + ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n" + ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x60]\n" + ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n" + ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n" + ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n" + ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x70]\n" + "add x25, x25, #0x88\n" + ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n" + ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n" + ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n" + ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n" + "fmul v24.4s, v17.4s, v2.s[0]\n" + "scvtf v10.4s, v10.4s, #0x4\n" + "scvtf v29.4s, v29.4s, #0x4\n" + "scvtf v9.4s, v9.4s, #0x4\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "fmla v15.4s, v10.4s, v24.4s\n" + "ldr q24, [x23, #0x0]\n" + "fmul v10.4s, v17.4s, v2.s[1]\n" + "fmla v19.4s, v29.4s, v10.4s\n" + "ldr q10, [x23, #0x10]\n" + "fmul v29.4s, v17.4s, v2.s[2]\n" + "fmul v2.4s, v17.4s, v2.s[3]\n" + "fmla v18.4s, v9.4s, v29.4s\n" + "movi v9.4s, #0x0\n" + "movi v29.4s, #0x0\n" + ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n" + ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n" + "fmla v14.4s, v20.4s, v2.4s\n" + "movi v20.4s, #0x0\n" + "movi v2.4s, #0x0\n" + ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n" + ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n" + "ldr q24, [x23, #0x20]\n" + ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n" + ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n" + ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n" + ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n" + "ldr q10, [x23, #0x30]\n" + ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n" + ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n" + ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n" + ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n" + "ldr q24, [x23, #0x40]\n" + ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n" + ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n" + ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n" + ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n" + "ldr q10, [x23, #0x50]\n" + ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n" + ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n" + ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n" + ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n" + "ldr q24, [x23, #0x60]\n" + ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n" + ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n" + ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n" + ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n" + "ldr q10, [x23, #0x70]\n" + "add x23, x23, #0x88\n" + ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n" + ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n" + ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n" + ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n" + "ldr q24, [x22, #0x0]\n" + ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n" + ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n" + ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n" + ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n" + "fmul v10.4s, v17.4s, v26.s[0]\n" + "scvtf v9.4s, v9.4s, #0x4\n" + "scvtf v29.4s, v29.4s, #0x4\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "scvtf v2.4s, v2.4s, #0x4\n" + "fmla v11.4s, v9.4s, v10.4s\n" + "ldr q9, [x22, #0x10]\n" + "fmul v10.4s, v17.4s, v26.s[1]\n" + "fmla v13.4s, v29.4s, v10.4s\n" + "ldr d29, [x22, #-0x8]\n" + "fmul v10.4s, v17.4s, v26.s[2]\n" + "fmul v26.4s, v17.4s, v26.s[3]\n" + "fcvtl v29.4s, v29.4h\n" + "fmla v23.4s, v20.4s, v10.4s\n" + "movi v20.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "fmla v16.4s, v2.4s, v26.4s\n" + "movi v26.4s, #0x0\n" + "movi v2.4s, #0x0\n" + ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n" + ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n" + ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n" + ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n" + "ldr q24, [x22, #0x20]\n" + ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n" + ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n" + ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n" + ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n" + "ldr q9, [x22, #0x30]\n" + ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n" + ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n" + ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n" + ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n" + "ldr q24, [x22, #0x40]\n" + ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n" + ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n" + ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n" + ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n" + "ldr q9, [x22, #0x50]\n" + ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n" + ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n" + ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n" + ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n" + "ldr q24, [x22, #0x60]\n" + ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n" + ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n" + ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n" + ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n" + "ldr q9, [x22, #0x70]\n" + "add x22, x22, #0x88\n" + ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n" + ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n" + ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n" + ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n" + "ldr q24, [x21, #0x0]\n" + ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n" + ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n" + ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n" + ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n" + "fmul v9.4s, v17.4s, v29.s[0]\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "scvtf v10.4s, v10.4s, #0x4\n" + "scvtf v26.4s, v26.4s, #0x4\n" + "scvtf v2.4s, v2.4s, #0x4\n" + "fmla v25.4s, v20.4s, v9.4s\n" + "ldr q9, [x21, #0x10]\n" + "fmul v20.4s, v17.4s, v29.s[1]\n" + "fmla v7.4s, v10.4s, v20.4s\n" + "ldr d20, [x21, #-0x8]\n" + "fmul v10.4s, v17.4s, v29.s[2]\n" + "fmul v29.4s, v17.4s, v29.s[3]\n" + "fcvtl v20.4s, v20.4h\n" + "fmla v0.4s, v26.4s, v10.4s\n" + "movi v26.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "fmla v4.4s, v2.4s, v29.4s\n" + "movi v2.4s, #0x0\n" + "movi v29.4s, #0x0\n" + ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n" + ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n" + ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n" + ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n" + "ldr q12, [x21, #0x20]\n" + "fmul v24.4s, v17.4s, v20.s[0]\n" + ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n" + ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n" + ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n" + ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n" + "ldr q9, [x21, #0x30]\n" + "fmul v31.4s, v17.4s, v20.s[1]\n" + ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n" + ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n" + ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n" + ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n" + "ldr q12, [x21, #0x40]\n" + "fmul v6.4s, v17.4s, v20.s[2]\n" + "fmul v20.4s, v17.4s, v20.s[3]\n" + ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n" + ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n" + ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n" + ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n" + "ldr q9, [x21, #0x50]\n" + ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n" + ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n" + ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n" + ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n" + "ldr q12, [x21, #0x60]\n" + ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n" + ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n" + ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n" + ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n" + "ldr q17, [x21, #0x70]\n" + "add x21, x21, #0x88\n" + ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n" + ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n" + ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n" + ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n" + ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n" + ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n" + ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n" + ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n" + "scvtf v26.4s, v26.4s, #0x4\n" + "scvtf v10.4s, v10.4s, #0x4\n" + "fmla v5.4s, v26.4s, v24.4s\n" + "scvtf v2.4s, v2.4s, #0x4\n" + "scvtf v29.4s, v29.4s, #0x4\n" + "fmla v21.4s, v10.4s, v31.4s\n" + "fmla v8.4s, v2.4s, v6.4s\n" + "fmla v1.4s, v29.4s, v20.4s\n" + "bgt 3b\n" + "mov x20, %x[res_ptr]\n" + "subs x27, x27, #0x4\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "str q15, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q19, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q18, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q14, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q11, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q13, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q23, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q16, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q25, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q7, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q0, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q4, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q5, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q21, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q8, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q1, [x20, #0x0]\n" + "bne 2b\n" + "mov x20, #0x4\n" + "sub x10, x10, #0x10\n" + "cmp x10, #0x10\n" + "mov %x[res_ptr], x26\n" + "madd %x[a_ptr], x20, x9, %x[a_ptr]\n" + "bge 1b\n" + "4:" // Row loop skip + "cbz x10, 9f\n" + "5:" // Row tail: Row loop + "add x24, %x[b_ptr], #0x8\n" + "mov x23, %x[nc]\n" + "add x22, %x[res_ptr], %x[res_stride], LSL #2\n" + "6:" // Row tail: Column loop + "movi v15.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "add x25, %x[a_ptr], #0x8\n" + "mov x21, %x[nb]\n" + "movi v18.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "7:" // Row tail: Block loop + "ldr q7, [x24, #0x0]\n" + "ldr q5, [x25, #0x0]\n" + "movi v9.16b, #0x4\n" + "movi v4.4s, #0x0\n" + "ldr q3, [x24, #0x10]\n" + "ldr q2, [x25, #0x10]\n" + "movi v1.4s, #0x0\n" + "movi v0.4s, #0x0\n" + "ldr q13, [x24, #0x20]\n" + "ldr q31, [x25, #0x20]\n" + "movi v30.4s, #0x0\n" + "movi v29.16b, #0xf0\n" + "ldr q28, [x24, #0x30]\n" + "ldr q27, [x25, #0x30]\n" + "sshl v20.16b, v7.16b, v9.16b\n" + "sub x20, x24, #0x8\n" + "ldr q26, [x25, #0x40]\n" + "ldr q25, [x25, #0x50]\n" + "sshl v17.16b, v3.16b, v9.16b\n" + "and v7.16b, v7.16b, v29.16b\n" + "ldr q24, [x25, #0x60]\n" + "ldr q16, [x25, #0x70]\n" + "sshl v22.16b, v13.16b, v9.16b\n" + "and v3.16b, v3.16b, v29.16b\n" + "ldr d21, [x20, #0x0]\n" + "ldr d12, [x25, #-0x8]\n" + ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n" + ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n" + ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n" + ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n" + "sshl v9.16b, v28.16b, v9.16b\n" + "subs x21, x21, #0x1\n" + "and v13.16b, v13.16b, v29.16b\n" + "and v28.16b, v28.16b, v29.16b\n" + "add x25, x25, #0x88\n" + "add x24, x24, #0x48\n" + "fcvtl v21.4s, v21.4h\n" + "fcvtl v12.4s, v12.4h\n" + ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n" + ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n" + ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n" + ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n" + "fmul v11.4s, v21.4s, v12.s[0]\n" + "fmul v23.4s, v21.4s, v12.s[1]\n" + "fmul v17.4s, v21.4s, v12.s[2]\n" + ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n" + "fmul v6.4s, v21.4s, v12.s[3]\n" + ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n" + ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n" + ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n" + ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n" + ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n" + ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n" + ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n" + ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n" + ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n" + ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n" + ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n" + ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n" + ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n" + ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n" + ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n" + ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n" + ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n" + ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n" + ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n" + ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n" + ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n" + ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n" + ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n" + "scvtf v4.4s, v4.4s, #0x4\n" + "scvtf v1.4s, v1.4s, #0x4\n" + "scvtf v0.4s, v0.4s, #0x4\n" + "fmla v15.4s, v4.4s, v11.4s\n" + "scvtf v30.4s, v30.4s, #0x4\n" + "fmla v19.4s, v1.4s, v23.4s\n" + "fmla v18.4s, v0.4s, v17.4s\n" + "fmla v14.4s, v30.4s, v6.4s\n" + "bgt 7b\n" + "mov x20, %x[res_ptr]\n" + "cmp x10, #0x1\n" + "str q15, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x10, #0x2\n" + "str q19, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x10, #0x3\n" + "str q18, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "str q14, [x20, #0x0]\n" + "8:" // Row tail: Accumulator store skip + "subs x23, x23, #0x4\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "bne 6b\n" + "subs x10, x10, #0x4\n" + "add %x[a_ptr], %x[a_ptr], x9\n" + "mov %x[res_ptr], x22\n" + "bgt 5b\n" + "9:" // Row tail: Row loop skip + : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) + : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +#else + float sumf[4][4]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } +#endif +} + +void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) + if (svcntw() == 8) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); + } +#endif +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + size_t res_stride = bs * sizeof(float); + + __asm__ __volatile__( + "mov x10, %x[nr]\n" + "mov x9, #0x88\n" + "cmp x10, #0x10\n" + "mul x9, %x[nb], x9\n" + "blt 4f\n" + "1:" // Row loop + "add x28, %x[b_ptr], #0x8\n" + "mov x27, %x[nc]\n" + "add x26, %x[res_ptr], %x[res_stride], LSL #4\n" + "2:" // Column loop + "add x25, %x[a_ptr], #0x8\n" + "movi v2.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "mov x24, %x[nb]\n" + "add x23, x25, x9\n" + "movi v12.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "add x22, x23, x9\n" + "movi v11.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "add x21, x22, x9\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v5.16b, #0x0\n" + "movi v7.16b, #0x0\n" + "movi v4.16b, #0x0\n" + "movi v6.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "3:" // Block loop + "ldr q21, [x28, #0x0]\n" + "ldr q16, [x28, #0x10]\n" + "movi v1.16b, #0x4\n" + "movi v19.4s, #0x0\n" + "ldr q27, [x25, #0x0]\n" + "ldr q15, [x25, #0x10]\n" + "movi v26.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "ldr q29, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" + "movi v17.4s, #0x0\n" + "movi v0.16b, #0xf0\n" + "ldr d20, [x25, #-0x8]\n" + "ldr d9, [x23, #-0x8]\n" + "sshl v8.16b, v21.16b, v1.16b\n" + "sshl v31.16b, v16.16b, v1.16b\n" + "and v21.16b, v21.16b, v0.16b\n" + "and v16.16b, v16.16b, v0.16b\n" + "sub x20, x28, #0x8\n" + "subs x24, x24, #0x1\n" + "add x28, x28, #0x48\n" + ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n" + ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n" + "ldr q27, [x25, #0x20]\n" + ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n" + ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n" + "sshl v15.16b, v29.16b, v1.16b\n" + "sshl v1.16b, v3.16b, v1.16b\n" + "and v29.16b, v29.16b, v0.16b\n" + "and v3.16b, v3.16b, v0.16b\n" + "ldr q0, [x25, #0x30]\n" + "fcvtl v20.4s, v20.4h\n" + ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n" + "fcvtl v9.4s, v9.4h\n" + ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n" + "ldr q27, [x25, #0x40]\n" + ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n" + ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n" + "ldr q0, [x25, #0x50]\n" + ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n" + ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n" + "ldr q27, [x25, #0x60]\n" + ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n" + ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n" + "ldr q0, [x25, #0x70]\n" + "add x25, x25, #0x88\n" + ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n" + ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n" + "ldr d27, [x20, #0x0]\n" + ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n" + ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n" + "fcvtl v27.4s, v27.4h\n" + "uzp1 v0.2d, v19.2d, v26.2d\n" + "uzp2 v26.2d, v19.2d, v26.2d\n" + "fmul v19.4s, v27.4s, v20.s[0]\n" + "scvtf v0.4s, v0.4s, #0x4\n" + "scvtf v26.4s, v26.4s, #0x4\n" + "fmla v2.4s, v0.4s, v19.4s\n" + "ldr q19, [x23, #0x0]\n" + "uzp1 v0.2d, v18.2d, v17.2d\n" + "uzp2 v18.2d, v18.2d, v17.2d\n" + "fmul v17.4s, v27.4s, v20.s[1]\n" + "scvtf v0.4s, v0.4s, #0x4\n" + "scvtf v18.4s, v18.4s, #0x4\n" + "fmla v10.4s, v26.4s, v17.4s\n" + "ldr q17, [x23, #0x10]\n" + "fmul v26.4s, v27.4s, v20.s[2]\n" + "fmul v20.4s, v27.4s, v20.s[3]\n" + "fmla v12.4s, v0.4s, v26.4s\n" + "ldr d0, [x22, #-0x8]\n" + "ldr d26, [x21, #-0x8]\n" + "fcvtl v0.4s, v0.4h\n" + "fmla v28.4s, v18.4s, v20.4s\n" + "movi v20.4s, #0x0\n" + "movi v18.4s, #0x0\n" + ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n" + ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n" + "ldr q19, [x23, #0x20]\n" + "fcvtl v26.4s, v26.4h\n" + ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n" + ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n" + "ldr q19, [x23, #0x40]\n" + ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n" + ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n" + "ldr q19, [x23, #0x60]\n" + ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n" + ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n" + "uzp1 v19.2d, v20.2d, v18.2d\n" + "scvtf v19.4s, v19.4s, #0x4\n" + "uzp2 v20.2d, v20.2d, v18.2d\n" + "fmul v18.4s, v27.4s, v9.s[0]\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "fmla v11.4s, v19.4s, v18.4s\n" + "ldr q18, [x22, #0x0]\n" + "fmul v19.4s, v27.4s, v9.s[1]\n" + "fmla v13.4s, v20.4s, v19.4s\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n" + ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n" + "ldr q17, [x23, #0x30]\n" + ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n" + ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n" + "ldr q17, [x23, #0x50]\n" + ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n" + ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n" + "ldr q17, [x23, #0x70]\n" + "add x23, x23, #0x88\n" + ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n" + ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n" + "uzp1 v17.2d, v19.2d, v20.2d\n" + "scvtf v17.4s, v17.4s, #0x4\n" + "uzp2 v20.2d, v19.2d, v20.2d\n" + "fmul v19.4s, v27.4s, v9.s[2]\n" + "fmul v9.4s, v27.4s, v9.s[3]\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "fmla v22.4s, v17.4s, v19.4s\n" + "ldr q17, [x22, #0x10]\n" + "movi v19.4s, #0x0\n" + ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n" + "fmla v23.4s, v20.4s, v9.4s\n" + "movi v20.4s, #0x0\n" + "movi v9.4s, #0x0\n" + ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n" + "ldr q18, [x22, #0x20]\n" + ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n" + ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n" + ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n" + "ldr q18, [x22, #0x40]\n" + ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n" + ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n" + "ldr q18, [x22, #0x60]\n" + ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n" + ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n" + "movi v18.4s, #0x0\n" + ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n" + "ldr q17, [x22, #0x30]\n" + ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n" + ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n" + "ldr q17, [x22, #0x50]\n" + ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n" + ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n" + "ldr q17, [x22, #0x70]\n" + "add x22, x22, #0x88\n" + ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n" + ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n" + "uzp1 v17.2d, v19.2d, v20.2d\n" + "uzp2 v20.2d, v19.2d, v20.2d\n" + "fmul v19.4s, v27.4s, v0.s[0]\n" + "scvtf v17.4s, v17.4s, #0x4\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "fmla v25.4s, v17.4s, v19.4s\n" + "ldr q19, [x21, #0x0]\n" + "fmul v17.4s, v27.4s, v0.s[1]\n" + "fmla v5.4s, v20.4s, v17.4s\n" + "ldr q17, [x21, #0x10]\n" + "uzp1 v20.2d, v9.2d, v18.2d\n" + "uzp2 v9.2d, v9.2d, v18.2d\n" + "fmul v18.4s, v27.4s, v0.s[2]\n" + "fmul v0.4s, v27.4s, v0.s[3]\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "scvtf v9.4s, v9.4s, #0x4\n" + "fmla v7.4s, v20.4s, v18.4s\n" + "movi v20.4s, #0x0\n" + "movi v18.4s, #0x0\n" + ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n" + ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n" + "ldr q19, [x21, #0x20]\n" + "fmla v4.4s, v9.4s, v0.4s\n" + "movi v9.4s, #0x0\n" + "movi v0.4s, #0x0\n" + ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n" + "fmul v8.4s, v27.4s, v26.s[0]\n" + ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n" + "ldr q17, [x21, #0x30]\n" + ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n" + "fmul v31.4s, v27.4s, v26.s[1]\n" + ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n" + "ldr q19, [x21, #0x40]\n" + ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n" + "fmul v15.4s, v27.4s, v26.s[2]\n" + "fmul v27.4s, v27.4s, v26.s[3]\n" + ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n" + "ldr q1, [x21, #0x50]\n" + ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n" + ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n" + "ldr q26, [x21, #0x60]\n" + ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n" + ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n" + "ldr q21, [x21, #0x70]\n" + "add x21, x21, #0x88\n" + ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n" + ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n" + ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n" + ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n" + "uzp1 v29.2d, v20.2d, v18.2d\n" + "uzp2 v21.2d, v20.2d, v18.2d\n" + "scvtf v29.4s, v29.4s, #0x4\n" + "uzp1 v18.2d, v9.2d, v0.2d\n" + "uzp2 v16.2d, v9.2d, v0.2d\n" + "scvtf v21.4s, v21.4s, #0x4\n" + "fmla v6.4s, v29.4s, v8.4s\n" + "scvtf v18.4s, v18.4s, #0x4\n" + "scvtf v16.4s, v16.4s, #0x4\n" + "fmla v30.4s, v21.4s, v31.4s\n" + "fmla v24.4s, v18.4s, v15.4s\n" + "fmla v14.4s, v16.4s, v27.4s\n" + "bgt 3b\n" + "mov x20, %x[res_ptr]\n" + "subs x27, x27, #0x4\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "str q2, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q10, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q12, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q28, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q11, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q13, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q22, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q23, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q25, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q5, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q7, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q4, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q6, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q30, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q24, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q14, [x20, #0x0]\n" + "bne 2b\n" + "mov x20, #0x4\n" + "sub x10, x10, #0x10\n" + "cmp x10, #0x10\n" + "mov %x[res_ptr], x26\n" + "madd %x[a_ptr], x20, x9, %x[a_ptr]\n" + "bge 1b\n" + "4:" // Row loop skip + "cbz x10, 9f\n" + "5:" // Row tail: Row loop + "add x24, %x[b_ptr], #0x8\n" + "mov x23, %x[nc]\n" + "add x22, %x[res_ptr], %x[res_stride], LSL #2\n" + "6:" // Row tail: Column loop + "movi v2.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "add x25, %x[a_ptr], #0x8\n" + "mov x21, %x[nb]\n" + "movi v12.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "7:" // Row tail: Block loop + "ldr q6, [x24, #0x0]\n" + "ldr q5, [x24, #0x10]\n" + "movi v17.16b, #0x4\n" + "movi v8.4s, #0x0\n" + "ldr q4, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "movi v27.4s, #0x0\n" + "movi v0.4s, #0x0\n" + "ldr q31, [x24, #0x20]\n" + "ldr q14, [x24, #0x30]\n" + "movi v29.4s, #0x0\n" + "movi v22.16b, #0xf0\n" + "ldr q11, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "sshl v21.16b, v6.16b, v17.16b\n" + "sshl v16.16b, v5.16b, v17.16b\n" + "ldr q20, [x25, #0x40]\n" + "ldr q26, [x25, #0x50]\n" + "and v6.16b, v6.16b, v22.16b\n" + "and v5.16b, v5.16b, v22.16b\n" + "ldr q25, [x25, #0x60]\n" + "ldr q3, [x25, #0x70]\n" + "sshl v19.16b, v31.16b, v17.16b\n" + "sshl v18.16b, v14.16b, v17.16b\n" + "ldr d17, [x25, #-0x8]\n" + ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n" + ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n" + "and v31.16b, v31.16b, v22.16b\n" + ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n" + ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n" + "and v14.16b, v14.16b, v22.16b\n" + "sub x20, x24, #0x8\n" + "ldr d16, [x20, #0x0]\n" + "subs x21, x21, #0x1\n" + "add x25, x25, #0x88\n" + "fcvtl v17.4s, v17.4h\n" + "add x24, x24, #0x48\n" + ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n" + ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n" + ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n" + ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n" + "fcvtl v16.4s, v16.4h\n" + ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n" + ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n" + "fmul v23.4s, v16.4s, v17.s[0]\n" + "fmul v21.4s, v16.4s, v17.s[1]\n" + "fmul v1.4s, v16.4s, v17.s[2]\n" + "fmul v20.4s, v16.4s, v17.s[3]\n" + ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n" + ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n" + ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n" + ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n" + ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n" + ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n" + "uzp1 v19.2d, v8.2d, v27.2d\n" + "uzp2 v18.2d, v8.2d, v27.2d\n" + "scvtf v19.4s, v19.4s, #0x4\n" + "uzp1 v17.2d, v0.2d, v29.2d\n" + "uzp2 v16.2d, v0.2d, v29.2d\n" + "scvtf v18.4s, v18.4s, #0x4\n" + "fmla v2.4s, v19.4s, v23.4s\n" + "scvtf v17.4s, v17.4s, #0x4\n" + "scvtf v16.4s, v16.4s, #0x4\n" + "fmla v10.4s, v18.4s, v21.4s\n" + "fmla v12.4s, v17.4s, v1.4s\n" + "fmla v28.4s, v16.4s, v20.4s\n" + "bgt 7b\n" + "mov x20, %x[res_ptr]\n" + "cmp x10, #0x1\n" + "str q2, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x10, #0x2\n" + "str q10, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x10, #0x3\n" + "str q12, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "str q28, [x20, #0x0]\n" + "8:" // Row tail: Accumulator store skip + "subs x23, x23, #0x4\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "bne 6b\n" + "subs x10, x10, #0x4\n" + "add %x[a_ptr], %x[a_ptr], x9\n" + "mov %x[res_ptr], x22\n" + "bgt 5b\n" + "9:" // Row tail: Row loop skip + : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) + : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +#elif defined(__ARM_NEON) && defined(__aarch64__) + GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && + "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " + "performance"); +#else + float sumf[4][4]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } +#endif +} + +void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) + if (svcntw() == 8) { + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + size_t res_stride = bs * sizeof(float); + + __asm__ __volatile__( + "mov x20, #0x4\n" + "mov x13, %x[nr]\n" + "mov z28.s, #-0x4\n" + "mov x12, #0x88\n" + "ptrue p1.b\n" + "whilelt p0.s, XZR, x20\n" + "cmp x13, #0x10\n" + "mul x12, %x[nb], x12\n" + "blt 4f\n" + "1:" // Row loop + "add x11, %x[b_ptr], #0x10\n" + "mov x10, %x[nc]\n" + "add x9, %x[res_ptr], %x[res_stride], LSL #4\n" + "2:" // Column loop + "add x28, %x[a_ptr], #0x8\n" + "mov z24.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov x27, %x[nb]\n" + "add x26, x28, x12\n" + "mov z12.b, #0x0\n" + "mov z0.b, #0x0\n" + "add x25, x26, x12\n" + "mov z13.b, #0x0\n" + "mov z1.b, #0x0\n" + "add x24, x25, x12\n" + "mov z20.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z8.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z10.b, #0x0\n" + "3:" // Block loop + "ld1b { z30.b }, p1/Z, [x11]\n" + "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n" + "mov z18.s, #0x0\n" + "mov z7.s, #0x0\n" + "ld1rqb { z3.b }, p1/Z, [x28]\n" + "ld1rqb { z5.b }, p1/Z, [x28, #16]\n" + "mov z9.s, #0x0\n" + "mov z22.s, #0x0\n" + "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n" + "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n" + "sub x20, x11, #0x10\n" + "sub x23, x28, #0x8\n" + "lsl z31.b, z30.b, #0x4\n" + "lsl z6.b, z21.b, #0x4\n" + "ld1h { z23.s }, p1/Z, [x20]\n" + "sub x22, x26, #0x8\n" + "and z30.b, z30.b, #0xf0\n" + "and z21.b, z21.b, #0xf0\n" + "sub x21, x25, #0x8\n" + "sub x20, x24, #0x8\n" + "lsl z14.b, z4.b, #0x4\n" + "lsl z2.b, z17.b, #0x4\n" + "subs x27, x27, #0x1\n" + "add x11, x11, #0x90\n" + ".inst 0x451f9872 // smmla z18.s, z3.b, z31.b\n" + ".inst 0x45069867 // smmla z7.s, z3.b, z6.b\n" + "ld1rqb { z3.b }, p1/Z, [x28, #32]\n" + "and z4.b, z4.b, #0xf0\n" + ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n" + ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n" + "ld1rqb { z5.b }, p1/Z, [x28, #48]\n" + "and z17.b, z17.b, #0xf0\n" + "fcvt z23.s, p1/m, z23.h\n" + ".inst 0x450e9872 // smmla z18.s, z3.b, z14.b\n" + ".inst 0x45029867 // smmla z7.s, z3.b, z2.b\n" + "ld1rqb { z3.b }, p1/Z, [x28, #64]\n" + ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n" + ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n" + "ld1rqb { z5.b }, p1/Z, [x28, #80]\n" + "fscale z23.s, p1/m, z23.s, z28.s\n" + ".inst 0x451e9872 // smmla z18.s, z3.b, z30.b\n" + ".inst 0x45159867 // smmla z7.s, z3.b, z21.b\n" + "ld1rqb { z3.b }, p1/Z, [x28, #96]\n" + ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n" + ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n" + "ld1rqb { z5.b }, p1/Z, [x28, #112]\n" + "add x28, x28, #0x88\n" + ".inst 0x45049872 // smmla z18.s, z3.b, z4.b\n" + ".inst 0x45119867 // smmla z7.s, z3.b, z17.b\n" + "ld1h { z3.s }, p0/Z, [x23]\n" + ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n" + ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n" + "fcvt z3.s, p1/m, z3.h\n" + "uzp1 z5.d, z18.d, z7.d\n" + "uzp2 z18.d, z18.d, z7.d\n" + "mov z3.q, z3.q[0]\n" + "uzp1 z7.d, z9.d, z22.d\n" + "uzp2 z22.d, z9.d, z22.d\n" + "fmul z9.s, z23.s, z3.s[0]\n" + "scvtf z5.s, p1/m, z5.s\n" + "scvtf z18.s, p1/m, z18.s\n" + "scvtf z7.s, p1/m, z7.s\n" + "scvtf z22.s, p1/m, z22.s\n" + "fmla z24.s, p1/M, z5.s, z9.s\n" + "ld1rqb { z5.b }, p1/Z, [x26]\n" + "fmul z9.s, z23.s, z3.s[1]\n" + "fmla z15.s, p1/M, z18.s, z9.s\n" + "ld1rqb { z18.b }, p1/Z, [x26, #16]\n" + "fmul z9.s, z23.s, z3.s[2]\n" + "fmul z3.s, z23.s, z3.s[3]\n" + "fmla z12.s, p1/M, z7.s, z9.s\n" + "mov z9.s, #0x0\n" + "ld1h { z7.s }, p0/Z, [x22]\n" + ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n" + "fmla z0.s, p1/M, z22.s, z3.s\n" + "mov z22.s, #0x0\n" + "ld1h { z3.s }, p0/Z, [x21]\n" + ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n" + "ld1rqb { z5.b }, p1/Z, [x26, #32]\n" + "fcvt z7.s, p1/m, z7.h\n" + "fcvt z3.s, p1/m, z3.h\n" + ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n" + ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n" + "ld1rqb { z5.b }, p1/Z, [x26, #64]\n" + "mov z7.q, z7.q[0]\n" + "mov z3.q, z3.q[0]\n" + ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n" + ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n" + "ld1rqb { z5.b }, p1/Z, [x26, #96]\n" + ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n" + ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n" + "uzp1 z5.d, z9.d, z22.d\n" + "scvtf z5.s, p1/m, z5.s\n" + "uzp2 z22.d, z9.d, z22.d\n" + "fmul z9.s, z23.s, z7.s[0]\n" + "scvtf z22.s, p1/m, z22.s\n" + "fmla z13.s, p1/M, z5.s, z9.s\n" + "ld1rqb { z9.b }, p1/Z, [x25]\n" + "fmul z5.s, z23.s, z7.s[1]\n" + "fmla z1.s, p1/M, z22.s, z5.s\n" + "mov z5.s, #0x0\n" + "mov z22.s, #0x0\n" + ".inst 0x451f9a45 // smmla z5.s, z18.b, z31.b\n" + ".inst 0x45069a56 // smmla z22.s, z18.b, z6.b\n" + "ld1rqb { z18.b }, p1/Z, [x26, #48]\n" + ".inst 0x450e9a45 // smmla z5.s, z18.b, z14.b\n" + ".inst 0x45029a56 // smmla z22.s, z18.b, z2.b\n" + "ld1rqb { z18.b }, p1/Z, [x26, #80]\n" + ".inst 0x451e9a45 // smmla z5.s, z18.b, z30.b\n" + ".inst 0x45159a56 // smmla z22.s, z18.b, z21.b\n" + "ld1rqb { z18.b }, p1/Z, [x26, #112]\n" + "add x26, x26, #0x88\n" + ".inst 0x45049a45 // smmla z5.s, z18.b, z4.b\n" + ".inst 0x45119a56 // smmla z22.s, z18.b, z17.b\n" + "uzp1 z18.d, z5.d, z22.d\n" + "scvtf z18.s, p1/m, z18.s\n" + "uzp2 z22.d, z5.d, z22.d\n" + "fmul z5.s, z23.s, z7.s[2]\n" + "fmul z7.s, z23.s, z7.s[3]\n" + "scvtf z22.s, p1/m, z22.s\n" + "fmla z20.s, p1/M, z18.s, z5.s\n" + "ld1rqb { z18.b }, p1/Z, [x25, #16]\n" + "ld1h { z5.s }, p0/Z, [x20]\n" + "fcvt z5.s, p1/m, z5.h\n" + "fmla z25.s, p1/M, z22.s, z7.s\n" + "mov z22.s, #0x0\n" + "mov z7.s, #0x0\n" + ".inst 0x451f9936 // smmla z22.s, z9.b, z31.b\n" + ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n" + "ld1rqb { z9.b }, p1/Z, [x25, #32]\n" + "mov z5.q, z5.q[0]\n" + ".inst 0x450e9936 // smmla z22.s, z9.b, z14.b\n" + ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n" + "ld1rqb { z9.b }, p1/Z, [x25, #64]\n" + ".inst 0x451e9936 // smmla z22.s, z9.b, z30.b\n" + ".inst 0x45159927 // smmla z7.s, z9.b, z21.b\n" + "ld1rqb { z9.b }, p1/Z, [x25, #96]\n" + ".inst 0x45049936 // smmla z22.s, z9.b, z4.b\n" + ".inst 0x45119927 // smmla z7.s, z9.b, z17.b\n" + "uzp1 z9.d, z22.d, z7.d\n" + "scvtf z9.s, p1/m, z9.s\n" + "uzp2 z22.d, z22.d, z7.d\n" + "fmul z7.s, z23.s, z3.s[0]\n" + "scvtf z22.s, p1/m, z22.s\n" + "fmla z11.s, p1/M, z9.s, z7.s\n" + "ld1rqb { z9.b }, p1/Z, [x24]\n" + "fmul z7.s, z23.s, z3.s[1]\n" + "fmla z16.s, p1/M, z22.s, z7.s\n" + "mov z22.s, #0x0\n" + "mov z7.s, #0x0\n" + ".inst 0x451f9a56 // smmla z22.s, z18.b, z31.b\n" + ".inst 0x45069a47 // smmla z7.s, z18.b, z6.b\n" + "ld1rqb { z18.b }, p1/Z, [x25, #48]\n" + ".inst 0x450e9a56 // smmla z22.s, z18.b, z14.b\n" + ".inst 0x45029a47 // smmla z7.s, z18.b, z2.b\n" + "ld1rqb { z18.b }, p1/Z, [x25, #80]\n" + ".inst 0x451e9a56 // smmla z22.s, z18.b, z30.b\n" + ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n" + "ld1rqb { z18.b }, p1/Z, [x25, #112]\n" + "add x25, x25, #0x88\n" + ".inst 0x45049a56 // smmla z22.s, z18.b, z4.b\n" + ".inst 0x45119a47 // smmla z7.s, z18.b, z17.b\n" + "uzp1 z18.d, z22.d, z7.d\n" + "scvtf z18.s, p1/m, z18.s\n" + "uzp2 z7.d, z22.d, z7.d\n" + "fmul z22.s, z23.s, z3.s[2]\n" + "fmul z3.s, z23.s, z3.s[3]\n" + "scvtf z7.s, p1/m, z7.s\n" + "fmla z19.s, p1/M, z18.s, z22.s\n" + "ld1rqb { z18.b }, p1/Z, [x24, #16]\n" + "fmul z22.s, z23.s, z5.s[0]\n" + "fmla z26.s, p1/M, z7.s, z3.s\n" + "mov z3.s, #0x0\n" + "mov z7.s, #0x0\n" + ".inst 0x451f9923 // smmla z3.s, z9.b, z31.b\n" + ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n" + "ld1rqb { z9.b }, p1/Z, [x24, #32]\n" + ".inst 0x450e9923 // smmla z3.s, z9.b, z14.b\n" + ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n" + "mov z9.s, #0x0\n" + ".inst 0x451f9a49 // smmla z9.s, z18.b, z31.b\n" + "mov z31.s, #0x0\n" + ".inst 0x45069a5f // smmla z31.s, z18.b, z6.b\n" + "ld1rqb { z6.b }, p1/Z, [x24, #48]\n" + "ld1rqb { z18.b }, p1/Z, [x24, #64]\n" + ".inst 0x450e98c9 // smmla z9.s, z6.b, z14.b\n" + "fmul z14.s, z23.s, z5.s[1]\n" + ".inst 0x450298df // smmla z31.s, z6.b, z2.b\n" + "ld1rqb { z6.b }, p1/Z, [x24, #80]\n" + "fmul z2.s, z23.s, z5.s[2]\n" + "fmul z23.s, z23.s, z5.s[3]\n" + ".inst 0x451e9a43 // smmla z3.s, z18.b, z30.b\n" + ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n" + "ld1rqb { z5.b }, p1/Z, [x24, #96]\n" + ".inst 0x451e98c9 // smmla z9.s, z6.b, z30.b\n" + ".inst 0x451598df // smmla z31.s, z6.b, z21.b\n" + "ld1rqb { z18.b }, p1/Z, [x24, #112]\n" + "add x24, x24, #0x88\n" + ".inst 0x450498a3 // smmla z3.s, z5.b, z4.b\n" + ".inst 0x451198a7 // smmla z7.s, z5.b, z17.b\n" + ".inst 0x45049a49 // smmla z9.s, z18.b, z4.b\n" + ".inst 0x45119a5f // smmla z31.s, z18.b, z17.b\n" + "uzp1 z18.d, z3.d, z7.d\n" + "uzp2 z5.d, z3.d, z7.d\n" + "scvtf z18.s, p1/m, z18.s\n" + "uzp1 z6.d, z9.d, z31.d\n" + "uzp2 z9.d, z9.d, z31.d\n" + "scvtf z5.s, p1/m, z5.s\n" + "fmla z8.s, p1/M, z18.s, z22.s\n" + "scvtf z6.s, p1/m, z6.s\n" + "scvtf z9.s, p1/m, z9.s\n" + "fmla z29.s, p1/M, z5.s, z14.s\n" + "fmla z27.s, p1/M, z6.s, z2.s\n" + "fmla z10.s, p1/M, z9.s, z23.s\n" + "bgt 3b\n" + "mov x20, %x[res_ptr]\n" + "subs x10, x10, #0x8\n" + "add %x[res_ptr], %x[res_ptr], #0x20\n" + "st1w { z24.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z15.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z12.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z0.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z13.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z1.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z20.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z25.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z11.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z16.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z19.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z26.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z8.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z29.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z27.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z10.s }, p1, [x20]\n" + "bne 2b\n" + "mov x20, #0x4\n" + "sub x13, x13, #0x10\n" + "cmp x13, #0x10\n" + "mov %x[res_ptr], x9\n" + "madd %x[a_ptr], x20, x12, %x[a_ptr]\n" + "bge 1b\n" + "4:" // Row loop skip + "cbz x13, 9f\n" + "5:" // Row tail: Row loop + "add x25, %x[b_ptr], #0x10\n" + "mov x24, %x[nc]\n" + "add x23, %x[res_ptr], %x[res_stride], LSL #2\n" + "6:" // Row tail: Column loop + "mov z24.b, #0x0\n" + "mov z15.b, #0x0\n" + "add x28, %x[a_ptr], #0x8\n" + "mov x22, %x[nb]\n" + "mov z12.b, #0x0\n" + "mov z0.b, #0x0\n" + "7:" // Row tail: Block loop + "ld1b { z3.b }, p1/Z, [x25]\n" + "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n" + "mov z2.s, #0x0\n" + "mov z25.s, #0x0\n" + "ld1rqb { z26.b }, p1/Z, [x28]\n" + "ld1rqb { z21.b }, p1/Z, [x28, #16]\n" + "mov z27.s, #0x0\n" + "mov z19.s, #0x0\n" + "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n" + "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n" + "sub x21, x25, #0x10\n" + "sub x20, x28, #0x8\n" + "lsl z20.b, z3.b, #0x4\n" + "lsl z4.b, z6.b, #0x4\n" + "ld1rqb { z10.b }, p1/Z, [x28, #32]\n" + "ld1rqb { z23.b }, p1/Z, [x28, #48]\n" + "and z3.b, z3.b, #0xf0\n" + "and z6.b, z6.b, #0xf0\n" + "ld1rqb { z11.b }, p1/Z, [x28, #64]\n" + "ld1rqb { z7.b }, p1/Z, [x28, #80]\n" + "lsl z8.b, z29.b, #0x4\n" + "lsl z14.b, z16.b, #0x4\n" + "ld1rqb { z18.b }, p1/Z, [x28, #96]\n" + "ld1rqb { z30.b }, p1/Z, [x28, #112]\n" + ".inst 0x45149b42 // smmla z2.s, z26.b, z20.b\n" + ".inst 0x45049b59 // smmla z25.s, z26.b, z4.b\n" + "and z29.b, z29.b, #0xf0\n" + "ld1h { z17.s }, p1/Z, [x21]\n" + ".inst 0x45149abb // smmla z27.s, z21.b, z20.b\n" + ".inst 0x45049ab3 // smmla z19.s, z21.b, z4.b\n" + "and z16.b, z16.b, #0xf0\n" + "ld1h { z4.s }, p0/Z, [x20]\n" + "subs x22, x22, #0x1\n" + "add x28, x28, #0x88\n" + "fcvt z17.s, p1/m, z17.h\n" + "add x25, x25, #0x90\n" + ".inst 0x45089942 // smmla z2.s, z10.b, z8.b\n" + ".inst 0x450e9959 // smmla z25.s, z10.b, z14.b\n" + "fcvt z4.s, p1/m, z4.h\n" + ".inst 0x45089afb // smmla z27.s, z23.b, z8.b\n" + ".inst 0x450e9af3 // smmla z19.s, z23.b, z14.b\n" + "fscale z17.s, p1/m, z17.s, z28.s\n" + "mov z4.q, z4.q[0]\n" + ".inst 0x45039962 // smmla z2.s, z11.b, z3.b\n" + ".inst 0x45069979 // smmla z25.s, z11.b, z6.b\n" + "fmul z23.s, z17.s, z4.s[0]\n" + "fmul z9.s, z17.s, z4.s[1]\n" + "fmul z21.s, z17.s, z4.s[2]\n" + "fmul z4.s, z17.s, z4.s[3]\n" + ".inst 0x450398fb // smmla z27.s, z7.b, z3.b\n" + ".inst 0x450698f3 // smmla z19.s, z7.b, z6.b\n" + ".inst 0x451d9a42 // smmla z2.s, z18.b, z29.b\n" + ".inst 0x45109a59 // smmla z25.s, z18.b, z16.b\n" + ".inst 0x451d9bdb // smmla z27.s, z30.b, z29.b\n" + ".inst 0x45109bd3 // smmla z19.s, z30.b, z16.b\n" + "uzp1 z31.d, z2.d, z25.d\n" + "uzp2 z13.d, z2.d, z25.d\n" + "scvtf z31.s, p1/m, z31.s\n" + "uzp1 z17.d, z27.d, z19.d\n" + "uzp2 z18.d, z27.d, z19.d\n" + "scvtf z13.s, p1/m, z13.s\n" + "fmla z24.s, p1/M, z31.s, z23.s\n" + "scvtf z17.s, p1/m, z17.s\n" + "scvtf z18.s, p1/m, z18.s\n" + "fmla z15.s, p1/M, z13.s, z9.s\n" + "fmla z12.s, p1/M, z17.s, z21.s\n" + "fmla z0.s, p1/M, z18.s, z4.s\n" + "bgt 7b\n" + "mov x20, %x[res_ptr]\n" + "cmp x13, #0x1\n" + "st1w { z24.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x13, #0x2\n" + "st1w { z15.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x13, #0x3\n" + "st1w { z12.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "st1w { z0.s }, p1, [x20]\n" + "8:" // Row tail: Accumulator store skip + "subs x24, x24, #0x8\n" + "add %x[res_ptr], %x[res_ptr], #0x20\n" + "bne 6b\n" + "subs x13, x13, #0x4\n" + "add %x[a_ptr], %x[a_ptr], x12\n" + "mov %x[res_ptr], x23\n" + "bgt 5b\n" + "9:" // Row tail: Row loop skip + : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) + : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) + : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); + return; + } + else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { + GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " + "performance"); + } + else if (ggml_cpu_has_neon()) { + GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " + "quantization format for optimal performance"); + } +#endif +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) + GGML_ASSERT(ggml_cpu_has_sve() && + "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance"); +#elif defined(__ARM_NEON) && defined(__aarch64__) + GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && + "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " + "performance"); +#else + float sumf[4][8]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } +#endif +} diff --git a/ggml/src/ggml-aarch64.h b/ggml/src/ggml-aarch64.h new file mode 100644 index 000000000..65ead1efe --- /dev/null +++ b/ggml/src/ggml-aarch64.h @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd. +#pragma once + +#define GGML_COMMON_DECL_C +#include "ggml-common.h" + +#include "ggml.h" + +// GGML internal header + +#ifdef __cplusplus +extern "C" { +#endif + +// Quantization +void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + +void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize); + +// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") +size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// GEMV +void ggml_gemv_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + +// GEMM +void ggml_gemm_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + +#ifdef __cplusplus +} +#endif + diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index c74060cc4..fafd5fa7a 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -199,6 +199,30 @@ typedef struct { } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); +typedef struct { + ggml_half d[4]; // deltas for 4 q4_0 blocks + uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks +} block_q4_0x4; +static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding"); + +typedef struct { + ggml_half d[8]; // deltas for 8 q4_0 blocks + uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks +} block_q4_0x8; +static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding"); + +typedef struct { + ggml_half d[4]; // deltas for 4 q8_0 blocks + int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks +} block_q8_0x4; +static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding"); + +typedef struct { + ggml_half d[8]; // deltas for 8 q8_0 blocks + int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks +} block_q8_0x8; +static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding"); + // // Super-block quantization structures // diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index dfd75e0e7..39e345b66 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -29,6 +29,7 @@ #include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/unary.cuh" #include "ggml-cuda/upscale.cuh" +#include "ggml-cuda/conv-transpose-1d.cuh" #include #include @@ -2262,6 +2263,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_IM2COL: ggml_cuda_op_im2col(ctx, dst); break; + case GGML_OP_CONV_TRANSPOSE_1D: + ggml_cuda_op_conv_transpose_1d(ctx,dst); + break; case GGML_OP_POOL_2D: ggml_cuda_op_pool2d(ctx, dst); break; @@ -2805,6 +2809,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ggml_type src0_type = op->src[0]->type; return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + return false; + } break; case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: diff --git a/ggml/src/ggml-cuda/conv-transpose-1d.cu b/ggml/src/ggml-cuda/conv-transpose-1d.cu new file mode 100644 index 000000000..b1e94d6f7 --- /dev/null +++ b/ggml/src/ggml-cuda/conv-transpose-1d.cu @@ -0,0 +1,87 @@ +#include "conv-transpose-1d.cuh" + +static __global__ void conv_transpose_1d_kernel( + const int s0, const int p0, const int d0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3, + const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3, + const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3, + const float * src0, const float * src1, float * dst) { + int global_index = threadIdx.x + blockIdx.x * blockDim.x; + if (global_index >= output_size) { + return; + } + + int out_index = global_index / dst_ne0; + + float accumulator = 0; + + for (int c = 0; c < src0_ne2; c++) { + int idx = global_index % dst_ne0; + + int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0); + int input_offset = src1_ne0 * c; + + for (int i = 0; i < src1_ne0; i++) { + if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) { + continue; + } + int weight_idx = idx - i*s0; + + float kernel_weight = src0[kernel_offset + weight_idx]; + float input_value = src1[input_offset+i]; + + accumulator += kernel_weight * input_value; + } + } + dst[global_index] = accumulator; +} + +static void conv_transpose_1d_f32_f32_cuda( + const int s0, const int p0, const int d0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3, + const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3, + const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3, + const float * src0, const float * src1, float * dst, + cudaStream_t stream) { + + const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE; + conv_transpose_1d_kernel<<>>( + s0,p0,d0,output_size, + src0_ne0, src0_ne1, src0_ne2, src0_ne3, + src1_ne0, src1_ne1, src1_ne2, src1_ne3, + dst_ne0, dst_ne1, dst_ne2, dst_ne3, + src0,src1, dst); +} + +void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + + const ggml_tensor * src1 = dst->src[1]; + const float * src1_d = (const float *)src1->data; + + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + const int32_t * opts = (const int32_t *)dst->op_params; + + const int s0 = opts[0]; + const int p0 = 0;//opts[3]; + const int d0 = 1;//opts[4]; + + const int64_t kernel_size = ggml_nelements(src0); + const int64_t input_size = ggml_nelements(src1); + const int64_t output_size = ggml_nelements(dst); + + conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + src0_d, src1_d, dst_d, stream); +} diff --git a/ggml/src/ggml-cuda/conv-transpose-1d.cuh b/ggml/src/ggml-cuda/conv-transpose-1d.cuh new file mode 100644 index 000000000..6c2cf666b --- /dev/null +++ b/ggml/src/ggml-cuda/conv-transpose-1d.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256 + +void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 1d2336190..a2c8dbec0 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -609,6 +609,10 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { #endif // defined(__ARM_NEON) && (!defined(__MSC_VER) +#ifdef __ARM_FEATURE_SVE +#include +#endif // __ARM_FEATURE_SVE + // precomputed f32 table for f16 (256 KB) // defined in ggml.c, initialized in ggml_init() extern float ggml_table_f32_f16[1 << 16]; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 0eb52e485..cbe377cf5 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3814,43 +3814,47 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r } #endif #if defined(__ARM_FEATURE_SVE) - const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); - const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); + if (svcntb() == QK8_0) { + const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); + const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); - assert(nb % 2 == 0); // TODO: handle odd nb + assert(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + for (int i = 0; i < nb; i += 2) { + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; - // load x - const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); - const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); - // 4-bit -> 8-bit - const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04)); - const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04)); + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04)); - // sub 8 - const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); - const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); - // dot product - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + // dot product + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + return; } - - *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); -#elif defined(__ARM_NEON) +#endif +#if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -5422,31 +5426,35 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r } #endif #if defined(__ARM_FEATURE_SVE) - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); + if (svcntb() == QK8_0) { + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); - assert(nb % 2 == 0); // TODO: handle odd nb + assert(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q8_0 * restrict x0 = &x[i + 0]; - const block_q8_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + for (int i = 0; i < nb; i += 2) { + const block_q8_0 * restrict x0 = &x[i + 0]; + const block_q8_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; - // load x - const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); - const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + // load x + const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); + const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + return; } - - *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); -#elif defined(__ARM_NEON) +#endif +#if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -14760,6 +14768,16 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) { } \ } +#define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \ + const type * q = (const type *) (data); \ + for (size_t i = 0; i < (nb); ++i) { \ + for (size_t j = 0; j < (nr); ++j) { \ + if (!validate_fp16(q[i].d[j], i)) { \ + return false; \ + } \ + } \ + } + bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) { if (type < 0 || type >= GGML_TYPE_COUNT) { fprintf(stderr, "%s: invalid type %d\n", __func__, type); @@ -14977,6 +14995,16 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + { + VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4); + } break; + case GGML_TYPE_Q4_0_8_8: + { + VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8); + } break; + case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 21006cd7b..9c419ba89 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -3658,6 +3658,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE); #endif // SYCL_USE_XMX + // mmvq path is faster in the CUDA backend. + if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda) + use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; + if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst); diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 9a1c161b6..68d41411b 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -346,4 +346,10 @@ inline sycl::vec vec_aligned_load(const Tp* aligned_ptr) { return *reinterpret_cast*>(aligned_ptr); } +// Helper for accessing pointers with no warnings +template +static __dpct_inline__ Tp* get_pointer(sycl::local_accessor acc) { + return acc.template get_multi_ptr().get(); +} + #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index a15271b51..39c28753c 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -158,7 +158,7 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k, sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_K(vx, y, scale_local_acc.get_pointer(), item_ct1); + dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1); }); }); } diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp index b514f0040..3107ba919 100644 --- a/ggml/src/ggml-sycl/mmq.cpp +++ b/ggml/src/ggml-sycl/mmq.cpp @@ -1835,10 +1835,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, mul_mat_q4_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_qs_q4_0_acc_ct1.get_pointer(), - tile_x_d_q4_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_qs_q4_0_acc_ct1), + get_pointer(tile_x_d_q4_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -1870,10 +1870,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, mul_mat_q4_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_qs_q4_0_acc_ct1.get_pointer(), - tile_x_d_q4_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_qs_q4_0_acc_ct1), + get_pointer(tile_x_d_q4_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -1950,10 +1950,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, mul_mat_q4_1( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_qs_q4_1_acc_ct1.get_pointer(), - tile_x_dm_q4_1_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_qs_q4_1_acc_ct1), + get_pointer(tile_x_dm_q4_1_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -1985,10 +1985,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, mul_mat_q4_1( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_qs_q4_1_acc_ct1.get_pointer(), - tile_x_dm_q4_1_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_qs_q4_1_acc_ct1), + get_pointer(tile_x_dm_q4_1_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2065,10 +2065,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, mul_mat_q5_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q5_0_acc_ct1.get_pointer(), - tile_x_d_q5_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q5_0_acc_ct1), + get_pointer(tile_x_d_q5_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2100,10 +2100,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, mul_mat_q5_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q5_0_acc_ct1.get_pointer(), - tile_x_d_q5_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q5_0_acc_ct1), + get_pointer(tile_x_d_q5_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2180,10 +2180,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, mul_mat_q5_1( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q5_1_acc_ct1.get_pointer(), - tile_x_dm_q5_1_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q5_1_acc_ct1), + get_pointer(tile_x_dm_q5_1_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2215,10 +2215,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, mul_mat_q5_1( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q5_1_acc_ct1.get_pointer(), - tile_x_dm_q5_1_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q5_1_acc_ct1), + get_pointer(tile_x_dm_q5_1_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2295,10 +2295,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, mul_mat_q8_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_qs_q8_0_acc_ct1.get_pointer(), - tile_x_d_q8_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_qs_q8_0_acc_ct1), + get_pointer(tile_x_d_q8_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2330,10 +2330,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, mul_mat_q8_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_qs_q8_0_acc_ct1.get_pointer(), - tile_x_d_q8_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_qs_q8_0_acc_ct1), + get_pointer(tile_x_d_q8_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2412,11 +2412,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q2_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q2_K_acc_ct1.get_pointer(), - tile_x_dm_q2_K_acc_ct1.get_pointer(), - tile_x_sc_q2_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q2_K_acc_ct1), + get_pointer(tile_x_dm_q2_K_acc_ct1), + get_pointer(tile_x_sc_q2_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2450,11 +2450,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q2_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q2_K_acc_ct1.get_pointer(), - tile_x_dm_q2_K_acc_ct1.get_pointer(), - tile_x_sc_q2_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q2_K_acc_ct1), + get_pointer(tile_x_dm_q2_K_acc_ct1), + get_pointer(tile_x_sc_q2_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2537,12 +2537,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q3_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q3_K_acc_ct1.get_pointer(), - tile_x_dm_q3_K_acc_ct1.get_pointer(), - tile_x_qh_q3_K_acc_ct1.get_pointer(), - tile_x_sc_q3_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q3_K_acc_ct1), + get_pointer(tile_x_dm_q3_K_acc_ct1), + get_pointer(tile_x_qh_q3_K_acc_ct1), + get_pointer(tile_x_sc_q3_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2578,12 +2578,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q3_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q3_K_acc_ct1.get_pointer(), - tile_x_dm_q3_K_acc_ct1.get_pointer(), - tile_x_qh_q3_K_acc_ct1.get_pointer(), - tile_x_sc_q3_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q3_K_acc_ct1), + get_pointer(tile_x_dm_q3_K_acc_ct1), + get_pointer(tile_x_qh_q3_K_acc_ct1), + get_pointer(tile_x_sc_q3_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2663,11 +2663,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q4_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q4_K_acc_ct1.get_pointer(), - tile_x_dm_q4_K_acc_ct1.get_pointer(), - tile_x_sc_q4_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q4_K_acc_ct1), + get_pointer(tile_x_dm_q4_K_acc_ct1), + get_pointer(tile_x_sc_q4_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2701,11 +2701,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q4_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q4_K_acc_ct1.get_pointer(), - tile_x_dm_q4_K_acc_ct1.get_pointer(), - tile_x_sc_q4_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q4_K_acc_ct1), + get_pointer(tile_x_dm_q4_K_acc_ct1), + get_pointer(tile_x_sc_q4_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2784,11 +2784,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q5_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q5_K_acc_ct1.get_pointer(), - tile_x_dm_q5_K_acc_ct1.get_pointer(), - tile_x_sc_q5_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q5_K_acc_ct1), + get_pointer(tile_x_dm_q5_K_acc_ct1), + get_pointer(tile_x_sc_q5_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2822,11 +2822,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q5_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_q5_K_acc_ct1.get_pointer(), - tile_x_dm_q5_K_acc_ct1.get_pointer(), - tile_x_sc_q5_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_q5_K_acc_ct1), + get_pointer(tile_x_dm_q5_K_acc_ct1), + get_pointer(tile_x_sc_q5_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2905,11 +2905,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q6_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_acc_ct1.get_pointer(), - tile_x_dm_acc_ct1.get_pointer(), - tile_x_sc_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_acc_ct1), + get_pointer(tile_x_dm_acc_ct1), + get_pointer(tile_x_sc_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } @@ -2943,11 +2943,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, mul_mat_q6_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, - tile_x_ql_acc_ct1.get_pointer(), - tile_x_dm_acc_ct1.get_pointer(), - tile_x_sc_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); + get_pointer(tile_x_ql_acc_ct1), + get_pointer(tile_x_dm_acc_ct1), + get_pointer(tile_x_sc_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); }); }); } diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index e0c5dfeca..cccf87d06 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -218,7 +218,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols, [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { norm_f32(x, dst, ncols, eps, item_ct1, - s_sum_acc_ct1.get_pointer(), work_group_size); + get_pointer(s_sum_acc_ct1), work_group_size); }); }); } @@ -265,7 +265,7 @@ static void group_norm_f32_sycl(const float* x, float* dst, [[intel::reqd_sub_group_size(WARP_SIZE)]] { group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, - s_sum_acc_ct1.get_pointer(), work_group_size); + get_pointer(s_sum_acc_ct1), work_group_size); }); }); } @@ -306,7 +306,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { rms_norm_f32(x, dst, ncols, eps, item_ct1, - s_sum_acc_ct1.get_pointer(), work_group_size); + get_pointer(s_sum_acc_ct1), work_group_size); }); }); } diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index eabf1693e..6f507941a 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -55,7 +55,7 @@ static void rope_norm( const int i = row*ne0 + i0; const int i2 = row/p_delta_rows; - const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); + const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f); const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; @@ -98,7 +98,7 @@ static void rope_neox( const int i = row*ne0 + i0/2; const int i2 = row/p_delta_rows; - const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); + const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f); const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp index e624b6ba3..c5d9a837e 100644 --- a/ggml/src/ggml-sycl/softmax.cpp +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -136,7 +136,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float * soft_max_f32(x, mask, dst, ncols_par, nrows_y, scale, max_bias, m0, m1, n_head_log2, item_ct1, - local_buf_acc.get_pointer()); + get_pointer(local_buf_acc)); }); }); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2093be2a9..a9d5c7f4d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4,7 +4,7 @@ #include "ggml-impl.h" #include "ggml-quants.h" #include "ggml.h" - +#include "ggml-aarch64.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -37,12 +37,12 @@ #include #endif -#ifdef __ARM_FEATURE_MATMUL_INT8 +#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif #ifdef GGML_USE_LLAMAFILE -#include "sgemm.h" +#include #endif #if defined(_MSC_VER) @@ -692,6 +692,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { #else .nrows = 1, #endif + .from_float_to_mat = quantize_mat_q8_0, }, [GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -889,6 +890,54 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, + }, + [GGML_TYPE_Q4_0_4_4] = { + .type_name = "q4_0_4x4", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = NULL, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + .ncols = 4, + .interleave_blcksize = 4, + .gemv = ggml_gemv_q4_0_4x4_q8_0, + .gemm = ggml_gemm_q4_0_4x4_q8_0, + }, + [GGML_TYPE_Q4_0_4_8] = { + .type_name = "q4_0_4x8", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = NULL, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + .ncols = 4, + .interleave_blcksize = 8, + .gemv = ggml_gemv_q4_0_4x8_q8_0, + .gemm = ggml_gemm_q4_0_4x8_q8_0, + }, + [GGML_TYPE_Q4_0_8_8] = { + .type_name = "q4_0_8x8", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = NULL, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + .ncols = 8, + .interleave_blcksize = 8, + .gemv = ggml_gemv_q4_0_8x8_q8_0, + .gemm = ggml_gemm_q4_0_8x8_q8_0, } }; @@ -3188,6 +3237,9 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; + case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; + case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break; + case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -9432,6 +9484,9 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -9807,6 +9862,9 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -9932,6 +9990,9 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: default: { GGML_ASSERT(false); @@ -12134,6 +12195,12 @@ static void ggml_compute_forward_mul_mat( enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; int64_t const vec_dot_num_rows = type_traits[type].nrows; + int64_t const matmul_num_cols = type_traits[type].ncols; + int64_t const interleave_blcksize = type_traits[type].interleave_blcksize; + ggml_from_float_to_mat_t const from_float_to_mat + = type_traits[vec_dot_type].from_float_to_mat; + ggml_gemv_t const gemv = type_traits[type].gemv; + ggml_gemm_t const gemm = type_traits[type].gemm; GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); @@ -12192,7 +12259,16 @@ UseGgmlGemm1:; for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + int64_t i11_processed = 0; + if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) { + for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { + from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), + 4, ne10, interleave_blcksize); + } + i11_processed = ne11 - ne11 % 4; + } + for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), ne10); @@ -12273,6 +12349,28 @@ UseGgmlGemm2:; const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; + if ((ggml_n_dims(src0) == 2) && gemv) { + const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11; + int64_t src0_start = (ith * ne01) / nth; + int64_t src0_end = ((ith + 1) * ne01) / nth; + src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start; + src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end; + if (src0_start >= src0_end) return; + + // If there are more than three rows in src1, use gemm; otherwise, use gemv. + if (gemm && (ne11 > 3)) { + gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); + } + for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) { + gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1, + src0_end - src0_start); + } + return; + } + // The first chunk comes from our thread_id, the rest will get auto-assigned. int current_chunk = ith; @@ -12318,6 +12416,8 @@ static void ggml_compute_forward_mul_mat_id( ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + int64_t const matmul_num_cols = type_traits[type].ncols; + ggml_gemv_t const gemv = type_traits[type].gemv; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == ggml_type_size(type)); @@ -12403,6 +12503,34 @@ static void ggml_compute_forward_mul_mat_id( const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = cne1; // src1 rows + if (((ggml_n_dims(src0) - 1) == 2) && gemv) { + int64_t src0_cur_start = (ith * ne01) / nth; + int64_t src0_cur_end = ((ith + 1) * ne01) / nth; + src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start; + src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end; + if (src0_cur_start >= src0_cur_end) return; + + for (int ir1 = 0; ir1 < nr1; ir1++) { + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); + const int id = row_mapping.i1; // selected expert index + + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 + + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row + + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12)); + + gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, + (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start); + } + continue; + } + // distribute the thread work across the inner or outer loop based on which one is larger const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows @@ -12704,6 +12832,9 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_out_prod_q_f32(params, dst); } break; @@ -12889,6 +13020,9 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: default: { GGML_ASSERT(false); @@ -13148,6 +13282,9 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -13734,6 +13871,9 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -20457,6 +20597,9 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); @@ -21759,8 +21902,6 @@ int ggml_cpu_has_neon(void) { int ggml_cpu_has_sve(void) { #if defined(__ARM_FEATURE_SVE) - // TODO: Currently, SVE 256 bit is only supported. - GGML_ASSERT(svcntb() == QK8_0); return 1; #else return 0; diff --git a/ggml/src/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp similarity index 100% rename from ggml/src/sgemm.cpp rename to ggml/src/llamafile/sgemm.cpp diff --git a/ggml/src/sgemm.h b/ggml/src/llamafile/sgemm.h similarity index 100% rename from ggml/src/sgemm.h rename to ggml/src/llamafile/sgemm.h diff --git a/gguf-py/README.md b/gguf-py/README.md index bc46d6e1d..9dd888f31 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -79,5 +79,4 @@ python -m twine upload dist/* ``` ## TODO -- [ ] Add tests - [ ] Include conversion scripts as command line entry points in this package. diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 20432bd25..e8e61abf8 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -67,7 +67,7 @@ class ReaderTensor(NamedTuple): class GGUFReader: # I - same as host, S - swapped - byte_order: Literal['I'] | Literal['S'] = 'I' + byte_order: Literal['I', 'S'] = 'I' alignment: int = GGUF_DEFAULT_ALIGNMENT data_offset: int @@ -86,7 +86,7 @@ class GGUFReader: GGUFValueType.BOOL: np.bool_, } - def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'): + def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = 'r'): self.data = np.memmap(path, mode = mode) offs = 0 @@ -140,7 +140,7 @@ class GGUFReader: return self.tensors[idx] def _get( - self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None, + self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None, ) -> npt.NDArray[Any]: count = int(count) itemsize = int(np.empty([], dtype = dtype).itemsize) diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index 1167335b8..6e266f34f 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -6,7 +6,6 @@ from typing import Any, Callable from collections import deque import numpy as np -from numpy._typing import _Shape from numpy.typing import DTypeLike @@ -16,16 +15,16 @@ logger = logging.getLogger(__name__) class LazyMeta(ABCMeta): def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs): - def __getattr__(self, __name: str) -> Any: - meta_attr = getattr(self._meta, __name) + def __getattr__(self, name: str) -> Any: + meta_attr = getattr(self._meta, name) if callable(meta_attr): return type(self)._wrap_fn( - (lambda s, *args, **kwargs: getattr(s, __name)(*args, **kwargs)), + (lambda s, *args, **kwargs: getattr(s, name)(*args, **kwargs)), use_self=self, ) elif isinstance(meta_attr, self._tensor_type): # e.g. self.T with torch.Tensor should still be wrapped - return type(self)._wrap_fn(lambda s: getattr(s, __name))(self) + return type(self)._wrap_fn(lambda s: getattr(s, name))(self) else: # no need to wrap non-tensor properties, # and they likely don't depend on the actual contents of the tensor @@ -141,19 +140,21 @@ class LazyBase(ABC, metaclass=LazyMeta): res = cls.meta_with_dtype_and_shape(meta_noop, res.shape) if isinstance(res, cls._tensor_type): - def collect_replace(t: LazyBase): - if collect_replace.shared_lazy is None: - collect_replace.shared_lazy = t._lazy - else: - collect_replace.shared_lazy.extend(t._lazy) - t._lazy = collect_replace.shared_lazy + class CollectSharedLazy: + # emulating a static variable + shared_lazy: None | deque[LazyBase] = None - # emulating a static variable - collect_replace.shared_lazy = None + @staticmethod + def collect_replace(t: LazyBase): + if CollectSharedLazy.shared_lazy is None: + CollectSharedLazy.shared_lazy = t._lazy + else: + CollectSharedLazy.shared_lazy.extend(t._lazy) + t._lazy = CollectSharedLazy.shared_lazy - LazyBase._recurse_apply(args, collect_replace) + LazyBase._recurse_apply(args, CollectSharedLazy.collect_replace) - shared_lazy = collect_replace.shared_lazy + shared_lazy = CollectSharedLazy.shared_lazy return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs)) else: @@ -184,6 +185,7 @@ class LazyBase(ABC, metaclass=LazyMeta): lt._args = cls._recurse_apply(lt._args, already_eager_to_eager) lt._data = lt._func(lt._args) # sanity check + assert lt._data is not None assert lt._data.dtype == lt._meta.dtype assert lt._data.shape == lt._meta.shape @@ -216,7 +218,7 @@ class LazyNumpyTensor(LazyBase): _tensor_type = np.ndarray @classmethod - def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]: + def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]: # The initial idea was to use np.nan as the fill value, # but non-float types like np.int16 can't use that. # So zero it is. diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 36e63ee3b..62129126b 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.9.0" +version = "0.9.1" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ diff --git a/gguf-py/scripts/__init__.py b/gguf-py/scripts/__init__.py index f9d29cb69..e77f2e9c9 100644 --- a/gguf-py/scripts/__init__.py +++ b/gguf-py/scripts/__init__.py @@ -1,3 +1,5 @@ +# pyright: reportUnusedImport=false + from .gguf_convert_endian import main as gguf_convert_endian_entrypoint from .gguf_dump import main as gguf_dump_entrypoint from .gguf_set_metadata import main as gguf_set_metadata_entrypoint diff --git a/gguf-py/scripts/gguf_hash.py b/gguf-py/scripts/gguf_hash.py index 956775182..770b79a93 100755 --- a/gguf-py/scripts/gguf_hash.py +++ b/gguf-py/scripts/gguf_hash.py @@ -63,9 +63,9 @@ def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar) -> None: bar.update(sum_weights_in_tensor) sha1_layer = hashlib.sha1() - sha1_layer.update(tensor.data) - sha1.update(tensor.data) - uuidv5_sha1.update(tensor.data) + sha1_layer.update(tensor.data.data) + sha1.update(tensor.data.data) + uuidv5_sha1.update(tensor.data.data) print("sha1 {0} {1}:{2}".format(sha1_layer.hexdigest(), filename, tensor.name)) # noqa: NP100 # Flush Hash Progress Bar diff --git a/gguf-py/scripts/gguf_new_metadata.py b/gguf-py/scripts/gguf_new_metadata.py index c4b90d581..fce52a8c1 100755 --- a/gguf-py/scripts/gguf_new_metadata.py +++ b/gguf-py/scripts/gguf_new_metadata.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +from __future__ import annotations + import logging import argparse import os diff --git a/gguf-py/tests/test_gguf.py b/gguf-py/tests/test_gguf.py index 0adeb7d55..76b52181e 100644 --- a/gguf-py/tests/test_gguf.py +++ b/gguf-py/tests/test_gguf.py @@ -1,4 +1,4 @@ -import gguf # noqa: F401 +import gguf # noqa: F401 # pyright: ignore[reportUnusedImport] # TODO: add tests diff --git a/grammars/README.md b/grammars/README.md index 886023f77..01b02abb4 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -4,7 +4,7 @@ GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia. ## Background -[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features. +[Backus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features. ## Basics diff --git a/include/llama.h b/include/llama.h index 138a11e96..c57d21f0c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -162,6 +162,9 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/pyrightconfig.json b/pyrightconfig.json index 020a71a4e..6016f4b6d 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -1,3 +1,21 @@ { "extraPaths": ["gguf-py"], -} + "pythonVersion": "3.9", + "pythonPlatform": "All", + "reportUnusedImport": "warning", + "reportDuplicateImport": "error", + "reportDeprecated": "warning", + "reportUnnecessaryTypeIgnoreComment": "warning", + "executionEnvironments": [ + { + // TODO: make this version override work correctly + "root": "gguf-py", + "pythonVersion": "3.8", + }, + { + // uses match expressions in steps.py + "root": "examples/server/tests", + "pythonVersion": "3.10", + }, + ], + } diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt new file mode 100644 index 000000000..94de59d7e --- /dev/null +++ b/requirements/requirements-all.txt @@ -0,0 +1,12 @@ +-r ../examples/llava/requirements.txt +-r ../examples/server/bench/requirements.txt +-r ../examples/server/tests/requirements.txt + +-r ./requirements-compare-llama-bench.txt +-r ./requirements-pydantic.txt +-r ./requirements-test-tokenizer-random.txt + +-r ./requirements-convert_hf_to_gguf.txt +-r ./requirements-convert_hf_to_gguf_update.txt +-r ./requirements-convert_legacy_llama.txt +-r ./requirements-convert_llama_ggml_to_gguf.txt diff --git a/requirements/requirements-compare-llama-bench.txt b/requirements/requirements-compare-llama-bench.txt new file mode 100644 index 000000000..e0aaa3204 --- /dev/null +++ b/requirements/requirements-compare-llama-bench.txt @@ -0,0 +1,2 @@ +tabulate~=0.9.0 +GitPython~=3.1.43 diff --git a/requirements/requirements-pydantic.txt b/requirements/requirements-pydantic.txt new file mode 100644 index 000000000..2f9455b14 --- /dev/null +++ b/requirements/requirements-pydantic.txt @@ -0,0 +1,2 @@ +docstring_parser~=0.15 +pydantic~=2.6.3 diff --git a/requirements/requirements-test-tokenizer-random.txt b/requirements/requirements-test-tokenizer-random.txt new file mode 100644 index 000000000..2785e71a2 --- /dev/null +++ b/requirements/requirements-test-tokenizer-random.txt @@ -0,0 +1 @@ +cffi~=1.16.0 diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh index 48f924c02..d3bbded13 100755 --- a/scripts/check-requirements.sh +++ b/scripts/check-requirements.sh @@ -108,6 +108,11 @@ check_convert_script() { fatal "$py missing requirements. Expected: $reqs" fi + # Check that all sub-requirements are added to top-level requirements.txt + if ! grep -qF "$reqs" requirements.txt; then + fatal "$reqs needs to be added to requirements.txt" + fi + local venv="$workdir/$pyname-venv" python3 -m venv "$venv" @@ -134,12 +139,7 @@ EOF readonly ignore_eq_eq='check_requirements: ignore "=="' -for req in "$reqs_dir"/*; do - # Check that all sub-requirements are added to top-level requirements.txt - if ! grep -qF "$req" requirements.txt; then - fatal "$req needs to be added to requirements.txt" - fi - +for req in */**/requirements*.txt; do # Make sure exact release versions aren't being pinned in the requirements # Filters out the ignore string if grep -vF "$ignore_eq_eq" "$req" | grep -q '=='; then diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 513dde5e1..92b9e682a 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -123,13 +123,13 @@ builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall() try: repo = git.Repo(".", search_parent_directories=True) -except git.exc.InvalidGitRepositoryError: +except git.InvalidGitRepositoryError: repo = None -def find_parent_in_data(commit): +def find_parent_in_data(commit: git.Commit): """Helper function to find the most recent parent measured in number of commits for which there is data.""" - heap = [(0, commit)] + heap: list[tuple[int, git.Commit]] = [(0, commit)] seen_hexsha8 = set() while heap: depth, current_commit = heapq.heappop(heap) @@ -144,7 +144,7 @@ def find_parent_in_data(commit): return None -def get_all_parent_hexsha8s(commit): +def get_all_parent_hexsha8s(commit: git.Commit): """Helper function to recursively get hexsha8 values for all parents of a commit.""" unvisited = [commit] visited = [] diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index 890e4d7c2..2d9bde01c 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import array import unicodedata import requests @@ -133,7 +135,7 @@ table_nfd.sort() # group ranges with same flags -ranges_flags = [(0, codepoint_flags[0])] # start, flags +ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])] # start, flags for codepoint, flags in enumerate(codepoint_flags): if flags != ranges_flags[-1][1]: ranges_flags.append((codepoint, flags)) @@ -141,11 +143,11 @@ ranges_flags.append((MAX_CODEPOINTS, 0x0000)) # group ranges with same nfd -ranges_nfd = [(0, 0, 0)] # start, last, nfd +ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)] # start, last, nfd for codepoint, norm in table_nfd: start = ranges_nfd[-1][0] if ranges_nfd[-1] != (start, codepoint - 1, norm): - ranges_nfd.append(None) + ranges_nfd.append(None) # type: ignore[arg-type] # dummy, will be replaced below start = codepoint ranges_nfd[-1] = (start, codepoint, norm) @@ -179,13 +181,13 @@ for codepoint in table_whitespace: out("};\n") out("const std::unordered_map unicode_map_lowercase = {") -for tuple in table_lowercase: - out("{0x%06X, 0x%06X}," % tuple) +for tuple_lw in table_lowercase: + out("{0x%06X, 0x%06X}," % tuple_lw) out("};\n") out("const std::unordered_map unicode_map_uppercase = {") -for tuple in table_uppercase: - out("{0x%06X, 0x%06X}," % tuple) +for tuple_up in table_uppercase: + out("{0x%06X, 0x%06X}," % tuple_up) out("};\n") out("const std::vector unicode_ranges_nfd = { // start, last, nfd") diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index b05a33747..ba3bedf21 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -63,6 +63,7 @@ while read c; do src/ggml*.metal \ src/ggml*.cu \ src/ggml-cuda/* \ + src/ggml-sycl/* \ include/ggml*.h \ tests/test-opt.cpp \ tests/test-grad0.cpp \ @@ -113,6 +114,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-quants.c -> ggml/src/ggml-quants.c # src/ggml-quants.h -> ggml/src/ggml-quants.h # src/ggml-rpc.cpp -> ggml/src/ggml-rpc.cpp + # src/ggml-sycl/* -> ggml/src/ggml-sycl/ # src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp # src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp # @@ -153,6 +155,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \ diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 2da33e913..80159b70b 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -5378ea0d3c2f25bcd330ecb226ad2db454be86d0 +e3b3846976c94163f2b3dd128cc959782653edbb diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 2f32c1ce8..402446ef9 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -18,6 +18,7 @@ cp -rpv ../ggml/src/ggml-metal.metal ./ggml/src/ggml-metal.metal cp -rpv ../ggml/src/ggml-quants.c ./ggml/src/ggml-quants.c cp -rpv ../ggml/src/ggml-quants.h ./ggml/src/ggml-quants.h cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp +cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp diff --git a/src/llama.cpp b/src/llama.cpp index fda48e822..7ed80fcaf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -57,6 +57,12 @@ #include #endif +#if __cplusplus >= 202000L + #define LU8(x) (const char*)(u8##x) +#else + #define LU8(x) u8##x +#endif + #include #include #include @@ -3836,6 +3842,9 @@ struct llama_model_loader { case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; + case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; + case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; + case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -4529,6 +4538,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; + case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; + case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; + case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; default: return "unknown, may not work"; } @@ -13282,6 +13294,8 @@ struct llm_build_context { LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); } else { + GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); + struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); @@ -17842,6 +17856,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } + else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || + new_type == GGML_TYPE_Q4_0_8_8) { + new_type = GGML_TYPE_Q4_0; + } } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { @@ -18154,6 +18172,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; + case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; + case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -18464,6 +18485,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.data(); } + int chunk_size_multiplier = 1; + if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { + if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0; + else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; + if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8; + else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; + } + LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); fflush(stdout); @@ -18476,7 +18505,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const int64_t nrows = tensor->ne[1]; static const int64_t min_chunk_size = 32 * 512; - const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row); + const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * + chunk_size_multiplier; const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; @@ -21508,12 +21538,12 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) { + } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF for (auto message : chat) { std::string role(message->role); if (role == "user") { - ss << u8"<用户>"; + ss << LU8("<用户>"); ss << trim(message->content); ss << ""; } else { @@ -21529,7 +21559,7 @@ static int32_t llama_chat_apply_template_internal( } else if (role == "user") { ss << "User: " << message->content << "\n\n"; } else if (role == "assistant") { - ss << "Assistant: " << message->content << u8"<|end▁of▁sentence|>"; + ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>"); } } if (add_ass) { diff --git a/src/unicode.cpp b/src/unicode.cpp index 51daa15af..e05fb9d17 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -1,3 +1,7 @@ +#if defined(_MSC_VER) +#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING +#endif + #include "unicode.h" #include "unicode-data.h" diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2bb71ac03..a1bfc8fdd 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1266,6 +1266,32 @@ struct test_pool2d : public test_case { } }; +// GGML_OP_CONV_TRANSPOSE_1D +struct test_conv_transpose_1d : public test_case { + const std::array ne_input; + const std::array ne_kernel; + + const int s0; // stride + const int p0; // padding + const int d0; // dilation + + std::string vars() override { + return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0); + } + + test_conv_transpose_1d(std::array ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1] + std::array ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1] + int s0 = 1, int p0 = 0, int d0 = 1) + : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); + ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data()); + ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0); + return out; + } +}; + // GGML_OP_IM2COL struct test_im2col : public test_case { const ggml_type type_input; @@ -1279,7 +1305,7 @@ struct test_im2col : public test_case { // padding const int p0; const int p1; - // dilatation + // dilation const int d0; const int d1; // mode @@ -2098,6 +2124,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16)); + test_cases.emplace_back(new test_conv_transpose_1d()); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); + + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1})); diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 48cab8a1e..c50a8ca32 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -6,6 +6,8 @@ # python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe # +from __future__ import annotations + import time import logging import argparse @@ -13,7 +15,9 @@ import subprocess import random import unicodedata -from typing import Iterator +from pathlib import Path +from typing import Any, Iterator, cast +from typing_extensions import Buffer import cffi from transformers import AutoTokenizer @@ -28,15 +32,15 @@ class LibLlama: DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"] DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON - def __init__(self, path_llama_h: str = None, path_includes: list[str] = [], path_libllama: str = None): + def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None): path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H path_includes = path_includes or self.DEFAULT_PATH_INCLUDES path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama) self.lib.llama_backend_init() - def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str): - cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="] + def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]: + cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="] cmd += ["-I" + path for path in path_includes] + [path_llama_h] res = subprocess.run(cmd, stdout=subprocess.PIPE) assert (res.returncode == 0) @@ -68,7 +72,7 @@ class LibLlama: class LibLlamaModel: def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}): - self.lib = libllama.lib + self.lib: Any = libllama.lib self.ffi = libllama.ffi if isinstance(mparams, dict): mparams = libllama.model_default_params(**mparams) @@ -94,11 +98,11 @@ class LibLlamaModel: self.lib = None def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]: - text = text.encode("utf-8") - num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, len(self.token_ids), add_special, parse_special) + encoded_text: bytes = text.encode("utf-8") + num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) while num < 0 and len(self.token_ids) < (16 << 20): self.token_ids = self.ffi.new("llama_token[]", -2 * num) - num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, len(self.token_ids), add_special, parse_special) + num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) return list(self.token_ids[0:num]) def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str: @@ -110,7 +114,7 @@ class LibLlamaModel: while num < 0 and len(self.text_buff) < (16 << 20): self.text_buff = self.ffi.new("uint8_t[]", -2 * num) num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) - return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' + return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' class Tokenizer: @@ -152,7 +156,7 @@ class TokenizerGroundtruth (Tokenizer): class TokenizerLlamaCpp (Tokenizer): - libllama: LibLlama = None + libllama: LibLlama | None = None def __init__(self, vocab_file: str): if not self.libllama: @@ -404,7 +408,7 @@ def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100 def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]): - def find_first_mismatch(ids1: list[int], ids2: list[int]): + def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str): for i, (a, b) in enumerate(zip(ids1, ids2)): if a != b: return i @@ -433,7 +437,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl decode_errors = 0 MAX_ERRORS = 10 - logger.info("%s: %s" % (generator.__name__, "ini")) + logger.info("%s: %s" % (generator.__qualname__, "ini")) for text in generator: # print(repr(text), text.encode()) # print(repr(text), hex(ord(text[0])), text.encode()) @@ -472,13 +476,13 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl break t_total = time.perf_counter() - t_start - logger.info(f"{generator.__name__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") + logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") -def main(argv: list[str] = None): +def main(argv: list[str] | None = None): parser = argparse.ArgumentParser() - parser.add_argument("vocab_file", help="path to vocab 'gguf' file") - parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") + parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file") + parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args(argv) @@ -520,7 +524,7 @@ if __name__ == "__main__": format = "%(levelname)s %(message)s", ) - path_tokenizers = "./models/tokenizers/" + path_tokenizers = Path("./models/tokenizers/") path_vocab_format = "./models/ggml-vocab-%s.gguf" tokenizers = [ @@ -556,6 +560,6 @@ if __name__ == "__main__": for tokenizer in tokenizers: logger.info("-" * 50) logger.info(f"TOKENIZER: '{tokenizer}'") - vocab_file = path_vocab_format % tokenizer - dir_tokenizer = path_tokenizers + "/" + tokenizer - main([vocab_file, dir_tokenizer, "--verbose"]) + vocab_file = Path(path_vocab_format % tokenizer) + dir_tokenizer = path_tokenizers / tokenizer + main([str(vocab_file), str(dir_tokenizer), "--verbose"])