diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 88e70e495..179080576 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,17 +8,19 @@ on: required: true type: boolean push: + branches: + - master paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp'] pull_request: - types: [opened, synchronize, edited, reopened, review_requested, ready_for_review] + types: [opened, synchronize, reopened] paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp'] env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} jobs: - ubuntu-latest-make: - runs-on: ubuntu-latest + ubuntu-focal-make: + runs-on: ubuntu-20.04 steps: - name: Clone @@ -29,12 +31,12 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential + sudo apt-get install build-essential gcc-8 - name: Build id: make_build run: | - make + CC=gcc-8 make ubuntu-latest-cmake: runs-on: ubuntu-latest @@ -73,7 +75,6 @@ jobs: matrix: sanitizer: [ADDRESS, THREAD, UNDEFINED] build_type: [Debug, Release] - accelerate: [ON, OFF] steps: - name: Clone @@ -91,7 +92,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }} + cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake --build . --config ${{ matrix.build_type }} - name: Test @@ -156,7 +157,7 @@ jobs: - build: 'avx' defines: '-DLLAMA_AVX2=OFF' - build: 'avx512' - defines: '-DLLAMA_AVX512=ON' + defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' steps: - name: Clone @@ -215,7 +216,7 @@ jobs: runs-on: ubuntu-latest needs: - - ubuntu-latest-make + - ubuntu-focal-make - ubuntu-latest-cmake - macOS-latest-make - macOS-latest-cmake diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 28402c933..379fbd7ad 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -18,6 +18,8 @@ on: jobs: push_to_registry: name: Push Docker image to Docker Hub + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest env: COMMIT_SHA: ${{ github.sha }} diff --git a/.gitignore b/.gitignore index ba5cbf1ed..e52d479ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,15 @@ *.o *.a +.DS_Store +.build/ .cache/ +.direnv/ +.envrc +.swiftpm +.venv .vs/ .vscode/ -.DS_Store -.build/ build/ build-em/ build-debug/ @@ -24,17 +28,15 @@ models/* /perplexity /embedding /benchmark-q4_0-matmult +/vdot /Pipfile arm_neon.h compile_commands.json -.envrc -.direnv/ - -.venv __pycache__ -.swiftpm zig-out/ zig-cache/ + +ppl-*.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a20de3a2..11ebe9eb6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" option(LLAMA_AVX "llama: enable AVX" ON) option(LLAMA_AVX2 "llama: enable AVX2" ON) option(LLAMA_AVX512 "llama: enable AVX512" OFF) +option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) +option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) option(LLAMA_FMA "llama: enable FMA" ON) # in MSVC F16C is implied with AVX2/AVX512 if (NOT MSVC) @@ -64,6 +66,7 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF) +option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -107,6 +110,7 @@ if (APPLE AND LLAMA_ACCELERATE) message(WARNING "Accelerate framework not found") endif() endif() + if (LLAMA_OPENBLAS) if (LLAMA_STATIC) set(BLA_STATIC ON) @@ -140,6 +144,30 @@ if (LLAMA_OPENBLAS) endif() endif() +if (LLAMA_CUBLAS) + cmake_minimum_required(VERSION 3.17) + + find_package(CUDAToolkit) + if (CUDAToolkit_FOUND) + message(STATUS "cuBLAS found") + + enable_language(CUDA) + + set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h) + + add_compile_definitions(GGML_USE_CUBLAS) + + if (LLAMA_STATIC) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + else() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() + + else() + message(WARNING "cuBLAS not found") + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags @@ -151,7 +179,6 @@ if (LLAMA_ALL_WARNINGS) -Wshadow -Wstrict-prototypes -Wpointer-arith - -Wno-unused-function ) set(cxx_flags -Wall @@ -174,6 +201,10 @@ endif() if (MSVC) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) + + if (BUILD_SHARED_LIBS) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + endif() endif() if (LLAMA_LTO) @@ -219,11 +250,26 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") message(STATUS "x86 detected") if (MSVC) if (LLAMA_AVX512) - add_compile_options(/arch:AVX512) + add_compile_options($<$:/arch:AVX512>) + add_compile_options($<$:/arch:AVX512>) + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (LLAMA_AVX512_VBMI) + add_compile_definitions($<$:__AVX512VBMI__>) + add_compile_definitions($<$:__AVX512VBMI__>) + endif() + if (LLAMA_AVX512_VNNI) + add_compile_definitions($<$:__AVX512VNNI__>) + add_compile_definitions($<$:__AVX512VNNI__>) + endif() elseif (LLAMA_AVX2) - add_compile_options(/arch:AVX2) + add_compile_options($<$:/arch:AVX2>) + add_compile_options($<$:/arch:AVX2>) elseif (LLAMA_AVX) - add_compile_options(/arch:AVX) + add_compile_options($<$:/arch:AVX>) + add_compile_options($<$:/arch:AVX>) endif() else() if (LLAMA_F16C) @@ -240,9 +286,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") endif() if (LLAMA_AVX512) add_compile_options(-mavx512f) - # add_compile_options(-mavx512cd) - # add_compile_options(-mavx512dq) - # add_compile_options(-mavx512bw) + add_compile_options(-mavx512bw) + endif() + if (LLAMA_AVX512_VBMI) + add_compile_options(-mavx512vbmi) + endif() + if (LLAMA_AVX512_VNNI) + add_compile_options(-mavx512vnni) endif() endif() else() @@ -256,11 +306,13 @@ endif() add_library(ggml OBJECT ggml.c - ggml.h) + ggml.h + ${GGML_CUDA_SOURCES}) target_include_directories(ggml PUBLIC .) target_compile_features(ggml PUBLIC c_std_11) # don't bump -target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS}) +target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) + if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() @@ -273,11 +325,20 @@ add_library(llama target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS}) + if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) endif() +if (GGML_CUDA_SOURCES) + message(STATUS "GGML CUDA sources found, configuring CUDA architecture") + set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF) + set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") + set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF) +endif() + + # # programs, examples and tests # @@ -289,4 +350,5 @@ endif () if (LLAMA_BUILD_EXAMPLES) add_subdirectory(examples) + add_subdirectory(pocs) endif() diff --git a/Makefile b/Makefile index 1c450f055..e971e066f 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,6 @@ +# Define the default target now so that it is always the first target +default: main quantize quantize-stats perplexity embedding vdot + ifndef UNAME_S UNAME_S := $(shell uname -s) endif @@ -36,7 +39,7 @@ CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC LDFLAGS = # warnings -CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function +CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar # OS specific @@ -71,13 +74,17 @@ endif # feel free to update the Makefile for your architecture and send a pull request or issue ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) # Use all CPU extensions that are available: - CFLAGS += -march=native -mtune=native + CFLAGS += -march=native -mtune=native CXXFLAGS += -march=native -mtune=native + + # Usage AVX-only + #CFLAGS += -mfma -mf16c -mavx + #CXXFLAGS += -mfma -mf16c -mavx endif ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) ifneq (,$(findstring POWER9,$(POWER9_M))) - CFLAGS += -mcpu=power9 + CFLAGS += -mcpu=power9 CXXFLAGS += -mcpu=power9 endif # Require c++23's std::byteswap for big-endian support. @@ -97,12 +104,25 @@ ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas LDFLAGS += -lopenblas endif +ifdef LLAMA_CUBLAS + CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include + LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 + OBJS += ggml-cuda.o + NVCC = nvcc + NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h + $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ +endif ifdef LLAMA_GPROF CFLAGS += -pg CXXFLAGS += -pg endif +ifdef LLAMA_PERF + CFLAGS += -DGGML_PERF + CXXFLAGS += -DGGML_PERF +endif ifneq ($(filter aarch64%,$(UNAME_M)),) - CFLAGS += -mcpu=native + CFLAGS += -mcpu=native CXXFLAGS += -mcpu=native endif ifneq ($(filter armv6%,$(UNAME_M)),) @@ -133,8 +153,6 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize perplexity embedding - # # Build library # @@ -151,32 +169,35 @@ common.o: examples/common.cpp examples/common.h clean: rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult -main: examples/main/main.cpp ggml.o llama.o common.o +main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp ggml.o llama.o +quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o +quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o +embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -libllama.so: llama.o ggml.o +vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + +libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) # # Tests # -benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o llama.o common.o +benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS) ./benchmark-q4_0-matmult diff --git a/README.md b/README.md index 78215c9ce..44cf72124 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,19 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ +**Warnings** + +- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalized + **Hot topics:** +- [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820) - [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915) - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784) ## Description -The main goal is to run the model using 4-bit quantization on a MacBook +The main goal of llama.cpp is to run the llama model using 4-bit quantization on a MacBook. - Plain C/C++ implementation without dependencies - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework @@ -50,6 +55,7 @@ New features will probably be added mostly through community contributions. - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) +- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) **UI:** @@ -150,7 +156,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8 ## Usage -Here are the step for the LLaMA-7B model. +Here are the steps for the LLaMA-7B model. ### Get the Code @@ -208,8 +214,7 @@ When running the larger models, make sure you have enough disk space to store al ### Memory/Disk Requirements -As the models are currently fully loaded into memory, you will need adequate disk space to save them -and sufficient RAM to load them. At the moment, memory and disk requirements are the same. +As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. | model | original size | quantized size (4-bit) | |-------|---------------|------------------------| @@ -221,22 +226,22 @@ and sufficient RAM to load them. At the moment, memory and disk requirements are ### Interactive mode If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter. -In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. +In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. -Here is an example few-shot interaction, invoked with the command +Here is an example of a few-shot interaction, invoked with the command ```bash -# default arguments using 7B model +# default arguments using a 7B model ./examples/chat.sh -# advanced chat with 13B model +# advanced chat with a 13B model ./examples/chat-13B.sh -# custom arguments using 13B model +# custom arguments using a 13B model ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ``` -Note the use of `--color` to distinguish between user input and generated text. +Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program. ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) @@ -270,22 +275,23 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. ### Using [GPT4All](https://github.com/nomic-ai/gpt4all) -- Obtain the `gpt4all-lora-quantized.bin` model +- Obtain the `tokenizer.model` file from LLaMA model and put it to `models` +- Obtain the `added_tokens.json` file from Alpaca model and put it to `models` +- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B` - It is distributed in the old `ggml` format which is now obsoleted -- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to -convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py): +- You have to convert it to the new format using `convert.py`: - ```bash - python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model - python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin - ``` +```bash +python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin +``` -- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models -- The original model is saved in the same folder with a suffix `.orig` +- You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models + +- The newer GPT4All-J model is not yet supported! ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data -- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.** +- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.** - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data. - Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. @@ -297,29 +303,27 @@ convert the model from the old format to the new format with [./migrate-ggml-202 `shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS -- If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: - - LLaMA: - - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) - - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) - - GPT-3 - - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) - - GPT-3.5 / InstructGPT / ChatGPT: - - [Aligning language models to follow instructions](https://openai.com/research/instruction-following) - - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) +- If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: +- LLaMA: +- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) +- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) +- GPT-3 +- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) +- GPT-3.5 / InstructGPT / ChatGPT: +- [Aligning language models to follow instructions](https://openai.com/research/instruction-following) +- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) -### Perplexity (Measuring model quality) +### Perplexity (measuring model quality) -You can use the `perplexity` example to measure perplexity over the given prompt. For more background, -see https://huggingface.co/docs/transformers/perplexity. However, in general, lower perplexity is better for LLMs. +You can use the `perplexity` example to measure perplexity over the given prompt. For more background, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). However, in general, lower perplexity is better for LLMs. #### Latest measurements -The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well -compared to the baseline implementations. Quantization has a small negative impact to quality, but, as you can see, running +The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well compared to the baseline implementations. Quantization has a small negative impact on quality, but, as you can see, running 13B at q4_0 beats the 7B f16 model by a significant amount. -All measurements are done against wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context). -Note that the changing the context length will have a significant impact on perplexity (longer context = better perplexity). +All measurements are done against the wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context). +Note that changing the context length will have a significant impact on perplexity (longer context = better perplexity). ``` Perplexity - model options 5.5985 - 13B, q4_0 @@ -361,7 +365,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0 #### Prerequisites * Docker must be installed and running on your system. -* Create a folder to store big models & intermediate files (in ex. im using /llama/models) +* Create a folder to store big models & intermediate files (ex. /llama/models) #### Images We have two Docker images available for this project: @@ -375,17 +379,17 @@ The easiest way to download the models, convert them to ggml and optimize them i Replace `/path/to/models` below with the actual path where you downloaded the models. - ```bash +```bash docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B ``` -On complete, you are ready to play! +On completion, you are ready to play! ```bash docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` -or with light image: +or with a light image: ```bash docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 @@ -406,7 +410,7 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode - Always consider cross-compatibility with other operating systems and architectures - Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit -- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a` +- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions ### Docs diff --git a/SHA256SUMS b/SHA256SUMS index 63fac21ae..87faa7f1b 100644 --- a/SHA256SUMS +++ b/SHA256SUMS @@ -1,12 +1,27 @@ 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth +666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin +99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin +cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin +25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin +3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b models/7B/ggml-model-q4_3.bin 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth +2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin +eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin +d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin +75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin +4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3 models/13B/ggml-model-q4_3.bin 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth +7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin +517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin +7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin +aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin +a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33 models/30B/ggml-model-q4_3.bin 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth @@ -16,5 +31,10 @@ e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770 models/65B/con a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/consolidated.05.pth 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth +60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin +01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin +4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin +1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin +305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c models/65B/ggml-model-q4_3.bin 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py new file mode 100644 index 000000000..9090e8d6d --- /dev/null +++ b/convert-lora-to-ggml.py @@ -0,0 +1,129 @@ +import json +import os +import re +import struct +import sys +from typing import Any, Dict, Sequence, TextIO + +import torch + +from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType + +HF_SUBLAYER_TO_GGML = { + "self_attn.q_proj": "attention.wq", + "self_attn.k_proj": "attention.wk", + "self_attn.v_proj": "attention.wv", + "self_attn.o_proj": "attention.wo", + "mlp.gate_proj": "feed_forward.w1", + "mlp.down_proj": "feed_forward.w2", + "mlp.up_proj": "feed_forward.w3", + "input_layernorm": "attention_norm", + "post_attention_layernorm": "ffn_norm", + # "norm": "norm", + # "embed_tokens": "tok_embeddings", + # "lm_head": "output", +} + + +def translate_tensor_name(t: str) -> str: + match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t) + if match: + nn = match.group(1) + sub_layer = match.group(2) + lora_type = match.group(3) + + sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer) + if sub_layer_renamed is None: + print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}") + sys.exit(1) + + output_string = ( + f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" + ) + return output_string + else: + print(f"Error: unrecognized tensor {t}") + sys.exit(1) + + +def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None: + fout.write(b"ggla"[::-1]) # magic (ggml lora) + fout.write(struct.pack("i", 1)) # file version + fout.write(struct.pack("i", params["r"])) + # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int + # but some models ship a float value instead + # let's convert to int, but fail if lossless conversion is not possible + assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly" + fout.write(struct.pack("i", int(params["lora_alpha"]))) + + +def write_tensor_header( + self, name: str, shape: Sequence[int], data_type: DataType +) -> None: + sname = name.encode("utf-8") + fout.write( + struct.pack( + "iii", + len(shape), + len(sname), + DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]], + ) + ) + fout.write(struct.pack("i" * len(shape), *shape[::-1])) + fout.write(sname) + fout.seek((fout.tell() + 31) & -32) + + +if len(sys.argv) != 2: + print(f"Usage: python {sys.argv[0]} ") + print( + "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" + ) + sys.exit(1) + +input_json = os.path.join(sys.argv[1], "adapter_config.json") +input_model = os.path.join(sys.argv[1], "adapter_model.bin") +output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") + +model = torch.load(input_model, map_location="cpu") + +with open(input_json, "r") as f: + params = json.load(f) + +if params["peft_type"] != "LORA": + print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") + sys.exit(1) + +if params["fan_in_fan_out"] is True: + print("Error: param fan_in_fan_out is not supported") + sys.exit(1) + +if params["bias"] is not None and params["bias"] != "none": + print("Error: param bias is not supported") + sys.exit(1) + +# TODO: these seem to be layers that have been trained but without lora. +# doesn't seem widely used but eventually should be supported +if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: + print("Error: param modules_to_save is not supported") + sys.exit(1) + +with open(output_path, "wb") as fout: + fout.truncate() + + write_file_header(fout, params) + for k, v in model.items(): + if k.endswith("lora_A.weight"): + if v.dtype != torch.float16 and v.dtype != torch.float32: + v = v.float() + v = v.T + else: + v = v.float() + + t = v.numpy() + tname = translate_tensor_name(k) + print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") + write_tensor_header(fout, tname, t.shape, t.dtype) + t.tofile(fout) + +print(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/convert.py b/convert.py index 056dc618d..7f7ae05fa 100644 --- a/convert.py +++ b/convert.py @@ -735,7 +735,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size)) # Use mmap for the actual data to avoid race conditions with the file offset. mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) - byte_buf = mapped[fp.tell():] + byte_buf = mapped[8 + header_size:] def convert(info: Dict[str, Any]) -> LazyTensor: data_type = SAFETENSORS_DATA_TYPES[info['dtype']] @@ -761,7 +761,7 @@ def must_read(fp: IO[bytes], length: int) -> bytes: return ret -def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus: +def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus: magic = must_read(fp, 4)[::-1] if magic in (b'ggmf', b'ggjt'): version, = struct.unpack("i", must_read(fp, 4)) @@ -795,7 +795,9 @@ def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus: model: LazyModel = {} # Use mmap for the actual data to avoid race conditions with the file offset. + off = fp.raw.tell() mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) + fp.raw.seek(off) # needed on Windows def read_tensor() -> None: # this is a function so that variables captured in `load` don't change shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12)) @@ -949,8 +951,9 @@ class OutputFile: ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): - size = ' x '.join(map(str, lazy_tensor.shape)) - print(f"[{i+1}/{len(model)}] Writing tensor {name}, size {size}...") + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) + padi = len(str(len(model))) + print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}") of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type) ndarray.tofile(of.fout) of.fout.close() @@ -1082,6 +1085,7 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", + GGMLFileType.MostlyQ4_0: "q4_0", GGMLFileType.MostlyQ4_1: "q4_1", GGMLFileType.PerLayerIsQ4_1: "q4_1", }[params.file_type] @@ -1105,7 +1109,7 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)") + parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 8f53244f6..0973a3fa1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -34,5 +34,6 @@ else() add_subdirectory(quantize-stats) add_subdirectory(perplexity) add_subdirectory(embedding) + add_subdirectory(save-load-state) add_subdirectory(benchmark) endif() diff --git a/examples/alpaca.sh b/examples/alpaca.sh index 8d6261730..aef207f36 100755 --- a/examples/alpaca.sh +++ b/examples/alpaca.sh @@ -7,4 +7,13 @@ cd `dirname $0` cd .. -./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 +./main -m ./models/ggml-alpaca-7b-q4.bin \ + --color \ + -f ./prompts/alpaca.txt \ + --ctx_size 2048 \ + -n -1 \ + -ins -b 256 \ + --top_k 10000 \ + --temp 0.2 \ + --repeat_penalty 1.1 \ + -t 7 diff --git a/examples/common.cpp b/examples/common.cpp index 0772dbfe1..c0e87eb9f 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -139,14 +139,25 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.model = argv[i]; + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--embedding") { params.embedding = true; - } else if (arg == "--interactive-start") { - params.interactive = true; } else if (arg == "--interactive-first") { - params.interactive_start = true; + params.interactive_first = true; } else if (arg == "-ins" || arg == "--instruct") { params.instruct = true; } else if (arg == "--color") { @@ -242,6 +253,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { } fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, "\n"); diff --git a/examples/common.h b/examples/common.h index 1ea6f7445..6f26b514d 100644 --- a/examples/common.h +++ b/examples/common.h @@ -20,7 +20,7 @@ struct gpt_params { int32_t repeat_last_n = 64; // last n tokens to penalize int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size - int32_t n_batch = 8; // batch size for prompt processing + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt // sampling parameters @@ -31,18 +31,19 @@ struct gpt_params { std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string prompt = ""; - std::string input_prefix = ""; // string to prefix user inputs with - - + std::string input_prefix = ""; // string to prefix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted + std::string lora_adapter = ""; // lora adapter path + std::string lora_base = ""; // base model path for the lora adapter + bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode bool embedding = false; // get only sentence embedding - bool interactive_start = false; // wait for user input immediately + bool interactive_first = false; // wait for user input immediately bool instruct = false; // instruction mode (used for Alpaca models) bool ignore_eos = false; // do not stop generating after eos diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 2eda3ac01..e10de619c 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,6 +1,8 @@ #include "common.h" #include "llama.h" +#include + int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; diff --git a/examples/main/README.md b/examples/main/README.md index f09e7ba97..234bf2eb5 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -1,3 +1,191 @@ -# main +# llama.cpp/example/main -TODO +This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts. + +## Table of Contents + +1. [Quick Start](#quick-start) +2. [Common Options](#common-options) +3. [Input Prompts](#input-prompts) +4. [Interaction](#interaction) +5. [Context Management](#context-management) +6. [Generation Flags](#generation-flags) +7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options) +8. [Additional Options](#additional-options) + +## Quick Start + +To get started right away, run the following command, making sure to use the correct path for the model you have: + +```bash +./main -m models/7B/ggml-model.bin --prompt "Once upon a time" +``` + +The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it): + +```bash +./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time" +``` + +For an interactive experience, try this command: + +```bash +./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:' +``` + +Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead. + +## Common Options + +In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: + +- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. +- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models. +- `-t N, --threads N`: Set the number of threads to use during computation. It is recommended to set this to the number of physical cores your CPU has. +- `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. +- `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. + +## Input Prompts + +The `main` program provides several ways to interact with the LLaMA models using input prompts: + +- `--prompt PROMPT`: Provide a prompt directly as a command-line option. +- `--file FNAME`: Provide a file containing a prompt or multiple prompts. +- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.) +- `--random-prompt`: Start with a randomized prompt. + +## Interaction + +The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`. + +In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing. + +### Interaction Options + +- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model. +- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation. +- `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions. +- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text. + +By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. + +### Reverse Prompts + +Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered: + +- `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space. + +To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt. + +### In-Prefix + +The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag: + +```sh +./main -r "User:" --in-prefix " " +``` + +### Instruction Mode + +Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks: + +- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions. + +Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response). + +By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. + +## Context Management + +During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations. + +### Context Size + +The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations. + +- `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results. + +### Keep Prompt + +The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained. + +- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. + +By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation. + +## Generation Flags + +The following options are related to controlling the text generation process, influencing the diversity, creativity, and quality of the generated text. Understanding these options will help you fine-tune the output according to your needs: + +### Number of Tokens to Predict + +- `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity). + +The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit. + +It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter. + +### RNG Seed + +- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1). + +The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than or equal to 0, a random seed will be used, which will result in different outputs on each run. + +### Temperature + +- `--temp N`: Adjust the randomness of the generated text (default: 0.8). + +Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. + +Example usage: `--temp 0.8` + +### Repeat Penalty + +- `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1). + +Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1. + +Example usage: `--repeat_penalty 1.1` + +### Top-K Sampling + +- `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40). + +Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40. + +Example usage: `--top_k 40` + +### Top-P Sampling + +- `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). + +Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9. + +Example usage: `--top_p 0.9` + +By adjusting these options, you can control the diversity, quality, and creativity of the generated text to better suit your needs. You can experiment with different combinations of values to find the best settings for your specific use case. + +## Performance Tuning and Memory Options + +These options help improve the performance and memory usage of the LLaMA models: + +- `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of CPU cores. +- `--mlock`: Lock the model in memory, preventing it from being swapped out when mmaped. This can improve performance. +- `--no-mmap`: Do not memory-map the model. This results in a slower load time but may reduce pageouts if you're not using `mlock`. +- `--memory_f32`: Use 32 bit floats instead of 16 bit floats for memory key+value, allowing higher quality inference at the cost of memory. +- `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations. + +For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run). + +By understanding and using these performance tuning settings, you can optimize the LLaMA model's behavior to achieve the best performance for your specific needs. + +## Additional Options + +These options provide extra functionality and customization when running the LLaMA models: + +- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. +- `--verbose-prompt`: Print the prompt before generating text. +- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ba153cb82..f9c9e9d98 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,7 @@ #endif static console_state con_st; +static llama_context ** g_ctx; static bool is_interacting = false; @@ -35,6 +37,7 @@ void sigint_handler(int signo) { if (!is_interacting) { is_interacting=true; } else { + llama_print_timings(*g_ctx); _exit(130); } } @@ -93,6 +96,7 @@ int main(int argc, char ** argv) { //bool is_prime(int n) {)"; llama_context * ctx; + g_ctx = &ctx; // load the model { @@ -113,6 +117,17 @@ int main(int argc, char ** argv) { } } + if (!params.lora_adapter.empty()) { + int err = llama_apply_lora_from_file(ctx, + params.lora_adapter.c_str(), + params.lora_base.empty() ? NULL : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return 1; + } + } + // print system information { fprintf(stderr, "\n"); @@ -163,12 +178,12 @@ int main(int argc, char ** argv) { // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { - params.interactive_start = true; + params.interactive_first = true; params.antiprompt.push_back("### Instruction:\n\n"); } // enable interactive mode if reverse prompt or interactive start is specified - if (params.antiprompt.size() != 0 || params.interactive_start) { + if (params.antiprompt.size() != 0 || params.interactive_first) { params.interactive = true; } @@ -231,7 +246,7 @@ int main(int argc, char ** argv) { #endif " - Press Return to return control to LLaMa.\n" " - If you want to submit another line, end your input in '\\'.\n\n"); - is_interacting = params.interactive_start; + is_interacting = params.interactive_first; } bool is_antiprompt = false; @@ -252,7 +267,7 @@ int main(int argc, char ** argv) { // infinite text generation via context swapping // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) - // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches if (n_past + (int) embd.size() > n_ctx) { const int n_left = n_past - params.n_keep; @@ -270,13 +285,21 @@ int main(int argc, char ** argv) { //printf("\n---\n"); } - if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; + // evaluate tokens in batches + // embd is typically prepared beforehand to fit within a batch, but not always + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { + int n_eval = (int) embd.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + n_past += n_eval; } } - n_past += embd.size(); embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 38e3643b1..615157e7b 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2,6 +2,7 @@ #include "llama.h" #include +#include std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); @@ -52,7 +53,13 @@ void perplexity(llama_context * ctx, const gpt_params & params) { auto end_t = std::chrono::high_resolution_clock::now(); if (i == 0) { const float seconds = std::chrono::duration(end_t - start_t).count(); - printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); + printf("%.2f seconds per pass - ETA ", seconds); + int total_seconds = (int)(seconds * seq_count); + if (total_seconds >= 60*60) { + printf("%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + printf("%d minutes\n", total_seconds / 60); } // We get the logits for all the tokens in the context window (params.n_ctx) // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, @@ -133,6 +140,17 @@ int main(int argc, char ** argv) { } } + if (!params.lora_adapter.empty()) { + int err = llama_apply_lora_from_file(ctx, + params.lora_adapter.c_str(), + params.lora_base.empty() ? NULL : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return 1; + } + } + // print system information { fprintf(stderr, "\n"); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 050300931..4e6c2c831 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include struct quantize_stats_params { std::string model = "models/7B/ggml-model-f16.bin"; @@ -27,7 +29,6 @@ struct quantize_stats_params { std::vector include_types; }; -const int64_t SCRATCH_ELEMENTS = 32*32; const size_t HISTOGRAM_BUCKETS = 150; const double HISTOGRAM_RANGE = 0.03; @@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou stats.num_samples += nelements; } +void combine_error_stats(error_stats & into, const error_stats & from) { + into.num_samples += from.num_samples; + into.total_error += from.total_error; + if (from.max_error > into.max_error) into.max_error = from.max_error; + for (size_t i=0; inb[3] == tensor->nb[2]*tensor->ne[2]; } +void test_roundtrip_on_chunk( + const ggml_tensor * layer, + int64_t offset, + int64_t chunk_size, + const quantize_fns_t & qfns, + bool use_reference, + float * input_scratch, + char * quantized_scratch, + float * output_scratch, + error_stats & stats) { + + if (layer->type == GGML_TYPE_F16) { + for (int i = 0; i < chunk_size; i++) { + input_scratch[i] = ggml_get_f32_1d(layer, i + offset); + } + } else { + input_scratch = ggml_get_data_f32(layer) + offset; + } + + if (use_reference) { + qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); + } else { + qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + } + qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); + + update_error_stats(chunk_size, input_scratch, output_scratch, stats); +} + + // Run quantization function for a single layer and update error stats void test_roundtrip_on_layer( std::string & name, @@ -137,40 +175,61 @@ void test_roundtrip_on_layer( const quantize_fns_t & qfns, bool use_reference, const ggml_tensor * layer, - float * input_scratch, - char *quantized_scratch, - float * output_scratch, - error_stats & total_error) { + std::vector & input_scratch, + std::vector & quantized_scratch, + std::vector & output_scratch, + error_stats & total_error, + int max_thread = 0) { assert(tensor_is_contiguous(layer)); error_stats layer_error {}; - int64_t nelements = ggml_nelements(layer); + uint64_t nelements = ggml_nelements(layer); - for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) { - int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset); - - if (layer->type == GGML_TYPE_F16) { - for (int i = 0; i < chunk_size; i++) { - input_scratch[i] = ggml_get_f32_1d(layer, i + offset); - } - } else { - input_scratch = ggml_get_data_f32(layer) + offset; - } - - if (use_reference) { - qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); - } else { - qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); - } - qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); - - update_error_stats(chunk_size, input_scratch, output_scratch, total_error); - if (print_layer_stats) { - update_error_stats(chunk_size, input_scratch, output_scratch, layer_error); - } + float* input_scratch_ptr = nullptr; + if (layer->type == GGML_TYPE_F16) { + if (input_scratch.size() < nelements) input_scratch.resize(nelements); + input_scratch_ptr = input_scratch.data(); } + if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements); + if (output_scratch.size() < nelements) output_scratch.resize(nelements); + + if (max_thread < 1) max_thread = std::thread::hardware_concurrency(); + int chunk_size = 32*512; + int num_chunks = (nelements + chunk_size - 1)/chunk_size; + + if (num_chunks < 2 || max_thread < 2) { + test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(), + output_scratch.data(), print_layer_stats ? layer_error : total_error); + } else { + auto & stats = print_layer_stats ? layer_error : total_error; + std::mutex mutex; + uint64_t counter = 0; + auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr, + &quantized_scratch, &output_scratch, chunk_size] () { + error_stats local_stats {}; + while (true) { + std::unique_lock lock(mutex); + uint64_t offset = counter; counter += chunk_size; + if (offset >= nelements) { + combine_error_stats(stats, local_stats); + break; + } + lock.unlock(); + uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; + test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset, + quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); + } + }; + int nthread = std::min(num_chunks, max_thread); + std::vector workers(nthread-1); + for (auto& w : workers) w = std::thread(compute); + compute(); + for (auto& w : workers) w.join(); + } + if (print_layer_stats) { print_error_stats(name, layer_error, false); + combine_error_stats(total_error, layer_error); } } @@ -181,6 +240,7 @@ int main(int argc, char ** argv) { // read command line + int max_thread = 0; bool invalid_param = false; std::string arg; for (int i = 1; i < argc; i++) { @@ -221,7 +281,7 @@ int main(int argc, char ** argv) { break; } int j; - for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) { + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) { // find match } if (j < GGML_TYPE_COUNT) { @@ -230,6 +290,12 @@ int main(int argc, char ** argv) { fprintf(stderr, "error: %s not in list of types\n", argv[i]); invalid_param = true; } + } else if (arg == "-n" || arg == "--num-threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + max_thread = atoi(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); quantize_stats_print_usage(argc, argv); @@ -295,9 +361,9 @@ int main(int argc, char ** argv) { } printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); // allocate scratch space - std::vector input_scratch(SCRATCH_ELEMENTS); - std::vector quantized_scratch(SCRATCH_ELEMENTS*4); - std::vector output_scratch(SCRATCH_ELEMENTS); + std::vector input_scratch; + std::vector quantized_scratch; + std::vector output_scratch; // loop throught quantization types for (int i = 0; i < GGML_TYPE_COUNT; i++) { @@ -328,10 +394,11 @@ int main(int argc, char ** argv) { qfns, params.reference, kv_tensor.second, - input_scratch.data(), - quantized_scratch.data(), - output_scratch.data(), - global_stats + input_scratch, + quantized_scratch, + output_scratch, + global_stats, + max_thread ); } diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 5c9e2ad94..ad39a805d 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -10,10 +10,13 @@ int main(int argc, char ** argv) { ggml_time_init(); - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); + if (argc < 4) { + fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]); fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); + fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); + fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3); + fprintf(stderr, " type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0); return 1; } @@ -28,6 +31,7 @@ int main(int argc, char ** argv) { const std::string fname_out = argv[2]; const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); + int nthread = argc > 4 ? atoi(argv[4]) : 0; const int64_t t_main_start_us = ggml_time_us(); @@ -37,7 +41,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) { + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt new file mode 100644 index 000000000..cff79fa1f --- /dev/null +++ b/examples/save-load-state/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET save-load-state) +add_executable(${TARGET} save-load-state.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp new file mode 100644 index 000000000..39aa7f82c --- /dev/null +++ b/examples/save-load-state/save-load-state.cpp @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "common.h" +#include "llama.h" +#include "llama.cpp" + +using namespace std; + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + params.seed = 42; + params.n_threads = 4; + params.repeat_last_n = 64; + params.prompt = "The quick brown fox"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; + + auto n_past = 0; + auto last_n_tokens_data = vector(params.repeat_last_n, 0); + + // init + auto ctx = llama_init_from_file(params.model.c_str(), lparams); + auto tokens = vector(params.n_ctx); + auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true); + + if (n_prompt_tokens < 1) { + fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); + return 1; + } + + // evaluate prompt + + llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads); + + last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens); + n_past += n_prompt_tokens; + + // Save state (rng, logits, embedding and kv_cache) to file + FILE *fp_write = fopen("dump_state.bin", "wb"); + auto state_size = llama_get_state_size(ctx); + auto state_mem = new uint8_t[state_size]; + llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file + fwrite(state_mem, 1, state_size, fp_write); + fclose(fp_write); + + // save state (last tokens) + auto last_n_tokens_data_saved = vector(last_n_tokens_data); + auto n_past_saved = n_past; + + // first run + printf("\n%s", params.prompt.c_str()); + for (auto i = 0; i < params.n_predict; i++) { + auto next_token = llama_sample_top_p_top_k( + ctx, + &last_n_tokens_data.back() - params.repeat_last_n, + params.repeat_last_n, + 40, + 1.0, + 1.0, + 1.1); + auto next_token_str = llama_token_to_str(ctx, next_token); + last_n_tokens_data.push_back(next_token); + printf("%s", next_token_str); + if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { + fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + return 1; + } + n_past += 1; + } + printf("\n\n"); + + // free old model + llama_free(ctx); + + // load new model + + auto ctx2 = llama_init_from_file(params.model.c_str(), lparams); + + // Load state (rng, logits, embedding and kv_cache) from file + FILE *fp_read = fopen("dump_state.bin", "rb"); + auto state_size2 = llama_get_state_size(ctx2); + if (state_size != state_size2) { + fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + } + fread(state_mem, 1, state_size, fp_read); + llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file + fclose(fp_read); + + // restore state (last tokens) + last_n_tokens_data = last_n_tokens_data_saved; + n_past = n_past_saved; + + // second run + for (auto i = 0; i < params.n_predict; i++) { + auto next_token = llama_sample_top_p_top_k( + ctx2, + &last_n_tokens_data.back() - params.repeat_last_n, + params.repeat_last_n, + 40, + 1.0, + 1.0, + 1.1); + auto next_token_str = llama_token_to_str(ctx2, next_token); + last_n_tokens_data.push_back(next_token); + printf("%s", next_token_str); + if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { + fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + return 1; + } + n_past += 1; + } + printf("\n\n"); + return 0; +} diff --git a/flake.nix b/flake.nix index 5363052b1..2c9edbb6a 100644 --- a/flake.nix +++ b/flake.nix @@ -30,9 +30,9 @@ mv bin/* $out/bin/ mv $out/bin/main $out/bin/llama - echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml - cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml - chmod +x $out/bin/convert-pth-to-ggml + echo "#!${llama-python}/bin/python" > $out/bin/convert.py + cat ${./convert.py} >> $out/bin/convert.py + chmod +x $out/bin/convert.py ''; meta.mainProgram = "llama"; }; diff --git a/ggml-cuda.cu b/ggml-cuda.cu new file mode 100644 index 000000000..f104ed5ac --- /dev/null +++ b/ggml-cuda.cu @@ -0,0 +1,256 @@ +#include +#include +#include +#include +#include "ggml-cuda.h" + +typedef uint16_t ggml_fp16_t; +static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +#define QK4_0 32 +typedef struct { + float d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + float d; // delta + float m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK4_2 16 +typedef struct { + __half d; // delta + uint8_t qs[QK4_2 / 2]; // nibbles / quants +} block_q4_2; +static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); + +#define QK4_3 16 +typedef struct { + __half d; // delta + __half m; // min + uint8_t qs[QK4_3 / 2]; // nibbles / quants +} block_q4_3; +static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); + +#define QK8_0 32 +typedef struct { + float d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); + +static __global__ void dequantize_block_q4_0(const void * vx, float * y) { + const block_q4_0 * x = (const block_q4_0 *) vx; + + const int i = blockIdx.x; + + const float d = x[i].d; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_0; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = (vi0 - 8)*d; + const float v1 = (vi1 - 8)*d; + + y[i*QK4_0 + l + 0] = v0; + y[i*QK4_0 + l + 1] = v1; + } +} + +static __global__ void dequantize_block_q4_1(const void * vx, float * y) { + const block_q4_1 * x = (const block_q4_1 *) vx; + + const int i = blockIdx.x; + + const float d = x[i].d; + const float m = x[i].m; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_1; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = vi0*d + m; + const float v1 = vi1*d + m; + + y[i*QK4_1 + l + 0] = v0; + y[i*QK4_1 + l + 1] = v1; + } +} + +static __global__ void dequantize_block_q4_2(const void * vx, float * y) { + const block_q4_2 * x = (const block_q4_2 *) vx; + + const int i = blockIdx.x; + + const float d = x[i].d; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_2; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = (vi0 - 8)*d; + const float v1 = (vi1 - 8)*d; + + y[i*QK4_2 + l + 0] = v0; + y[i*QK4_2 + l + 1] = v1; + } +} + +static __global__ void dequantize_block_q4_3(const void * vx, float * y) { + const block_q4_3 * x = (const block_q4_3 *) vx; + + const int i = blockIdx.x; + + const float d = x[i].d; + const float m = x[i].m; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_3; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = vi0*d + m; + const float v1 = vi1*d + m; + + y[i*QK4_3 + l + 0] = v0; + y[i*QK4_3 + l + 1] = v1; + } +} + +static __global__ void dequantize_block_q8_0(const void * vx, float * y) { + const block_q8_0 * x = (const block_q8_0 *) vx; + + const int i = blockIdx.x; + + const float d = x[i].d; + + const int8_t * pp = x[i].qs; + + for (int l = 0; l < QK8_0; l++) { + const int8_t vi = pp[l]; + + y[i*QK8_0 + l] = vi*d; + } +} + +void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { + const int nb = k / QK4_0; + dequantize_block_q4_0<<>>(vx, y); +} + +void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { + const int nb = k / QK4_1; + dequantize_block_q4_1<<>>(vx, y); +} + +void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { + const int nb = k / QK4_2; + dequantize_block_q4_2<<>>(vx, y); +} + +void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) { + const int nb = k / QK4_3; + dequantize_block_q4_3<<>>(vx, y); +} + +void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { + const int nb = k / QK8_0; + dequantize_block_q8_0<<>>(vx, y); +} + +// buffer pool for cuda +#define MAX_CUDA_BUFFERS 16 + +struct scoped_spin_lock { + std::atomic_flag& lock; + scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { + while (lock.test_and_set(std::memory_order_acquire)) { + ; // spin + } + } + ~scoped_spin_lock() { + lock.clear(std::memory_order_release); + } + scoped_spin_lock(const scoped_spin_lock&) = delete; + scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; +}; + +struct cuda_buffer { + void * ptr = nullptr; + size_t size = 0; +}; + +static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; +static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; + +void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { + scoped_spin_lock lock(g_cuda_pool_lock); + + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[i]; + if (b.size >= size && b.ptr != nullptr) { + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + void * ptr; + CUDA_CHECK(cudaMalloc((void **) &ptr, size)); + *actual_size = size; + return ptr; +} + +void ggml_cuda_pool_free(void * ptr, size_t size) { + scoped_spin_lock lock(g_cuda_pool_lock); + + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); + CUDA_CHECK(cudaFree(ptr)); +} + +cublasHandle_t g_cublasH = NULL; +cudaStream_t g_cudaStream = NULL; + +void ggml_init_cublas(void) { + if (g_cublasH == NULL) { + // create cublas handle, bind a stream + CUBLAS_CHECK(cublasCreate(&g_cublasH)); + + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking)); + + CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream)); + + // configure logging to stdout + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL)); + } +} diff --git a/ggml-cuda.h b/ggml-cuda.h new file mode 100644 index 000000000..4048ea491 --- /dev/null +++ b/ggml-cuda.h @@ -0,0 +1,42 @@ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + cudaGetErrorString(err_)); \ + exit(1); \ + } \ + } while (0) + +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +extern cublasHandle_t g_cublasH; +extern cudaStream_t g_cudaStream; + +void ggml_init_cublas(void); +void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size); +void ggml_cuda_pool_free(void * ptr, size_t size); + +void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); +void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); +void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream); +void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream); +void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 54722cd61..89af4eef5 100644 --- a/ggml.c +++ b/ggml.c @@ -19,6 +19,7 @@ #include #include #include +#include #if defined(__AVX2__) #include @@ -146,10 +147,12 @@ inline static void* ggml_aligned_malloc(size_t size) { } \ } while (0) -#ifdef GGML_USE_ACCELERATE +#if defined(GGML_USE_ACCELERATE) #include -#elif GGML_USE_OPENBLAS +#elif defined(GGML_USE_OPENBLAS) #include +#elif defined(GGML_USE_CUBLAS) +#include "ggml-cuda.h" #endif #undef MIN @@ -431,7 +434,51 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // quantization // -#define QK 32 +#if __AVX__ || __AVX2__ || __AVX512F__ +// Unpack 16 4-bit fields into 16 bytes +// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval +static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) +{ + // Load 8 bytes from memory + __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi ); + + // Expand bytes into uint16_t values + __m128i bytes = _mm_cvtepu8_epi16( tmp ); + + // Unpack values into individual bytes + const __m128i lowMask = _mm_set1_epi8( 0xF ); + __m128i high = _mm_andnot_si128( lowMask, bytes ); + __m128i low = _mm_and_si128( lowMask, bytes ); + high = _mm_slli_epi16( high, 4 ); + bytes = _mm_or_si128( low, high ); + return bytes; +} + +// horizontally add 8 floats +static inline float hsum_float_8(const __m256 x) { + __m128 res = _mm256_extractf128_ps(x, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(x)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + return _mm_cvtss_f32(res); +} + +// horizontally add 8 int32_t +static inline int hsum_i32_8(const __m256i a) { + const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); + const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); + const __m128i sum64 = _mm_add_epi32(hi64, sum128); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +// horizontally add 4 int32_t +static inline int hsum_i32_4(const __m128i a) { + const __m128i hi64 = _mm_unpackhi_epi64(a, a); + const __m128i sum64 = _mm_add_epi32(hi64, a); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} // AVX routine provided by GH user jon-chuang #if __AVX2__ || __AVX512F__ @@ -514,7 +561,7 @@ void ggml_mul_row_f32_tall_skinny(const float * A, const float * B, float * C, i #if __AVX2__ || __AVX512F__ // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytesFromNibbles( const uint8_t* rsi ) +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { // Load 16 bytes from memory __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); @@ -531,9 +578,38 @@ static inline __m256i bytesFromNibbles( const uint8_t* rsi ) return bytes; } +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m256i x) { + const __m256i ones = _mm256_set1_epi16(1); + const __m256i summed_pairs = _mm256_madd_epi16(ones, x); + return _mm256_cvtepi32_ps(summed_pairs); +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(x, x); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(y, x); +#if __AVXVNNI__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + return sum_i16_pairs_float(dot); +#endif +} + static inline __m128i packNibbles( __m256i bytes ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh +#if __AVX512F__ + const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 + bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh + return _mm256_cvtepi16_epi8(bytes); // abcd_efgh +#else const __m256i lowByte = _mm256_set1_epi16( 0xFF ); __m256i high = _mm256_andnot_si256( lowByte, bytes ); __m256i low = _mm256_and_si256( lowByte, bytes ); @@ -544,26 +620,9 @@ static inline __m128i packNibbles( __m256i bytes ) __m128i r0 = _mm256_castsi256_si128( bytes ); __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); return _mm_packus_epi16( r0, r1 ); +#endif } -#elif __AVX__ - -static inline __m128i bytesFromNibbles( const uint8_t* rsi ) -{ - // Load 8 bytes from memory - __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi ); - - // Expand bytes into uint16_t values - __m128i bytes = _mm_cvtepu8_epi16( tmp ); - - // Unpack values into individual bytes - const __m128i lowMask = _mm_set1_epi8( 0xF ); - __m128i high = _mm_andnot_si128( lowMask, bytes ); - __m128i low = _mm_and_si128( lowMask, bytes ); - high = _mm_slli_epi16( high, 4 ); - bytes = _mm_or_si128( low, high ); - return bytes; -} - +#else static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh @@ -580,6 +639,7 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) return _mm_packus_epi16( bytes1, bytes2); } #endif +#endif // __AVX__ || __AVX2__ || __AVX512F__ #if __ARM_NEON @@ -597,6 +657,18 @@ inline static uint16_t vaddvq_u8(uint8x16_t v) { (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); } +inline static int16_t vaddvq_s8(int8x16_t v) { + return + (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) + + (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) + + (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) + + (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) + + (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) + + (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) + + (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) + + (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15); +} + inline static int32_t vaddvq_s16(int16x8_t v) { return (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + @@ -652,51 +724,83 @@ uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { #endif #endif -// method 5 -// blocks of QK elements -// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) -typedef struct { - float d; // delta - uint8_t qs[QK / 2]; // nibbles / quants -} block_q4_0; -static_assert(sizeof(block_q4_0) == sizeof(float) + QK / 2, "wrong q4_0 block size/padding"); -// method 4 -// blocks of QK elements -// represented with 2 floats (delta + min) and QK/2 8-bit ints (i.e QK 4-bit unsigned integer factors) +#define QK4_0 32 typedef struct { - float d; - float m; - uint8_t qs[QK / 2]; // nibbles / quants + float d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + float d; // delta + float m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; -static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding"); +static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK4_2 16 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qs[QK4_2 / 2]; // nibbles / quants +} block_q4_2; +static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); + +#define QK4_3 16 +typedef struct { + ggml_fp16_t d; // delta + ggml_fp16_t m; // min + uint8_t qs[QK4_3 / 2]; // nibbles / quants +} block_q4_3; +static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); + +#define QK8_0 32 +typedef struct { + float d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); + +#define QK8_1 32 +typedef struct { + float d; // delta + float s0; // d * sum(qs[i]) low + float s1; // d * sum(qs[i]) high + int8_t qs[QK8_1]; // quants +} block_q8_1; +static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; - uint8_t pp[QK/2]; + uint8_t pp[QK4_0/2]; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max + float max = 0.0f; - for (int l = 0; l < QK; l++) { - const float v = x[i*QK + l]; - amax = MAX(amax, fabsf(v)); + for (int l = 0; l < QK4_0; l++) { + const float v = x[i*QK4_0 + l]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } } - const float d = amax / ((1 << 3) - 1); + const float d = max / -8; const float id = d ? 1.0f/d : 0.0f; y[i].d = d; - for (int l = 0; l < QK; l += 2) { - const float v0 = x[i*QK + l + 0]*id; - const float v1 = x[i*QK + l + 1]*id; + for (int l = 0; l < QK4_0; l += 2) { + const float v0 = x[i*QK4_0 + l + 0]*id; + const float v1 = x[i*QK4_0 + l + 1]*id; - const uint8_t vi0 = (int8_t)roundf(v0) + 8; - const uint8_t vi1 = (int8_t)roundf(v1) + 8; + const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8); + const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8); assert(vi0 < 16); assert(vi1 < 16); @@ -709,35 +813,49 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r } static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; block_q4_0 * restrict y = vy; #if defined(__POWER9_VECTOR__) const vector float v85 = vec_splats(8.5f); + const vector signed int v15 = vec_splats(15); for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max + float max = 0.0f; + float min = 0.0f; vector float srcv [8]; - vector float asrcv[8]; - vector float amaxv[8]; + vector float maxv[8]; + vector float minv[8]; for (int l = 0; l < 8; l++) srcv[l] = *(vector float *)(x + i*32 + 4*l); - for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]); + //for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]); - for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]); - //for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]); - amaxv[0] = vec_max(amaxv[0], amaxv[2]); - amaxv[4] = vec_max(amaxv[4], amaxv[6]); - //for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]); - amaxv[0] = vec_max(amaxv[0], amaxv[4]); + for (int l = 0; l < 4; l++) maxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]); + //for (int l = 0; l < 2; l++) maxv[4*l] = vec_max(maxv[4*l], maxv[4*l+2]); + maxv[0] = vec_max(maxv[0], maxv[2]); + maxv[4] = vec_max(maxv[4], maxv[6]); + //for (int l = 0; l < 1; l++) maxv[8*l] = vec_max(maxv[8*l], maxv[8*l+4]); + maxv[0] = vec_max(maxv[0], maxv[4]); - amax = MAX( - MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)), - MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3))); + for (int l = 0; l < 4; l++) minv[2*l] = vec_min(asrcv[2*l], asrcv[2*l+1]); + //for (int l = 0; l < 2; l++) minv[4*l] = vec_min(minv[4*l], minv[4*l+2]); + minv[0] = vec_min(minv[0], minv[2]); + minv[4] = vec_min(minv[4], minv[6]); + //for (int l = 0; l < 1; l++) minv[8*l] = vec_min(minv[8*l], minv[8*l+4]); + minv[0] = vec_min(minv[0], minv[4]); - const float d = amax / ((1 << 3) - 1); + + max = MAX( + MAX(vec_extract(maxv[0], 0), vec_extract(maxv[0], 1)), + MAX(vec_extract(maxv[0], 2), vec_extract(maxv[0], 3))); + min = MIN( + MIN(vec_extract(minv[0], 0), vec_extract(minv[0], 1)), + MIN(vec_extract(minv[0], 2), vec_extract(minv[0], 3))); + + const float magnitude = max >= fabsf(min) ? max : min; + const float d = magnitude / -8; const float id = d ? 1.0/d : 0.0; y[i].d = d; @@ -747,27 +865,33 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int for (int l = 0; l < 8; l++) { const vector float vf = vec_madd(srcv[l], vid, v85); const vector signed int vi = vec_signed(vf); + const vector signed int vc = vec_min(vi, v15); - pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4); - pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4); + pb[2*l + 0] = vec_extract(vc, 0) | (vec_extract(vc, 1) << 4); + pb[2*l + 1] = vec_extract(vc, 2) | (vec_extract(vc, 3) << 4); } } #elif __ARM_NEON for (int i = 0; i < nb; i++) { float32x4_t srcv [8]; - float32x4_t asrcv[8]; - float32x4_t amaxv[8]; + float32x4_t maxv[8]; + float32x4_t minv[8]; for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); - for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); - for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]); - for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); - for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); + for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l+1]); + for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l+2]); + for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l+4]); - const float amax = vmaxvq_f32(amaxv[0]); + for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l+1]); + for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l+2]); + for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l+4]); - const float d = amax / ((1 << 3) - 1); + const float max = vmaxvq_f32(maxv[0]); + const float min = vminvq_f32(minv[0]); + + const float magnitude = max >= fabsf(min) ? max : min; + const float d = magnitude / -8; const float id = d ? 1.0f/d : 0.0f; y[i].d = d; @@ -776,9 +900,10 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const float32x4_t v = vmulq_n_f32(srcv[l], id); const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f)); const int32x4_t vi = vcvtq_s32_f32(vf); + const int32x4_t vc = vminq_s32(vi, vdupq_n_s32(15)); - y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); - y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); + y[i].qs[2*l + 0] = vgetq_lane_s32(vc, 0) | (vgetq_lane_s32(vc, 1) << 4); + y[i].qs[2*l + 1] = vgetq_lane_s32(vc, 2) | (vgetq_lane_s32(vc, 3) << 4); } } #elif defined(__AVX2__) @@ -790,22 +915,31 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int __m256 v3 = _mm256_loadu_ps( x + 24 ); x += 32; - // Compute max(abs(e)) for the block - const __m256 signBit = _mm256_set1_ps( -0.0f ); - __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + // Compute max for the block + __m256 max = _mm256_max_ps( v0, v1 ); + __m256 maxTmp = _mm256_max_ps( v2, v3 ); + max = _mm256_max_ps( max, maxTmp ); - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) ); max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); const float maxScalar = _mm_cvtss_f32( max4 ); + // Compute min for the block + __m256 min = _mm256_min_ps( v0, v1 ); + __m256 minTmp = _mm256_min_ps( v2, v3 ); + min = _mm256_min_ps( min, minTmp ); + + __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) ); + min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) ); + min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) ); + const float minScalar = _mm_cvtss_f32( min4 ); + // Quantize these floats - const float d = maxScalar / 7.0f; + const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar; + const float d = magnitude / -8.0f; y[i].d = d; - const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f; + const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f; const __m256 mul = _mm256_set1_ps( id ); // Apply the multiplier @@ -838,9 +972,11 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); i0 = _mm256_permutevar8x32_epi32( i0, perm ); - // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ] + // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ] const __m256i off = _mm256_set1_epi8( 8 ); i0 = _mm256_add_epi8( i0, off ); + const __m256i maxNibble = _mm256_set1_epi8( 15 ); + i0 = _mm256_min_epi8( i0, maxNibble ); // Compress the vector into 4 bit/value, and store __m128i res = packNibbles( i0 ); @@ -855,22 +991,31 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int __m256 v3 = _mm256_loadu_ps( x + 24 ); x += 32; - // Compute max(abs(e)) for the block - const __m256 signBit = _mm256_set1_ps( -0.0f ); - __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + // Compute max for the block + __m256 max = _mm256_max_ps( v0, v1 ); + __m256 maxTmp = _mm256_max_ps( v2, v3 ); + max = _mm256_max_ps( max, maxTmp ); - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) ); max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); const float maxScalar = _mm_cvtss_f32( max4 ); + // Compute min for the block + __m256 min = _mm256_min_ps( v0, v1 ); + __m256 minTmp = _mm256_min_ps( v2, v3 ); + min = _mm256_min_ps( min, minTmp ); + + __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) ); + min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) ); + min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) ); + const float minScalar = _mm_cvtss_f32( min4 ); + // Quantize these floats - const float d = maxScalar / 7.0f; + const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar; + const float d = magnitude / -8.0f; y[i].d = d; - const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f; + const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f; const __m256 mul = _mm256_set1_ps( id ); // Apply the multiplier @@ -911,10 +1056,13 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int ni0 = _mm_packs_epi16( ni0, ni2 ); ni4 = _mm_packs_epi16( ni4, ni6 ); - // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ] - const __m128i off = _mm_set1_epi8( 8); + // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ] + const __m128i off = _mm_set1_epi8( 8 ); ni0 = _mm_add_epi8( ni0, off ); ni4 = _mm_add_epi8( ni4, off ); + const __m128i maxNibble = _mm_set1_epi8( 15 ); + ni0 = _mm_min_epi8( ni0, maxNibble ); + ni4 = _mm_min_epi8( ni4, maxNibble ); // Compress the vector into 4 bit/value, and store __m128i res = packNibbles( ni0, ni4 ); @@ -922,24 +1070,32 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int } #elif defined(__wasm_simd128__) for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max + float max = 0.0f; + float min = 0.0f; v128_t srcv [8]; - v128_t asrcv[8]; - v128_t amaxv[8]; + v128_t maxv[8]; + v128_t minv[8]; for (int l = 0; l < 8; l++) srcv[l] = wasm_v128_load(x + i*32 + 4*l); - for (int l = 0; l < 8; l++) asrcv[l] = wasm_f32x4_abs(srcv[l]); - for (int l = 0; l < 4; l++) amaxv[2*l] = wasm_f32x4_max(asrcv[2*l], asrcv[2*l+1]); - for (int l = 0; l < 2; l++) amaxv[4*l] = wasm_f32x4_max(amaxv[4*l], amaxv[4*l+2]); - for (int l = 0; l < 1; l++) amaxv[8*l] = wasm_f32x4_max(amaxv[8*l], amaxv[8*l+4]); + for (int l = 0; l < 4; l++) maxv[2*l] = wasm_f32x4_max(srcv[2*l], srcv[2*l+1]); + for (int l = 0; l < 2; l++) maxv[4*l] = wasm_f32x4_max(maxv[4*l], maxv[4*l+2]); + for (int l = 0; l < 1; l++) maxv[8*l] = wasm_f32x4_max(maxv[8*l], maxv[8*l+4]); - amax = MAX( - MAX(wasm_f32x4_extract_lane(amaxv[0], 0), wasm_f32x4_extract_lane(amaxv[0], 1)), - MAX(wasm_f32x4_extract_lane(amaxv[0], 2), wasm_f32x4_extract_lane(amaxv[0], 3))); + for (int l = 0; l < 4; l++) minv[2*l] = wasm_f32x4_min(srcv[2*l], srcv[2*l+1]); + for (int l = 0; l < 2; l++) minv[4*l] = wasm_f32x4_min(minv[4*l], minv[4*l+2]); + for (int l = 0; l < 1; l++) minv[8*l] = wasm_f32x4_min(minv[8*l], minv[8*l+4]); - const float d = amax / ((1 << 3) - 1); + max = MAX( + MAX(wasm_f32x4_extract_lane(maxv[0], 0), wasm_f32x4_extract_lane(maxv[0], 1)), + MAX(wasm_f32x4_extract_lane(maxv[0], 2), wasm_f32x4_extract_lane(maxv[0], 3))); + min = MIN( + MIN(wasm_f32x4_extract_lane(minv[0], 0), wasm_f32x4_extract_lane(minv[0], 1)), + MIN(wasm_f32x4_extract_lane(minv[0], 2), wasm_f32x4_extract_lane(minv[0], 3))); + + const float magnitude = max >= fabsf(min) ? max : min; + const float d = magnitude / -8; const float id = d ? 1.0/d : 0.0; y[i].d = d; @@ -948,9 +1104,10 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id)); const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f)); const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf); + const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15)); - y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4); - y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4); + y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4); + y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4); } } #else @@ -960,19 +1117,19 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int } static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_1 == 0); + const int nb = k / QK4_1; block_q4_1 * restrict y = vy; - uint8_t pp[QK/2]; + uint8_t pp[QK4_1/2]; for (int i = 0; i < nb; i++) { float min = FLT_MAX; float max = -FLT_MAX; - for (int l = 0; l < QK; l++) { - const float v = x[i*QK + l]; + for (int l = 0; l < QK4_1; l++) { + const float v = x[i*QK4_1 + l]; if (v < min) min = v; if (v > max) max = v; } @@ -983,9 +1140,9 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric y[i].d = d; y[i].m = min; - for (int l = 0; l < QK; l += 2) { - const float v0 = (x[i*QK + l + 0] - min)*id; - const float v1 = (x[i*QK + l + 1] - min)*id; + for (int l = 0; l < QK4_1; l += 2) { + const float v0 = (x[i*QK4_1 + l + 0] - min)*id; + const float v1 = (x[i*QK4_1 + l + 1] - min)*id; const uint8_t vi0 = roundf(v0); const uint8_t vi1 = roundf(v1); @@ -1001,9 +1158,9 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric } static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); + assert(k % QK4_1 == 0); - const int nb = k / QK; + const int nb = k / QK4_1; block_q4_1 * restrict y = vy; @@ -1087,7 +1244,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int float32x4_t minv[8]; float32x4_t maxv[8]; - for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK + 4*l); + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK4_1 + 4*l); for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]); for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]); @@ -1123,9 +1280,332 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int #endif } +// reference implementation for deterministic creation of model files +static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) { + assert(k % QK4_2 == 0); + + const int nb = k / QK4_2; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int l = 0; l < QK4_2; l++) { + const float v = x[i*QK4_2 + l]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -8; + + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int l = 0; l < QK4_2; l += 2) { + const float v0 = x[i*QK4_2 + l + 0]*id; + const float v1 = x[i*QK4_2 + l + 1]*id; + + const uint8_t vi0 = MIN(15, (uint8_t)(v0 + 8.5f)); + const uint8_t vi1 = MIN(15, (uint8_t)(v1 + 8.5f)); + + assert(vi0 < 16); + assert(vi1 < 16); + + y[i].qs[l/2] = vi0 | (vi1 << 4); + } + } +} + +static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) { + assert(k % QK4_2 == 0); + + block_q4_2 * restrict y = vy; + + quantize_row_q4_2_reference(x, y, k); +} + +static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) { + assert(k % QK4_3 == 0); + const int nb = k / QK4_3; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int l = 0; l < QK4_3; l++) { + const float v = x[i*QK4_3 + l]; + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); + + for (int l = 0; l < QK4_3; l += 2) { + const float v0 = (x[i*QK4_3 + l + 0] - min)*id; + const float v1 = (x[i*QK4_3 + l + 1] - min)*id; + + const uint8_t vi0 = (int) (v0 + 0.5f); + const uint8_t vi1 = (int) (v1 + 0.5f); + + assert(vi0 < 16); + assert(vi1 < 16); + + y[i].qs[l/2] = vi0 | (vi1 << 4); + } + } +} + +static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) { + assert(k % QK4_3 == 0); + + block_q4_3 * restrict y = vy; + + quantize_row_q4_3_reference(x, y, k); +} + +// reference implementation for deterministic creation of model files +static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int l = 0; l < QK8_0; l++) { + const float v = x[i*QK8_0 + l]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + for (int l = 0; l < QK8_0; ++l) { + const float v0 = x[i*QK8_0 + l]*id; + + y[i].qs[l] = roundf(v0); + } + } +} + +static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { + assert(k % QK8_0 == 0); + + block_q8_0 * restrict y = vy; + + quantize_row_q8_0_reference(x, y, k); +} + +// reference implementation for deterministic creation of model files +static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int l = 0; l < QK8_1; l++) { + const float v = x[i*QK8_1 + l]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int sum0 = 0; + int sum1 = 0; + + for (int l = 0; l < QK8_1/2; ++l) { + const float v0 = x[i*QK8_1 + l]*id; + const float v1 = x[i*QK8_1 + QK8_1/2 + l]*id; + + y[i].qs[ l] = roundf(v0); + y[i].qs[QK8_1/2 + l] = roundf(v1); + + sum0 += y[i].qs[ l]; + sum1 += y[i].qs[QK8_1/2 + l]; + } + + y[i].s0 = d * sum0; + y[i].s1 = d * sum1; + } +} + +static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); + for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); + + for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]); + for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); + for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int32x4_t accv0 = vdupq_n_s32(0); + int32x4_t accv1 = vdupq_n_s32(0); + + // low half + for (int l = 0; l < 4; l++) { + const float32x4_t v = vmulq_n_f32(srcv[l], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3); + + accv0 = vaddq_s32(accv0, vi); + } + + // high half + for (int l = 4; l < 8; l++) { + const float32x4_t v = vmulq_n_f32(srcv[l], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3); + + accv1 = vaddq_s32(accv1, vi); + } + + const int32_t sum0 = vaddvq_s32(accv0); + const int32_t sum1 = vaddvq_s32(accv1); + + y[i].s0 = d * sum0; + y[i].s1 = d * sum1; + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Compute the sum of the quants and set y[i].s + //y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + y[i].s0 = d * hsum_i32_8(_mm256_add_epi32(i0, i1)); + y[i].s1 = d * hsum_i32_8(_mm256_add_epi32(i2, i3)); + + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Compute the sum of the quants and set y[i].s + const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); + const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); + y[i].s0 = d * hsum_i32_4(s0); + y[i].s1 = d * hsum_i32_4(s1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + // scalar + quantize_row_q8_1_reference(x, y, k); +#endif +} + static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; const block_q4_0 * restrict x = vx; @@ -1136,9 +1616,9 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 32) { + for (int l = 0; l < QK4_0; l += 32) { // Load 32x4-bit integers into 32x8-bit integers - __m256i vx8 = bytesFromNibbles(pp+l/2); + __m256i vx8 = bytes_from_nibbles_32(pp+l/2); // Subtract 8 from the integers vx8 = _mm256_sub_epi8(vx8, _mm256_set1_epi8(8)); @@ -1158,7 +1638,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in // Scale and store for (int j = 0; j < 4; j++) { const __m256 result = _mm256_mul_ps(vf[j], d_v); - _mm256_storeu_ps(y + i * QK + l + j*8, result); + _mm256_storeu_ps(y + i * QK4_0 + l + j*8, result); } } } @@ -1168,7 +1648,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 16) { + for (int l = 0; l < QK4_0; l += 16) { // Load 16x4-bit integers into 8x8-bit integers const uint8x8_t v8 = vld1_u8(pp + l/2); @@ -1207,10 +1687,10 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const float32x4_t r3 = vmulq_f32(vf_3, vd); // Store - vst1q_f32(y + i*QK + l + 0, r0); - vst1q_f32(y + i*QK + l + 4, r1); - vst1q_f32(y + i*QK + l + 8, r2); - vst1q_f32(y + i*QK + l + 12, r3); + vst1q_f32(y + i*QK4_0 + l + 0, r0); + vst1q_f32(y + i*QK4_0 + l + 4, r1); + vst1q_f32(y + i*QK4_0 + l + 8, r2); + vst1q_f32(y + i*QK4_0 + l + 12, r3); } } #else @@ -1220,7 +1700,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK4_0; l += 2) { const uint8_t vi = pp[l/2]; const int8_t vi0 = vi & 0xf; @@ -1231,19 +1711,19 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in //printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1); - y[i*QK + l + 0] = v0; - y[i*QK + l + 1] = v1; + y[i*QK4_0 + l + 0] = v0; + y[i*QK4_0 + l + 1] = v1; - assert(!isnan(y[i*QK + l + 0])); - assert(!isnan(y[i*QK + l + 1])); + assert(!isnan(y[i*QK4_0 + l + 0])); + assert(!isnan(y[i*QK4_0 + l + 1])); } } #endif } static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_1 == 0); + const int nb = k / QK4_1; const block_q4_1 * restrict x = vx; @@ -1254,9 +1734,9 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 32) { + for (int l = 0; l < QK4_1; l += 32) { // Load 32x4-bit integers into 32x8-bit integers - __m256i vx8 = bytesFromNibbles(pp+l/2); + __m256i vx8 = bytes_from_nibbles_32(pp+l/2); // Convert to 16-bit int const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0)); @@ -1273,7 +1753,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in // Scale, add m and store for (int j = 0; j < 4; j++) { const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m); - _mm256_storeu_ps(y + i * QK + l + j*8, result); + _mm256_storeu_ps(y + i * QK4_1 + l + j*8, result); } } } @@ -1284,7 +1764,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 16) { + for (int l = 0; l < QK4_1; l += 16) { // Load 16x4-bit integers into 8x8-bit integers const uint8x8_t v8 = vld1_u8(pp + l/2); @@ -1315,10 +1795,10 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd); // Store - vst1q_f32(y + i*QK + l + 0, r0); - vst1q_f32(y + i*QK + l + 4, r1); - vst1q_f32(y + i*QK + l + 8, r2); - vst1q_f32(y + i*QK + l + 12, r3); + vst1q_f32(y + i*QK4_1 + l + 0, r0); + vst1q_f32(y + i*QK4_1 + l + 4, r1); + vst1q_f32(y + i*QK4_1 + l + 8, r2); + vst1q_f32(y + i*QK4_1 + l + 12, r3); } } #else @@ -1328,7 +1808,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK4_1; l += 2) { const uint8_t vi = pp[l/2]; const int8_t vi0 = vi & 0xf; @@ -1337,16 +1817,156 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const float v0 = vi0*d + m; const float v1 = vi1*d + m; - y[i*QK + l + 0] = v0; - y[i*QK + l + 1] = v1; + y[i*QK4_1 + l + 0] = v0; + y[i*QK4_1 + l + 1] = v1; - assert(!isnan(y[i*QK + l + 0])); - assert(!isnan(y[i*QK + l + 1])); + assert(!isnan(y[i*QK4_1 + l + 0])); + assert(!isnan(y[i*QK4_1 + l + 1])); } } #endif } +static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, int k) { + assert(k % QK4_2 == 0); + const int nb = k / QK4_2; + + const block_q4_2 * restrict x = vx; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + const uint8_t * restrict pp = x[i].qs; + + for (int l = 0; l < QK4_2; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = (vi0 - 8)*d; + const float v1 = (vi1 - 8)*d; + + y[i*QK4_2 + l + 0] = v0; + y[i*QK4_2 + l + 1] = v1; + + assert(!isnan(y[i*QK4_2 + l + 0])); + assert(!isnan(y[i*QK4_2 + l + 1])); + } + } +} + +static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) { + assert(k % QK4_3 == 0); + const int nb = k / QK4_3; + + const block_q4_3 * restrict x = vx; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); + + const uint8_t * restrict pp = x[i].qs; + + for (int l = 0; l < QK4_3; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = vi0*d + m; + const float v1 = vi1*d + m; + + y[i*QK4_3 + l + 0] = v0; + y[i*QK4_3 + l + 1] = v1; + + assert(!isnan(y[i*QK4_3 + l + 0])); + assert(!isnan(y[i*QK4_3 + l + 1])); + } + } +} + +static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + const block_q8_0 * restrict x = vx; + + for (int i = 0; i < nb; i++) { + const float d = x[i].d; + + const int8_t * restrict pp = x[i].qs; + + for (int l = 0; l < QK8_0; ++l) { + y[i*QK8_0 + l] = pp[l]*d; + } + } +} + +static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); + +static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { + [GGML_TYPE_Q4_0] = { + .dequantize_row_q = dequantize_row_q4_0, + .quantize_row_q = quantize_row_q4_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q4_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, + [GGML_TYPE_Q4_1] = { + .dequantize_row_q = dequantize_row_q4_1, + .quantize_row_q = quantize_row_q4_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .quantize_row_q_dot = quantize_row_q8_1, + .vec_dot_q = ggml_vec_dot_q4_1_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, + }, + [GGML_TYPE_Q4_2] = { + .dequantize_row_q = dequantize_row_q4_2, + .quantize_row_q = quantize_row_q4_2, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q4_2_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, + [GGML_TYPE_Q4_3] = { + .dequantize_row_q = dequantize_row_q4_3, + .quantize_row_q = quantize_row_q4_3, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference, + .quantize_row_q_dot = quantize_row_q8_1, + .vec_dot_q = ggml_vec_dot_q4_3_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, + }, + [GGML_TYPE_Q8_0] = { + .dequantize_row_q = dequantize_row_q8_0, + .quantize_row_q = quantize_row_q8_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q8_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, + [GGML_TYPE_Q8_1] = { + .dequantize_row_q = NULL, // TODO + .quantize_row_q = quantize_row_q8_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference, + .quantize_row_q_dot = quantize_row_q8_1, + .vec_dot_q = NULL, // TODO + .vec_dot_type = GGML_TYPE_Q8_1, + }, +}; + +// For internal test use +quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { + GGML_ASSERT(i < GGML_TYPE_COUNT); + return quantize_fns[i]; +} + + // // simd mappings // @@ -1903,37 +2523,6 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float *s = sumf; } -#if __AVX512F__ && QK == 32 -static inline __m512 dot_q4_0_oneblock_avx512( - __m512 acc, - const block_q4_0 * restrict x, - const block_q4_0 * restrict y, - int i -) { - // Compute combined scale for the block - __m512 d = _mm512_set1_ps( x[i].d * y[i].d ); - - __m256i bx = bytesFromNibbles( x[i].qs ); - __m256i by = bytesFromNibbles( y[i].qs ); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); - by = _mm256_sub_epi8( by, off ); - - // Sign-extend 16 signed bytes into int16_t - __m512i x32 = _mm512_cvtepi8_epi16( bx ); - __m512i y32 = _mm512_cvtepi8_epi16( by ); - // Compute products of int16_t integers, add pairwise - __m512i i64 = _mm512_madd_epi16( x32, y32 ); - - // Convert int32_t to float - __m512 p = _mm512_cvtepi32_ps( i64 ); - // Apply the scale, and accumulate - return _mm512_fmadd_ps( d, p, acc ); -} -#endif - inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { ggml_float sumf = 0.0; @@ -1970,67 +2559,62 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t *s = sumf; } -static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK; +static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK8_0; - assert(n % QK == 0); + assert(n % QK8_0 == 0); assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; - const block_q4_0 * restrict y = vy; - - float sumf = 0.0; + const block_q8_0 * restrict y = vy; #if defined(__ARM_NEON) - float sum0 = 0.0f; - float sum1 = 0.0f; + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); for (int i = 0; i < nb; i += 2) { const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict y0 = &y[i + 0]; const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q4_0 * restrict y1 = &y[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; - const uint8x16_t m4b = vdupq_n_u8(0xf); - const int8x16_t s8b = vdupq_n_s8(0x8); + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int8x16_t s8b = vdupq_n_s8(0x8); const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v1_0 = vld1q_u8(y0->qs); const uint8x16_t v0_1 = vld1q_u8(x1->qs); - const uint8x16_t v1_1 = vld1q_u8(y1->qs); // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); - const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); - - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); - const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); // sub 8 const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); - const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); - const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); - const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); - const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + // interleave + const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h); + const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h); + const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h); + const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h); #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t - int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); - int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs); - p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); - p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); - - sum0 += x0->d*y0->d*vaddvq_s32(p_0); - sum1 += x1->d*y1->d*vaddvq_s32(p_1); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); @@ -2042,124 +2626,41 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); - const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h); - const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h); + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h); - const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h); - - const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); - const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); - - sum0 += x0->d*y0->d*vaddvq_s16(p_0); - sum1 += x1->d*y1->d*vaddvq_s16(p_1); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d); #endif } - sumf = sum0 + sum1; -#elif defined(__AVX512F__) - // Initialize accumulator with zeros - __m512 acc0 = _mm512_setzero_ps(); - __m512 acc1 = _mm512_setzero_ps(); - - const int superblock_size = 8; - const int superblock_count = nb / superblock_size; - - for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) { - int i = superblock_ix * superblock_size; - - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+0 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+1 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+2 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+3 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+4 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+5 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+6 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+7 ); - } - - // Remainders - for (int i = superblock_count * superblock_size; i < nb; ++i) { - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i ); - } - - // Horizontal sum of all lanes of the accumulator - sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); - /* Prepare the constants we will need during execution */ - const __m256i lowMask = _mm256_set1_epi8( 0xF ); - const __m256i offset_8 = _mm256_set1_epi16( 8 ); - -#define UNROLL_COUNT 8 - // make sure we only unroll multiples of the block count - assert(nb % UNROLL_COUNT == 0); - // Main loop - for (int i = 0; i < nb; i+=UNROLL_COUNT) { - // This loop will be unrolled by the compiler - for (int u=0;u we now have a vector of 8 int_32t */ - __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); - - /* Convert to vectore of 8 int32_t to 8 floats */ - __m256 q = _mm256_cvtepi32_ps( xy_q ); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( scale, q, acc ); - } + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( d, q, acc ); } - // Return horizontal sum of the acc vector - __m128 res = _mm256_extractf128_ps( acc, 1 ); - res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); - res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); - res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - - sumf = _mm_cvtss_f32( res ); + *s = hsum_float_8(acc); #elif defined(__AVX__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -2172,13 +2673,12 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest __m128i i32[2]; for (int j = 0; j < 2; ++j) { // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes - __m128i bx = bytesFromNibbles( x[i].qs + 8*j ); - __m128i by = bytesFromNibbles( y[i].qs + 8*j ); + __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j); + __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j)); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. const __m128i off = _mm_set1_epi8( 8 ); bx = _mm_sub_epi8( bx, off ); - by = _mm_sub_epi8( by, off ); // Get absolute values of x vectors const __m128i ax = _mm_sign_epi8(bx, bx); @@ -2199,303 +2699,540 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); } - // Return horizontal sum of the acc vector - __m128 res = _mm256_extractf128_ps( acc, 1 ); - res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); - res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); - res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - - sumf = _mm_cvtss_f32( res ); -#elif defined(__wasm_simd128__) - // wasm simd - float sum0 = 0.0f; - float sum1 = 0.0f; - - for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict y0 = &y[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q4_0 * restrict y1 = &y[i + 1]; - - const v128_t m4b = wasm_u8x16_splat(0xf); - const v128_t s8b = wasm_i8x16_splat(0x8); - - const v128_t v0_0 = wasm_v128_load(x0->qs); - const v128_t v0_1 = wasm_v128_load(y0->qs); - const v128_t v1_0 = wasm_v128_load(x1->qs); - const v128_t v1_1 = wasm_v128_load(y1->qs); - - // 4-bit -> 8-bit - const v128_t v0_0l = wasm_v128_and(v0_0, m4b); - const v128_t v1_0l = wasm_v128_and(v1_0, m4b); - - const v128_t v0_0h = wasm_u8x16_shr(v0_0, 4); - const v128_t v1_0h = wasm_u8x16_shr(v1_0, 4); - - const v128_t v0_1l = wasm_v128_and(v0_1, m4b); - const v128_t v1_1l = wasm_v128_and(v1_1, m4b); - - const v128_t v0_1h = wasm_u8x16_shr(v0_1, 4); - const v128_t v1_1h = wasm_u8x16_shr(v1_1, 4); - - // sub 8 - const v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b); - const v128_t v1_0ls = wasm_i8x16_sub(v1_0l, s8b); - - const v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b); - const v128_t v1_0hs = wasm_i8x16_sub(v1_0h, s8b); - - const v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b); - const v128_t v1_1ls = wasm_i8x16_sub(v1_1l, s8b); - - const v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b); - const v128_t v1_1hs = wasm_i8x16_sub(v1_1h, s8b); - - // dot product into int16x8_t - const v128_t pl0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0ls), wasm_i16x8_extend_low_i8x16(v1_0ls)); - const v128_t pl0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0ls), wasm_i16x8_extend_high_i8x16(v1_0ls)); - - const v128_t ph0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0hs), wasm_i16x8_extend_low_i8x16(v1_0hs)); - const v128_t ph0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0hs), wasm_i16x8_extend_high_i8x16(v1_0hs)); - - const v128_t pl1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1ls), wasm_i16x8_extend_low_i8x16(v1_1ls)); - const v128_t pl1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1ls), wasm_i16x8_extend_high_i8x16(v1_1ls)); - - const v128_t ph1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1hs), wasm_i16x8_extend_low_i8x16(v1_1hs)); - const v128_t ph1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1hs), wasm_i16x8_extend_high_i8x16(v1_1hs)); - - const v128_t pl_0 = wasm_i16x8_add(pl0l, pl0h); - const v128_t ph_0 = wasm_i16x8_add(ph0l, ph0h); - - const v128_t pl_1 = wasm_i16x8_add(pl1l, pl1h); - const v128_t ph_1 = wasm_i16x8_add(ph1l, ph1h); - - const v128_t p_0 = wasm_i16x8_add(pl_0, ph_0); - const v128_t p_1 = wasm_i16x8_add(pl_1, ph_1); - - sum0 += x0->d * y0->d * ( - wasm_i16x8_extract_lane(p_0, 0) + wasm_i16x8_extract_lane(p_0, 1) + - wasm_i16x8_extract_lane(p_0, 2) + wasm_i16x8_extract_lane(p_0, 3) + - wasm_i16x8_extract_lane(p_0, 4) + wasm_i16x8_extract_lane(p_0, 5) + - wasm_i16x8_extract_lane(p_0, 6) + wasm_i16x8_extract_lane(p_0, 7)); - sum1 += x1->d * y1->d * ( - wasm_i16x8_extract_lane(p_1, 0) + wasm_i16x8_extract_lane(p_1, 1) + - wasm_i16x8_extract_lane(p_1, 2) + wasm_i16x8_extract_lane(p_1, 3) + - wasm_i16x8_extract_lane(p_1, 4) + wasm_i16x8_extract_lane(p_1, 5) + - wasm_i16x8_extract_lane(p_1, 6) + wasm_i16x8_extract_lane(p_1, 7)); - } - - sumf = sum0 + sum1; + *s = hsum_float_8(acc); #else // scalar + float sumf = 0.0; for (int i = 0; i < nb; i++) { const float d0 = x[i].d; const float d1 = y[i].d; const uint8_t * restrict p0 = x[i].qs; - const uint8_t * restrict p1 = y[i].qs; + const int8_t * restrict p1 = y[i].qs; int sumi = 0; - for (int j = 0; j < QK/2; j++) { + for (int j = 0; j < QK8_0/2; j++) { const uint8_t v0 = p0[j]; - const uint8_t v1 = p1[j]; - const int8_t i0 = (int8_t) (v0 & 0xf) - 8; - const int8_t i1 = (int8_t) (v0 >> 4) - 8; + const int i0 = (int8_t) (v0 & 0xf) - 8; + const int i1 = (int8_t) (v0 >> 4) - 8; - const int8_t i2 = (int8_t) (v1 & 0xf) - 8; - const int8_t i3 = (int8_t) (v1 >> 4) - 8; + const int i2 = p1[2*j + 0]; + const int i3 = p1[2*j + 1]; sumi += i0*i2 + i1*i3; } - sumf += d0 * d1 * sumi; + sumf += d0*d1*sumi; } -#endif - *s = sumf; +#endif } -static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK; +static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK8_1; + + assert(n % QK8_1 == 0); + assert(nb % 2 == 0); const block_q4_1 * restrict x = vx; - const block_q4_1 * restrict y = vy; + const block_q8_1 * restrict y = vy; - float sumf = 0.0; + // TODO: add AVX / WASM SIMD / etc +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); -#if defined(__AVX2__) + float summs = 0; + + for (int i = 0; i < nb; i += 2) { + const block_q4_1 * restrict x0 = &x[i + 0]; + const block_q4_1 * restrict x1 = &x[i + 1]; + const block_q8_1 * restrict y0 = &y[i + 0]; + const block_q8_1 * restrict y1 = &y[i + 1]; + + summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1); + + const uint8x16_t m4b = vdupq_n_u8(0xf); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // interleave + const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h); + const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h); + const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h); + const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; +#elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); - // Accumulator for constant offsets - float acc_offset = 0.0f; + + float summs = 0; // Main loop for (int i = 0; i < nb; ++i) { const float * d0 = &x[i].d; const float * d1 = &y[i].d; - const float * m0 = &x[i].m; - const float * m1 = &y[i].m; + summs += x[i].m * (y[i].s0 + y[i].s1); const __m256 d0v = _mm256_broadcast_ss( d0 ); const __m256 d1v = _mm256_broadcast_ss( d1 ); - const __m256 m0v = _mm256_broadcast_ss( m0 ); - const __m256 m1v = _mm256_broadcast_ss( m1 ); - // Compute combined scale for the block - const __m256 scale_01 = _mm256_mul_ps( d0v, d1v ); - - // Compute cross scales for the block - const __m256 scale_0 = _mm256_mul_ps( d0v, m1v ); - const __m256 scale_1 = _mm256_mul_ps( m0v, d1v ); - const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0xAA /* 0b10101010 */ ); + // Compute combined scales + const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - __m256i bx = bytesFromNibbles( x[i].qs ); - __m256i by = bytesFromNibbles( y[i].qs ); + const __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); - // Now we have a vector with bytes in [ 0 .. 15 ] interval. + const __m256 xy = mul_sum_i8_pairs_float(bx, by); - // Sign-extend first 16 signed bytes into int16_t - __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) ); - __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); - // Compute products of int16_t integers, add pairwise - __m256i i32 = _mm256_madd_epi16( x16, y16 ); - - // Sign-extend last 16 signed bytes into int16_t vectors - __m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) ); - __m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); - // Accumulate products of int16_t integers - i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) ); - - // compute sums of unsigned bytes in bx, by in blocks of 8. - // This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000, - // which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400. - // so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ] - __m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() ); - __m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() ); - __m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) ); - __m256 sums = _mm256_cvtepi32_ps( sumsi ); - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps( i32 ); - // Apply the scale, and accumulate - // acc += d0*d1*x*y + d0*m1*x + d1*m0*y - acc = _mm256_fmadd_ps( scale_01, p, acc ); - acc = _mm256_fmadd_ps( cross_scales, sums, acc ); - // acc_offset += m0*m1 (for each entry in the block) - acc_offset += (*m0)*(*m1); + // Accumulate d0*d1*x*y + acc = _mm256_fmadd_ps( d0d1, xy, acc ); } - // Return horizontal sum of the acc vector - __m128 res = _mm256_extractf128_ps( acc, 1 ); - res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); - res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); - res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - - sumf = _mm_cvtss_f32( res ) + acc_offset * QK; -#elif defined(__ARM_NEON) - float sum00 = 0.0f; - float sum01 = 0.0f; - float sum10 = 0.0f; - float sum11 = 0.0f; - - for (int i = 0; i < nb; i += 2) { - const block_q4_1 * restrict x0 = &x[i + 0]; - const block_q4_1 * restrict y0 = &y[i + 0]; - const block_q4_1 * restrict x1 = &x[i + 1]; - const block_q4_1 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0xf); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v1_0 = vld1q_u8(y0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - const uint8x16_t v1_1 = vld1q_u8(y1->qs); - - // 4-bit -> 8-bit - const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); - const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); - const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); - const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); - - const uint8x16_t v0_1l = vandq_u8(v0_1, m4b); - const uint8x16_t v1_1l = vandq_u8(v1_1, m4b); - const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4); - const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4); - - sum00 += x0->m*y0->m; - sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h)); - sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h)); - - sum00 += x1->m*y1->m; - sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h)); - sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h)); - -#if defined(__ARM_FEATURE_DOTPROD) - // dot product into int32x4_t - uint32x4_t p_0 = vdotq_u32(vdupq_n_u32(0), v0_0l, v1_0l); - uint32x4_t p_1 = vdotq_u32(vdupq_n_u32(0), v0_1l, v1_1l); - - p_0 = vdotq_u32(p_0, v0_0h, v1_0h); - p_1 = vdotq_u32(p_1, v0_1h, v1_1h); - - sum11 += x0->d*y0->d*vaddvq_u32(p_0); - sum11 += x1->d*y1->d*vaddvq_u32(p_1); -#else - const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); - const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); - const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); - const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); - - const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l)); - const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l)); - const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h)); - const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h)); - - const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h); - const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h); - - const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h); - const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h); - - const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0); - const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1); - - sum11 += x0->d*y0->d*vaddvq_u16(p_0); - sum11 += x1->d*y1->d*vaddvq_u16(p_1); -#endif - } - - sumf = QK*sum00 + sum01 + sum10 + sum11; + *s = hsum_float_8(acc) + summs; #else // scalar + float sumf = 0.0; for (int i = 0; i < nb; i++) { const float d0 = x[i].d; + const float m0 = x[i].m; const float d1 = y[i].d; - const float m0 = x[i].m; - const float m1 = y[i].m; - const uint8_t * restrict p0 = x[i].qs; - const uint8_t * restrict p1 = y[i].qs; + const int8_t * restrict p1 = y[i].qs; - for (int j = 0; j < QK/2; j++) { + // TODO: this is very slow .. + for (int j = 0; j < QK8_1/2; j++) { const uint8_t v0 = p0[j]; - const uint8_t v1 = p1[j]; const float f0 = d0*(v0 & 0xf) + m0; const float f1 = d0*(v0 >> 4) + m0; - const float f2 = d1*(v1 & 0xf) + m1; - const float f3 = d1*(v1 >> 4) + m1; + const float f2 = d1*p1[2*j + 0]; + const float f3 = d1*p1[2*j + 1]; sumf += f0*f2 + f1*f3; } } + *s = sumf; #endif +} + +static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK8_0; + + assert(n % QK8_0 == 0); + assert(nb % 2 == 0); + assert(QK8_0 == 2*QK4_2); + + const block_q4_2 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i += 2) { + const block_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0]; + const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1]; + const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0]; + const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1]; + + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs)); + const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs)); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + + // interleave + const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs); + const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs); + const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs); + const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)), + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d); + + sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)), + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( + vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)), + vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d); + + sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( + vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)), + vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d)); + const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d)); + const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d)); + + __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs); + __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs); + __m256i bx = _mm256_set_m128i(bx1, bx0); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8(8); + bx = _mm256_sub_epi8(bx, off); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps(d, q, acc); + } + + *s = hsum_float_8(acc); +#else + // scalar + float sumf = 0.0; + for (int i = 0; i < nb; i++) { + const uint8_t * restrict x0 = x[2*i + 0].qs; + const uint8_t * restrict x1 = x[2*i + 1].qs; + const int8_t * restrict y0 = y[i].qs; + + const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d); + const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d); + + int sumi_0 = 0; + int sumi_1 = 0; + + for (int j = 0; j < QK8_0/4; j++) { + const uint8_t v0 = x0[j]; + const uint8_t v1 = x1[j]; + + const int i0_0 = (int8_t) (v0 & 0xf) - 8; + const int i1_0 = (int8_t) (v0 >> 4) - 8; + + const int i0_1 = (int8_t) (v1 & 0xf) - 8; + const int i1_1 = (int8_t) (v1 >> 4) - 8; + + const int i2_0 = y0[2*j + 0]; + const int i3_0 = y0[2*j + 1]; + + const int i2_1 = y0[2*(j + QK8_0/4) + 0]; + const int i3_1 = y0[2*(j + QK8_0/4) + 1]; + + sumi_0 += i0_0*i2_0 + i1_0*i3_0; + sumi_1 += i0_1*i2_1 + i1_1*i3_1; + } + + sumf += (d0 * y[i].d) * sumi_0; + sumf += (d1 * y[i].d) * sumi_1; + } + *s = sumf; +#endif +} + +static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK8_1; + + assert(n % QK8_1 == 0); + assert(nb % 2 == 0); + assert(QK8_1 == 2*QK4_3); + + const block_q4_3 * restrict x = vx; + const block_q8_1 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float summs0 = 0.0f; + float summs1 = 0.0f; + + for (int i = 0; i < nb; ++i) { + const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0]; + const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1]; + + const block_q8_1 * restrict y0 = &y[i + 0]; + + summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0; + summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1; + + const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs)); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, vdupq_n_u8(0xf))); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + + // interleave + const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h); + const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + + const float x0_0d = GGML_FP16_TO_FP32(x0_0->d); + const float x0_1d = GGML_FP16_TO_FP32(x0_1->d); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph0), x0_1d*y0->d); +#endif + } + + *s = vaddvq_f32(vaddq_f32(sumv0, sumv1)) + summs0 + summs1; +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d)); + const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d)); + const __m256 dx = _mm256_set_m128(d1, d0); + + summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0 + + GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1; + + const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs); + const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs); + const __m256i bx = _mm256_set_m128i(bx1, bx0); + + const __m256 dy = _mm256_broadcast_ss(&y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); + } + + *s = hsum_float_8(acc) + summs; +#else + // scalar + float sumf = 0.0; + for (int i = 0; i < nb; i++) { + const uint8_t * restrict x0 = x[2*i + 0].qs; + const uint8_t * restrict x1 = x[2*i + 1].qs; + const int8_t * restrict y0 = y[i].qs; + + const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d); + const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m); + const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d); + const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m); + + int sxy_0 = 0; + int sxy_1 = 0; + + for (int j = 0; j < QK8_1/4; j++) { + const uint8_t v0 = x0[j]; + const uint8_t v1 = x1[j]; + + const int x0_0 = v0 & 0xf; + const int x1_0 = v0 >> 4; + + const int x0_1 = v1 & 0xf; + const int x1_1 = v1 >> 4; + + const int y0_0 = y0[2*j + 0]; + const int y1_0 = y0[2*j + 1]; + + const int y0_1 = y0[2*(j + QK8_1/4) + 0]; + const int y1_1 = y0[2*(j + QK8_1/4) + 1]; + + sxy_0 += x0_0*y0_0 + x1_0*y1_0; + sxy_1 += x0_1*y0_1 + x1_1*y1_1; + } + + sumf += (d0*sxy_0 + d1*sxy_1)*y[i].d + m0*y[i].s0 + m1*y[i].s1; + } + *s = sumf; +#endif +} + +static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK8_0; + + assert(n % QK8_0 == 0); + assert(nb % 2 == 0); + assert(QK8_0 == QK8_0); + + const block_q8_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i += 2) { + const block_q8_0 * restrict x0 = &x[i + 0]; + const block_q8_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const int8x16_t x0_0 = vld1q_s8(x0->qs); + const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); + const int8x16_t x1_0 = vld1q_s8(x1->qs); + const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); + + // load y + const int8x16_t y0_0 = vld1q_s8(y0->qs); + const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); + const int8x16_t y1_0 = vld1q_s8(y1->qs); + const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), + vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d); + + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), + vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d); + +#else + const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); + const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0)); + const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1)); + const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1)); + + const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0)); + const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0)); + const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1)); + const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1)); + + const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1)); + const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3)); + const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); + const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + const int8_t * restrict x0 = x[i].qs; + const int8_t * restrict y0 = y[i].qs; + + int sumi = 0; + + for (int j = 0; j < QK8_0; j++) { + const int v0 = x0[j]; + const int v1 = y0[j]; + + sumi += v0*v1; + } + + sumf += (x[i].d*y[i].d)*sumi; + } *s = sumf; +#endif } // compute GGML_VEC_DOT_UNROLL dot products at once @@ -2694,6 +3431,14 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #endif } +inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) { + ggml_float sum = 0.0; + for (int i = 0; i < n; ++i) { + sum += (ggml_float)x[i]; + } + *s = sum; +} + inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE float max = -INFINITY; @@ -2742,24 +3487,32 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = 1, [GGML_TYPE_F16] = 1, - [GGML_TYPE_Q4_0] = QK, - [GGML_TYPE_Q4_1] = QK, + [GGML_TYPE_Q4_0] = QK4_0, + [GGML_TYPE_Q4_1] = QK4_1, + [GGML_TYPE_Q4_2] = QK4_2, + [GGML_TYPE_Q4_3] = QK4_3, + [GGML_TYPE_Q8_0] = QK8_0, + [GGML_TYPE_Q8_1] = QK8_1, [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, }; -static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 11, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = sizeof(float), [GGML_TYPE_F16] = sizeof(ggml_fp16_t), [GGML_TYPE_Q4_0] = sizeof(block_q4_0), [GGML_TYPE_Q4_1] = sizeof(block_q4_1), + [GGML_TYPE_Q4_2] = sizeof(block_q4_2), + [GGML_TYPE_Q4_3] = sizeof(block_q4_3), + [GGML_TYPE_Q8_0] = sizeof(block_q8_0), + [GGML_TYPE_Q8_1] = sizeof(block_q8_1), [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), }; -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_SIZE is outdated"); static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { @@ -2767,11 +3520,30 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = "f16", [GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_1] = "q4_1", + [GGML_TYPE_Q4_2] = "q4_2", + [GGML_TYPE_Q4_3] = "q4_3", + [GGML_TYPE_Q8_0] = "q8_0", + [GGML_TYPE_Q8_1] = "q8_1", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", }; -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_NAME is outdated"); +static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_NAME is outdated"); + +static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = false, + [GGML_TYPE_F16] = false, + [GGML_TYPE_Q4_0] = true, + [GGML_TYPE_Q4_1] = true, + [GGML_TYPE_Q4_2] = true, + [GGML_TYPE_Q4_3] = true, + [GGML_TYPE_Q8_0] = true, + [GGML_TYPE_Q8_1] = true, + [GGML_TYPE_I8] = false, + [GGML_TYPE_I16] = false, + [GGML_TYPE_I32] = false, +}; +static_assert(GGML_TYPE_COUNT == 11, "GGML_IS_QUANTIZED is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -3033,6 +3805,10 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct (t0->ne[3] == t1->ne[3]); } +bool ggml_is_quantized(enum ggml_type type) { + return GGML_IS_QUANTIZED[type]; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -3143,6 +3919,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } + // initialize cuBLAS + #if defined(GGML_USE_CUBLAS) + ggml_init_cublas(); + #endif + is_first_call = false; } @@ -3444,14 +4225,6 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { char * const data = tensor->data; switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); @@ -3487,7 +4260,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { ggml_vec_set_f32(nc, (float *)(data + i*n1), value); } } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3504,14 +4277,6 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { char * const data = tensor->data; switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); @@ -3547,7 +4312,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { ggml_vec_set_f32(nc, (float *)(data + i*n1), value); } } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3558,14 +4323,6 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3591,7 +4348,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3602,14 +4359,6 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3635,7 +4384,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); ((float *)(tensor->data))[i] = value; } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3644,14 +4393,6 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3677,7 +4418,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3688,14 +4429,6 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3721,7 +4454,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); ((float *)(tensor->data))[i] = value; } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5121,7 +5854,6 @@ static void ggml_compute_forward_dup_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -5133,6 +5865,11 @@ static void ggml_compute_forward_dup_f16( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; const size_t nb02 = src0->nb[2]; @@ -5143,19 +5880,40 @@ static void ggml_compute_forward_dup_f16( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); + // parallelize by elements + const int ne = ggml_nelements(dst); + const int dr = (ne + nth - 1) / nth; + const int ie0 = dr * ith; + const int ie1 = MIN(ie0 + dr, ne); + + memcpy( + ((char *) dst->data + ie0*nb0), + ((char *) src0->data + ie0*nb00), + (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); + return; } + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + if (src0->type == dst->type && - src0->ne[0] == dst->ne[0] && - src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + ne00 == ne0 && + nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { memcpy( ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), @@ -5169,21 +5927,21 @@ static void ggml_compute_forward_dup_f16( // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy if (ggml_is_contiguous(dst)) { - if (src0->nb[0] == sizeof(ggml_fp16_t)) { + if (nb00 == sizeof(ggml_fp16_t)) { if (dst->type == GGML_TYPE_F16) { size_t id = 0; - const size_t rs = ne00*nb00; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; - - memcpy(dst_ptr, src0_ptr, rs); - - id++; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; } + id += rs * (ne01 - ir1); } } } else if (dst->type == GGML_TYPE_F32) { @@ -5192,14 +5950,39 @@ static void ggml_compute_forward_dup_f16( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); id++; } } + id += ne00 * (ne01 - ir1); + } + } + } else if (ggml_is_quantized(dst->type)) { + quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + size_t id = 0; + size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + for (int i00 = 0; i00 < ne00; i00++) { + src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); + } + + quantize_row_q(src0_f32, dst_ptr + id, ne00); + id += rs; + } + id += rs * (ne01 - ir1); } } } else { @@ -5214,7 +5997,8 @@ static void ggml_compute_forward_dup_f16( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -5222,6 +6006,7 @@ static void ggml_compute_forward_dup_f16( id++; } } + id += ne00 * (ne01 - ir1); } } } else if (dst->type == GGML_TYPE_F16) { @@ -5230,7 +6015,8 @@ static void ggml_compute_forward_dup_f16( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -5238,6 +6024,7 @@ static void ggml_compute_forward_dup_f16( id++; } } + id += ne00 * (ne01 - ir1); } } } else { @@ -5256,7 +6043,20 @@ static void ggml_compute_forward_dup_f16( if (dst->type == GGML_TYPE_F16) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); @@ -5277,25 +6077,51 @@ static void ggml_compute_forward_dup_f16( } } } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } } } } else if (dst->type == GGML_TYPE_F32) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); - if (++i10 == ne00) { + if (++i10 == ne0) { i10 = 0; - if (++i11 == ne01) { + if (++i11 == ne1) { i11 = 0; - if (++i12 == ne02) { + if (++i12 == ne2) { i12 = 0; - if (++i13 == ne03) { + if (++i13 == ne3) { i13 = 0; } } @@ -5303,6 +6129,19 @@ static void ggml_compute_forward_dup_f16( } } } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } } } } else { @@ -5314,7 +6153,6 @@ static void ggml_compute_forward_dup_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -5326,6 +6164,11 @@ static void ggml_compute_forward_dup_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; const size_t nb02 = src0->nb[2]; @@ -5336,19 +6179,40 @@ static void ggml_compute_forward_dup_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); + // parallelize by elements + const int ne = ggml_nelements(dst); + const int dr = (ne + nth - 1) / nth; + const int ie0 = dr * ith; + const int ie1 = MIN(ie0 + dr, ne); + + memcpy( + ((char *) dst->data + ie0*nb0), + ((char *) src0->data + ie0*nb00), + (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); + return; } + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + if (src0->type == dst->type && - src0->ne[0] == dst->ne[0] && - src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + ne00 == ne0 && + nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { memcpy( ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), @@ -5361,21 +6225,21 @@ static void ggml_compute_forward_dup_f32( if (ggml_is_contiguous(dst)) { // TODO: simplify - if (src0->nb[0] == sizeof(float)) { + if (nb00 == sizeof(float)) { if (dst->type == GGML_TYPE_F32) { size_t id = 0; - const size_t rs = ne00*nb00; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; - - memcpy(dst_ptr, src0_ptr, rs); - - id++; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; } + id += rs * (ne01 - ir1); } } } else if (dst->type == GGML_TYPE_F16) { @@ -5384,7 +6248,8 @@ static void ggml_compute_forward_dup_f32( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -5392,6 +6257,25 @@ static void ggml_compute_forward_dup_f32( id++; } } + id += ne00 * (ne01 - ir1); + } + } + } else if (ggml_is_quantized(dst->type)) { + quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + + size_t id = 0; + size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + quantize_row_q(src0_ptr, dst_ptr + id, ne00); + id += rs; + } + id += rs * (ne01 - ir1); } } } else { @@ -5406,7 +6290,8 @@ static void ggml_compute_forward_dup_f32( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -5414,6 +6299,7 @@ static void ggml_compute_forward_dup_f32( id++; } } + id += ne00 * (ne01 - ir1); } } } else if (dst->type == GGML_TYPE_F16) { @@ -5422,7 +6308,8 @@ static void ggml_compute_forward_dup_f32( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -5430,6 +6317,7 @@ static void ggml_compute_forward_dup_f32( id++; } } + id += ne00 * (ne01 - ir1); } } } else { @@ -5441,6 +6329,7 @@ static void ggml_compute_forward_dup_f32( } // dst counters + int64_t i10 = 0; int64_t i11 = 0; int64_t i12 = 0; @@ -5449,20 +6338,33 @@ static void ggml_compute_forward_dup_f32( if (dst->type == GGML_TYPE_F32) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); memcpy(dst_ptr, src0_ptr, sizeof(float)); - if (++i10 == dst->ne[0]) { + if (++i10 == ne0) { i10 = 0; - if (++i11 == dst->ne[1]) { + if (++i11 == ne1) { i11 = 0; - if (++i12 == dst->ne[2]) { + if (++i12 == ne2) { i12 = 0; - if (++i13 == dst->ne[3]) { + if (++i13 == ne3) { i13 = 0; } } @@ -5470,25 +6372,51 @@ static void ggml_compute_forward_dup_f32( } } } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } } } } else if (dst->type == GGML_TYPE_F16) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); - if (++i10 == dst->ne[0]) { + if (++i10 == ne0) { i10 = 0; - if (++i11 == dst->ne[1]) { + if (++i11 == ne1) { i11 = 0; - if (++i12 == dst->ne[2]) { + if (++i12 == ne2) { i12 = 0; - if (++i13 == dst->ne[3]) { + if (++i13 == ne3) { i13 = 0; } } @@ -5496,6 +6424,19 @@ static void ggml_compute_forward_dup_f32( } } } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } } } } else { @@ -5516,12 +6457,7 @@ static void ggml_compute_forward_dup( { ggml_compute_forward_dup_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5587,6 +6523,212 @@ static void ggml_compute_forward_add_f32( } } +static void ggml_compute_forward_add_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + if (nb10 == sizeof(float)) { + for (int j = ith; j < n; j += nth) { + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); + for (int i = 0; i < nc; i++) { + float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + if (nb10 == sizeof(ggml_fp16_t)) { + for (int j = ith; j < n; j += nth) { + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); + for (int i = 0; i < nc; i++) { + ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr)); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + //const int64_t ne10 = src1->ne[0]; + //const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + //const int64_t ne0 = dst->ne[0]; + //const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + const enum ggml_type type = src0->type; + dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ggml_is_quantized(src0->type)); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // total rows in src0 + const int nr = ne01*ne02*ne03; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + // src1 and dst are same shape as src0 => same indices + const int i13 = i03; + const int i12 = i02; + const int i11 = i01; + + const int i3 = i03; + const int i2 = i02; + const int i1 = i01; + + void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); + void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0)); + + assert(ne00 % 32 == 0); + + // unquantize row from src0 to temp buffer + dequantize_row_q(src0_row, wdata, ne00); + // add src1 + ggml_vec_acc_f32(ne00, wdata, src1_row); + // quantize row to dst + quantize_row_q(wdata, dst_row, ne00); + } +} + static void ggml_compute_forward_add( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -5597,13 +6739,27 @@ static void ggml_compute_forward_add( { ggml_compute_forward_add_f32(params, src0, src1, dst); } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + } + else { + GGML_ASSERT(false); + } + } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: + case GGML_TYPE_Q8_0: + { + ggml_compute_forward_add_q_f32(params, src0, src1, dst); + } break; + default: { GGML_ASSERT(false); } break; @@ -5649,13 +6805,7 @@ static void ggml_compute_forward_sub( { ggml_compute_forward_sub_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5701,13 +6851,7 @@ static void ggml_compute_forward_mul( { ggml_compute_forward_mul_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5753,13 +6897,7 @@ static void ggml_compute_forward_div( { ggml_compute_forward_div_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5801,13 +6939,7 @@ static void ggml_compute_forward_sqr( { ggml_compute_forward_sqr_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5849,13 +6981,7 @@ static void ggml_compute_forward_sqrt( { ggml_compute_forward_sqrt_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5887,15 +7013,20 @@ static void ggml_compute_forward_sum_f32( const size_t nb02 = src0->nb[2]; const size_t nb03 = src0->nb[3]; + ggml_float sum = 0; + ggml_float row_sum = 0; + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_f32(ne00, - (float *) (dst->data), + ggml_vec_sum_ggf(ne00, + &row_sum, (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + sum += row_sum; } } } + ((float *) dst->data)[0] = sum; } static void ggml_compute_forward_sum( @@ -5907,13 +7038,7 @@ static void ggml_compute_forward_sum( { ggml_compute_forward_sum_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5984,13 +7109,7 @@ static void ggml_compute_forward_mean( { ggml_compute_forward_mean_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6048,13 +7167,7 @@ static void ggml_compute_forward_repeat( { ggml_compute_forward_repeat_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6096,13 +7209,7 @@ static void ggml_compute_forward_abs( { ggml_compute_forward_abs_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6144,13 +7251,7 @@ static void ggml_compute_forward_sgn( { ggml_compute_forward_sgn_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6192,13 +7293,7 @@ static void ggml_compute_forward_neg( { ggml_compute_forward_neg_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6240,13 +7335,7 @@ static void ggml_compute_forward_step( { ggml_compute_forward_step_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6288,13 +7377,7 @@ static void ggml_compute_forward_relu( { ggml_compute_forward_relu_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6353,13 +7436,7 @@ static void ggml_compute_forward_gelu( { ggml_compute_forward_gelu_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6420,13 +7497,7 @@ static void ggml_compute_forward_silu( { ggml_compute_forward_silu_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6506,13 +7577,7 @@ static void ggml_compute_forward_norm( { ggml_compute_forward_norm_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6586,13 +7651,7 @@ static void ggml_compute_forward_rms_norm( { ggml_compute_forward_rms_norm_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6602,7 +7661,7 @@ static void ggml_compute_forward_rms_norm( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( @@ -6642,7 +7701,7 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(__AVX2__) || defined(__AVX__) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(__AVX2__) || defined(__AVX__) const int64_t ne10 = src1->ne[0]; #endif const int64_t ne11 = src1->ne[1]; @@ -6730,7 +7789,7 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -6744,6 +7803,19 @@ static void ggml_compute_forward_mul_mat_f32( return; } +#if defined(GGML_USE_CUBLAS) + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne10; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + size_t x_size, y_size, d_size; + float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); + float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); + float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); +#endif + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); @@ -6751,16 +7823,38 @@ static void ggml_compute_forward_mul_mat_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); +#if defined(GGML_USE_CUBLAS) + // copy data to device + CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, g_cudaStream)); + CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream)); + + // compute + CUBLAS_CHECK( + cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, d_X, ne00, + d_Y, ne10, + &beta, d_D, ne01)); + + // copy data to host + CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); +#else // zT = y * xT cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); +#endif } } - - //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne01, ne11, ne02, ne03); +#if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaStreamSynchronize(g_cudaStream)); + ggml_cuda_pool_free(d_X, x_size); + ggml_cuda_pool_free(d_Y, y_size); + ggml_cuda_pool_free(d_D, d_size); +#endif + //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; } @@ -6889,7 +7983,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -6905,10 +7999,35 @@ static void ggml_compute_forward_mul_mat_f16_f32( return; } - float * const wdata = params->wdata; +#if defined(GGML_USE_CUBLAS) + ggml_fp16_t * const wdata = params->wdata; + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne10; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + size_t x_size, y_size, d_size; + float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); + float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); + float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); +#else + float * const wdata = params->wdata; +#endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { +#if defined(GGML_USE_CUBLAS) + // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16 + { + size_t id = 0; + for (int64_t i01 = 0; i01 < ne11; ++i01) { + for (int64_t i00 = 0; i00 < ne10; ++i00) { + wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)); + } + } + } +#else { size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { @@ -6917,7 +8036,31 @@ static void ggml_compute_forward_mul_mat_f16_f32( } } } +#endif +#if defined(GGML_USE_CUBLAS) + const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03); + const ggml_fp16_t * y = (ggml_fp16_t *) wdata; + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + // copy data to device + CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream)); + CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream)); + + // compute + CUBLAS_CHECK( + cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, d_X, CUDA_R_16F, ne00, + d_Y, CUDA_R_16F, ne10, + &beta, d_D, CUDA_R_32F, ne01, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT)); + + // copy data to host + CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); +#else const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -6929,9 +8072,16 @@ static void ggml_compute_forward_mul_mat_f16_f32( 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); +#endif } } +#if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaStreamSynchronize(g_cudaStream)); + ggml_cuda_pool_free(d_X, x_size); + ggml_cuda_pool_free(d_Y, y_size); + ggml_cuda_pool_free(d_D, d_size); +#endif /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ return; @@ -7015,27 +8165,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } -static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { - [GGML_TYPE_Q4_0] = { - .dequantize_row_q = dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, - .vec_dot_q = ggml_vec_dot_q4_0, - }, - [GGML_TYPE_Q4_1] = { - .dequantize_row_q = dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, - .vec_dot_q = ggml_vec_dot_q4_1, - }, -}; - -// For internal test use -quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { - GGML_ASSERT(i < GGML_TYPE_COUNT); - return quantize_fns[i]; -} - static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -7083,8 +8212,9 @@ static void ggml_compute_forward_mul_mat_q_f32( GGML_ASSERT(ne3 == ne13); const enum ggml_type type = src0->type; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; - vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; + quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot; + vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; + enum ggml_type const vec_dot_type = quantize_fns[type].vec_dot_type; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); @@ -7104,7 +8234,7 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -7118,11 +8248,58 @@ static void ggml_compute_forward_mul_mat_q_f32( return; } +#if defined(GGML_USE_CUBLAS) + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne10; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + size_t x_size, y_size, d_size, q_size; + float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); + float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); + float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); + float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size); + + void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL; + if (type == GGML_TYPE_Q4_0) { + dequantize_row_q_cuda = dequantize_row_q4_0_cuda; + } + else if (type == GGML_TYPE_Q4_1) { + dequantize_row_q_cuda = dequantize_row_q4_1_cuda; + } + else if (type == GGML_TYPE_Q4_2) { + dequantize_row_q_cuda = dequantize_row_q4_2_cuda; + } + else if (type == GGML_TYPE_Q4_3) { + dequantize_row_q_cuda = dequantize_row_q4_3_cuda; + } + else if (type == GGML_TYPE_Q8_0) { + dequantize_row_q_cuda = dequantize_row_q8_0_cuda; + } + else { + GGML_ASSERT(false); + } +#else float * const wdata = params->wdata; dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; +#endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + +#if defined(GGML_USE_CUBLAS) + // copy and dequantize on device + CUDA_CHECK( + cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02, + GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, g_cudaStream)); + + dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream); + CUDA_CHECK(cudaGetLastError()); +#else { size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { @@ -7130,21 +8307,42 @@ static void ggml_compute_forward_mul_mat_q_f32( id += ne00; } } - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); +#endif - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); +#if defined(GGML_USE_CUBLAS) + // copy data to device + CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream)); + + // compute + CUBLAS_CHECK( + cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, d_X, ne00, + d_Y, ne10, + &beta, d_D, ne01)); + + // copy data to host + CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); +#else // zT = y * xT cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); +#endif } } +#if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaStreamSynchronize(g_cudaStream)); + ggml_cuda_pool_free(d_X, x_size); + ggml_cuda_pool_free(d_Y, y_size); + ggml_cuda_pool_free(d_D, d_size); + ggml_cuda_pool_free(d_Q, q_size); +#endif //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; @@ -7153,12 +8351,12 @@ static void ggml_compute_forward_mul_mat_q_f32( if (params->type == GGML_TASK_INIT) { char * wdata = params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type]; + const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { - quantize_row_q((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); wdata += row_size; } } @@ -7184,7 +8382,7 @@ static void ggml_compute_forward_mul_mat_q_f32( const int ir1 = MIN(ir0 + dr, nr); void * wdata = params->wdata; - const size_t row_size = ne00*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type]; + const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; for (int ir = ir0; ir < ir1; ++ir) { // src0 indices @@ -7232,6 +8430,10 @@ static void ggml_compute_forward_mul_mat( switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: { ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); } break; @@ -7243,42 +8445,11 @@ static void ggml_compute_forward_mul_mat( { ggml_compute_forward_mul_mat_f32(params, src0, src1, dst); } break; - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; } - -#if 0 - if (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_Q4_1) { - static int first = 8; - printf("src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src0->ne[0], src0->ne[1], src0->ne[2]); - printf("src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src1->ne[0], src1->ne[1], src1->ne[2]); - printf("dst: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", dst->ne[0], dst->ne[1], dst->ne[2]); - if (first) { - --first; - } else { - for (int k = 0; k < dst->ne[1]; ++k) { - for (int j = 0; j < dst->ne[0]/16; ++j) { - for (int i = 0; i < 16; ++i) { - printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); - } - printf("\n"); - } - printf("\n"); - } - printf("\n"); - exit(0); - } - } else { - printf("aaaa src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src0->ne[0], src0->ne[1], src0->ne[2]); - printf("aaaa src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src1->ne[0], src1->ne[1], src1->ne[2]); - printf("aaaa dst: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", dst->ne[0], dst->ne[1], dst->ne[2]); - } -#endif } // ggml_compute_forward_scale @@ -7328,13 +8499,7 @@ static void ggml_compute_forward_scale( { ggml_compute_forward_scale_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -7495,6 +8660,10 @@ static void ggml_compute_forward_get_rows( switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -7506,10 +8675,7 @@ static void ggml_compute_forward_get_rows( { ggml_compute_forward_get_rows_f32(params, src0, src1, dst); } break; - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -7582,13 +8748,7 @@ static void ggml_compute_forward_diag_mask_inf( { ggml_compute_forward_diag_mask_inf_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -7676,13 +8836,7 @@ static void ggml_compute_forward_soft_max( { ggml_compute_forward_soft_max_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -7739,9 +8893,11 @@ static void ggml_compute_forward_rope_f32( const float theta_scale = powf(10000.0, -2.0f/n_dims); + const bool is_neox = mode & 2; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int p = (mode == 0 ? n_past + i2 : i2); + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int p = ((mode & 1) == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -7754,14 +8910,25 @@ static void ggml_compute_forward_rope_f32( theta *= theta_scale; - const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + if (!is_neox) { + const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = src[0]; - const float x1 = src[1]; + const float x0 = src[0]; + const float x1 = src[1]; - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[1] = x0*sin_theta + x1*cos_theta; + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[1] = x0*sin_theta + x1*cos_theta; + } else { + const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + + const float x0 = src[0]; + const float x1 = src[n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } } } } @@ -7816,9 +8983,11 @@ static void ggml_compute_forward_rope_f16( const float theta_scale = powf(10000.0, -2.0f/n_dims); + const bool is_neox = mode & 2; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int p = (mode == 0 ? n_past + i2 : i2); + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int p = ((mode & 1) == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -7831,14 +9000,25 @@ static void ggml_compute_forward_rope_f16( theta *= theta_scale; - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + if (!is_neox) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = ggml_fp16_to_fp32(src[0]); - const float x1 = ggml_fp16_to_fp32(src[1]); + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[1]); - dst_data[0] = ggml_fp32_to_fp16(x0*cos_theta - x1*sin_theta); - dst_data[1] = ggml_fp32_to_fp16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } else { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } } } } @@ -7859,12 +9039,7 @@ static void ggml_compute_forward_rope( { ggml_compute_forward_rope_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -8127,12 +9302,7 @@ static void ggml_compute_forward_conv_1d_1s( { ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -8395,12 +9565,7 @@ static void ggml_compute_forward_conv_1d_2s( { ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -8880,12 +10045,7 @@ static void ggml_compute_forward_flash_attn( { ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -9091,12 +10251,7 @@ static void ggml_compute_forward_flash_ff( { GGML_ASSERT(false); // TODO } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -9140,13 +10295,7 @@ static void ggml_compute_forward_map_unary( { ggml_compute_forward_map_unary_f32(params, src0, dst, fun); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -9195,13 +10344,7 @@ static void ggml_compute_forward_map_binary( { ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -9951,13 +11094,29 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) struct ggml_tensor * node = cgraph->nodes[i]; switch (node->op) { + case GGML_OP_CPY: case GGML_OP_DUP: { - node->n_tasks = 1; + node->n_tasks = n_threads; + + size_t cur = 0; + if (ggml_is_quantized(node->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; + } + + work_size = MAX(work_size, cur); } break; case GGML_OP_ADD: { node->n_tasks = n_threads; + + size_t cur = 0; + + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; + } + + work_size = MAX(work_size, cur); } break; case GGML_OP_SUB: case GGML_OP_MUL: @@ -10002,7 +11161,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -10018,15 +11177,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; - } else if (quantize_fns[node->src0->type].vec_dot_q && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); } else #endif { - cur = GGML_TYPE_SIZE[node->src0->type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[node->src0->type]; + const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; + cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; } } else { GGML_ASSERT(false); @@ -10038,7 +11198,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = n_threads; } break; - case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: @@ -10355,9 +11514,9 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - perf_total_per_op_us[node->op] += node->perf_time_us; + perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us); - GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 ", %" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, @@ -10371,13 +11530,17 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * node = cgraph->leafs[i]; - GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 "] %8s\n", + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", i, node->ne[0], node->ne[1], GGML_OP_LABEL[node->op]); } for (int i = 0; i < GGML_OP_COUNT; i++) { + if (perf_total_per_op_us[i] == 0) { + continue; + } + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0); } @@ -11201,16 +12364,16 @@ enum ggml_opt_result ggml_opt( //////////////////////////////////////////////////////////////////////////////// size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; for (int j = 0; j < n; j += k) { - block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK; + block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK4_0; quantize_row_q4_0_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK4_0; l += 2) { const uint8_t vi0 = y[i].qs[l/2] & 0xF; const uint8_t vi1 = y[i].qs[l/2] >> 4; @@ -11220,20 +12383,20 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * } } - return (n/QK*sizeof(block_q4_0)); + return (n/QK4_0*sizeof(block_q4_0)); } size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_1 == 0); + const int nb = k / QK4_1; for (int j = 0; j < n; j += k) { - block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK; + block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK4_1; quantize_row_q4_1_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK4_1; l += 2) { const uint8_t vi0 = y[i].qs[l/2] & 0xF; const uint8_t vi1 = y[i].qs[l/2] >> 4; @@ -11243,7 +12406,113 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * } } - return (n/QK*sizeof(block_q4_1)); + return (n/QK4_1*sizeof(block_q4_1)); +} + +size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK4_2 == 0); + const int nb = k / QK4_2; + + for (int j = 0; j < n; j += k) { + block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2; + + quantize_row_q4_2_reference(src + j, y, k); + + for (int i = 0; i < nb; i++) { + for (int l = 0; l < QK4_2; l += 2) { + const uint8_t vi0 = y[i].qs[l/2] & 0xF; + const uint8_t vi1 = y[i].qs[l/2] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/QK4_2*sizeof(block_q4_2)); +} + +size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK4_3 == 0); + const int nb = k / QK4_3; + + for (int j = 0; j < n; j += k) { + block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3; + + quantize_row_q4_3_reference(src + j, y, k); + + for (int i = 0; i < nb; i++) { + for (int l = 0; l < QK4_3; l += 2) { + const uint8_t vi0 = y[i].qs[l/2] & 0xF; + const uint8_t vi1 = y[i].qs[l/2] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/QK4_3*sizeof(block_q4_3)); +} + +size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int j = 0; j < n; j += k) { + block_q8_0 * restrict y = (block_q8_0 *)dst + j/QK8_0; + + quantize_row_q8_0_reference(src + j, y, k); + + for (int i = 0; i < nb; i++) { + for (int l = 0; l < QK8_0; ++l) { + const int8_t vi = y[i].qs[l]; + + hist[vi/16 + 8]++; + } + } + } + + return (n/QK8_0*sizeof(block_q8_0)); +} + +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { + size_t result = 0; + switch (type) { + case GGML_TYPE_Q4_0: + { + GGML_ASSERT(start % QK4_0 == 0); + block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; + result = ggml_quantize_q4_0(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q4_1: + { + GGML_ASSERT(start % QK4_1 == 0); + block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; + result = ggml_quantize_q4_1(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q4_2: + { + GGML_ASSERT(start % QK4_2 == 0); + block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; + result = ggml_quantize_q4_2(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q4_3: + { + GGML_ASSERT(start % QK4_3 == 0); + block_q4_3 * block = (block_q4_3*)dst + start / QK4_3; + result = ggml_quantize_q4_3(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(start % QK8_0 == 0); + block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; + result = ggml_quantize_q8_0(src + start, block, n, n, hist); + } break; + default: + assert(false); + } + return result; } //////////////////////////////////////////////////////////////////////////////// @@ -11272,6 +12541,22 @@ int ggml_cpu_has_avx512(void) { #endif } +int ggml_cpu_has_avx512_vbmi(void) { +#if defined(__AVX512VBMI__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512_vnni(void) { +#if defined(__AVX512VNNI__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_fma(void) { #if defined(__FMA__) return 1; @@ -11321,7 +12606,15 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_cublas(void) { +#if defined(GGML_USE_CUBLAS) return 1; #else return 0; diff --git a/ggml.h b/ggml.h index 617298a95..8300a0c62 100644 --- a/ggml.h +++ b/ggml.h @@ -169,14 +169,27 @@ // // -#ifdef __cplusplus -extern "C" { +#ifdef GGML_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_BUILD +# define GGML_API __declspec(dllexport) +# else +# define GGML_API __declspec(dllimport) +# endif +# else +# define GGML_API __attribute__ ((visibility ("default"))) +# endif +#else +# define GGML_API #endif #include #include #include +#define GGML_FILE_MAGIC 0x67676d6c // "ggml" +#define GGML_FILE_VERSION 1 + #define GGML_MAX_DIMS 4 #define GGML_MAX_NODES 4096 #define GGML_MAX_PARAMS 16 @@ -184,662 +197,691 @@ extern "C" { #define GGML_MAX_OPT 4 #define GGML_DEFAULT_N_THREADS 4 -#ifdef __ARM_NEON -// we use the built-in 16-bit float type -typedef __fp16 ggml_fp16_t; -#else -typedef uint16_t ggml_fp16_t; +#ifdef __cplusplus +extern "C" { #endif -// convert FP16 <-> FP32 -float ggml_fp16_to_fp32(ggml_fp16_t x); -ggml_fp16_t ggml_fp32_to_fp16(float x); - -struct ggml_object; -struct ggml_context; - -enum ggml_type { - // explicitly numbered values are used in llama.cpp files - GGML_TYPE_F32 = 0, - GGML_TYPE_F16 = 1, - GGML_TYPE_Q4_0 = 2, - GGML_TYPE_Q4_1 = 3, - GGML_TYPE_I8, - GGML_TYPE_I16, - GGML_TYPE_I32, - GGML_TYPE_COUNT, -}; - -// available tensor operations: -enum ggml_op { - GGML_OP_NONE = 0, - - GGML_OP_DUP, - GGML_OP_ADD, - GGML_OP_SUB, - GGML_OP_MUL, - GGML_OP_DIV, - GGML_OP_SQR, - GGML_OP_SQRT, - GGML_OP_SUM, - GGML_OP_MEAN, - GGML_OP_REPEAT, - GGML_OP_ABS, - GGML_OP_SGN, - GGML_OP_NEG, - GGML_OP_STEP, - GGML_OP_RELU, - GGML_OP_GELU, - GGML_OP_SILU, - GGML_OP_NORM, // normalize - GGML_OP_RMS_NORM, - - GGML_OP_MUL_MAT, - - GGML_OP_SCALE, - GGML_OP_CPY, - GGML_OP_CONT, - GGML_OP_RESHAPE, - GGML_OP_VIEW, - GGML_OP_PERMUTE, - GGML_OP_TRANSPOSE, - GGML_OP_GET_ROWS, - GGML_OP_DIAG_MASK_INF, - GGML_OP_SOFT_MAX, - GGML_OP_ROPE, - GGML_OP_CONV_1D_1S, - GGML_OP_CONV_1D_2S, - - GGML_OP_FLASH_ATTN, - GGML_OP_FLASH_FF, - - GGML_OP_MAP_UNARY, - GGML_OP_MAP_BINARY, - - GGML_OP_COUNT, -}; - - -// ggml object -struct ggml_object { - size_t offs; - size_t size; - - struct ggml_object * next; - - char padding[8]; -}; - -static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); - -// n-dimensional tensor -struct ggml_tensor { - enum ggml_type type; - - int n_dims; - int64_t ne[GGML_MAX_DIMS]; // number of elements - size_t nb[GGML_MAX_DIMS]; // stride in bytes: - // nb[0] = sizeof(type) - // nb[1] = nb[0] * ne[0] + padding - // nb[i] = nb[i-1] * ne[i-1] - - // compute data - enum ggml_op op; - - bool is_param; - - struct ggml_tensor * grad; - struct ggml_tensor * src0; - struct ggml_tensor * src1; - struct ggml_tensor * opt[GGML_MAX_OPT]; - - // thread scheduling - int n_tasks; - - // performance - int perf_runs; - int64_t perf_cycles; - int64_t perf_time_us; - - void * data; - char padding[8]; -}; - -// computation graph -struct ggml_cgraph { - int n_nodes; - int n_leafs; - int n_threads; - - size_t work_size; - struct ggml_tensor * work; - - struct ggml_tensor * nodes[GGML_MAX_NODES]; - struct ggml_tensor * grads[GGML_MAX_NODES]; - struct ggml_tensor * leafs[GGML_MAX_NODES]; - - // performance - int perf_runs; - int64_t perf_cycles; - int64_t perf_time_us; -}; - -// scratch buffer -struct ggml_scratch { - size_t offs; - size_t size; - void * data; -}; - -struct ggml_init_params { - // memory pool - size_t mem_size; // bytes - void * mem_buffer; // if NULL, memory will be allocated internally - bool no_alloc; // don't allocate memory for the tensor data -}; - -void ggml_time_init(void); // call this once at the beginning of the program -int64_t ggml_time_ms(void); -int64_t ggml_time_us(void); -int64_t ggml_cycles(void); -int64_t ggml_cycles_per_ms(void); - -void ggml_print_object (const struct ggml_object * obj); -void ggml_print_objects(const struct ggml_context * ctx); - -int64_t ggml_nelements(const struct ggml_tensor * tensor); -size_t ggml_nbytes (const struct ggml_tensor * tensor); +#ifdef __ARM_NEON + // we use the built-in 16-bit float type + typedef __fp16 ggml_fp16_t; +#else + typedef uint16_t ggml_fp16_t; +#endif + + // convert FP16 <-> FP32 + GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); + GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); + + struct ggml_object; + struct ggml_context; + + enum ggml_type { + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, + GGML_TYPE_Q4_2 = 4, + GGML_TYPE_Q4_3 = 5, + GGML_TYPE_Q8_0 = 6, + GGML_TYPE_Q8_1 = 7, + GGML_TYPE_I8, + GGML_TYPE_I16, + GGML_TYPE_I32, + GGML_TYPE_COUNT, + }; + + // available tensor operations: + enum ggml_op { + GGML_OP_NONE = 0, + + GGML_OP_DUP, + GGML_OP_ADD, + GGML_OP_SUB, + GGML_OP_MUL, + GGML_OP_DIV, + GGML_OP_SQR, + GGML_OP_SQRT, + GGML_OP_SUM, + GGML_OP_MEAN, + GGML_OP_REPEAT, + GGML_OP_ABS, + GGML_OP_SGN, + GGML_OP_NEG, + GGML_OP_STEP, + GGML_OP_RELU, + GGML_OP_GELU, + GGML_OP_SILU, + GGML_OP_NORM, // normalize + GGML_OP_RMS_NORM, + + GGML_OP_MUL_MAT, + + GGML_OP_SCALE, + GGML_OP_CPY, + GGML_OP_CONT, + GGML_OP_RESHAPE, + GGML_OP_VIEW, + GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, + GGML_OP_DIAG_MASK_INF, + GGML_OP_SOFT_MAX, + GGML_OP_ROPE, + GGML_OP_CONV_1D_1S, + GGML_OP_CONV_1D_2S, + + GGML_OP_FLASH_ATTN, + GGML_OP_FLASH_FF, + + GGML_OP_MAP_UNARY, + GGML_OP_MAP_BINARY, + + GGML_OP_COUNT, + }; + + + // ggml object + struct ggml_object { + size_t offs; + size_t size; + + struct ggml_object * next; + + char padding[8]; + }; + + static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + + // n-dimensional tensor + struct ggml_tensor { + enum ggml_type type; + + int n_dims; + int64_t ne[GGML_MAX_DIMS]; // number of elements + size_t nb[GGML_MAX_DIMS]; // stride in bytes: + // nb[0] = sizeof(type) + // nb[1] = nb[0] * ne[0] + padding + // nb[i] = nb[i-1] * ne[i-1] + + // compute data + enum ggml_op op; + + bool is_param; + + struct ggml_tensor * grad; + struct ggml_tensor * src0; + struct ggml_tensor * src1; + struct ggml_tensor * opt[GGML_MAX_OPT]; + + // thread scheduling + int n_tasks; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + + void * data; + char padding[8]; + }; + + // computation graph + struct ggml_cgraph { + int n_nodes; + int n_leafs; + int n_threads; + + size_t work_size; + struct ggml_tensor * work; + + struct ggml_tensor * nodes[GGML_MAX_NODES]; + struct ggml_tensor * grads[GGML_MAX_NODES]; + struct ggml_tensor * leafs[GGML_MAX_NODES]; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + }; + + // scratch buffer + struct ggml_scratch { + size_t offs; + size_t size; + void * data; + }; + + struct ggml_init_params { + // memory pool + size_t mem_size; // bytes + void * mem_buffer; // if NULL, memory will be allocated internally + bool no_alloc; // don't allocate memory for the tensor data + }; + + // misc + + GGML_API void ggml_time_init(void); // call this once at the beginning of the program + GGML_API int64_t ggml_time_ms(void); + GGML_API int64_t ggml_time_us(void); + GGML_API int64_t ggml_cycles(void); + GGML_API int64_t ggml_cycles_per_ms(void); + + GGML_API void ggml_print_object (const struct ggml_object * obj); + GGML_API void ggml_print_objects(const struct ggml_context * ctx); + + GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + + GGML_API int ggml_blck_size (enum ggml_type type); + GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block + GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float + + GGML_API const char * ggml_type_name(enum ggml_type type); + + GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); + + GGML_API bool ggml_is_quantized(enum ggml_type type); + + // main + + GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); + GGML_API void ggml_free(struct ggml_context * ctx); + + GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); + + GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); + + GGML_API struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t *ne); + + GGML_API struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1); + + GGML_API struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); + GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); + + GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); + GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); + + GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); + + GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); + + GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); + GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); -int ggml_blck_size (enum ggml_type type); -size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block -float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float - -const char * ggml_type_name(enum ggml_type type); - -size_t ggml_element_size(const struct ggml_tensor * tensor); - -struct ggml_context * ggml_init(struct ggml_init_params params); -void ggml_free(struct ggml_context * ctx); - -size_t ggml_used_mem(const struct ggml_context * ctx); - -size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); - -struct ggml_tensor * ggml_new_tensor( - struct ggml_context * ctx, - enum ggml_type type, - int n_dims, - const int64_t *ne); - -struct ggml_tensor * ggml_new_tensor_1d( - struct ggml_context * ctx, - enum ggml_type type, - int64_t ne0); - -struct ggml_tensor * ggml_new_tensor_2d( - struct ggml_context * ctx, - enum ggml_type type, - int64_t ne0, - int64_t ne1); - -struct ggml_tensor * ggml_new_tensor_3d( - struct ggml_context * ctx, - enum ggml_type type, - int64_t ne0, - int64_t ne1, - int64_t ne2); - -struct ggml_tensor * ggml_new_tensor_4d( - struct ggml_context * ctx, - enum ggml_type type, - int64_t ne0, - int64_t ne1, - int64_t ne2, - int64_t ne3); - -struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); -struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); - -struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); -struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); - -struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); -struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); -struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); - -int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); -void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); - -float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); -void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); - - void * ggml_get_data (const struct ggml_tensor * tensor); -float * ggml_get_data_f32(const struct ggml_tensor * tensor); - -// -// operations on tensors with backpropagation -// - -struct ggml_tensor * ggml_dup( - struct ggml_context * ctx, - struct ggml_tensor * a); - -struct ggml_tensor * ggml_add( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -struct ggml_tensor * ggml_sub( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -struct ggml_tensor * ggml_mul( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -struct ggml_tensor * ggml_div( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -struct ggml_tensor * ggml_sqr( - struct ggml_context * ctx, - struct ggml_tensor * a); - -struct ggml_tensor * ggml_sqrt( - struct ggml_context * ctx, - struct ggml_tensor * a); - -// return scalar -// TODO: compute sum along rows -struct ggml_tensor * ggml_sum( - struct ggml_context * ctx, - struct ggml_tensor * a); - -// mean along rows -struct ggml_tensor * ggml_mean( - struct ggml_context * ctx, - struct ggml_tensor * a); - -// if a is the same shape as b, and a is not parameter, return a -// otherwise, return a new tensor: repeat(a) to fit in b -struct ggml_tensor * ggml_repeat( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -struct ggml_tensor * ggml_abs( - struct ggml_context * ctx, - struct ggml_tensor * a); - -struct ggml_tensor * ggml_sgn( - struct ggml_context * ctx, - struct ggml_tensor * a); - -struct ggml_tensor * ggml_neg( - struct ggml_context * ctx, - struct ggml_tensor * a); - -struct ggml_tensor * ggml_step( - struct ggml_context * ctx, - struct ggml_tensor * a); - -struct ggml_tensor * ggml_relu( - struct ggml_context * ctx, - struct ggml_tensor * a); - -// TODO: double-check this computation is correct -struct ggml_tensor * ggml_gelu( - struct ggml_context * ctx, - struct ggml_tensor * a); - -struct ggml_tensor * ggml_silu( - struct ggml_context * ctx, - struct ggml_tensor * a); - -// normalize along rows -// TODO: eps is hardcoded to 1e-5 for now -struct ggml_tensor * ggml_norm( - struct ggml_context * ctx, - struct ggml_tensor * a); - -struct ggml_tensor * ggml_rms_norm( - struct ggml_context * ctx, - struct ggml_tensor * a); - -// A: m rows, n columns -// B: p rows, n columns (i.e. we transpose it internally) -// result is m columns, p rows -struct ggml_tensor * ggml_mul_mat( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -// -// operations on tensors without backpropagation -// - -// in-place, returns view(a) -struct ggml_tensor * ggml_scale( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -// a -> b, return view(b) -struct ggml_tensor * ggml_cpy( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -// make contiguous -struct ggml_tensor * ggml_cont( - struct ggml_context * ctx, - struct ggml_tensor * a); - -// return view(a), b specifies the new shape -// TODO: when we start computing gradient, make a copy instead of view -struct ggml_tensor * ggml_reshape( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -// return view(a) -// TODO: when we start computing gradient, make a copy instead of view -struct ggml_tensor * ggml_reshape_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1); - -// return view(a) -// TODO: when we start computing gradient, make a copy instead of view -struct ggml_tensor * ggml_reshape_3d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - int64_t ne2); - -// offset in bytes -struct ggml_tensor * ggml_view_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - size_t offset); - -struct ggml_tensor * ggml_view_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - size_t nb1, // row stride in bytes - size_t offset); - -struct ggml_tensor * ggml_view_3d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - int64_t ne2, - size_t nb1, // row stride in bytes - size_t nb2, // slice stride in bytes - size_t offset); - -struct ggml_tensor * ggml_permute( - struct ggml_context * ctx, - struct ggml_tensor * a, - int axis0, - int axis1, - int axis2, - int axis3); - -// alias for ggml_permute(ctx, a, 1, 0, 2, 3) -struct ggml_tensor * ggml_transpose( - struct ggml_context * ctx, - struct ggml_tensor * a); - -struct ggml_tensor * ggml_get_rows( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -// set elements above the diagonal to -INF -// in-place, returns view(a) -struct ggml_tensor * ggml_diag_mask_inf( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_past); - -// in-place, returns view(a) -struct ggml_tensor * ggml_soft_max( - struct ggml_context * ctx, - struct ggml_tensor * a); - -// rotary position embedding -// in-place, returns view(a) -// if mode == 1, skip n_past elements -// TODO: avoid creating a new tensor every time -struct ggml_tensor * ggml_rope( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_past, - int n_dims, - int mode); - -// padding = 1 -// TODO: we don't support extra parameters for now -// that's why we are hard-coding the stride, padding, and dilation -// not great .. -struct ggml_tensor * ggml_conv_1d_1s( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -struct ggml_tensor * ggml_conv_1d_2s( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - -struct ggml_tensor * ggml_flash_attn( - struct ggml_context * ctx, - struct ggml_tensor * q, - struct ggml_tensor * k, - struct ggml_tensor * v, - bool masked); - -struct ggml_tensor * ggml_flash_ff( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b0, - struct ggml_tensor * b1, - struct ggml_tensor * c0, - struct ggml_tensor * c1); - -// Mapping operations -typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); -typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); - -struct ggml_tensor * ggml_map_unary_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_unary_op_f32_t fun); - -struct ggml_tensor * ggml_map_binary_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_binary_op_f32_t fun); - -// -// automatic differentiation -// - -void ggml_set_param( - struct ggml_context * ctx, - struct ggml_tensor * tensor); - -void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); - -struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); -struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); -void ggml_graph_reset (struct ggml_cgraph * cgraph); - -// print info and performance information for the graph -void ggml_graph_print(const struct ggml_cgraph * cgraph); - -// dump the graph into a file using the dot format -void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); - -// -// optimization -// - -// optimization methods -enum ggml_opt_type { - GGML_OPT_ADAM, - GGML_OPT_LBFGS, -}; - -// linesearch methods -enum ggml_linesearch { - GGML_LINESEARCH_DEFAULT = 1, - - GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, - GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, - GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, -}; - -// optimization return values -enum ggml_opt_result { - GGML_OPT_OK = 0, - GGML_OPT_DID_NOT_CONVERGE, - GGML_OPT_NO_CONTEXT, - GGML_OPT_INVALID_WOLFE, - GGML_OPT_FAIL, - - GGML_LINESEARCH_FAIL = -128, - GGML_LINESEARCH_MINIMUM_STEP, - GGML_LINESEARCH_MAXIMUM_STEP, - GGML_LINESEARCH_MAXIMUM_ITERATIONS, - GGML_LINESEARCH_INVALID_PARAMETERS, -}; - -// optimization parameters -// -// see ggml.c (ggml_opt_default_params) for default values -// -struct ggml_opt_params { - enum ggml_opt_type type; - - int n_threads; - - // delta-based convergence test // - // if past == 0 - disabled - // if past > 0: - // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) + // operations on tensors with backpropagation // - int past; - float delta; - // maximum number of iterations without improvement + GGML_API struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // return scalar + // TODO: compute sum along rows + GGML_API struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // mean along rows + GGML_API struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // if a is the same shape as b, and a is not parameter, return a + // otherwise, return a new tensor: repeat(a) to fit in b + GGML_API struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // TODO: double-check this computation is correct + GGML_API struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_silu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // normalize along rows + // TODO: eps is hardcoded to 1e-5 for now + GGML_API struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_rms_norm( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // A: m rows, n columns + // B: p rows, n columns (i.e. we transpose it internally) + // result is m columns, p rows + GGML_API struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // - // if 0 - disabled - // if > 0: - // assume convergence if no cost improvement in this number of iterations + // operations on tensors without backpropagation // - int max_no_improvement; - bool print_forward_graph; - bool print_backward_graph; + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - // ADAM parameters - struct { - int n_iter; + // a -> b, return view(b) + GGML_API struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - float alpha; // learning rate - float beta1; - float beta2; - float eps; // epsilon for numerical stability - float eps_f; // epsilon for convergence test - float eps_g; // epsilon for convergence test - } adam; + // make contiguous + GGML_API struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a); - // LBFGS parameters - struct { - int m; // number of corrections to approximate the inv. Hessian - int n_iter; - int max_linesearch; + // return view(a), b specifies the new shape + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - float eps; // convergence tolerance - float ftol; // line search tolerance - float wolfe; - float min_step; - float max_step; + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1); - enum ggml_linesearch linesearch; - } lbfgs; -}; + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); -struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); + // offset in bytes + GGML_API struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + size_t offset); -// optimize the function defined by the tensor f -enum ggml_opt_result ggml_opt( - struct ggml_context * ctx, - struct ggml_opt_params params, - struct ggml_tensor * f); + GGML_API struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + size_t nb1, // row stride in bytes + size_t offset); -// -// quantization -// + GGML_API struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t offset); -size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3); -// -// system info -// + // alias for ggml_permute(ctx, a, 1, 0, 2, 3) + GGML_API struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a); -int ggml_cpu_has_avx(void); -int ggml_cpu_has_avx2(void); -int ggml_cpu_has_avx512(void); -int ggml_cpu_has_fma(void); -int ggml_cpu_has_neon(void); -int ggml_cpu_has_arm_fma(void); -int ggml_cpu_has_f16c(void); -int ggml_cpu_has_fp16_va(void); -int ggml_cpu_has_wasm_simd(void); -int ggml_cpu_has_blas(void); -int ggml_cpu_has_sse3(void); -int ggml_cpu_has_vsx(void); + GGML_API struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // set elements above the diagonal to -INF + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // rotary position embedding + // in-place, returns view(a) + // if mode & 1 == 1, skip n_past elements + // if mode & 2 == 1, GPT-NeoX style + // TODO: avoid creating a new tensor every time + GGML_API struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode); + + // padding = 1 + // TODO: we don't support extra parameters for now + // that's why we are hard-coding the stride, padding, and dilation + // not great .. + GGML_API struct ggml_tensor * ggml_conv_1d_1s( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_conv_1d_2s( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_flash_attn( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + bool masked); + + GGML_API struct ggml_tensor * ggml_flash_ff( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b0, + struct ggml_tensor * b1, + struct ggml_tensor * c0, + struct ggml_tensor * c1); + + // Mapping operations + GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); + GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + + GGML_API struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun); + + GGML_API struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun); + + // + // automatic differentiation + // + + GGML_API void ggml_set_param( + struct ggml_context * ctx, + struct ggml_tensor * tensor); + + GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + + GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); + GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); + + GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + + // print info and performance information for the graph + GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); + + // dump the graph into a file using the dot format + GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); + + // + // optimization + // + + // optimization methods + enum ggml_opt_type { + GGML_OPT_ADAM, + GGML_OPT_LBFGS, + }; + + // linesearch methods + enum ggml_linesearch { + GGML_LINESEARCH_DEFAULT = 1, + + GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, + GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, + GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, + }; + + // optimization return values + enum ggml_opt_result { + GGML_OPT_OK = 0, + GGML_OPT_DID_NOT_CONVERGE, + GGML_OPT_NO_CONTEXT, + GGML_OPT_INVALID_WOLFE, + GGML_OPT_FAIL, + + GGML_LINESEARCH_FAIL = -128, + GGML_LINESEARCH_MINIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_ITERATIONS, + GGML_LINESEARCH_INVALID_PARAMETERS, + }; + + // optimization parameters + // + // see ggml.c (ggml_opt_default_params) for default values + // + struct ggml_opt_params { + enum ggml_opt_type type; + + int n_threads; + + // delta-based convergence test + // + // if past == 0 - disabled + // if past > 0: + // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) + // + int past; + float delta; + + // maximum number of iterations without improvement + // + // if 0 - disabled + // if > 0: + // assume convergence if no cost improvement in this number of iterations + // + int max_no_improvement; + + bool print_forward_graph; + bool print_backward_graph; + + // ADAM parameters + struct { + int n_iter; + + float alpha; // learning rate + float beta1; + float beta2; + float eps; // epsilon for numerical stability + float eps_f; // epsilon for convergence test + float eps_g; // epsilon for convergence test + } adam; + + // LBFGS parameters + struct { + int m; // number of corrections to approximate the inv. Hessian + int n_iter; + int max_linesearch; + + float eps; // convergence tolerance + float ftol; // line search tolerance + float wolfe; + float min_step; + float max_step; + + enum ggml_linesearch linesearch; + } lbfgs; + }; + + GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); + + // optimize the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f); + + // + // quantization + // + + GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); + + GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + + // + // system info + // + + GGML_API int ggml_cpu_has_avx (void); + GGML_API int ggml_cpu_has_avx2 (void); + GGML_API int ggml_cpu_has_avx512 (void); + GGML_API int ggml_cpu_has_avx512_vbmi(void); + GGML_API int ggml_cpu_has_avx512_vnni(void); + GGML_API int ggml_cpu_has_fma (void); + GGML_API int ggml_cpu_has_neon (void); + GGML_API int ggml_cpu_has_arm_fma (void); + GGML_API int ggml_cpu_has_f16c (void); + GGML_API int ggml_cpu_has_fp16_va (void); + GGML_API int ggml_cpu_has_wasm_simd (void); + GGML_API int ggml_cpu_has_blas (void); + GGML_API int ggml_cpu_has_cublas (void); + GGML_API int ggml_cpu_has_sse3 (void); + GGML_API int ggml_cpu_has_vsx (void); -// -// Internal types and functions exposed for tests and benchmarks -// + // + // Internal types and functions exposed for tests and benchmarks + // #ifdef __cplusplus -// restrict not standard in C++ + // restrict not standard in C++ #define GGML_RESTRICT #else #define GGML_RESTRICT restrict #endif -typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); -typedef struct { - dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - quantize_row_q_t quantize_row_q_reference; - vec_dot_q_t vec_dot_q; -} quantize_fns_t; + typedef struct { + dequantize_row_q_t dequantize_row_q; + quantize_row_q_t quantize_row_q; + quantize_row_q_t quantize_row_q_reference; + quantize_row_q_t quantize_row_q_dot; + vec_dot_q_t vec_dot_q; + enum ggml_type vec_dot_type; + } quantize_fns_t; -quantize_fns_t ggml_internal_get_quantize_fn(size_t i); + quantize_fns_t ggml_internal_get_quantize_fn(size_t i); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index be8c4cdc1..25203c9e9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,6 +1,8 @@ // Defines fileno on msys: #ifndef _GNU_SOURCE #define _GNU_SOURCE +#include +#include #endif #include "llama_util.h" @@ -9,6 +11,7 @@ #include "ggml.h" #include +#include #include #include #include @@ -21,6 +24,10 @@ #include #include #include +#include +#include +#include +#include #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 @@ -41,36 +48,52 @@ static const size_t MB = 1024*1024; // TODO: dynamically determine these sizes // needs modifications in ggml -static const std::map MEM_REQ_SCRATCH0 = { - { MODEL_7B, 512ull*MB }, - { MODEL_13B, 512ull*MB }, - { MODEL_30B, 512ull*MB }, - { MODEL_65B, 512ull*MB }, -}; +static const std::map & MEM_REQ_SCRATCH0() +{ + static std::map _MEM_REQ_SCRATCH0 = { + { MODEL_7B, 512ull * MB }, + { MODEL_13B, 512ull * MB }, + { MODEL_30B, 512ull * MB }, + { MODEL_65B, 1024ull * MB }, + }; + return _MEM_REQ_SCRATCH0; +} -static const std::map MEM_REQ_SCRATCH1 = { - { MODEL_7B, 512ull*MB }, - { MODEL_13B, 512ull*MB }, - { MODEL_30B, 512ull*MB }, - { MODEL_65B, 512ull*MB }, -}; +static const std::map & MEM_REQ_SCRATCH1() +{ + static std::map _MEM_REQ_SCRATCH1 = { + { MODEL_7B, 512ull * MB }, + { MODEL_13B, 512ull * MB }, + { MODEL_30B, 512ull * MB }, + { MODEL_65B, 1024ull * MB }, + }; + return _MEM_REQ_SCRATCH1; +} // 2*n_embd*n_ctx*n_layer*sizeof(float16) -static const std::map MEM_REQ_KV_SELF = { - { MODEL_7B, 1026ull*MB }, - { MODEL_13B, 1608ull*MB }, - { MODEL_30B, 3124ull*MB }, - { MODEL_65B, 5120ull*MB }, -}; +static const std::map & MEM_REQ_KV_SELF() +{ + static std::map _MEM_REQ_KV_SELF = { + { MODEL_7B, 1026ull * MB }, + { MODEL_13B, 1608ull * MB }, + { MODEL_30B, 3124ull * MB }, + { MODEL_65B, 5120ull * MB }, + }; + return _MEM_REQ_KV_SELF; +} // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled -static const std::map MEM_REQ_EVAL = { - { MODEL_7B, 768ull*MB }, - { MODEL_13B, 1024ull*MB }, - { MODEL_30B, 1280ull*MB }, - { MODEL_65B, 1536ull*MB }, -}; +static const std::map & MEM_REQ_EVAL() +{ + static std::map _MEM_REQ_EVAL = { + { MODEL_7B, 768ull * MB }, + { MODEL_13B, 1024ull * MB }, + { MODEL_30B, 1280ull * MB }, + { MODEL_65B, 1536ull * MB }, + }; + return _MEM_REQ_EVAL; +} // default hparams (LLaMA 7B) struct llama_hparams { @@ -261,12 +284,12 @@ static size_t checked_div(size_t a, size_t b) { } static std::string llama_format_tensor_shape(const std::vector & ne) { - std::string ret = "[" + std::to_string(ne.at(0)); + char buf[256]; + snprintf(buf, sizeof(buf), "%5u", ne.at(0)); for (size_t i = 1; i < ne.size(); i++) { - ret += " x " + std::to_string(ne.at(i)); + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i)); } - ret += "]"; - return ret; + return buf; } static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { @@ -459,6 +482,9 @@ struct llama_file_loader { case GGML_TYPE_F16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: + case GGML_TYPE_Q8_0: break; default: { throw format("unrecognized tensor type %u\n", shard.type); @@ -531,6 +557,9 @@ struct llama_file_saver { case GGML_TYPE_F16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: + case GGML_TYPE_Q8_0: break; default: LLAMA_ASSERT(false); } @@ -616,6 +645,7 @@ struct llama_model_loader { throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); } + return get_tensor_for(lt); } @@ -818,6 +848,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; + case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; + case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3"; + case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; default: return "unknown, may not work"; } } @@ -898,13 +931,13 @@ static void llama_model_load_internal( const size_t mem_required = ctx_size + mmapped_size + - MEM_REQ_SCRATCH0.at(model.type) + - MEM_REQ_SCRATCH1.at(model.type) + - MEM_REQ_EVAL.at (model.type); + MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH1().at(model.type) + + MEM_REQ_EVAL().at(model.type); // this is the memory required by one llama_state const size_t mem_required_state = - scale*MEM_REQ_KV_SELF.at(model.type); + scale*MEM_REQ_KV_SELF().at(model.type); fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); @@ -941,8 +974,8 @@ static void llama_model_load_internal( ml->ggml_ctx = ctx; model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); - model.norm = ml->get_tensor("norm.weight", {n_embd}); - model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); + model.norm = ml->get_tensor("norm.weight", {n_embd}); + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { @@ -1046,7 +1079,7 @@ static bool llama_eval_internal( // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads; + gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, tokens, N*ggml_element_size(embd)); @@ -1220,9 +1253,11 @@ static bool llama_eval_internal( ggml_build_forward_expand(&gf, inpL); ggml_graph_compute (ctx0, &gf); +#ifdef GGML_PERF // print timing information per ggml operation (for debugging purposes) // requires GGML_PERF to be defined - //ggml_graph_print(&gf); + ggml_graph_print(&gf); +#endif // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { @@ -1546,14 +1581,21 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) { +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) { ggml_type quantized_type; switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; + case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; + case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break; + case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; default: throw format("invalid output file type %d\n", ftype); }; + if (nthread <= 0) { + nthread = std::thread::hardware_concurrency(); + } + std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, /*vocab_only*/ false)); llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); @@ -1562,6 +1604,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t total_size_new = 0; std::vector hist_all(1 << 4, 0); + std::vector workers; + std::mutex mutex; + size_t idx = 0; for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { llama_buffer read_data; @@ -1569,7 +1614,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s tensor.data = read_data.addr; model_loader->load_data_for(tensor); - printf("[%zu/%zu] %36s - %s, type = %6s, ", + printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", ++idx, model_loader->tensors_map.tensors.size(), tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), ggml_type_name(tensor.type)); @@ -1580,6 +1625,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // quantize only 2D tensors quantize &= (tensor.ne.size() == 2); + // uncomment this to keep the output layer in FP16 + //if (tensor.name == "output.weight") { + // quantize = false; + //} + enum ggml_type new_type; void * new_data; size_t new_size; @@ -1615,17 +1665,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = work.addr; std::vector hist_cur(1 << 4, 0); - switch (new_type) { - case GGML_TYPE_Q4_0: - { - new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - default: - LLAMA_ASSERT(false); + int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1)/chunk_size; + const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + if (nthread_use < 2) { + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + } else { + size_t counter = 0; + new_size = 0; + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + std::vector local_hist; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + size_t first = counter; counter += chunk_size; + if (first >= nelements) { + if (!local_hist.empty()) { + for (int j=0; j %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); @@ -1724,17 +1794,17 @@ struct llama_context * llama_init_from_file( if (params.logits_all) { ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); } else { - ctx->logits.reserve(hparams.n_ctx); + ctx->logits.reserve(hparams.n_vocab); } if (params.embedding){ ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type)); - ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type)); + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); + ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); } return ctx; @@ -1747,9 +1817,10 @@ void llama_free(struct llama_context * ctx) { int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype) { + enum llama_ftype ftype, + int nthread) { try { - llama_model_quantize_internal(fname_inp, fname_out, ftype); + llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread); return 0; } catch (const std::string & err) { fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); @@ -1757,31 +1828,439 @@ int llama_model_quantize( } } -// Returns the KV cache that will contain the context for the -// ongoing prediction with the model. -const uint8_t * llama_get_kv_cache(struct llama_context * ctx) { - return ctx->model.kv_self.buf.addr; +int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + + auto & model = ctx->model; + + const int64_t t_start_lora_us = ggml_time_us(); + + auto fin = std::ifstream(path_lora, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora); + return 1; + } + + // verify magic and version + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != 'ggla') { + fprintf(stderr, "%s: bad file magic\n", __func__); + return 1; + } + uint32_t format_version; + fin.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + fprintf(stderr, "%s: unsupported file version\n", __func__ ); + return 1; + } + } + + int32_t lora_r; + int32_t lora_alpha; + fin.read((char *) &lora_r, sizeof(lora_r)); + fin.read((char *) &lora_alpha, sizeof(lora_alpha)); + float scaling = (float)lora_alpha / (float)lora_r; + + fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + + + // create a temporary ggml context to store the lora tensors + // todo: calculate size from biggest possible tensor + std::vector lora_buf(1024ull * 1024ull * 1024ull); + struct ggml_init_params params; + params.mem_size = lora_buf.size(); + params.mem_buffer = lora_buf.data(); + params.no_alloc = false; + + ggml_context * lora_ctx = ggml_init(params); + std::unordered_map lora_tensors; + + // create a name -> tensor map of the model to accelerate lookups + std::unordered_map model_tensors; + for (auto & kv: model.tensors_by_name) { + model_tensors.insert(kv); + } + + + // load base model + std::unique_ptr model_loader; + ggml_context * base_ctx = NULL; + llama_buffer base_buf; + if (path_base_model) { + fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); + model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); + + size_t ctx_size, mmapped_size; + model_loader->calc_sizes(&ctx_size, &mmapped_size); + base_buf.resize(ctx_size); + + ggml_init_params base_params; + base_params.mem_size = base_buf.size; + base_params.mem_buffer = base_buf.addr; + base_params.no_alloc = model_loader->use_mmap; + + base_ctx = ggml_init(base_params); + + model_loader->ggml_ctx = base_ctx; + + // maybe this should in llama_model_loader + if (model_loader->use_mmap) { + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); + } + } + + // read tensors and apply + bool warned = false; + int n_tensors = 0; + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ftype), sizeof(ftype)); + if (fin.eof()) { + break; + } + + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + } + + std::string name(length, 0); + fin.read(&name[0], length); + + // check for lora suffix and get the type of tensor + const std::string lora_suffix = ".lora"; + size_t pos = name.rfind(lora_suffix); + if (pos == std::string::npos) { + fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); + return 1; + } + + std::string lora_type = name.substr(pos + lora_suffix.length()); + std::string base_name = name; + base_name.erase(pos); + // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); + + if (model_tensors.find(base_name.data()) == model_tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); + return 1; + } + + // create ggml tensor + ggml_type wtype; + switch (ftype) { + case 0: wtype = GGML_TYPE_F32; break; + case 1: wtype = GGML_TYPE_F16; break; + default: + { + fprintf(stderr, "%s: invalid tensor data type '%d'\n", + __func__, ftype); + return false; + } + } + ggml_tensor* lora_tensor; + if (n_dims == 2) { + lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); + } + else { + fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); + return 1; + } + + // load tensor data + size_t offset = fin.tellg(); + size_t tensor_data_size = ggml_nbytes(lora_tensor); + offset = (offset + 31) & -32; + fin.seekg(offset); + fin.read((char*)lora_tensor->data, tensor_data_size); + + lora_tensors[name] = lora_tensor; + + // check if we have both A and B tensors and apply + if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && + lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + + ggml_tensor * dest_t = model_tensors[base_name]; + ggml_tensor * base_t; + if (model_loader) { + // load from base model + if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { + fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + return 1; + } + size_t idx = model_loader->tensors_map.name_to_idx[base_name]; + llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; + base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); + lt.data = (uint8_t *) lt.ggml_tensor->data; + model_loader->load_data_for(lt); + lt.ggml_tensor->data = lt.data; + } + else { + base_t = dest_t; + } + + if (ggml_is_quantized(base_t->type)) { + if (!warned) { + fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + } + + ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; + ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + return 1; + } + + // w = w + BA*s + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + BA = ggml_scale(lora_ctx, BA, scale_tensor); + } + + ggml_tensor * r; + if (base_t == dest_t) { + r = ggml_add_inplace(lora_ctx, dest_t, BA); + } + else { + r = ggml_add(lora_ctx, base_t, BA); + r = ggml_cpy(lora_ctx, r, dest_t); + } + + struct ggml_cgraph gf = ggml_build_forward(r); + gf.n_threads = n_threads; + ggml_graph_compute(lora_ctx, &gf); + + // we won't need these tensors again, reset the context to save memory + ggml_free(lora_ctx); + lora_ctx = ggml_init(params); + lora_tensors.clear(); + + n_tensors++; + if (n_tensors % 4 == 0) + fprintf(stderr, "."); + } + } + + // TODO: this should be in a destructor, it will leak on failure + ggml_free(lora_ctx); + if (base_ctx) { + ggml_free(base_ctx); + } + + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); + + return 0; } -// Returns the size of the KV cache -size_t llama_get_kv_cache_size(struct llama_context * ctx) { - return ctx->model.kv_self.buf.size; +int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str()); + return 1; + } } int llama_get_kv_cache_token_count(struct llama_context * ctx) { return ctx->model.kv_self.n; } -// Sets the KV cache containing the current context for the model -void llama_set_kv_cache( - struct llama_context * ctx, - const uint8_t * kv_cache, - size_t n_size, - int n_token_count) { - // Make sure we have the same kv cache setup - LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size); - memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size); - ctx->model.kv_self.n = n_token_count; +#define LLAMA_MAX_RNG_STATE 64*1024 + +// Returns the size of the state +size_t llama_get_state_size(struct llama_context * ctx) { + // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. + // for reference, std::mt19937(1337) serializes to 6701 bytes. + const size_t s_rng_size = sizeof(size_t); + const size_t s_rng = LLAMA_MAX_RNG_STATE; + const size_t s_logits_capacity = sizeof(size_t); + const size_t s_logits_size = sizeof(size_t); + const size_t s_logits = ctx->logits.capacity() * sizeof(float); + const size_t s_embedding_size = sizeof(size_t); + const size_t s_embedding = ctx->embedding.size() * sizeof(float); + const size_t s_kv_size = sizeof(size_t); + const size_t s_kv_ntok = sizeof(int); + const size_t s_kv = ctx->model.kv_self.buf.size; + + const size_t s_total = ( + + s_rng_size + + s_rng + + s_logits_capacity + + s_logits_size + + s_logits + + s_embedding_size + + s_embedding + + s_kv_size + + s_kv_ntok + + s_kv + ); + + return s_total; +} + +// Copies the state to the specified destination address +size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { + uint8_t * out = dest; + + // copy rng + { + std::stringstream rng_ss; + rng_ss << ctx->rng; + + const size_t rng_size = rng_ss.str().size(); + char rng_buf[LLAMA_MAX_RNG_STATE]; + + memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE); + memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); + + memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size); + memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE; + } + + // copy logits + { + const size_t logits_cap = ctx->logits.capacity(); + const size_t logits_size = ctx->logits.size(); + + memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap); + memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size); + + if (logits_size) { + memcpy(out, ctx->logits.data(), logits_size * sizeof(float)); + } + + out += logits_cap * sizeof(float); + } + + // copy embeddings + { + const size_t embedding_size = ctx->embedding.size(); + + memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size); + + if (embedding_size) { + memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float)); + out += embedding_size * sizeof(float); + } + } + + // copy kv cache + { + const size_t kv_size = ctx->model.kv_self.buf.size; + const int kv_ntok = llama_get_kv_cache_token_count(ctx); + + memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); + memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); + + if (kv_size) { + memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size; + } + } + + const size_t written = out - dest; + const size_t expected = llama_get_state_size(ctx); + + LLAMA_ASSERT(written == expected); + + return written; +} + +// Sets the state reading from the specified source address +size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { + const uint8_t * in = src; + + // set rng + { + size_t rng_size; + char rng_buf[LLAMA_MAX_RNG_STATE]; + + memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size); + memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE; + + std::stringstream rng_ss; + rng_ss.str(std::string(&rng_buf[0], rng_size)); + rng_ss >> ctx->rng; + + LLAMA_ASSERT(rng_ss.fail() == false); + } + + // set logits + { + size_t logits_cap; + size_t logits_size; + + memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap); + memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size); + + LLAMA_ASSERT(ctx->logits.capacity() == logits_cap); + + if (logits_size) { + ctx->logits.resize(logits_size); + memcpy(ctx->logits.data(), in, logits_size * sizeof(float)); + } + + in += logits_cap * sizeof(float); + } + + // set embeddings + { + size_t embedding_size; + + memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size); + + LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size); + + if (embedding_size) { + memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float)); + in += embedding_size * sizeof(float); + } + } + + // set kv cache + { + size_t kv_size; + int kv_ntok; + + memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size); + memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok); + + if (kv_size) { + LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size); + + void * k_data = ctx->model.kv_self.k->data; // remember data pointers + void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy + + memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size; + + ctx->model.kv_self.k->data = k_data; // restore correct data pointers + ctx->model.kv_self.v->data = v_data; + + } + + ctx->model.kv_self.n = kv_ntok; + } + + const size_t nread = in - src; + const size_t expected = llama_get_state_size(ctx); + + LLAMA_ASSERT(nread == expected); + + return nread; } int llama_eval( @@ -1914,18 +2393,20 @@ const char * llama_print_system_info(void) { static std::string s; s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; + s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; + s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; + s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; return s.c_str(); } @@ -1934,3 +2415,4 @@ const char * llama_print_system_info(void) { std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; } + diff --git a/llama.h b/llama.h index 192217593..ab41798d8 100644 --- a/llama.h +++ b/llama.h @@ -72,6 +72,9 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params(); @@ -91,27 +94,39 @@ extern "C" { // TODO: not great API - very likely to change // Returns 0 on success + // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given LLAMA_API int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype); + enum llama_ftype ftype, + int nthread); - // Returns the KV cache that will contain the context for the - // ongoing prediction with the model. - LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx); - - // Returns the size of the KV cache - LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx); + // Apply a LoRA adapter to a loaded model + // path_base_model is the path to a higher quality model to use as a base for + // the layers modified by the adapter. Can be NULL to use the current loaded model. + // The model needs to be reloaded before applying a new adapter, otherwise the adapter + // will be applied on top of the previous one + // Returns 0 on success + LLAMA_API int llama_apply_lora_from_file( + struct llama_context * ctx, + const char * path_lora, + const char * path_base_model, + int n_threads); // Returns the number of tokens in the KV cache LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx); - // Sets the KV cache containing the current context for the model - LLAMA_API void llama_set_kv_cache( - struct llama_context * ctx, - const uint8_t * kv_cache, - size_t n_size, - int n_token_count); + // Returns the size in bytes of the state (rng, logits, embedding and kv_cache) + LLAMA_API size_t llama_get_state_size(struct llama_context * ctx); + + // Copies the state to the specified destination address. + // Destination needs to have allocated enough memory. + // Returns the number of bytes copied + LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest); + + // Set the state reading from the specified address + // Returns the number of bytes read + LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process diff --git a/llama_util.h b/llama_util.h index 653bf7138..acb207e65 100755 --- a/llama_util.h +++ b/llama_util.h @@ -21,6 +21,9 @@ #if defined(_POSIX_MAPPED_FILES) #include #endif + #if defined(_POSIX_MEMLOCK_RANGE) + #include + #endif #endif #endif @@ -43,8 +46,12 @@ } while (0) #ifdef __GNUC__ +#ifdef __MINGW32__ +__attribute__((format(gnu_printf, 1, 2))) +#else __attribute__((format(printf, 1, 2))) #endif +#endif static std::string format(const char * fmt, ...) { va_list ap, ap2; va_start(ap, fmt); @@ -57,7 +64,7 @@ static std::string format(const char * fmt, ...) { va_end(ap2); va_end(ap); return std::string(buf.data(), size); -}; +} struct llama_file { // use FILE * so we don't have to re-open the file to mmap @@ -164,7 +171,7 @@ struct llama_mmap { #ifdef _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file) { + llama_mmap(struct llama_file * file, bool prefetch = true) { size = file->size; int fd = fileno(file->fp); int flags = MAP_SHARED; @@ -172,15 +179,16 @@ struct llama_mmap { flags |= MAP_POPULATE; #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - close(fd); if (addr == MAP_FAILED) { throw format("mmap failed: %s", strerror(errno)); } - // Advise the kernel to preload the mapped memory - if (madvise(addr, file->size, MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); + if (prefetch) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, file->size, MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } } } @@ -190,14 +198,13 @@ struct llama_mmap { #elif defined(_WIN32) static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file) { + llama_mmap(struct llama_file * file, bool prefetch = true) { size = file->size; HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); DWORD error = GetLastError(); - CloseHandle(hFile); if (hMapping == NULL) { throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); @@ -212,13 +219,15 @@ struct llama_mmap { } #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 - // Advise the kernel to preload the mapped memory - WIN32_MEMORY_RANGE_ENTRY range; - range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; - if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); + if (prefetch) { + // Advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } } #else #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") @@ -297,8 +306,18 @@ struct llama_mlock { if (!mlock(addr, size)) { return true; } else { - fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION, - size, this->size, std::strerror(errno)); + char* errmsg = std::strerror(errno); + bool suggest = (errno == ENOMEM); + + // Check if the resource limit is fine after all + struct rlimit lock_limit; + if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) + suggest = false; + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) + suggest = false; + + fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", + size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); return false; } } diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt new file mode 100644 index 000000000..03e1d2c04 --- /dev/null +++ b/pocs/CMakeLists.txt @@ -0,0 +1,12 @@ +# dependencies + +find_package(Threads REQUIRED) + +# third-party + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +if (EMSCRIPTEN) +else() + add_subdirectory(vdot) +endif() diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt new file mode 100644 index 000000000..fb89a1cd4 --- /dev/null +++ b/pocs/vdot/CMakeLists.txt @@ -0,0 +1,9 @@ +set(TARGET vdot) +add_executable(${TARGET} vdot.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET q8dot) +add_executable(${TARGET} q8dot.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp new file mode 100644 index 000000000..5748c8ac2 --- /dev/null +++ b/pocs/vdot/q8dot.cpp @@ -0,0 +1,172 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +constexpr int kVecSize = 1 << 16; + +// Copy-pasted from ggml.c +#define QK4_0 32 +typedef struct { + float d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + float d; // delta + float m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); + +// Copy-pasted from ggml.c +#define QK8_0 32 +typedef struct { + float d; // delta + float s; // d * sum(qs[i]) + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == 2*sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); + +static_assert(QK4_1 == QK8_0, "QK4_1 and QK8_0 must be the same"); +static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same"); + +template +void fillQ4blocks(std::vector& blocks, std::mt19937& rndm) { + for (auto& b : blocks) { + b.d = 1; + for (int i=0; i> 28; + uint8_t v2 = rndm() >> 28; + b.qs[i] = v1 | (v2 << 4); + } + } +} + +void fillQ80blocks(std::vector& blocks, std::mt19937& rndm) { + for (auto& b : blocks) { + b.d = 1; + int sum = 0; + for (int i=0; i> 24) - 128; + sum += b.qs[i]; + } + b.s = b.d * sum; + } +} + +float simpleDot(const block_q4_0& x, const block_q8_0& y) { + int s1 = 0; //, s2 = 0; + for (int i=0; i> 4; + int v3 = x.qs[i+1] & 0xf; + int v4 = x.qs[i+1] >> 4; + int j = 2*i; + s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3]; + //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3]; + } + return y.d * x.d * s1 - 8 * x.d * y.s; + //return y.d * x.d * (s1 - 8 * s2); +} + +float simpleDot(const block_q4_1& x, const block_q8_0& y) { + int s1 = 0; //, s2 = 0; + for (int i=0; i> 4; + int v3 = x.qs[i+1] & 0xf; + int v4 = x.qs[i+1] >> 4; + int j = 2*i; + s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3]; + //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3]; + } + return y.d * x.d * s1 + y.s * x.m; + //return y.d * (x.d * s1 + x.m * s2); +} + +struct Stat { + double sum = 0, sumt = 0, sumt2 = 0, maxt = 0; + int nloop = 0; + void addResult(double s, double t) { + sum += s; + sumt += t; sumt2 += t*t; maxt = std::max(maxt, t); + ++nloop; + } + void reportResult(const char* title) const { + if (nloop < 1) { + printf("%s(%s): no result\n",__func__,title); + return; + } + printf("============ %s\n",title); + printf(" = %g\n",sum/nloop); + auto t = sumt/nloop, dt = sumt2/nloop - t*t; + if (dt > 0) dt = sqrt(dt); + printf("