diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 2c0ae4e2a..1c9633cdf 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -227,20 +227,20 @@ effectiveStdenv.mkDerivation ( ) ] ++ optionals useRocm [ - (cmakeFeature "CMAKE_C_COMPILER" "hipcc") - (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc") - - # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM - # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt - # and select the line that matches the current nixpkgs version of rocBLAS. - # Should likely use `rocmPackages.clr.gpuTargets`. - "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102" + (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") + (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets)) ] ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) ]; + # Environment variables needed for ROCm + env = optionals useRocm { + ROCM_PATH = "${rocmPackages.clr}"; + HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode"; + }; + # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # if they haven't been added yet. postInstall = '' diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0742443c6..0109cc004 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -392,6 +392,33 @@ jobs: cmake -DLLAMA_VULKAN=ON .. cmake --build . --config Release -j $(nproc) + ubuntu-22-cmake-hip: + runs-on: ubuntu-22.04 + container: rocm/dev-ubuntu-22.04:6.0.2 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev + + - name: Build with native CMake HIP support + id: cmake_build + run: | + cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON + cmake --build build --config Release -j $(nproc) + + - name: Build with legacy HIP support + id: cmake_build_legacy_hip + run: | + cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON + cmake --build build2 --config Release -j $(nproc) + ubuntu-22-cmake-sycl: runs-on: ubuntu-22.04 @@ -989,6 +1016,37 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip name: llama-bin-win-sycl-x64.zip + windows-latest-cmake-hip: + runs-on: windows-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Install + id: depends + run: | + $ErrorActionPreference = "Stop" + write-host "Downloading AMD HIP SDK Installer" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + write-host "Installing AMD HIP SDK" + Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait + write-host "Completed AMD HIP SDK installation" + + - name: Verify ROCm + id: verify + run: | + & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + + - name: Build + id: cmake_build + run: | + $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) + $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON + cmake --build build --config Release + ios-xcode-build: runs-on: macos-latest diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ab6a45a6..990e34b86 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -555,16 +555,37 @@ if (LLAMA_VULKAN) endif() if (LLAMA_HIPBLAS) - list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + if ($ENV{ROCM_PATH}) + set(ROCM_PATH $ENV{ROCM_PATH}) + else() + set(ROCM_PATH /opt/rocm) + endif() + list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) - if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") - message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") + # CMake on Windows doesn't support the HIP language yet + if(WIN32) + set(CXX_IS_HIPCC TRUE) + else() + string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") endif() - if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") - message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") - endif() + if(CXX_IS_HIPCC) + if(LINUX) + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." + " Prefer setting the HIP compiler directly. See README for details.") + endif() + else() + # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. + if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_ARGETS}) + endif() + cmake_minimum_required(VERSION 3.21) + enable_language(HIP) + endif() find_package(hip REQUIRED) find_package(hipblas REQUIRED) find_package(rocblas REQUIRED) @@ -598,13 +619,18 @@ if (LLAMA_HIPBLAS) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) + if (CXX_IS_HIPCC) + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device) + else() + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) + endif() if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas) endif() if (LLAMA_SYCL) diff --git a/Makefile b/Makefile index 3fa56d13a..22d521856 100644 --- a/Makefile +++ b/Makefile @@ -560,10 +560,10 @@ endif # LLAMA_VULKAN ifdef LLAMA_HIPBLAS ifeq ($(wildcard /opt/rocm),) ROCM_PATH ?= /usr - GPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) + AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) else ROCM_PATH ?= /opt/rocm - GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) + AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) endif HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc LLAMA_CUDA_DMMV_X ?= 32 @@ -575,7 +575,7 @@ ifdef LLAMA_HIP_UMA endif # LLAMA_HIP_UMA MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas - HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) + HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) diff --git a/README.md b/README.md index 5d6217d13..7dd6fc0eb 100644 --- a/README.md +++ b/README.md @@ -528,13 +528,28 @@ Building the program with BLAS support may lead to some performance improvements ``` - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash - CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ - cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ + cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -- -j 16 ``` On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). + Note that if you get the following error: + ``` + clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library + ``` + Try searching for a directory under `HIP_PATH` that contains the file + `oclc_abi_version_400.bc`. Then, add the following to the start of the + command: `HIP_DEVICE_LIB_PATH=`, so something + like: + ```bash + HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \ + HIP_DEVICE_LIB_PATH= \ + cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + && cmake --build build -- -j 16 + ``` + - Using `make` (example for target gfx1030, build with 16 CPU threads): ```bash make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 @@ -543,10 +558,8 @@ Building the program with BLAS support may lead to some performance improvements - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% - mkdir build - cd build - cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release .. - cmake --build . + cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release + cmake --build build ``` Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`. diff --git a/common/common.cpp b/common/common.cpp index 96130ad54..e624fc7f3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2553,7 +2553,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha size_t pos_start = 0; size_t pos_found = 0; - if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) { + if (std::isspace(data_str[0]) || std::isspace(data_str.back())) { data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)"); diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index af5366b33..53333ff22 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -20,12 +20,13 @@ # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp -# TODO: automate the update of convert-hf-to-gguf.py # import json import logging import os +import pathlib +import re import sys from enum import IntEnum, auto from hashlib import sha256 @@ -35,6 +36,7 @@ from transformers import AutoTokenizer logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("convert-hf-to-gguf-update") +sess = requests.Session() class TOKENIZER_TYPE(IntEnum): @@ -85,76 +87,44 @@ models = [ {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, ] -# make directory "models/tokenizers" if it doesn't exist -if not os.path.exists("models/tokenizers"): - os.makedirs("models/tokenizers") - def download_file_with_auth(url, token, save_path): headers = {"Authorization": f"Bearer {token}"} - response = requests.get(url, headers=headers) - if response.status_code == 200: - with open(save_path, 'wb') as f: - f.write(response.content) - logger.info(f"File {save_path} downloaded successfully") - else: - logger.info(f"Failed to download file. Status code: {response.status_code}") + response = sess.get(url, headers=headers) + response.raise_for_status() + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with open(save_path, 'wb') as f: + f.write(response.content) + logger.info(f"File {save_path} downloaded successfully") -# download the tokenizer models -for model in models: - # set mapping +def download_model(model): name = model["name"] repo = model["repo"] tokt = model["tokt"] - # NOTE: We should always be using resolve to download files - url_resolve = f"{repo}/resolve/main" + os.makedirs(f"models/tokenizers/{name}", exist_ok=True) - # set dir paths - model_name_or_path = f"models/tokenizers/{name}" - model_tokenizer_path = f"{model_name_or_path}/tokenizer.json" - - # check dir path - if os.path.exists(model_name_or_path): # Still TOCTOU? - logger.info(f"Directory {model_name_or_path} already exists - skipping") - continue - - os.makedirs(model_name_or_path, exist_ok=True) - - logger.info(f"Downloading {name} to {model_name_or_path}") - - # model and repo urls are not the same - download_file_with_auth( - url=f"{url_resolve}/tokenizer.json", - token=token, - save_path=model_tokenizer_path - ) - - # Get the models hyper params - download_file_with_auth( - url=f"{url_resolve}/config.json", - token=token, - save_path=f"{model_name_or_path}/config.json" - ) - - # Handle sentencepiece tokenizer + files = ["config.json", "tokenizer.json", "tokenizer_config.json"] if tokt == TOKENIZER_TYPE.SPM: - download_file_with_auth( - url=f"{url_resolve}/tokenizer.model", - token=token, - save_path=f"{model_name_or_path}/tokenizer.model" - ) + files.append("tokenizer.model") + + for file in files: + save_path = f"models/tokenizers/{name}/{file}" + if os.path.isfile(save_path): + logger.info(f"{name}: File {save_path} already exists - skipping") + continue + download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) + + +for model in models: + try: + download_model(model) + except Exception as e: + logger.error(f"Failed to download model {model['name']}. Error: {e}") - # Get the tokenizer config - download_file_with_auth( - url=f"{url_resolve}/tokenizer_config.json", - token=token, - save_path=f"{model_name_or_path}/tokenizer_config.json" - ) # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: -# TODO: auto-update convert-hf-to-gguf.py with the generated function src_ifs = "" for model in models: @@ -186,7 +156,7 @@ for model in models: logger.info(f"chkhsh: {chkhsh}") # print the "pre_tokenizer" content from the tokenizer.json - with open(model_tokenizer_path, "r", encoding="utf-8") as f: + with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f: cfg = json.load(f) normalizer = cfg["normalizer"] logger.info("normalizer: " + json.dumps(normalizer, indent=4)) @@ -243,11 +213,18 @@ src_func = f""" return res """ -print(src_func) # noqa: NP100 +convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") +convert_py = convert_py_pth.read_text() +convert_py = re.sub( + r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", + lambda m: m.group(1) + src_func + m.group(3), + convert_py, + flags=re.DOTALL | re.MULTILINE, +) -logger.info("\n") -logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") -logger.info("\n") +convert_py_pth.write_text(convert_py) + +logger.info("+++ convert-hf-to-gguf.py was updated") # generate tests for each tokenizer model diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index e9156a832..805dd6a2f 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -412,6 +412,7 @@ class Model: # NOTE: this function is generated by convert-hf-to-gguf-update.py # do not modify it manually! # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model @@ -511,6 +512,7 @@ class Model: logger.debug(f"chkhsh: {chkhsh}") return res + # Marker: End get_vocab_base_pre def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() @@ -548,7 +550,7 @@ class Model: # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.special_tokens - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} for i in range(vocab_size): if i not in reverse_vocab: diff --git a/docs/debugging-tests.md b/docs/debugging-tests.md index 51a125e19..18407f688 100644 --- a/docs/debugging-tests.md +++ b/docs/debugging-tests.md @@ -1,6 +1,6 @@ # Debugging Tests Tips -## How to run & debug a specific test without anything else to keep the feedback loop short? +## How to run & execute or debug a specific test without anything else to keep the feedback loop short? There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number. @@ -10,13 +10,27 @@ For example, running the following command will output an interactive list from It will then build & run in the debugger for you. +To just execute a test and get back a PASS or FAIL message run: + ```bash ./scripts/debug-test.sh test-tokenizer +``` + +To test in GDB use the `-g` flag to enable gdb test mode. + +```bash +./scripts/debug-test.sh -g test-tokenizer # Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows: >>> b main ``` +To speed up the testing loop, if you know your test number you can just run it similar to below: + +```bash +./scripts/debug-test.sh test 23 +``` + For further reference use `debug-test.sh -h` to print help.   @@ -41,7 +55,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON .. make -j ``` -#### Step 3.1: Identify Test Command for Debugging +#### Step 3: Find all tests available that matches REGEX The output of this command will give you the command & arguments needed to run GDB. @@ -69,11 +83,13 @@ Labels: main ... ``` -So for test #1 we can tell these two pieces of relevant information: +#### Step 4: Identify Test Command for Debugging + +So for test #1 above we can tell these two pieces of relevant information: * Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0` * Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf` -#### Step 3.2: Run GDB on test command +#### Step 5: Run GDB on test command Based on the ctest 'test command' report above we can then run a gdb session via this command below: diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 41f377376..7c15d2aa4 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -56,6 +56,10 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv, params); exit(0); + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + print_usage(argc, argv, params); + exit(0); } } return true; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7e0d068f8..b598076d3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2387,6 +2387,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); + printf(" --rpc SERVERS comma separated list of RPC servers\n"); printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n"); printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); @@ -2439,6 +2440,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, break; } sparams.port = std::stoi(argv[i]); + } else if (arg == "--rpc") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rpc_servers = argv[i]; } else if (arg == "--host") { if (++i >= argc) { invalid_param = true; diff --git a/ggml-cuda/fattn-tile-f16.cu b/ggml-cuda/fattn-tile-f16.cu new file mode 100644 index 000000000..d2a1077ed --- /dev/null +++ b/ggml-cuda/fattn-tile-f16.cu @@ -0,0 +1,395 @@ +#include "common.cuh" +#include "fattn-common.cuh" +#include "fattn-tile-f16.cuh" + +#define FATTN_KQ_STRIDE_TILE_F16 64 + +template // D == head size +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +__launch_bounds__(nwarps*WARP_SIZE, 1) +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +static __global__ void flash_attn_tile_ext_f16( + const char * __restrict__ Q, + const char * __restrict__ K, + const char * __restrict__ V, + const char * __restrict__ mask, + float * __restrict__ dst, + float2 * __restrict__ dst_meta, + const float scale, + const float max_bias, + const float m0, + const float m1, + const uint32_t n_head_log2, + const int ne00, + const int ne01, + const int ne02, + const int ne03, + const int ne10, + const int ne11, + const int ne12, + const int ne13, + const int ne31, + const int nb31, + const int nb01, + const int nb02, + const int nb03, + const int nb11, + const int nb12, + const int nb13, + const int ne0, + const int ne1, + const int ne2, + const int ne3) { +#if FP16_AVAILABLE + //In this kernel Q, K, V are matrices while i, j, k are matrix indices. + + const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. + const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. + + const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. + const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0); + const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio)); + const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) mask + ne11*ic0; + + const int stride_KV2 = nb11 / sizeof(half2); + + half slopeh = __float2half(1.0f); + + // ALiBi + if (max_bias > 0.0f) { + const uint32_t h = blockIdx.y; + + const float base = h < n_head_log2 ? m0 : m1; + const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slopeh = __float2half(powf(base, exph)); + } + + static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); + + __shared__ half KQ[ncols*FATTN_KQ_STRIDE_TILE_F16]; + half2 * KQ2 = (half2 *) KQ; + + __shared__ half2 KV_tmp[FATTN_KQ_STRIDE_TILE_F16][D/2 + 1]; // Pad D to avoid memory bank conflicts. + + half kqmax[ncols/nwarps]; +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + kqmax[j0/nwarps] = -HALF_MAX_HALF; + } + half2 kqsum[ncols/nwarps] = {{0.0f, 0.0f}}; + + half2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}}; + + // Convert Q to half2 and store in registers: + __shared__ half2 Q_h2[ncols][D/2]; +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i]; + Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y); + } + } + + __syncthreads(); + + const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F16; + for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F16) { + // Calculate KQ tile and keep track of new maximum KQ values: + + half kqmax_new[ncols/nwarps]; +#pragma unroll + for (int j = 0; j < ncols/nwarps; ++j) { + kqmax_new[j] = kqmax[j]; + } + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += nwarps) { + const int i_KQ = i_KQ_0 + threadIdx.y; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { + const int k_KQ = k_KQ_0 + threadIdx.x; + + KV_tmp[i_KQ][k_KQ] = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ]; + } + } + + __syncthreads(); + + half2 sum2[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE][ncols/nwarps] = {{{0.0f, 0.0f}}}; + +#pragma unroll + for (int k_KQ = 0; k_KQ < D/2; ++k_KQ) { + half2 K_k[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE]; + half2 Q_k[ncols/nwarps]; + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) { + const int i_KQ = i_KQ_0 + threadIdx.x; + + K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ]; + } +#pragma unroll + for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { + const int j_KQ = j_KQ_0 + threadIdx.y; + + Q_k[j_KQ_0/nwarps] = Q_h2[j_KQ][k_KQ]; + } + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) { +#pragma unroll + for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { + sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE]*Q_k[j_KQ_0/nwarps]; + } + } + } + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) { + const int i_KQ = i_KQ_0 + threadIdx.x; + +#pragma unroll + for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { + const int j_KQ = j_KQ_0 + threadIdx.y; + + half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]); + sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f); + + kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum); + + KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F16 + i_KQ] = sum; + } + } + + __syncthreads(); + +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]); + const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new[j0/nwarps])); + kqmax[j0/nwarps] = kqmax_new[j0/nwarps]; + +#pragma unroll + for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F16/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + const half2 diff = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] - __half2half2(kqmax[j0/nwarps]); + const half2 val = h2exp(diff); + kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + val; + KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] = val; + } + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale; + } + } + + __syncthreads(); + +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += nwarps) { + const int k = k0 + threadIdx.y; + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + KV_tmp[k][i] = V_h2[(k_VKQ_0 + k)*stride_KV2 + i]; + } + } + + __syncthreads(); + +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += 2) { + half2 V_k[(D/2)/WARP_SIZE][2]; + half2 KQ_k[ncols/nwarps]; + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + V_k[i0/WARP_SIZE][0] = KV_tmp[k0 + 0][i]; + V_k[i0/WARP_SIZE][1] = KV_tmp[k0 + 1][i]; + } +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + KQ_k[j0/nwarps] = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + k0/2]; + } + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][0]* __low2half2(KQ_k[j0/nwarps]); + VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][1]*__high2half2(KQ_k[j0/nwarps]); + } + } + } + + __syncthreads(); + } + +#pragma unroll + for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) { + const int j_VKQ = j_VKQ_0 + threadIdx.y; + + half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]); + kqsum_j = warp_reduce_sum(kqsum_j); + +#pragma unroll + for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) { + const int i0 = i00 + 2*threadIdx.x; + + half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)]; + if (parallel_blocks == 1) { + dst_val /= __half2half2(kqsum_j); + } + const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; + dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = __low2float(dst_val); + dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = __high2float(dst_val); + } + + if (parallel_blocks != 1 && threadIdx.x == 0) { + dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j); + } + } +#else + NO_DEVICE_CODE; +#endif // FP16_AVAILABLE +} + +template void launch_fattn_tile_f16( + const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask, + ggml_cuda_pool & pool, cudaStream_t main_stream +) { + ggml_cuda_pool_alloc dst_tmp(pool); + ggml_cuda_pool_alloc dst_tmp_meta(pool); + + if (parallel_blocks > 1) { + dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV)); + dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV)); + } + + constexpr int nwarps = 8; + const dim3 block_dim(WARP_SIZE, nwarps, 1); + const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]); + const int shmem = 0; + + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float)); + + const uint32_t n_head = Q->ne[2]; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + flash_attn_tile_ext_f16 + <<>> ( + (const char *) Q->data, + (const char *) K->data, + (const char *) V->data, + mask ? ((const char *) mask->data) : nullptr, + parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr, + scale, max_bias, m0, m1, n_head_log2, + Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], + K->ne[0], K->ne[1], K->ne[2], K->ne[3], + mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, + Q->nb[1], Q->nb[2], Q->nb[3], + K->nb[1], K->nb[2], K->nb[3], + KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3] + ); + CUDA_CHECK(cudaGetLastError()); + + if (parallel_blocks == 1) { + return; + } + + const dim3 block_dim_combine(D, 1, 1); + const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z); + const int shmem_combine = 0; + + flash_attn_combine_results + <<>> + (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data); + CUDA_CHECK(cudaGetLastError()); +} + +void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * Q = dst->src[0]; + const ggml_tensor * K = dst->src[1]; + const ggml_tensor * V = dst->src[2]; + + const ggml_tensor * mask = dst->src[3]; + + ggml_tensor * KQV = dst; + + const int32_t precision = KQV->op_params[2]; + GGML_ASSERT(precision == GGML_PREC_DEFAULT); + GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128."); + + if (Q->ne[1] <= 16) { + constexpr int cols_per_block = 16; + constexpr int parallel_blocks = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_tile_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_tile_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + if (Q->ne[1] <= 32) { + constexpr int cols_per_block = 32; + constexpr int parallel_blocks = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_tile_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_tile_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + constexpr int cols_per_block = 32; + constexpr int parallel_blocks = 1; + switch (Q->ne[0]) { + case 64: + launch_fattn_tile_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_tile_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } +} diff --git a/ggml-cuda/fattn-tile-f16.cuh b/ggml-cuda/fattn-tile-f16.cuh new file mode 100644 index 000000000..ffc587842 --- /dev/null +++ b/ggml-cuda/fattn-tile-f16.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml-cuda/fattn-tile-f32.cu new file mode 100644 index 000000000..176895edd --- /dev/null +++ b/ggml-cuda/fattn-tile-f32.cu @@ -0,0 +1,393 @@ +#include "common.cuh" +#include "fattn-common.cuh" +#include "fattn-tile-f32.cuh" + +#define FATTN_KQ_STRIDE_TILE_F32 32 + +template // D == head size +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +__launch_bounds__(nwarps*WARP_SIZE, 1) +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +static __global__ void flash_attn_tile_ext_f32( + const char * __restrict__ Q, + const char * __restrict__ K, + const char * __restrict__ V, + const char * __restrict__ mask, + float * __restrict__ dst, + float2 * __restrict__ dst_meta, + const float scale, + const float max_bias, + const float m0, + const float m1, + const uint32_t n_head_log2, + const int ne00, + const int ne01, + const int ne02, + const int ne03, + const int ne10, + const int ne11, + const int ne12, + const int ne13, + const int ne31, + const int nb31, + const int nb01, + const int nb02, + const int nb03, + const int nb11, + const int nb12, + const int nb13, + const int ne0, + const int ne1, + const int ne2, + const int ne3) { + //In this kernel Q, K, V are matrices while i, j, k are matrix indices. + + const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. + const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. + + const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. + const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0); + const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio)); + const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) mask + ne11*ic0; + + const int stride_KV2 = nb11 / sizeof(half2); + + float slope = 1.0f; + + // ALiBi + if (max_bias > 0.0f) { + const uint32_t h = blockIdx.y; + + const float base = h < n_head_log2 ? m0 : m1; + const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = powf(base, exph); + } + + static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); + + __shared__ float KQ[ncols*FATTN_KQ_STRIDE_TILE_F32]; + + __shared__ float KV_tmp[FATTN_KQ_STRIDE_TILE_F32][D + 1]; // Pad D to avoid memory bank conflicts. + float2 * KV_tmp2 = (float2 *) KV_tmp; + + float kqmax[ncols/nwarps]; +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + kqmax[j0/nwarps] = -FLT_MAX/2.0f; + } + float kqsum[ncols/nwarps] = {0.0f}; + + float2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}}; + + // Convert Q to half2 and store in registers: + __shared__ float Q_f[ncols][D]; +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + +#pragma unroll + for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) { + float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x]; + Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale; + Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale; + } + } + + __syncthreads(); + + const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F32; + for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F32) { + // Calculate KQ tile and keep track of new maximum KQ values: + + float kqmax_new[ncols/nwarps]; +#pragma unroll + for (int j = 0; j < ncols/nwarps; ++j) { + kqmax_new[j] = kqmax[j]; + } + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += nwarps) { + const int i_KQ = i_KQ_0 + threadIdx.y; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) { + const half2 tmp = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x]; + KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp); + KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp); + } + } + + __syncthreads(); + + float sum[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE][ncols/nwarps] = {{0.0f}}; + +#pragma unroll + for (int k_KQ = 0; k_KQ < D; ++k_KQ) { + float K_k[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE]; + float Q_k[ncols/nwarps]; + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) { + const int i_KQ = i_KQ_0 + threadIdx.x; + + K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ]; + } +#pragma unroll + for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { + const int j_KQ = j_KQ_0 + threadIdx.y; + + Q_k[j_KQ_0/nwarps] = Q_f[j_KQ][k_KQ]; + } + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) { +#pragma unroll + for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { + sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE] * Q_k[j_KQ_0/nwarps]; + } + } + } + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) { + const int i_KQ = i_KQ_0 + threadIdx.x; + +#pragma unroll + for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { + const int j_KQ = j_KQ_0 + threadIdx.y; + + sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; + + kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]); + + KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F32 + i_KQ] = sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]; + } + } + + __syncthreads(); + +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]); + const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]); + kqmax[j0/nwarps] = kqmax_new[j0/nwarps]; + + float kqsum_add = 0.0f; +#pragma unroll + for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F32; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + const float diff = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] - kqmax[j0/nwarps]; + const float val = expf(diff); + kqsum_add += val; + KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] = val; + } + kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add; + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale; + VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale; + } + } + + __syncthreads(); + +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F32; k0 += nwarps) { + const int k = k0 + threadIdx.y; + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + KV_tmp2[k*(D/2) + i].x = __low2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]); + KV_tmp2[k*(D/2) + i].y = __high2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]); + } + } + + __syncthreads(); + +#pragma unroll + for (int k = 0; k < FATTN_KQ_STRIDE_TILE_F32; ++k) { + float2 V_k[(D/2)/WARP_SIZE]; + float KQ_k[ncols/nwarps]; + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + V_k[i0/WARP_SIZE] = KV_tmp2[k*(D/2) + i]; + } +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + KQ_k[j0/nwarps] = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + k]; + } + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + VKQ[j0/nwarps][i0/WARP_SIZE].x += V_k[i0/WARP_SIZE].x*KQ_k[j0/nwarps]; + VKQ[j0/nwarps][i0/WARP_SIZE].y += V_k[i0/WARP_SIZE].y*KQ_k[j0/nwarps]; + } + } + } + + __syncthreads(); + } + +#pragma unroll + for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) { + const int j_VKQ = j_VKQ_0 + threadIdx.y; + + float kqsum_j = kqsum[j_VKQ_0/nwarps]; + kqsum_j = warp_reduce_sum(kqsum_j); + +#pragma unroll + for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) { + const int i0 = i00 + 2*threadIdx.x; + + float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)]; + if (parallel_blocks == 1) { + dst_val.x /= kqsum_j; + dst_val.y /= kqsum_j; + } + const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; + dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = dst_val.x; + dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = dst_val.y; + } + + if (parallel_blocks != 1 && threadIdx.x == 0) { + dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j); + } + } +} + +template void launch_fattn_tile_f32( + const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask, + ggml_cuda_pool & pool, cudaStream_t main_stream +) { + ggml_cuda_pool_alloc dst_tmp(pool); + ggml_cuda_pool_alloc dst_tmp_meta(pool); + + if (parallel_blocks > 1) { + dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV)); + dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV)); + } + + constexpr int nwarps = 8; + const dim3 block_dim(WARP_SIZE, nwarps, 1); + const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]); + const int shmem = 0; + + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float)); + + const uint32_t n_head = Q->ne[2]; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + flash_attn_tile_ext_f32 + <<>> ( + (const char *) Q->data, + (const char *) K->data, + (const char *) V->data, + mask ? ((const char *) mask->data) : nullptr, + parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr, + scale, max_bias, m0, m1, n_head_log2, + Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], + K->ne[0], K->ne[1], K->ne[2], K->ne[3], + mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, + Q->nb[1], Q->nb[2], Q->nb[3], + K->nb[1], K->nb[2], K->nb[3], + KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3] + ); + CUDA_CHECK(cudaGetLastError()); + + if (parallel_blocks == 1) { + return; + } + + const dim3 block_dim_combine(D, 1, 1); + const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z); + const int shmem_combine = 0; + + flash_attn_combine_results + <<>> + (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data); + CUDA_CHECK(cudaGetLastError()); +} + +void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * Q = dst->src[0]; + const ggml_tensor * K = dst->src[1]; + const ggml_tensor * V = dst->src[2]; + + const ggml_tensor * mask = dst->src[3]; + + ggml_tensor * KQV = dst; + + const int32_t precision = KQV->op_params[2]; + GGML_ASSERT(precision == GGML_PREC_DEFAULT); + GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128."); + + if (Q->ne[1] <= 16) { + constexpr int cols_per_block = 16; + constexpr int parallel_blocks = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_tile_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_tile_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + if (Q->ne[1] <= 32) { + constexpr int cols_per_block = 32; + constexpr int parallel_blocks = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_tile_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_tile_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + constexpr int cols_per_block = 32; + constexpr int parallel_blocks = 1; + switch (Q->ne[0]) { + case 64: + launch_fattn_tile_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_tile_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } +} diff --git a/ggml-cuda/fattn-tile-f32.cuh b/ggml-cuda/fattn-tile-f32.cuh new file mode 100644 index 000000000..b1c546c80 --- /dev/null +++ b/ggml-cuda/fattn-tile-f32.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/fattn-vec-f16.cu b/ggml-cuda/fattn-vec-f16.cu index cbf5f7835..a18be5ddc 100644 --- a/ggml-cuda/fattn-vec-f16.cu +++ b/ggml-cuda/fattn-vec-f16.cu @@ -57,7 +57,7 @@ static __global__ void flash_attn_vec_ext_f16( // ALiBi if (max_bias > 0.0f) { - const int h = blockIdx.y; + const uint32_t h = blockIdx.y; const float base = h < n_head_log2 ? m0 : m1; const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; @@ -232,11 +232,8 @@ static __global__ void flash_attn_vec_ext_f16( dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; } - if (parallel_blocks != 1 && tid != 0) { -#pragma unroll - for (int j = 0; j < ncols; ++j) { - dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]); - } + if (parallel_blocks != 1 && threadIdx.x < ncols) { + dst_meta[(ic0 + threadIdx.x)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[threadIdx.x], kqsum[threadIdx.x]); } #else NO_DEVICE_CODE; diff --git a/ggml-cuda/fattn-vec-f32.cu b/ggml-cuda/fattn-vec-f32.cu index 40c336ce3..91fcdc8c3 100644 --- a/ggml-cuda/fattn-vec-f32.cu +++ b/ggml-cuda/fattn-vec-f32.cu @@ -56,7 +56,7 @@ static __global__ void flash_attn_vec_ext_f32( // ALiBi if (max_bias > 0.0f) { - const int h = blockIdx.y; + const uint32_t h = blockIdx.y; const float base = h < n_head_log2 ? m0 : m1; const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; @@ -221,11 +221,8 @@ static __global__ void flash_attn_vec_ext_f32( dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; } - if (parallel_blocks != 1 && tid != 0) { -#pragma unroll - for (int j = 0; j < ncols; ++j) { - dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]); - } + if (parallel_blocks != 1 && threadIdx.x < ncols) { + dst_meta[(ic0 + threadIdx.x)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[threadIdx.x], kqsum[threadIdx.x]); } } diff --git a/ggml-cuda/fattn.cu b/ggml-cuda/fattn.cu index 419f8e752..a1918e258 100644 --- a/ggml-cuda/fattn.cu +++ b/ggml-cuda/fattn.cu @@ -1,5 +1,7 @@ #include "common.cuh" #include "fattn-common.cuh" +#include "fattn-tile-f16.cuh" +#include "fattn-tile-f32.cuh" #include "fattn-vec-f16.cuh" #include "fattn-vec-f32.cuh" #include "fattn.cuh" @@ -88,7 +90,7 @@ static __global__ void flash_attn_ext_f16( // ALiBi if (max_bias > 0.0f) { - const int h = blockIdx.y; + const uint32_t h = blockIdx.y; const float base = h < n_head_log2 ? m0 : m1; const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; @@ -541,13 +543,31 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst const int32_t precision = KQV->op_params[2]; + // On AMD the tile kernels perform poorly, use the vec kernel instead: + if (cc >= CC_OFFSET_AMD) { + if (precision == GGML_PREC_DEFAULT) { + ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst); + } else { + ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); + } + return; + } + if (!fast_fp16_available(cc)) { - ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); + if (Q->ne[1] <= 8) { + ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); + } else { + ggml_cuda_flash_attn_ext_tile_f32(ctx, dst); + } return; } if (!fp16_mma_available(cc)) { - ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst); + if (Q->ne[1] <= 8) { + ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst); + } else { + ggml_cuda_flash_attn_ext_tile_f16(ctx, dst); + } return; } diff --git a/ggml-quants.c b/ggml-quants.c index f13599f6b..9b291d522 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1986,7 +1986,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri for (int j = 0; j < QK_K/16; ++j) { if (quant_weights) { - const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL; + const float * qw = quant_weights + QK_K * i + 16*j; for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]); } else { for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l]; diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index ba392009f..4a9bfa52d 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -134,7 +134,13 @@ static bool set_no_delay(sockfd_t sockfd) { int flag = 1; // set TCP_NODELAY to disable Nagle's algorithm int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int)); - return ret >= 0; + return ret == 0; +} + +static bool set_reuse_addr(sockfd_t sockfd) { + int flag = 1; + int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int)); + return ret == 0; } static std::shared_ptr socket_connect(const char * host, int port) { @@ -181,7 +187,10 @@ static std::shared_ptr create_server_socket(const char * host, int por if (sock == nullptr) { return nullptr; } - + if (!set_reuse_addr(sockfd)) { + fprintf(stderr, "Failed to set SO_REUSEADDR\n"); + return nullptr; + } struct sockaddr_in serv_addr; serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = inet_addr(host); diff --git a/ggml.c b/ggml.c index 684d84235..55152bce4 100644 --- a/ggml.c +++ b/ggml.c @@ -165,9 +165,6 @@ void ggml_print_backtrace(void) { #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 -#define GGML_SILU_FP16 -// #define GGML_CROSS_ENTROPY_EXP_FP16 -// #define GGML_FLASH_ATTN_EXP_FP16 #define GGML_SOFT_MAX_UNROLL 4 #define GGML_VEC_DOT_UNROLL 2 @@ -318,12 +315,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16]; // precomputed quick gelu table for f16 (128 KB) static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; -// precomputed silu table for f16 (128 KB) -static ggml_fp16_t ggml_table_silu_f16[1 << 16]; - -// precomputed exp table for f16 (128 KB) -static ggml_fp16_t ggml_table_exp_f16[1 << 16]; - // precomputed f32 table for f16 (256 KB) (ggml-impl.h) float ggml_table_f32_f16[1 << 16]; @@ -2085,52 +2076,291 @@ inline static float ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); } -//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { -// const uint16_t * i16 = (const uint16_t *) x; -// for (int i = 0; i < n; ++i) { -// y[i] = ggml_table_silu_f16[i16[i]]; -// } -//} +#if defined(__ARM_NEON) -#ifdef GGML_SILU_FP16 -inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { - uint16_t t; - for (int i = 0; i < n; ++i) { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); - memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]); - } +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static float32x4_t ggml_v_expf(float32x4_t x) { + const float32x4_t r = vdupq_n_f32(0x1.8p23f); + const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f)); + const float32x4_t n = vsubq_f32(z, r); + const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n, + vdupq_n_f32(0x1.7f7d1cp-20f)); + const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23); + const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1)))); + const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126)); + const float32x4_t u = vmulq_f32(b, b); + const float32x4_t j = vfmaq_f32( + vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b), + vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b), + vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u); + if (!vpaddd_u64(vreinterpretq_u64_u32(c))) + return vfmaq_f32(k, j, k); + const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000)); + const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000))); + const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d)); + return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1), + vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j))); } + +// computes silu x/(1+exp(-x)) in single precision vector +inline static float32x4_t ggml_v_silu(float32x4_t x) { + const float32x4_t one = vdupq_n_f32(1.0f); + const float32x4_t zero = vdupq_n_f32(0.0f); + const float32x4_t neg_x = vsubq_f32(zero, x); + const float32x4_t exp_neg_x = ggml_v_expf(neg_x); + const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x); + return vdivq_f32(x, one_plus_exp_neg_x); +} + +#elif defined(__AVX512F__) && defined(__AVX512DQ__) + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m512 ggml_v_expf(__m512 x) { + const __m512 r = _mm512_set1_ps(0x1.8p23f); + const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r); + const __m512 n = _mm512_sub_ps(z, r); + const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f), + _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x)); + const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23); + const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1)))); + const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ); + const __m512 u = _mm512_mul_ps(b, b); + const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b, + _mm512_set1_ps(0x1.573e2ep-5f)), u, + _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b, + _mm512_set1_ps(0x1.fffdb6p-2f))), + u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b)); + if (_mm512_kortestz(c, c)) + return _mm512_fmadd_ps(j, k, k); + const __m512i g = _mm512_and_si512( + _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)), + _mm512_set1_epi32(0x82000000u)); + const __m512 s1 = + _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u))); + const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g)); + const __mmask16 d = + _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ); + return _mm512_mask_blend_ps( + d, _mm512_mask_blend_ps( + c, _mm512_fmadd_ps(k, j, k), + _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)), + _mm512_mul_ps(s1, s1)); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m512 ggml_v_silu(__m512 x) { + const __m512 one = _mm512_set1_ps(1); + const __m512 zero = _mm512_setzero_ps(); + const __m512 neg_x = _mm512_sub_ps(zero, x); + const __m512 exp_neg_x = ggml_v_expf(neg_x); + const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x); + return _mm512_div_ps(x, one_plus_exp_neg_x); +} + +#elif defined(__AVX2__) && defined(__FMA__) + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m256 ggml_v_expf(__m256 x) { + const __m256 r = _mm256_set1_ps(0x1.8p23f); + const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r); + const __m256 n = _mm256_sub_ps(z, r); + const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f), + _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x)); + const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23); + const __m256 k = _mm256_castsi256_ps( + _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1)))); + const __m256i c = _mm256_castps_si256( + _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), + _mm256_set1_ps(126), _CMP_GT_OQ)); + const __m256 u = _mm256_mul_ps(b, b); + const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b, + _mm256_set1_ps(0x1.573e2ep-5f)), u, + _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b, + _mm256_set1_ps(0x1.fffdb6p-2f))), + u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b)); + if (!_mm256_movemask_ps(_mm256_castsi256_ps(c))) + return _mm256_fmadd_ps(j, k, k); + const __m256i g = _mm256_and_si256( + _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)), + _mm256_set1_epi32(0x82000000u)); + const __m256 s1 = + _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u))); + const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g)); + const __m256i d = _mm256_castps_si256( + _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), + _mm256_set1_ps(192), _CMP_GT_OQ)); + return _mm256_or_ps( + _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)), + _mm256_andnot_ps( + _mm256_castsi256_ps(d), + _mm256_or_ps( + _mm256_and_ps(_mm256_castsi256_ps(c), + _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)), + _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k))))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m256 ggml_v_silu(__m256 x) { + const __m256 one = _mm256_set1_ps(1); + const __m256 zero = _mm256_setzero_ps(); + const __m256 neg_x = _mm256_sub_ps(zero, x); + const __m256 exp_neg_x = ggml_v_expf(neg_x); + const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x); + return _mm256_div_ps(x, one_plus_exp_neg_x); +} + +#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON + +#if defined(__FMA__) +#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z) +#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z) #else -inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { - for (int i = 0; i < n; ++i) { +#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z) +#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y)) +#endif + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m128 ggml_v_expf(__m128 x) { + const __m128 r = _mm_set1_ps(0x1.8p23f); + const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r); + const __m128 n = _mm_sub_ps(z, r); + const __m128 b = + NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x)); + const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23); + const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1)))); + const __m128i c = + _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126))); + const __m128 u = _mm_mul_ps(b, b); + const __m128 j = + MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u, + MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))), + u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b)); + if (!_mm_movemask_epi8(c)) + return MADD128(j, k, k); + const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())), + _mm_set1_epi32(0x82000000u)); + const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u))); + const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g)); + const __m128i d = + _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192))); + return _mm_or_ps( + _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)), + _mm_andnot_ps(_mm_castsi128_ps(d), + _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)), + _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k))))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m128 ggml_v_silu(__m128 x) { + const __m128 one = _mm_set1_ps(1); + const __m128 zero = _mm_setzero_ps(); + const __m128 neg_x = _mm_sub_ps(zero, x); + const __m128 exp_neg_x = ggml_v_expf(neg_x); + const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x); + return _mm_div_ps(x, one_plus_exp_neg_x); +} + +#endif // __ARM_NEON / __AVX2__ / __SSE2__ + +static void ggml_vec_silu_f32(const int n, float * y, const float * x) { + int i = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i))); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i))); + } +#elif defined(__SSE2__) + for (; i + 3 < n; i += 4) { + _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i))); + } +#elif defined(__ARM_NEON) + for (; i + 3 < n; i += 4) { + vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i))); + } +#endif + for (; i < n; ++i) { y[i] = ggml_silu_f32(x[i]); } } + +static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { + int i = 0; + ggml_float sum = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i + 15 < n; i += 16) { + __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i), + _mm512_set1_ps(max))); + _mm512_storeu_ps(y + i, val); + sum += (ggml_float)_mm512_reduce_add_ps(val); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i + 7 < n; i += 8) { + __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i), + _mm256_set1_ps(max))); + _mm256_storeu_ps(y + i, val); + __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1), + _mm256_castps256_ps128(val)); + val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); + val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); + sum += (ggml_float)_mm_cvtss_f32(val2); + } +#elif defined(__SSE2__) + for (; i + 3 < n; i += 4) { + __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i), + _mm_set1_ps(max))); + _mm_storeu_ps(y + i, val); +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + val = _mm_add_ps(val, _mm_movehl_ps(val, val)); + val = _mm_add_ss(val, _mm_movehdup_ps(val)); +#else + __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1)); + val = _mm_add_ps(val, tmp); + tmp = _mm_movehl_ps(tmp, val); + val = _mm_add_ss(val, tmp); #endif + sum += (ggml_float)_mm_cvtss_f32(val); + } +#elif defined(__ARM_NEON) + for (; i + 3 < n; i += 4) { + float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i), + vdupq_n_f32(max))); + vst1q_f32(y + i, val); + sum += (ggml_float)vaddvq_f32(val); + } +#endif + for (; i < n; ++i) { + float val = expf(x[i] - max); + sum += (ggml_float)val; + y[i] = val; + } + return sum; +} inline static float ggml_silu_backward_f32(float x, float dy) { const float s = 1.0f/(1.0f + expf(-x)); return dy*s*(1.0f + x*(1.0f - s)); } -#ifdef GGML_SILU_FP16 -inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { - for (int i = 0; i < n; ++i) { - // we did not use x[i] to compute forward silu but its f16 equivalent - // take derivative at f16 of x[i]: - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); - float usedx = GGML_FP16_TO_FP32(fp16); - dx[i] = ggml_silu_backward_f32(usedx, dy[i]); - } -} -#else inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { for (int i = 0; i < n; ++i) { dx[i] = ggml_silu_backward_f32(x[i], dy[i]); } } -#endif inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE @@ -2922,8 +3152,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); - ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f)); - ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); } const uint64_t t_end = ggml_time_us(); UNUSED(t_end); @@ -13600,22 +13828,7 @@ static void ggml_compute_forward_soft_max_f32( float max = -INFINITY; ggml_vec_max_f32(nc, &max, wp); - ggml_float sum = 0.0; - - uint16_t scvt; - for (int i = 0; i < nc; i++) { - if (wp[i] == -INFINITY) { - dp[i] = 0.0f; - } else { - // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max); - ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max); - memcpy(&scvt, &s, sizeof(scvt)); - const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]); - sum += (ggml_float)val; - dp[i] = val; - } - } - + ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max); assert(sum > 0.0); sum = 1.0/sum; @@ -15374,37 +15587,7 @@ static void ggml_compute_forward_flash_attn_f32( vvexpf(S, S, &Mup); ggml_vec_sum_f32(Mup, &sum, S); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); - ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - - for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { - if (i >= masked_begin) { - break; - } - float * SS = S + i; - - for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { - if (i + j >= masked_begin) { - break; - } else if (SS[j] == -INFINITY) { - SS[j] = 0.0f; - } else { -#ifndef GGML_FLASH_ATTN_EXP_FP16 - const float val = expf(SS[j] - max); -#else - ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]); -#endif - sump[j] += (ggml_float)val; - SS[j] = val; - } - } - } - - for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; - } + sum = ggml_vec_soft_max_f32(Mup, S, S, max); #endif } @@ -15586,28 +15769,7 @@ static void ggml_compute_forward_flash_attn_f16( vvexpf(S, S, &Mup); ggml_vec_sum_f32(Mup, &sum, S); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; - ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - - for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { - float * SS = S + i; - - for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { - if (SS[j] == -INFINITY) { - SS[j] = 0.0f; - } else { - ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]); - sump[j] += (ggml_float)val; - SS[j] = val; - } - } - } - - for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; - } + sum = ggml_vec_soft_max_f32(Mup, S, S, max); #endif } @@ -16234,38 +16396,7 @@ static void ggml_compute_forward_flash_attn_back_f32( vvexpf(SM, SM, &Mup); ggml_vec_sum_f32(Mup, &sum, SM); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); - ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - - for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { - if (i >= masked_begin) { - break; - } - float * SR = S + i; - float * SW = SM + i; - - for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { - if (i + j >= masked_begin) { - break; - } else if (SR[j] == -INFINITY) { - SW[j] = 0.0f; - } else { -#ifndef GGML_FLASH_ATTN_EXP_FP16 - const float val = expf(SR[j] - max); -#else - ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]); -#endif - sump[j] += (ggml_float)val; - SW[j] = val; - } - } - } - - for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; - } + sum = ggml_vec_soft_max_f32(Mup, SM, S, max); #endif } @@ -17291,35 +17422,15 @@ static void ggml_compute_forward_cross_entropy_loss_f32( assert(!isnan(s1[i])); } #endif + // soft_max - ggml_float sum = 0.0; - { - float max = -INFINITY; - ggml_vec_max_f32(nc, &max, s0); - - uint16_t scvt; UNUSED(scvt); - for (int i = 0; i < nc; i++) { - if (s0[i] == -INFINITY) { - st[i] = 0.0f; - } else { -#ifndef GGML_CROSS_ENTROPY_EXP_FP16 - const float s = s0[i] - max; - const float val = expf(s); -#else - ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); - memcpy(&scvt, &s, sizeof(scvt)); - const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]); -#endif - sum += (ggml_float)val; - st[i] = val; - } - } - - assert(sum > 0.0); - // sum = 1.0/sum; - } - // avoid log(0) by rescaling from [0..1] to [eps..1] + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + ggml_float sum = ggml_vec_soft_max_f32(nc, st, s0, max); + assert(sum > 0.0); sum = (1.0 - eps) / sum; + + // avoid log(0) by rescaling from [0..1] to [eps..1] ggml_vec_scale_f32(nc, st, sum); ggml_vec_add1_f32(nc, st, st, eps); ggml_vec_log_f32(nc, st, st); @@ -17409,32 +17520,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( #endif // soft_max - ggml_float sum = 0.0; - { - float max = -INFINITY; - ggml_vec_max_f32(nc, &max, s0); - - uint16_t scvt; UNUSED(scvt); - for (int i = 0; i < nc; i++) { - if (s0[i] == -INFINITY) { - ds0[i] = 0.0f; - } else { -#ifndef GGML_CROSS_ENTROPY_EXP_FP16 - const float s = s0[i] - max; - const float val = expf(s); -#else - ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); - memcpy(&scvt, &s, sizeof(scvt)); - const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]); -#endif - sum += (ggml_float)val; - ds0[i] = val; - } - } - - assert(sum > 0.0); - sum = (1.0 - eps)/sum; - } + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max); + assert(sum > 0.0); + sum = (1.0 - eps) / sum; // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr ggml_vec_scale_f32(nc, ds0, sum); diff --git a/llama.cpp b/llama.cpp index dd7729e57..aa4bd5fba 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6625,6 +6625,7 @@ static struct ggml_tensor * llm_build_kqv( const int64_t n_embd_head_k = hparams.n_embd_head_k; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_head_v = hparams.n_embd_head_v; + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); cb(q, "q", il); @@ -6647,8 +6648,8 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * v = ggml_view_3d(ctx, kv.v_l[il], n_embd_head_v, n_kv, n_head_kv, - ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv.v_l[il]->type, n_embd_head_k), + ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv.v_l[il]->type, n_embd_head_v), 0); cb(v, "v", il); @@ -6658,7 +6659,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); } - cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); @@ -6703,7 +6704,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens); + cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens); cb(cur, "kqv_merged_cont", il); } @@ -12827,6 +12828,13 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } + if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { + LLAMA_LOG_WARN( + "%s: Added a BOS token to the prompt as specified by the model but the prompt " + "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + if (add_special && vocab.special_add_eos == 1) { GGML_ASSERT(vocab.special_eos_id != -1); output.push_back(vocab.special_eos_id); @@ -12853,6 +12861,13 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } + if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { + LLAMA_LOG_WARN( + "%s: Added a BOS token to the prompt as specified by the model but the prompt " + "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + if (add_special && vocab.special_add_eos == 1) { GGML_ASSERT(vocab.special_add_eos != -1); output.push_back(vocab.special_eos_id); @@ -13913,9 +13928,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ // Sample the next word X using top-k sampling llama_sample_top_k(nullptr, candidates, int(k), 1); - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; llama_token X = llama_sample_token(ctx, candidates); t_start_sample_us = ggml_time_us(); @@ -13929,9 +13942,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ // Update mu using the learning rate and error *mu = *mu - eta * e; - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; return X; } diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh index 231a23d69..7b2b601a9 100755 --- a/scripts/debug-test.sh +++ b/scripts/debug-test.sh @@ -1,117 +1,203 @@ #!/bin/bash -test_suite=${1:-} -test_number=${2:-} PROG=${0##*/} build_dir="build-ci-debug" -if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then - echo "Usage: $PROG [OPTION]... (test_number)" - echo "Debug specific ctest program." - echo - echo "Options:" - echo " -h, --help Display this help and exit" - echo - echo "Arguments:" - echo " (Mandatory) Supply one regex to the script to filter tests" - echo " (test_number) (Optional) Test number to run a specific test" - echo - echo "Example:" - echo " $PROG test-tokenizer" - echo " $PROG test-tokenizer 3" - echo - exit 0 -fi +# Print Color Commands +red=$(tput setaf 1) +green=$(tput setaf 2) +yellow=$(tput setaf 3) +blue=$(tput setaf 4) +magenta=$(tput setaf 5) +cyan=$(tput setaf 6) +normal=$(tput sgr0) -# Function to select and debug a test -function select_test() { - test_suite=${1:-test} - test_number=${2:-} - # Sanity Check If Tests Is Detected - printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n" - tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1')) - if [ ${#tests[@]} -eq 0 ] - then - echo "No tests avaliable... check your compliation process..." - echo "Exiting." - exit 1 - fi +# Print Help Message +#################### - if [ -z $test_number ] - then - # List out avaliable tests - printf "Which test would you like to debug?\n" - id=0 - for s in "${tests[@]}" - do - echo "Test# ${id}" - echo " $s" - ((id++)) - done +print_full_help() { + cat << EOF +Usage: $PROG [OPTION]... (test_number) +Debug specific ctest program. - # Prompt user which test they wanted to run - printf "\nRun test#? " - read test_number - else - printf "\nUser Already Requested #${test_number}" - fi +Options: + -h, --help display this help and exit + -g run in gdb mode - # Start GDB with the requested test binary and arguments - printf "Debugging(GDB) test: ${tests[test_number]}\n" - # Change IFS (Internal Field Separator) - sIFS=$IFS - IFS=$'\n' +Arguments: + (Mandatory) Supply one regex to the script to filter tests + (test_number) (Optional) Test number to run a specific test - # Get test args - gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) - IFS=$sIFS - printf "Debug arguments: ${gdb_args[test_number]}\n\n" - - # Expand paths if needed - args=() - for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\"//') - do - args+=($(echo $x | sed -e 's/.*\/..\//..\//')) - done - - # Execute debugger - echo "gdb args: ${args[@]}" - gdb --args ${args[@]} +Example: + $PROG test-tokenizer + $PROG test-tokenizer 3 +EOF } +abort() { + echo "Error: $1" >&2 + cat << EOF >&2 +Usage: $PROG [OPTION]... (test_number) +Debug specific ctest program. +Refer to --help for full instructions. +EOF + exit 1 +} + + +# Dependency Sanity Check +######################### + +check_dependency() { + command -v "$1" >/dev/null 2>&1 || { + abort "$1 is required but not found. Please install it and try again." + } +} + +check_dependency ctest +check_dependency cmake + + # Step 0: Check the args -if [ -z "$test_suite" ] -then - echo "Usage: $PROG [OPTION]... (test_number)" - echo "Supply one regex to the script to filter tests," - echo "and optionally a test number to run a specific test." - echo "Use --help flag for full instructions" - exit 1 +######################## + +if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then + print_full_help >&2 + exit 0 fi +# Parse command-line options +gdb_mode=false +while getopts "g" opt; do + case $opt in + g) + gdb_mode=true + echo "gdb_mode Mode Enabled" + ;; + esac +done + +# Shift the option parameters +shift $((OPTIND - 1)) + +# Positionial Argument Processing : +if [ -z "${1}" ]; then + abort "Test regex is required" +else + test_suite=${1:-} +fi + +# Positionial Argument Processing : (test_number) +test_number=${2:-} + + # Step 1: Reset and Setup folder context +######################################## + ## Sanity check that we are actually in a git repo repo_root=$(git rev-parse --show-toplevel) if [ ! -d "$repo_root" ]; then - echo "Error: Not in a Git repository." - exit 1 + abort "Not in a Git repository." fi -## Reset folder to root context of git repo -pushd "$repo_root" || exit 1 +## Reset folder to root context of git repo and Create and enter build directory +pushd "$repo_root" +rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir" -## Create and enter build directory -rm -rf "$build_dir" && mkdir "$build_dir" || exit 1 # Step 2: Setup Build Environment and Compile Test Binaries -cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1 -pushd "$build_dir" && make -j || exit 1 +########################################################### -# Step 3: Debug the Test -select_test "$test_suite" "$test_number" +# Note: test-eval-callback requires -DLLAMA_CURL +cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment" +pushd "$build_dir" +make -j || abort "Failed to compile" +popd > /dev/null || exit 1 -# Step 4: Return to the directory from which the user ran the command. -popd || exit 1 -popd || exit 1 -popd || exit 1 + +# Step 3: Find all tests available that matches REGEX +#################################################### + +# Ctest Gather Tests +# `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex) +# `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB. +# `-V` : Verbose Mode +printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n" +pushd "$build_dir" +tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1')) +if [ ${#tests[@]} -eq 0 ]; then + abort "No tests avaliable... check your compliation process..." +fi +popd > /dev/null || exit 1 + + +# Step 4: Identify Test Command for Debugging +############################################# + +# Select test number +if [ -z $test_number ]; then + # List out avaliable tests + printf "Which test would you like to debug?\n" + id=0 + for s in "${tests[@]}" + do + echo "Test# ${id}" + echo " $s" + ((id++)) + done + + # Prompt user which test they wanted to run + printf "\nRun test#? " + read test_number + +else + printf "\nUser Already Requested #${test_number}\n" + +fi + +# Grab all tests commands +pushd "$build_dir" +sIFS=$IFS # Save Initial IFS (Internal Field Separator) +IFS=$'\n' # Change IFS (Internal Field Separator) (So we split ctest output by newline rather than by spaces) +test_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) # Get test args +IFS=$sIFS # Reset IFS (Internal Field Separator) +popd > /dev/null || exit 1 + +# Grab specific test command +single_test_name="${tests[test_number]}" +single_test_command="${test_args[test_number]}" + + +# Step 5: Execute or GDB Debug +############################## + +printf "${magenta}Running Test #${test_number}: ${single_test_name}${normal}\n" +printf "${cyan}single_test_command: ${single_test_command}${normal}\n" + +if [ "$gdb_mode" = "true" ]; then + # Execute debugger + pushd "$repo_root" || exit 1 + eval "gdb --args ${single_test_command}" + popd > /dev/null || exit 1 + +else + # Execute Test + pushd "$repo_root" || exit 1 + eval "${single_test_command}" + exit_code=$? + popd > /dev/null || exit 1 + + # Print Result + printf "${blue}Ran Test #${test_number}: ${single_test_name}${normal}\n" + printf "${yellow}Command: ${single_test_command}${normal}\n" + if [ $exit_code -eq 0 ]; then + printf "${green}TEST PASS${normal}\n" + else + printf "${red}TEST FAIL${normal}\n" + fi + +fi + +# Return to the directory from which the user ran the command. +popd > /dev/null || exit 1