diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index eefd87878..9b03d19bc 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -91,6 +91,12 @@ jobs: echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT fi + - name: Downcase github.repository_owner + run: | + echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV + env: + GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' + - name: Build and push Docker image (versioned) if: github.event_name == 'push' uses: docker/build-push-action@v4 @@ -98,7 +104,7 @@ jobs: context: . push: true platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" file: ${{ matrix.config.dockerfile }} - name: Build and push Docker image (tagged) @@ -107,5 +113,5 @@ jobs: context: . push: ${{ github.event_name == 'push' }} platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}" + tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}" file: ${{ matrix.config.dockerfile }} diff --git a/Makefile b/Makefile index 5f864c935..dfc8b2418 100644 --- a/Makefile +++ b/Makefile @@ -688,7 +688,7 @@ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml $(CXX) $(CXXFLAGS) -c $< -o $@ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h -COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o +COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o common.o: common/common.cpp $(COMMON_H_DEPS) $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -756,7 +756,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS) +batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -788,7 +788,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) diff --git a/Package.swift b/Package.swift index 8b7195869..fbd0973be 100644 --- a/Package.swift +++ b/Package.swift @@ -2,6 +2,44 @@ import PackageDescription +var sources = [ + "ggml.c", + "llama.cpp", + "unicode.cpp", + "unicode-data.cpp", + "ggml-alloc.c", + "ggml-backend.c", + "ggml-quants.c", +] + +var resources: [Resource] = [] +var linkerSettings: [LinkerSetting] = [] +var cSettings: [CSetting] = [ + .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), + .unsafeFlags(["-fno-objc-arc"]), + // NOTE: NEW_LAPACK will required iOS version 16.4+ + // We should consider add this in the future when we drop support for iOS 14 + // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) + // .define("ACCELERATE_NEW_LAPACK"), + // .define("ACCELERATE_LAPACK_ILP64") +] + +#if canImport(Darwin) +sources.append("ggml-metal.m") +resources.append(.process("ggml-metal.metal")) +linkerSettings.append(.linkedFramework("Accelerate")) +cSettings.append( + contentsOf: [ + .define("GGML_USE_ACCELERATE"), + .define("GGML_USE_METAL") + ] +) +#endif + +#if os(Linux) + cSettings.append(.define("_GNU_SOURCE")) +#endif + let package = Package( name: "llama", platforms: [ @@ -28,34 +66,11 @@ let package = Package( "ggml-cuda.h", "Makefile" ], - sources: [ - "ggml.c", - "llama.cpp", - "unicode.cpp", - "unicode-data.cpp", - "ggml-alloc.c", - "ggml-backend.c", - "ggml-quants.c", - "ggml-metal.m", - ], - resources: [ - .process("ggml-metal.metal") - ], + sources: sources, + resources: resources, publicHeadersPath: "spm-headers", - cSettings: [ - .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .define("GGML_USE_ACCELERATE"), - .unsafeFlags(["-fno-objc-arc"]), - .define("GGML_USE_METAL"), - // NOTE: NEW_LAPACK will required iOS version 16.4+ - // We should consider add this in the future when we drop support for iOS 14 - // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) - // .define("ACCELERATE_NEW_LAPACK"), - // .define("ACCELERATE_LAPACK_ILP64") - ], - linkerSettings: [ - .linkedFramework("Accelerate") - ] + cSettings: cSettings, + linkerSettings: linkerSettings ) ], cxxLanguageStandard: .cxx11 diff --git a/README-sycl.md b/README-sycl.md index 169d2ca0b..f6dbfd878 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -68,7 +68,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, | Intel GPU | Status | Verified Model | |-------------------------------|---------|---------------------------------------| -| Intel Data Center Max Series | Support | Max 1550 | +| Intel Data Center Max Series | Support | Max 1550, 1100 | | Intel Data Center Flex Series | Support | Flex 170 | | Intel Arc Series | Support | Arc 770, 730M | | Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | @@ -84,8 +84,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, - **Execution Unit (EU)** - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use. -### Nvidia GPU -The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment) +### Other Vendor GPU **Verified devices** @@ -94,14 +93,9 @@ The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvi | Ampere Series | Support | A100, A4000 | | Ampere Series *(Mobile)* | Support | RTX 40 Series | -*Notes:* - - Support for Nvidia targets through oneAPI is currently limited to Linux platforms. - - - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs. - - ## Docker The docker build option is currently limited to *intel GPU* targets. + ### Build image ```sh # Using FP16 @@ -168,29 +162,10 @@ Platform #0: Intel(R) OpenCL HD Graphics - **Nvidia GPU** In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed. -Installation can be verified by running the following: -```sh -nvidia-smi -``` -Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*: -``` -+---------------------------------------------------------------------------------------+ -| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 | -|-----------------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+======================+======================| -| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 | -| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+----------------------+----------------------+ -``` - 2. **Install IntelĀ® oneAPI Base toolkit** -- **Base installation** +- **For Intel GPU** The base toolkit can be obtained from the official [IntelĀ® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page. @@ -202,10 +177,10 @@ Upon a successful installation, SYCL is enabled for the available intel devices, - **Adding support to Nvidia GPUs** -**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup. +**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup. -**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs. +**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs. ```sh git clone https://github.com/oneapi-src/oneMKL @@ -237,7 +212,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic - **Nvidia GPU** -Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow: +Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow: ``` [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix] [opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix] @@ -260,6 +235,9 @@ cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icp # Option 2: Use FP32 by default cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + +#build all binary +cmake --build . --config Release -j -v ``` #### Nvidia GPU @@ -278,6 +256,10 @@ cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=i # Option 2: Use FP32 by default cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + +#build all binary +cmake --build . --config Release -j -v + ``` ### III. Run the inference @@ -357,7 +339,6 @@ Otherwise, you can run the script: *Notes:* -- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue. - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: ```sh @@ -438,7 +419,7 @@ cd build cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON -make +make -j ``` Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions: @@ -525,7 +506,6 @@ Otherwise, run the following wrapper script: Note: -- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue. - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: ```sh @@ -557,12 +537,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512 ## Known Issues -- Hanging during startup - - llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang. - - - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable. - - `Split-mode:[row]` is not supported. ## Q&A @@ -574,7 +548,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 - General compiler error: - - Remove build folder or try a clean-build. + - Remove **build** folder or try a clean-build. - I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux. diff --git a/build.zig b/build.zig index 6ac1bc334..ea9da4bc6 100644 --- a/build.zig +++ b/build.zig @@ -128,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void { const clip = make.obj("clip", "examples/llava/clip.cpp"); const llava = make.obj("llava", "examples/llava/llava.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/ci/run.sh b/ci/run.sh index 19776b5f7..085dfd42f 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -153,6 +153,52 @@ function gg_sum_ctest_release { gg_printf '```\n' } +# test_scripts_debug + +function gg_run_test_scripts_debug { + cd ${SRC} + + set -e + + (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + + set +e +} + +function gg_sum_test_scripts_debug { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs test scripts in debug mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)" + gg_printf '```\n' + gg_printf '\n' +} + +# test_scripts_release + +function gg_run_test_scripts_release { + cd ${SRC} + + set -e + + (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + + set +e +} + +function gg_sum_test_scripts_release { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs test scripts in release mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)" + gg_printf '```\n' + gg_printf '\n' +} + function gg_get_model { local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf" local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" @@ -642,6 +688,9 @@ test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run embd_bge_small + test $ret -eq 0 && gg_run test_scripts_debug + test $ret -eq 0 && gg_run test_scripts_release + if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_CUDA} ]; then test $ret -eq 0 && gg_run open_llama_3b_v2 diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 1d840e5f7..0ec8d6d8d 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -47,9 +47,6 @@ if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() -set(TARGET json-schema-to-grammar) -add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h) - set(TARGET common) add_library(${TARGET} STATIC @@ -63,6 +60,7 @@ add_library(${TARGET} STATIC grammar-parser.h grammar-parser.cpp json.hpp + json-schema-to-grammar.cpp train.h train.cpp ngram-cache.h diff --git a/common/common.cpp b/common/common.cpp index dda514785..52576cba3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1,4 +1,6 @@ #include "common.h" +#include "json.hpp" +#include "json-schema-to-grammar.h" #include "llama.h" #include @@ -68,6 +70,8 @@ #define LLAMA_CURL_MAX_HEADER_LENGTH 256 #endif // LLAMA_USE_CURL +using json = nlohmann::ordered_json; + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -1148,6 +1152,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa ); return true; } + if (arg == "-j" || arg == "--json-schema") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); + return true; + } if (arg == "--override-kv") { if (++i >= argc) { invalid_param = true; @@ -1353,6 +1365,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); printf(" --grammar-file FNAME file to read grammar from\n"); + printf(" -j SCHEMA, --json-schema SCHEMA\n"); + printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n"); + printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n"); printf(" --cfg-negative-prompt PROMPT\n"); printf(" negative prompt to use for guidance. (default: empty)\n"); printf(" --cfg-negative-prompt-file FNAME\n"); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index e1ac09e02..b51d68307 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -43,17 +43,18 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]") class Model(ABC): _model_classes: dict[str, type[Model]] = {} - def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool): self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out self.is_big_endian = is_big_endian self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.use_temp_file = use_temp_file self.is_safetensors = self._is_model_safetensors() self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) @property @@ -2459,6 +2460,7 @@ def parse_args() -> argparse.Namespace: "model", type=Path, help="directory containing model file", ) + parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)") return parser.parse_args() @@ -2502,7 +2504,7 @@ def main() -> None: with torch.inference_mode(): model_class = Model.from_model_architecture(hparams["architectures"][0]) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file) print("Set model parameters") model_instance.set_gguf_parameters() diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md index ddb1f7649..ad1d86651 100644 --- a/examples/gguf-split/README.md +++ b/examples/gguf-split/README.md @@ -5,5 +5,6 @@ CLI to split / merge GGUF files. **Command line options:** - `--split`: split GGUF to multiple GGUF, default operation. +- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`. - `--split-max-tensors`: maximum tensors in each split: default(128) - `--merge`: merge multiple GGUF to a single GGUF. diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 24acbf02a..39c75e0a7 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -59,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) { int n; if (str.back() == 'M') { sscanf(str.c_str(), "%d", &n); - n_bytes = n * 1024 * 1024; // megabytes + n_bytes = (size_t)n * 1024 * 1024; // megabytes } else if (str.back() == 'G') { sscanf(str.c_str(), "%d", &n); - n_bytes = n * 1024 * 1024 * 1024; // gigabytes + n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes } else { throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back())); } diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh new file mode 100644 index 000000000..879522f7e --- /dev/null +++ b/examples/gguf-split/tests.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +set -eu + +if [ $# -lt 1 ] +then + echo "usage: $0 path_to_build_binary [path_to_temp_folder]" + echo "example: $0 ../../build/bin ../../tmp" + exit 1 +fi + +if [ $# -gt 1 ] +then + TMP_DIR=$2 +else + TMP_DIR=/tmp +fi + +set -x + +SPLIT=$1/gguf-split +MAIN=$1/main +WORK_PATH=$TMP_DIR/gguf-split +CUR_DIR=$(pwd) + +mkdir -p "$WORK_PATH" + +# Clean up in case of previously failed test +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf + +# 1. Get a model +( + cd $WORK_PATH + "$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf +) +echo PASS + +# 2. Split with max tensors strategy +$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split +echo PASS +echo + +# 2b. Test the sharded model is loading properly +$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# 3. Merge +$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf +echo PASS +echo + +# 3b. Test the merged model is loading properly +$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# 4. Split with no tensor in metadata +#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors +#echo PASS +#echo + +# 4b. Test the sharded model is loading properly +#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32 +#echo PASS +#echo + +# 5. Merge +#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf +#echo PASS +#echo + +# 5b. Test the merged model is loading properly +#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32 +#echo PASS +#echo + +# 6. Split with size strategy +$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G +echo PASS +echo + +# 6b. Test the sharded model is loading properly +$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# Clean up +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf diff --git a/examples/main/README.md b/examples/main/README.md index 10a589ceb..649f4e0f3 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -304,10 +304,12 @@ These options help improve the performance and memory usage of the LLaMA models. - `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation. -### Grammars +### Grammars & JSON schemas - `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax. +- `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead. + ### Quantization For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize). diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 946e34883..9a1576594 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -30,7 +30,7 @@ install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) -target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) if (LLAMA_SERVER_SSL) find_package(OpenSSL REQUIRED) target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 68661ee4d..9fe61eb1b 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -51,26 +51,6 @@ margin-bottom: 0.5em; } - button, input, textarea, .button, a.button, select { - color: #666; - border: 1px solid #ddd; - border-radius: 4px; - line-height: 1.5em; - padding: 0.25em 0.25em; - text-decoration: none; - font-size: 1.1rem; - } - - button { - border: 1px solid #2a8aad; - background: #3584e4; - font-weight: normal; - color: #fff; - } - button:disabled { - background: #9cbce5; - } - #write form { margin: 1em 0 0 0; display: flex; @@ -587,7 +567,7 @@ runCompletion(); } return html` -
+
diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index f20391d7a..db46d57ca 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -20,4 +20,4 @@ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx #cmake --build . --config Release --target llama-bench #build all binary -cmake --build . --config Release -v +cmake --build . --config Release -j -v diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index c979a52f6..7b39a18c0 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -12,6 +12,7 @@ if [ $# -gt 0 ]; then GGML_SYCL_SINGLE_GPU=1 else GGML_SYCL_DEVICE=0 + GGML_SYCL_SINGLE_GPU=0 fi #export GGML_SYCL_DEBUG=1 diff --git a/flake.lock b/flake.lock index ed48dd8da..2e2ab4932 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1712163089, - "narHash": "sha256-Um+8kTIrC19vD4/lUCN9/cU9kcOsD1O1m+axJqQPyMM=", + "lastModified": 1712791164, + "narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "fd281bd6b7d3e32ddfa399853946f782553163b5", + "rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5", "type": "github" }, "original": { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bff8ad9d9..e62f45b61 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst); - } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { diff --git a/ggml-metal.m b/ggml-metal.m index 38da384b1..0207b787a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -37,6 +37,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_DIV_ROW, GGML_METAL_KERNEL_TYPE_SCALE, GGML_METAL_KERNEL_TYPE_SCALE_4, + GGML_METAL_KERNEL_TYPE_CLAMP, GGML_METAL_KERNEL_TYPE_TANH, GGML_METAL_KERNEL_TYPE_RELU, GGML_METAL_KERNEL_TYPE_GELU, @@ -468,6 +469,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true); @@ -713,6 +715,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_SCALE: + case GGML_OP_CLAMP: case GGML_OP_SQR: case GGML_OP_SUM_ROWS: return true; @@ -1154,6 +1157,25 @@ static enum ggml_status ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_OP_CLAMP: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline; + + float min; + float max; + memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float)); + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&min length:sizeof(min) atIndex:2]; + [encoder setBytes:&max length:sizeof(max) atIndex:3]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; case GGML_OP_UNARY: switch (ggml_get_unary_op(gf->nodes[i])) { case GGML_UNARY_OP_TANH: diff --git a/ggml-metal.metal b/ggml-metal.metal index 3a823e65b..56748166c 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -213,6 +213,15 @@ kernel void kernel_scale_4( dst[tpig] = src0[tpig] * scale; } +kernel void kernel_clamp( + device const float * src0, + device float * dst, + constant float & min, + constant float & max, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] < min ? min : (src0[tpig] > max ? max : src0[tpig]); +} + kernel void kernel_relu( device const float * src0, device float * dst, diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 55a1eedb5..f5bb7da86 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)( #define SYCL_SCALE_BLOCK_SIZE 256 #define SYCL_CLAMP_BLOCK_SIZE 256 #define SYCL_ROPE_BLOCK_SIZE 256 -#define SYCL_SOFT_MAX_BLOCK_SIZE 1024 #define SYCL_ALIBI_BLOCK_SIZE 32 #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32 #define SYCL_QUANTIZE_BLOCK_SIZE 256 @@ -13080,11 +13079,13 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float * const int nrows_y, const float scale, const float max_bias, dpct::queue_ptr stream) { int nth = WARP_SIZE; - while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2; + int max_block_size = g_work_group_size; + while (nth < ncols_x && nth < max_block_size) nth *= 2; + if (nth>max_block_size) nth = max_block_size; + const sycl::range<3> block_dims(1, 1, nth); const sycl::range<3> block_nums(1, 1, nrows_x); const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE); - static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); const uint32_t n_head_kv = nrows_x/nrows_y; const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); @@ -13094,6 +13095,12 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float * const size_t local_mem_size = stream->get_device().get_info(); if (n_local_scratch*sizeof(float) < local_mem_size) { + if (ncols_x > max_block_size) { + soft_max_f32_submitter(x, mask, pos, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + return; + } switch (ncols_x) { case 32: soft_max_f32_submitter(x, mask, pos, dst, ncols_x, nrows_y, scale, @@ -15989,73 +15996,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) try { -#if 0 - ggml_sycl_mul_mat_id_sycl(dst); - // TODO: mmq/mmv support -#endif - - const int64_t nb11 = src1->nb[1]; - const int64_t nb1 = dst->nb[1]; - - const struct ggml_tensor * ids = src0; - const int32_t id = ((int32_t *) dst->op_params)[0]; - const int32_t n_as = ((int32_t *) dst->op_params)[1]; - - std::vector ids_host(ggml_nbytes(ids)); - + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT && + "mul_mat_id does not support split buffers"); + const ggml_tensor *ids = dst->src[2]; const dpct::queue_ptr stream = g_syclStreams[g_main_device][0]; - if (ids->backend == GGML_BACKEND_TYPE_GPU) { - const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; - SYCL_CHECK(CHECK_TRY_ERROR( - stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait())); - // SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); - } else { - memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); - } + const size_t nb11 = src1->nb[1]; + const size_t nb1 = dst->nb[1]; - const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; - const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; + const int32_t id = ((int32_t *)dst->op_params)[0]; + const int32_t n_as = src0->ne[2]; + std::vector ids_host(ggml_nbytes(ids)); + const char *ids_dev = (const char *)ids->data; + + SYCL_CHECK(CHECK_TRY_ERROR( + stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); + SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); + + const ggml_tensor_extra_gpu *src0_extra = + (const ggml_tensor_extra_gpu *)src0->extra; + const ggml_tensor_extra_gpu *src1_extra = + (const ggml_tensor_extra_gpu *)src1->extra; + const ggml_tensor_extra_gpu *dst_extra = + (const ggml_tensor_extra_gpu *)dst->extra; + + ggml_tensor_extra_gpu src0_row_extra; ggml_tensor_extra_gpu src1_row_extra; ggml_tensor_extra_gpu dst_row_extra; + ggml_tensor src0_row = *src0; ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; src1_row.backend = GGML_BACKEND_TYPE_GPU; dst_row.backend = GGML_BACKEND_TYPE_GPU; + src0_row.extra = &src0_row_extra; src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? - (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; - char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? - (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; + char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU + ? (char *)src0->data + : (char *)src0_extra->data_device[g_main_device]; + char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU + ? (char *)src1->data + : (char *)src1_extra->data_device[g_main_device]; + char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU + ? (char *)dst->data + : (char *)dst_extra->data_device[g_main_device]; + + src0_row.ne[2] = 1; + src0_row.ne[3] = 1; + src0_row.nb[3] = src0->nb[2]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - //int32_t row_id; - //SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0])); - //SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0])); - - const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + const int32_t row_id = + *(const int32_t *)(ids_host.data() + i01 * ids->nb[1] + + id * ids->nb[0]); GGML_ASSERT(row_id >= 0 && row_id < n_as); - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + src0_row_extra.data_device[g_main_device] = + src0_original + row_id * src0->nb[2]; + src1_row_extra.data_device[g_main_device] = + src1_original + i01 * src1->nb[1]; + dst_row_extra.data_device[g_main_device] = + dst_original + i01 * dst->nb[1]; - src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1]; - src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set? - - dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1]; - dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set? - - ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row); + ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row); } } else { sycl_pool_alloc src1_contiguous(sizeof(float)*ggml_nelements(src1)); @@ -16065,8 +16075,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); for (int32_t row_id = 0; row_id < n_as; ++row_id) { - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - int64_t num_src1_rows = 0; for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); @@ -16079,7 +16087,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, SYCL_CHECK(CHECK_TRY_ERROR( stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11, - src1_original + i01 * nb11, nb11).wait())); + src1_original + i01 * nb11, nb11))); num_src1_rows++; } @@ -16087,6 +16095,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, continue; } + src0_row_extra.data_device[g_main_device] = + src0_original + row_id * src0->nb[2]; + src1_row.ne[1] = num_src1_rows; dst_row.ne[1] = num_src1_rows; @@ -16098,7 +16109,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, dst_row.nb[2] = num_src1_rows*nb1; dst_row.nb[3] = num_src1_rows*nb1; - ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row); + ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row); num_src1_rows = 0; for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { @@ -16112,7 +16123,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy( dst_original + i01 * nb1, - dst_contiguous.get() + num_src1_rows * nb1, nb1).wait())); + dst_contiguous.get() + num_src1_rows * nb1, nb1))); num_src1_rows++; } } @@ -16814,11 +16825,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, const dpct::queue_ptr stream = g_syclStreams[ctx->device][0]; SYCL_CHECK( CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); - + char* host_buf = (char*)malloc(size); + memcpy(host_buf, data, size); SYCL_CHECK( CHECK_TRY_ERROR((*stream) - .memcpy((char *)tensor->data + offset, data, size) + .memcpy((char *)tensor->data + offset, host_buf, size) .wait())); + free(host_buf); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ diff --git a/llama.cpp b/llama.cpp index b93c1abcd..a5ef2fd8f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13063,6 +13063,11 @@ struct llama_beam_search_data { } llama_logit_info logit_info(ctx); std::vector next_tokens = logit_info.top_k(n_beams); + + // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode() + // call in loop() will conclusively fill in the kv slot once the beams converge at this position. + llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + size_t i=0; if (next_beams.size() < n_beams) { for (; next_beams.size() < n_beams ; ++i) { @@ -15473,6 +15478,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) { GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch); ctx->output_ids[id] = i; } + + ctx->n_outputs = n_outputs; } } @@ -16625,6 +16632,21 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "### Response:\n"; } + } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) { + // CohereForAI/c4ai-command-r-plus + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; + } else if (role == "user") { + ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; + } else if (role == "assistant") { + ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; + } + } + if (add_ass) { + ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"; + } } else { // template not supported return -1; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b5d7bb59c..89f23ca2d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -25,7 +25,7 @@ function(llama_test source) add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) - target_link_libraries(${TEST_TARGET} PRIVATE common json-schema-to-grammar) + target_link_libraries(${TEST_TARGET} PRIVATE common) add_test( NAME ${TEST_TARGET} WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY} diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 73c3536fd..522cc7d0d 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -45,6 +45,8 @@ int main(void) { // Orca-Vicuna // No template included in tokenizer_config.json, so this template likely needs to be manually set. "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}", + // CohereForAI/c4ai-command-r-plus + "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}" }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -69,6 +71,8 @@ int main(void) { "You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there\nUSER: Who are you\nASSISTANT: I am an assistant \nUSER: Another question\nASSISTANT:", // Orca-Vicuna "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there\nUSER: Who are you\nASSISTANT: I am an assistant \nUSER: Another question\nASSISTANT:", + // CohereForAI/c4ai-command-r-plus + "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", }; std::vector formatted_chat(1024); int32_t res;