From 83c54e6da58f1970556741b143bd26e30b1f46af Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 27 May 2023 15:18:25 +0300 Subject: [PATCH 1/5] [CI] CLBlast: Fix directory name (#1606) --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d5c2cdea5..245b454dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -187,7 +187,7 @@ jobs: curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z" curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE" 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z - rename-item $env:RUNNER_TEMP/clblast_release_dir clblast + rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) { $txt = Get-Content -Path $f -Raw $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8 From 93618031c7ccdd949d976370f24953d261048575 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 May 2023 16:19:56 +0300 Subject: [PATCH 2/5] ggml : add ggml_tensor_overhead() --- ggml.c | 12 ++++++++++++ ggml.h | 6 +++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index c24992260..14972464b 100644 --- a/ggml.c +++ b/ggml.c @@ -3808,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { return wtype; } +size_t ggml_tensor_overhead(void) { + return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -14527,6 +14531,14 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) { + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * leaf = cgraph->leafs[i]; + + if (strcmp(leaf->name, name) == 0) { + return leaf; + } + } + for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; diff --git a/ggml.h b/ggml.h index 0c90f5064..558138280 100644 --- a/ggml.h +++ b/ggml.h @@ -380,9 +380,6 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); - // use this to compute the memory overhead of a tensor - static const size_t GGML_TENSOR_OVERHEAD = (GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16); - // computation graph struct ggml_cgraph { int n_nodes; @@ -444,6 +441,9 @@ extern "C" { // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + // use this to compute the memory overhead of a tensor + GGML_API size_t ggml_tensor_overhead(void); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); From 0ecb1bbbeb16e36a2ea7a5ce525c6c59ef74312b Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 27 May 2023 17:24:06 +0300 Subject: [PATCH 3/5] [CI] Fix openblas (#1613) * Fix OpenBLAS build * Fix `LLAMA_BLAS_VENDOR` CMake variable that should be a string and not a boolean. --- .github/workflows/build.yml | 4 ++-- CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 245b454dd..41f2dee28 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -165,7 +165,7 @@ jobs: - build: 'clblast' defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' - build: 'openblas' - defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"' + defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' steps: - name: Clone @@ -213,7 +213,6 @@ jobs: cd build cmake .. ${{ matrix.defines }} cmake --build . --config Release - cp ../LICENSE ./bin/Release/llama.cpp.txt - name: Add clblast.dll id: add_clblast_dll @@ -258,6 +257,7 @@ jobs: id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | + Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt 7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\* - name: Upload artifacts diff --git a/CMakeLists.txt b/CMakeLists.txt index 31c5bd91d..21f4ec9dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,7 +66,7 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) -option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic) +set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") From 97c9b77c4fc5e2283755c4418759cfc5fc73ad05 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 27 May 2023 18:47:55 +0300 Subject: [PATCH 4/5] Add documentation about CLBlast (#1604) Installing, compiling and using. --- README.md | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f88e520ee..00571d8e1 100644 --- a/README.md +++ b/README.md @@ -240,11 +240,11 @@ In order to build llama.cpp you have three different options. Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it: -- Accelerate Framework: +- **Accelerate Framework**: This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions. -- OpenBLAS: +- **OpenBLAS**: This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine. @@ -278,11 +278,11 @@ Building the program with BLAS support may lead to some performance improvements cmake --build . --config Release ``` -- BLIS +- **BLIS** Check [BLIS.md](BLIS.md) for more information. -- Intel MKL +- **Intel MKL** By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by: @@ -293,7 +293,7 @@ Building the program with BLAS support may lead to some performance improvements cmake --build . -config Release ``` -- cuBLAS +- **cuBLAS** This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). - Using `make`: @@ -308,8 +308,81 @@ Building the program with BLAS support may lead to some performance improvements cmake .. -DLLAMA_CUBLAS=ON cmake --build . --config Release ``` + Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1. -Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1. +- **CLBlast** + + OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU. + + You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK). + - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed. + + -
+ Installing the OpenCL SDK from source + + ```sh + git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git + mkdir OpenCL-SDK/build + cd OpenCL-SDK/build + cmake .. -DBUILD_DOCS=OFF \ + -DBUILD_EXAMPLES=OFF \ + -DBUILD_TESTING=OFF \ + -DOPENCL_SDK_BUILD_SAMPLES=OFF \ + -DOPENCL_SDK_TEST_SAMPLES=OFF + cmake --build . --config Release + cmake --install . --prefix /some/path + ``` +
+ + Installing CLBlast: it may be found in your operating system's packages. + + -
+ If not, then installing from source: + + ```sh + git clone https://github.com/CNugteren/CLBlast.git + mkdir CLBlast/build + cd CLBLast/build + cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF + cmake --build . --config Release + cmake --install . --prefix /some/path + ``` + + Where `/some/path` is where the built library will be installed (default is `/usr/loca`l`). +
+ + Building: + + - Build with make: + ```sh + make LLAMA_CLBLAST=1 + ``` + - CMake: + ```sh + mkdir build + cd build + cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path + cmake --build . --config Release + ``` + + Running: + + The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does. + + To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`. + The selection can be a number (starting from 0) or a text string to search: + + ```sh + GGML_OPENCL_PLATFORM=1 ./main ... + GGML_OPENCL_DEVICE=2 ./main ... + GGML_OPENCL_PLATFORM=Intel ./main ... + GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ... + ``` + + The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful. + Using the variables it is possible to select a CPU-based driver as well, if so desired. + + You can get a list of platforms and devices from the `clinfo -l` command, etc. ### Prepare Data & Run From 0df7d63e5ba0ab8856476e121a03b985d6f15c9d Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sat, 27 May 2023 11:04:14 -0600 Subject: [PATCH 5/5] Include server in releases + other build system cleanups (#1610) Set `LLAMA_BUILD_SERVER` in workflow so the `server` example gets build. This currently only applies to Windows builds because it seems like only Windows binary artifacts are included in releases. Add `server` example target to `Makefile` (still uses `LLAMA_BUILD_SERVER` define and does not build by default) Fix issue where `vdot` binary wasn't removed when running `make clean`. Fix compile warnings in `server` example. Add `.hpp` files to trigger workflow (the server example has one). --- .github/workflows/build.yml | 16 ++++++++-------- Makefile | 13 +++++++++++-- examples/server/server.cpp | 16 ++++++++-------- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 41f2dee28..c98cbcbbe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,10 +10,10 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp'] + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp'] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp'] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp'] env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} @@ -157,15 +157,15 @@ jobs: matrix: include: - build: 'avx2' - defines: '' + defines: '-DLLAMA_BUILD_SERVER=ON' - build: 'avx' - defines: '-DLLAMA_AVX2=OFF' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF' - build: 'avx512' - defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'clblast' - defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' - build: 'openblas' - defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' steps: - name: Clone @@ -292,7 +292,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_CUBLAS=ON + cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON cmake --build . --config Release - name: Get commit hash diff --git a/Makefile b/Makefile index 804307b53..70bd5e90a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,11 @@ # Define the default target now so that it is always the first target -default: main quantize quantize-stats perplexity embedding vdot +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot + +ifdef LLAMA_BUILD_SERVER + BUILD_TARGETS += server +endif + +default: $(BUILD_TARGETS) ifndef UNAME_S UNAME_S := $(shell uname -s) @@ -210,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h # # Examples @@ -237,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh > $@.tmp @if ! cmp -s $@.tmp $@; then \ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7209a2b52..3904412cb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -61,7 +61,7 @@ struct llama_server_context std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt int new_prompt_len = 0; - for (int i = 0;i < prompt_tokens.size(); i++) { + for (size_t i = 0; i < prompt_tokens.size(); i++) { if (i < processed_tokens.size() && processed_tokens[i] == prompt_tokens[i]) { @@ -71,7 +71,7 @@ struct llama_server_context { embd_inp.push_back(prompt_tokens[i]); if(new_prompt_len == 0) { - if(i - 1 < n_past) { + if(int32_t(i) - 1 < n_past) { processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end()); } // Evaluate the new fragment prompt from the last token processed. @@ -136,7 +136,7 @@ struct llama_server_context { // out of user input, sample next token const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; @@ -306,12 +306,12 @@ struct llama_server_context // Avoid add the no show words to the response for (std::vector word_tokens : no_show_words) { - int match_token = 1; + size_t match_token = 1; if (tokens_predicted.front() == word_tokens.front()) { bool execute_matching = true; if (tokens_predicted.size() > 1) { // if previus tokens had been tested - for (int i = 1; i < word_tokens.size(); i++) + for (size_t i = 1; i < word_tokens.size(); i++) { if (i >= tokens_predicted.size()) { match_token = i; @@ -601,7 +601,7 @@ int main(int argc, char **argv) Server svr; - svr.Get("/", [](const Request &req, Response &res) + svr.Get("/", [](const Request &, Response &res) { res.set_content("

llama.cpp server works

", "text/html"); }); svr.Post("/completion", [&llama](const Request &req, Response &res) @@ -649,7 +649,7 @@ int main(int argc, char **argv) {"tokens_predicted", llama.num_tokens_predicted}}; return res.set_content(data.dump(), "application/json"); } - catch (json::exception e) + catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive json data = { @@ -701,7 +701,7 @@ int main(int argc, char **argv) {"content", result }, {"stop", !llama.has_next_token }}; return res.set_content(data.dump(), "application/json"); - } catch (json::exception e) { + } catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive json data = { {"content", "" },