From 921dcee00a55d9aba3b3026d0509d31ac8386e2a Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Fri, 5 May 2023 16:43:36 +0200 Subject: [PATCH 1/5] readme: add missing info (#1324) --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f1fa63542..233c5c5e1 100644 --- a/README.md +++ b/README.md @@ -18,10 +18,12 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant - Plain C/C++ implementation without dependencies - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework -- AVX2 support for x86 architectures +- AVX, AVX2 and AVX512 support for x86 architectures - Mixed F16 / F32 precision -- 4-bit integer quantization support +- 4-bit, 5-bit and 8-bit integer quantization support - Runs on the CPU +- OpenBLAS support +- cuBLAS and CLBlast support The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022). Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves From a3b85b28da84c67c3406807aef5e0457bcc4b00f Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Fri, 5 May 2023 22:56:09 +0200 Subject: [PATCH 2/5] ci : add cublas to windows release (#1271) --- .github/workflows/build.yml | 77 +++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 179080576..18bb33f94 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -210,6 +210,82 @@ jobs: path: | llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip + windows-latest-cmake-cublas: + runs-on: windows-latest + + strategy: + matrix: + cuda: ['12.1.0', '11.7.1'] + build: ['cublas'] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v1 + + - uses: Jimver/cuda-toolkit@v0.2.10 + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda }} + # TODO(green-sky): _dev seems to fail, and non dev are not enought + #sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]' + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. -DLLAMA_CUBLAS=ON + cmake --build . --config Release + + - name: Get commit hash + id: commit + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: pr-mpt/actions-commit-hash@v2 + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + 7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v3 + with: + path: | + llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip + + - name: Copy and pack Cuda runtime + if: ${{ matrix.cuda == '12.1.0' }} + # TODO(green-sky): paths are cuda 12 specific + run: | + echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" + mkdir '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\' + 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\* + + - name: Copy and pack Cuda runtime + if: ${{ matrix.cuda == '11.7.1' }} + # TODO(green-sky): paths are cuda 11 specific + run: | + echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" + mkdir '.\build\bin\cudart\' + ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\' + 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\* + + - name: Upload Cuda runtime + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v3 + with: + path: | + cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -221,6 +297,7 @@ jobs: - macOS-latest-make - macOS-latest-cmake - windows-latest-cmake + - windows-latest-cmake-cublas steps: - name: Download artifacts From 173d0e6419e8f8f3c1f4f13201b777f4c60629f3 Mon Sep 17 00:00:00 2001 From: DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> Date: Fri, 5 May 2023 23:57:14 +0200 Subject: [PATCH 3/5] makefile: automatic Arch Linux detection (#1332) This commit is a port of a detection method used in koboldcpp's Makefile in order to automatically set the -lcblas option on Arch Linux --- Makefile | 6 +++++- README.md | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 260b2487f..0ddff9961 100644 --- a/Makefile +++ b/Makefile @@ -107,7 +107,11 @@ ifndef LLAMA_NO_ACCELERATE endif ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas - LDFLAGS += -lopenblas + ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),) + LDFLAGS += -lopenblas -lcblas + else + LDFLAGS += -lopenblas + endif endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include diff --git a/README.md b/README.md index 233c5c5e1..19cc94aa2 100644 --- a/README.md +++ b/README.md @@ -216,7 +216,6 @@ Building the program with BLAS support may lead to some performance improvements ```bash make LLAMA_OPENBLAS=1 ``` - Note: In order to build on Arch Linux with OpenBLAS support enabled you must edit the Makefile adding at the end of the line 105: `-lcblas` - On Windows: From 3924088512d9e12e90ed6dbf28a6c5712481d33e Mon Sep 17 00:00:00 2001 From: Jed Fox Date: Sat, 6 May 2023 17:01:47 -0400 Subject: [PATCH 4/5] Remove default arguments from sampling functions (#1343) --- .gitignore | 1 + examples/main/main.cpp | 8 ++++---- llama.cpp | 2 +- llama.h | 8 ++++---- tests/test-sampling.cpp | 8 ++++---- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index e479c6180..6f275fea4 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ build-sanitize-addr/ build-sanitize-thread/ models/* +*.bin /main /quantize diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 43dca8eb5..5ac151e14 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -444,10 +444,10 @@ int main(int argc, char ** argv) { id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); } else { // Temperature sampling - llama_sample_top_k(ctx, &candidates_p, top_k); - llama_sample_tail_free(ctx, &candidates_p, tfs_z); - llama_sample_typical(ctx, &candidates_p, typical_p); - llama_sample_top_p(ctx, &candidates_p, top_p); + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); llama_sample_temperature(ctx, &candidates_p, temp); id = llama_sample_token(ctx, &candidates_p); } diff --git a/llama.cpp b/llama.cpp index 85af4dc49..c36c6ced6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1791,7 +1791,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat); // Sample the next word X using top-k sampling - llama_sample_top_k(nullptr, candidates, int(k)); + llama_sample_top_k(nullptr, candidates, int(k), 1); if (ctx) { ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } diff --git a/llama.h b/llama.h index e993c464a..58c6e0699 100644 --- a/llama.h +++ b/llama.h @@ -202,16 +202,16 @@ extern "C" { LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1); + LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); + LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. - LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1); + LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. - LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); + LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 8ce59af3d..9174c1e37 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -32,7 +32,7 @@ void test_top_k(const std::vector & probs, llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_sample_softmax(nullptr, &candidates_p); DUMP(&candidates_p); - llama_sample_top_k(nullptr, &candidates_p, k); + llama_sample_top_k(nullptr, &candidates_p, k, 1); DUMP(&candidates_p); assert(candidates_p.size == expected_probs.size()); @@ -57,7 +57,7 @@ void test_top_p(const std::vector & probs, llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_sample_softmax(nullptr, &candidates_p); DUMP(&candidates_p); - llama_sample_top_p(nullptr, &candidates_p, p); + llama_sample_top_p(nullptr, &candidates_p, p, 1); DUMP(&candidates_p); assert(candidates_p.size == expected_probs.size()); @@ -80,7 +80,7 @@ void test_tfs(const std::vector & probs, llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; DUMP(&candidates_p); - llama_sample_tail_free(nullptr, &candidates_p, z); + llama_sample_tail_free(nullptr, &candidates_p, z, 1); DUMP(&candidates_p); assert(candidates_p.size == expected_probs.size()); @@ -103,7 +103,7 @@ void test_typical(const std::vector & probs, llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; DUMP(&candidates_p); - llama_sample_typical(nullptr, &candidates_p, p); + llama_sample_typical(nullptr, &candidates_p, p, 1); DUMP(&candidates_p); assert(candidates_p.size == expected_probs.size()); From 1b0fd454650ef4d68a980e3225488b79e6e9af25 Mon Sep 17 00:00:00 2001 From: swittk Date: Sun, 7 May 2023 10:03:23 +0700 Subject: [PATCH 5/5] ggml : Allow usage of CLBlast alongside Accelerate.framework (#1336) Minor edit in ggml.c which originally would prevent OpenCL from loading completely if GGML_USE_ACCELERATE was defined. Minor speedup in prompt eval time. --- ggml.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml.c b/ggml.c index 4d49242a4..1b89bdd89 100644 --- a/ggml.c +++ b/ggml.c @@ -137,6 +137,9 @@ inline static void* ggml_aligned_malloc(size_t size) { #if defined(GGML_USE_ACCELERATE) #include +#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions +#include "ggml-opencl.h" +#endif #elif defined(GGML_USE_OPENBLAS) #include #elif defined(GGML_USE_CUBLAS)