From 1359b6aba55d5b0410f6adaa0aa2e49bbfd01d84 Mon Sep 17 00:00:00 2001 From: Senemu <10880819+Senemu@users.noreply.github.com> Date: Wed, 24 May 2023 06:16:22 +0000 Subject: [PATCH 01/11] chat-persistent.sh : use bracket expressions in grep (#1564) --- examples/chat-persistent.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh index b32284b49..e0c251e5b 100755 --- a/examples/chat-persistent.sh +++ b/examples/chat-persistent.sh @@ -23,8 +23,8 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin" NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt" NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin" -SESSION_SIZE_MSG_PATTERN='main: session file matches \d+ / \d+' -SAMPLE_TIME_MSG_PATTERN='sample time =\s+\d+.\d+ ms /\s+\d+' +SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+' +SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d" CTX_SIZE=2048 From c31bbe934b9666af42f32ce12d32cae9160e5dc4 Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Wed, 24 May 2023 02:24:01 -0400 Subject: [PATCH 02/11] readme : add docs for chat-persistent.sh (#1568) * readme : add docs for chat-persistent.sh * Update README.md --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 102cde43f..f88e520ee 100644 --- a/README.md +++ b/README.md @@ -391,6 +391,25 @@ Note the use of `--color` to distinguish between user input and generated text. ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) +### Persistent Interaction + +The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. + +```bash +# Start a new chat +PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh + +# Resume that chat +PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh + +# Start a different chat with the same prompt/model +PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh + +# Different prompt cache for different prompt/model +PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ + CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh +``` + ### Instruction mode with Alpaca 1. First, download the `ggml` Alpaca model into the `./models` folder From ac7876ac20124a15a44fd6317721ff1aa2538806 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 24 May 2023 10:30:09 +0300 Subject: [PATCH 03/11] Update CLBlast to 1.6.0 (#1580) * Update CLBlast to 1.6.0 --- .github/workflows/build.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 49b478d99..d5c2cdea5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -151,7 +151,7 @@ jobs: env: OPENBLAS_VERSION: 0.3.23 OPENCL_VERSION: 2023.04.17 - CLBLAST_VERSION: 1.5.3 + CLBLAST_VERSION: 1.6.0 strategy: matrix: @@ -184,13 +184,13 @@ jobs: id: get_clblast if: ${{ matrix.build == 'clblast' }} run: | - curl.exe -o $env:RUNNER_TEMP/clblast.zip -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-Windows-x64.zip" + curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z" curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE" - mkdir $env:RUNNER_TEMP/clblast - tar.exe -xvf $env:RUNNER_TEMP/clblast.zip -C $env:RUNNER_TEMP/clblast + 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z + rename-item $env:RUNNER_TEMP/clblast_release_dir clblast foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) { $txt = Get-Content -Path $f -Raw - $txt.Replace('C:/dependencies/opencl/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8 + $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8 } - name: Download OpenBLAS From 1fcdcc28b119a6608774d52de905931bd5f8a43d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 25 May 2023 23:07:29 +0200 Subject: [PATCH 04/11] cuda : performance optimizations (#1530) * xor hack * block y dim * loop unrolling * Fixed cmake LLAMA_CUDA_BY option * Removed hipblas compatibility code * Define GGML_CUDA_DMMV_BLOCK_Y if not defined * Fewer iters, more ops per iter * Renamed DMMV X/Y compilation options --- CMakeLists.txt | 52 ++++++++++++----------- Makefile | 12 +++++- ggml-cuda.cu | 110 +++++++++++++++++++++++++++++++------------------ 3 files changed, 110 insertions(+), 64 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 39db2e3fc..31c5bd91d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,42 +37,44 @@ endif() # # general -option(LLAMA_STATIC "llama: static link libraries" OFF) -option(LLAMA_NATIVE "llama: enable -march=native flag" OFF) -option(LLAMA_LTO "llama: enable link time optimization" OFF) +option(LLAMA_STATIC "llama: static link libraries" OFF) +option(LLAMA_NATIVE "llama: enable -march=native flag" OFF) +option(LLAMA_LTO "llama: enable link time optimization" OFF) # debug -option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) -option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) -option(LLAMA_GPROF "llama: enable gprof" OFF) +option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) +option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) +option(LLAMA_GPROF "llama: enable gprof" OFF) # sanitizers -option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) -option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) -option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) +option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) +option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) +option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) # instruction set specific -option(LLAMA_AVX "llama: enable AVX" ON) -option(LLAMA_AVX2 "llama: enable AVX2" ON) -option(LLAMA_AVX512 "llama: enable AVX512" OFF) -option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) -option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) -option(LLAMA_FMA "llama: enable FMA" ON) +option(LLAMA_AVX "llama: enable AVX" ON) +option(LLAMA_AVX2 "llama: enable AVX2" ON) +option(LLAMA_AVX512 "llama: enable AVX512" OFF) +option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) +option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) +option(LLAMA_FMA "llama: enable FMA" ON) # in MSVC F16C is implied with AVX2/AVX512 if (NOT MSVC) - option(LLAMA_F16C "llama: enable F16C" ON) + option(LLAMA_F16C "llama: enable F16C" ON) endif() # 3rd party libs -option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" OFF) -option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic) -option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) -option(LLAMA_CLBLAST "llama: use CLBlast" OFF) +option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) +option(LLAMA_BLAS "llama: use BLAS" OFF) +option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic) +option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") +set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") +option(LLAMA_CLBLAST "llama: use CLBlast" OFF) -option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_SERVER "llama: build server example" OFF) +option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_SERVER "llama: build server example" OFF) # # Build info header @@ -184,6 +186,8 @@ if (LLAMA_CUBLAS) set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h) add_compile_definitions(GGML_USE_CUBLAS) + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) if (LLAMA_STATIC) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) diff --git a/Makefile b/Makefile index 08e250314..804307b53 100644 --- a/Makefile +++ b/Makefile @@ -133,9 +133,19 @@ ifdef LLAMA_CUBLAS OBJS += ggml-cuda.o NVCC = nvcc NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native +ifdef LLAMA_CUDA_DMMV_X + NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) +else + NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 +endif # LLAMA_CUDA_DMMV_X +ifdef LLAMA_CUDA_DMMV_Y + NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) +else + NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1 +endif # LLAMA_CUDA_DMMV_Y ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ -endif +endif # LLAMA_CUBLAS ifdef LLAMA_CLBLAST CFLAGS += -DGGML_USE_CLBLAST CXXFLAGS += -DGGML_USE_CLBLAST diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 35d2e457c..98170a3ae 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -83,9 +83,19 @@ typedef struct { } block_q8_0; static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); +#define WARP_SIZE 32 + #define CUDA_MUL_BLOCK_SIZE 256 + #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec + +// dmmv = dequantize_mul_mat_vec +#ifndef GGML_CUDA_DMMV_X +#define GGML_CUDA_DMMV_X 32 +#endif +#ifndef GGML_CUDA_DMMV_Y +#define GGML_CUDA_DMMV_Y 1 +#endif static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -200,41 +210,51 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k) dequantize_kernel(vx, ib, iqs, v0, v1); } -template +template static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) { - const int row = blockIdx.x; + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter const int y_offset = qr == 1 ? 1 : qk/2; - __shared__ float tmp[block_size]; // separate sum for each thread - tmp[tid] = 0; + float tmp = 0; // partial sum for thread in warp - for (int i = 0; i < ncols/block_size; i += 2) { - const int col = i*block_size + 2*tid; - const int ib = (row*ncols + col)/qk; // block index - const int iqs = (col%qk)/qr; // quant index + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index const int iybs = col - col%qk; // y block start index - // dequantize - float v0, v1; - dequantize_kernel(vx, ib, iqs, v0, v1); +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter - // matrix multiplication - tmp[tid] += v0 * y[iybs + iqs + 0]; - tmp[tid] += v1 * y[iybs + iqs + y_offset]; + // dequantize + float v0, v1; + dequantize_kernel(vx, ib, iqs + j/qr, v0, v1); + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + + // matrix multiplication + tmp += v0 * y[iybs + iqs + j/qr + 0]; + tmp += v1 * y[iybs + iqs + j/qr + y_offset]; + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 + } } // sum up partial sums and write back result __syncthreads(); - for (int s=block_size/2; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } + if (tid == 0) { - dst[row] = tmp[0]; + dst[row] = tmp; } } @@ -269,33 +289,43 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu } static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols); } static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -304,9 +334,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c } static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); + dequantize_mul_mat_vec<1, 1, convert_f16> + <<>>(vx, y, dst, ncols); } static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { From 66874d4fbcc7866377246efbcee938e8cc9c7d76 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Thu, 25 May 2023 20:18:01 -0600 Subject: [PATCH 05/11] Some improvements to loading the session with --prompt-cache (#1550) Improvements to loading the session with `--prompt-cache` in the `main` example. 1. Fix an issue where the `--seed` parameter was ignored when loading a cached prompt. 2. When loading a cached prompt, you previously had to specify the saved prompt (or a prefix of it) again. This pull changes that behavior to default to the prompt that was cached if a prompt wasn't specified by the user. --- examples/main/README.md | 2 +- examples/main/main.cpp | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/examples/main/README.md b/examples/main/README.md index 7c03f92c8..e71ba6173 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models. ### Prompt Caching -- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. +- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation. ### Quantization diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 47b418d97..c7c591537 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -134,8 +134,6 @@ int main(int argc, char ** argv) { return 0; } - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); std::string path_session = params.path_prompt_cache; std::vector session_tokens; @@ -155,6 +153,7 @@ int main(int argc, char ** argv) { return 1; } session_tokens.resize(n_token_count_out); + llama_set_rng_seed(ctx, params.seed); fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); } else { @@ -163,7 +162,16 @@ int main(int argc, char ** argv) { } // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + std::vector embd_inp; + + if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + embd_inp = ::llama_tokenize(ctx, params.prompt, true); + } else { + embd_inp = session_tokens; + } const int n_ctx = llama_n_ctx(ctx); @@ -181,7 +189,9 @@ int main(int argc, char ** argv) { } n_matching_session_tokens++; } - if (n_matching_session_tokens >= embd_inp.size()) { + if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { + fprintf(stderr, "%s: using full prompt from session file\n", __func__); + } else if (n_matching_session_tokens >= embd_inp.size()) { fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", From bdbda1b17afb78e8613d03c8210a57fac632397b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 May 2023 12:22:05 +0300 Subject: [PATCH 06/11] ggml : sync ggml core (minor additions, e.g. ggml_get_tensor_by_name()) --- ggml.c | 46 +++++++++++++++++++++++++++++++++++++--------- ggml.h | 12 +++++++++++- 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/ggml.c b/ggml.c index c0e7ec05c..c24992260 100644 --- a/ggml.c +++ b/ggml.c @@ -3494,7 +3494,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { }; static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated"); -static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { +static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "NONE", "DUP", @@ -3749,6 +3749,9 @@ const char * ggml_type_name(enum ggml_type type) { return GGML_TYPE_NAME[type]; } +const char * ggml_op_name(enum ggml_op op) { + return GGML_OP_NAME[op]; +} size_t ggml_element_size(const struct ggml_tensor * tensor) { return GGML_TYPE_SIZE[tensor->type]; @@ -4017,6 +4020,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) return result; } +void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { + ctx->no_alloc = no_alloc; +} + // IMPORTANT: // when creating "opt" tensors, always save and load the scratch buffer // this is an error prone process, but it is necessary to support inplace @@ -4061,7 +4068,7 @@ struct ggml_tensor * ggml_new_tensor_impl( struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); if (ctx->scratch.data == NULL || data != NULL) { - size_needed += sizeof(struct ggml_tensor); + size_needed += GGML_TENSOR_SIZE; if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", @@ -4077,14 +4084,15 @@ struct ggml_tensor * ggml_new_tensor_impl( }; } else { if (ctx->scratch.offs + size_needed > ctx->scratch.size) { - GGML_PRINT("%s: not enough space in the scratch memory\n", __func__); + GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + __func__, ctx->scratch.offs + size_needed, ctx->scratch.size); assert(false); return NULL; } - if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) { + if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) { GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size); + __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; } @@ -4093,7 +4101,7 @@ struct ggml_tensor * ggml_new_tensor_impl( *obj_new = (struct ggml_object) { .offs = cur_end + GGML_OBJECT_SIZE, - .size = sizeof(struct ggml_tensor), + .size = GGML_TENSOR_SIZE, .next = NULL, }; @@ -13792,11 +13800,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * // reached a leaf node, not part of the gradient graph (e.g. a constant) GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs); + } + cgraph->leafs[cgraph->n_leafs] = node; cgraph->n_leafs++; } else { GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes); + } + cgraph->nodes[cgraph->n_nodes] = node; cgraph->grads[cgraph->n_nodes] = node->grad; cgraph->n_nodes++; @@ -14510,6 +14526,18 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } +struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) { + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + if (strcmp(node->name, name) == 0) { + return node; + } + } + + return NULL; +} + void ggml_graph_print(const struct ggml_cgraph * cgraph) { int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; @@ -14527,7 +14555,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], - GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, (double) node->perf_cycles / (double) ggml_cycles_per_ms(), (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_time_us / 1000.0, @@ -14541,7 +14569,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", i, node->ne[0], node->ne[1], - GGML_OP_LABEL[node->op]); + GGML_OP_NAME[node->op]); } for (int i = 0; i < GGML_OP_COUNT; i++) { @@ -14549,7 +14577,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { continue; } - GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0); + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0); } GGML_PRINT("========================================\n"); diff --git a/ggml.h b/ggml.h index c22d93836..0c90f5064 100644 --- a/ggml.h +++ b/ggml.h @@ -198,6 +198,7 @@ #define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_OPT 4 +#define GGML_MAX_NAME 32 #define GGML_DEFAULT_N_THREADS 4 #define GGML_ASSERT(x) \ @@ -372,11 +373,16 @@ extern "C" { void * data; - char name[32]; + char name[GGML_MAX_NAME]; char padding[16]; }; + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + + // use this to compute the memory overhead of a tensor + static const size_t GGML_TENSOR_OVERHEAD = (GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16); + // computation graph struct ggml_cgraph { int n_nodes; @@ -429,6 +435,7 @@ extern "C" { GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float GGML_API const char * ggml_type_name(enum ggml_type type); + GGML_API const char * ggml_op_name (enum ggml_op op); GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); @@ -445,6 +452,7 @@ extern "C" { GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); GGML_API struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, @@ -970,6 +978,8 @@ extern "C" { GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name); + // print info and performance information for the graph GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); From 83c54e6da58f1970556741b143bd26e30b1f46af Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 27 May 2023 15:18:25 +0300 Subject: [PATCH 07/11] [CI] CLBlast: Fix directory name (#1606) --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d5c2cdea5..245b454dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -187,7 +187,7 @@ jobs: curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z" curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE" 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z - rename-item $env:RUNNER_TEMP/clblast_release_dir clblast + rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) { $txt = Get-Content -Path $f -Raw $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8 From 93618031c7ccdd949d976370f24953d261048575 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 May 2023 16:19:56 +0300 Subject: [PATCH 08/11] ggml : add ggml_tensor_overhead() --- ggml.c | 12 ++++++++++++ ggml.h | 6 +++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index c24992260..14972464b 100644 --- a/ggml.c +++ b/ggml.c @@ -3808,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { return wtype; } +size_t ggml_tensor_overhead(void) { + return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -14527,6 +14531,14 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) { + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * leaf = cgraph->leafs[i]; + + if (strcmp(leaf->name, name) == 0) { + return leaf; + } + } + for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; diff --git a/ggml.h b/ggml.h index 0c90f5064..558138280 100644 --- a/ggml.h +++ b/ggml.h @@ -380,9 +380,6 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); - // use this to compute the memory overhead of a tensor - static const size_t GGML_TENSOR_OVERHEAD = (GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16); - // computation graph struct ggml_cgraph { int n_nodes; @@ -444,6 +441,9 @@ extern "C" { // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + // use this to compute the memory overhead of a tensor + GGML_API size_t ggml_tensor_overhead(void); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); From 0ecb1bbbeb16e36a2ea7a5ce525c6c59ef74312b Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 27 May 2023 17:24:06 +0300 Subject: [PATCH 09/11] [CI] Fix openblas (#1613) * Fix OpenBLAS build * Fix `LLAMA_BLAS_VENDOR` CMake variable that should be a string and not a boolean. --- .github/workflows/build.yml | 4 ++-- CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 245b454dd..41f2dee28 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -165,7 +165,7 @@ jobs: - build: 'clblast' defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' - build: 'openblas' - defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"' + defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' steps: - name: Clone @@ -213,7 +213,6 @@ jobs: cd build cmake .. ${{ matrix.defines }} cmake --build . --config Release - cp ../LICENSE ./bin/Release/llama.cpp.txt - name: Add clblast.dll id: add_clblast_dll @@ -258,6 +257,7 @@ jobs: id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | + Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt 7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\* - name: Upload artifacts diff --git a/CMakeLists.txt b/CMakeLists.txt index 31c5bd91d..21f4ec9dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,7 +66,7 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) -option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic) +set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") From 97c9b77c4fc5e2283755c4418759cfc5fc73ad05 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 27 May 2023 18:47:55 +0300 Subject: [PATCH 10/11] Add documentation about CLBlast (#1604) Installing, compiling and using. --- README.md | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f88e520ee..00571d8e1 100644 --- a/README.md +++ b/README.md @@ -240,11 +240,11 @@ In order to build llama.cpp you have three different options. Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it: -- Accelerate Framework: +- **Accelerate Framework**: This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions. -- OpenBLAS: +- **OpenBLAS**: This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine. @@ -278,11 +278,11 @@ Building the program with BLAS support may lead to some performance improvements cmake --build . --config Release ``` -- BLIS +- **BLIS** Check [BLIS.md](BLIS.md) for more information. -- Intel MKL +- **Intel MKL** By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by: @@ -293,7 +293,7 @@ Building the program with BLAS support may lead to some performance improvements cmake --build . -config Release ``` -- cuBLAS +- **cuBLAS** This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). - Using `make`: @@ -308,8 +308,81 @@ Building the program with BLAS support may lead to some performance improvements cmake .. -DLLAMA_CUBLAS=ON cmake --build . --config Release ``` + Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1. -Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1. +- **CLBlast** + + OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU. + + You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK). + - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed. + + -
+ Installing the OpenCL SDK from source + + ```sh + git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git + mkdir OpenCL-SDK/build + cd OpenCL-SDK/build + cmake .. -DBUILD_DOCS=OFF \ + -DBUILD_EXAMPLES=OFF \ + -DBUILD_TESTING=OFF \ + -DOPENCL_SDK_BUILD_SAMPLES=OFF \ + -DOPENCL_SDK_TEST_SAMPLES=OFF + cmake --build . --config Release + cmake --install . --prefix /some/path + ``` +
+ + Installing CLBlast: it may be found in your operating system's packages. + + -
+ If not, then installing from source: + + ```sh + git clone https://github.com/CNugteren/CLBlast.git + mkdir CLBlast/build + cd CLBLast/build + cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF + cmake --build . --config Release + cmake --install . --prefix /some/path + ``` + + Where `/some/path` is where the built library will be installed (default is `/usr/loca`l`). +
+ + Building: + + - Build with make: + ```sh + make LLAMA_CLBLAST=1 + ``` + - CMake: + ```sh + mkdir build + cd build + cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path + cmake --build . --config Release + ``` + + Running: + + The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does. + + To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`. + The selection can be a number (starting from 0) or a text string to search: + + ```sh + GGML_OPENCL_PLATFORM=1 ./main ... + GGML_OPENCL_DEVICE=2 ./main ... + GGML_OPENCL_PLATFORM=Intel ./main ... + GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ... + ``` + + The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful. + Using the variables it is possible to select a CPU-based driver as well, if so desired. + + You can get a list of platforms and devices from the `clinfo -l` command, etc. ### Prepare Data & Run From 0df7d63e5ba0ab8856476e121a03b985d6f15c9d Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sat, 27 May 2023 11:04:14 -0600 Subject: [PATCH 11/11] Include server in releases + other build system cleanups (#1610) Set `LLAMA_BUILD_SERVER` in workflow so the `server` example gets build. This currently only applies to Windows builds because it seems like only Windows binary artifacts are included in releases. Add `server` example target to `Makefile` (still uses `LLAMA_BUILD_SERVER` define and does not build by default) Fix issue where `vdot` binary wasn't removed when running `make clean`. Fix compile warnings in `server` example. Add `.hpp` files to trigger workflow (the server example has one). --- .github/workflows/build.yml | 16 ++++++++-------- Makefile | 13 +++++++++++-- examples/server/server.cpp | 16 ++++++++-------- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 41f2dee28..c98cbcbbe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,10 +10,10 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp'] + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp'] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp'] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp'] env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} @@ -157,15 +157,15 @@ jobs: matrix: include: - build: 'avx2' - defines: '' + defines: '-DLLAMA_BUILD_SERVER=ON' - build: 'avx' - defines: '-DLLAMA_AVX2=OFF' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF' - build: 'avx512' - defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'clblast' - defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' - build: 'openblas' - defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' steps: - name: Clone @@ -292,7 +292,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_CUBLAS=ON + cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON cmake --build . --config Release - name: Get commit hash diff --git a/Makefile b/Makefile index 804307b53..70bd5e90a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,11 @@ # Define the default target now so that it is always the first target -default: main quantize quantize-stats perplexity embedding vdot +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot + +ifdef LLAMA_BUILD_SERVER + BUILD_TARGETS += server +endif + +default: $(BUILD_TARGETS) ifndef UNAME_S UNAME_S := $(shell uname -s) @@ -210,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h # # Examples @@ -237,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh > $@.tmp @if ! cmp -s $@.tmp $@; then \ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7209a2b52..3904412cb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -61,7 +61,7 @@ struct llama_server_context std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt int new_prompt_len = 0; - for (int i = 0;i < prompt_tokens.size(); i++) { + for (size_t i = 0; i < prompt_tokens.size(); i++) { if (i < processed_tokens.size() && processed_tokens[i] == prompt_tokens[i]) { @@ -71,7 +71,7 @@ struct llama_server_context { embd_inp.push_back(prompt_tokens[i]); if(new_prompt_len == 0) { - if(i - 1 < n_past) { + if(int32_t(i) - 1 < n_past) { processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end()); } // Evaluate the new fragment prompt from the last token processed. @@ -136,7 +136,7 @@ struct llama_server_context { // out of user input, sample next token const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; @@ -306,12 +306,12 @@ struct llama_server_context // Avoid add the no show words to the response for (std::vector word_tokens : no_show_words) { - int match_token = 1; + size_t match_token = 1; if (tokens_predicted.front() == word_tokens.front()) { bool execute_matching = true; if (tokens_predicted.size() > 1) { // if previus tokens had been tested - for (int i = 1; i < word_tokens.size(); i++) + for (size_t i = 1; i < word_tokens.size(); i++) { if (i >= tokens_predicted.size()) { match_token = i; @@ -601,7 +601,7 @@ int main(int argc, char **argv) Server svr; - svr.Get("/", [](const Request &req, Response &res) + svr.Get("/", [](const Request &, Response &res) { res.set_content("

llama.cpp server works

", "text/html"); }); svr.Post("/completion", [&llama](const Request &req, Response &res) @@ -649,7 +649,7 @@ int main(int argc, char **argv) {"tokens_predicted", llama.num_tokens_predicted}}; return res.set_content(data.dump(), "application/json"); } - catch (json::exception e) + catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive json data = { @@ -701,7 +701,7 @@ int main(int argc, char **argv) {"content", result }, {"stop", !llama.has_next_token }}; return res.set_content(data.dump(), "application/json"); - } catch (json::exception e) { + } catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive json data = { {"content", "" },