diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index 491d67676..01b3111d9 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
- apt-get install -y build-essential python3 python3-pip
+ apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
index 2e629f8ce..fc34a0c18 100644
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
- apt-get install -y build-essential
+ apt-get install -y build-essential git
WORKDIR /app
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d5c2cdea5..c98cbcbbe 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
push:
branches:
- master
- paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+ paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
pull_request:
types: [opened, synchronize, reopened]
- paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+ paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -157,15 +157,15 @@ jobs:
matrix:
include:
- build: 'avx2'
- defines: ''
+ defines: '-DLLAMA_BUILD_SERVER=ON'
- build: 'avx'
- defines: '-DLLAMA_AVX2=OFF'
+ defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
- build: 'avx512'
- defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+ defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast'
- defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+ defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
- defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
+ defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
steps:
- name: Clone
@@ -187,7 +187,7 @@ jobs:
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
- rename-item $env:RUNNER_TEMP/clblast_release_dir clblast
+ rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
@@ -213,7 +213,6 @@ jobs:
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
- cp ../LICENSE ./bin/Release/llama.cpp.txt
- name: Add clblast.dll
id: add_clblast_dll
@@ -258,6 +257,7 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
+ Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -292,7 +292,7 @@ jobs:
run: |
mkdir build
cd build
- cmake .. -DLLAMA_CUBLAS=ON
+ cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
cmake --build . --config Release
- name: Get commit hash
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31c5bd91d..21f4ec9dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,7 +66,7 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
-option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
+set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
diff --git a/Makefile b/Makefile
index 804307b53..8e8d426c5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,11 @@
# Define the default target now so that it is always the first target
-default: main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
+
+ifdef LLAMA_BUILD_SERVER
+ BUILD_TARGETS += server
+endif
+
+default: $(BUILD_TARGETS)
ifndef UNAME_S
UNAME_S := $(shell uname -s)
@@ -38,7 +44,11 @@ CFLAGS = -I. -O3 -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
LDFLAGS =
-ifndef LLAMA_DEBUG
+ifdef LLAMA_DEBUG
+ CFLAGS += -O0 -g
+ CXXFLAGS += -O0 -g
+ LDFLAGS += -g
+else
CFLAGS += -DNDEBUG
CXXFLAGS += -DNDEBUG
endif
@@ -210,7 +220,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean:
- rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
+ rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
#
# Examples
@@ -237,6 +247,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+ $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+
build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \
diff --git a/README.md b/README.md
index f88e520ee..00571d8e1 100644
--- a/README.md
+++ b/README.md
@@ -240,11 +240,11 @@ In order to build llama.cpp you have three different options.
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
-- Accelerate Framework:
+- **Accelerate Framework**:
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
-- OpenBLAS:
+- **OpenBLAS**:
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
@@ -278,11 +278,11 @@ Building the program with BLAS support may lead to some performance improvements
cmake --build . --config Release
```
-- BLIS
+- **BLIS**
Check [BLIS.md](BLIS.md) for more information.
-- Intel MKL
+- **Intel MKL**
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
@@ -293,7 +293,7 @@ Building the program with BLAS support may lead to some performance improvements
cmake --build . -config Release
```
-- cuBLAS
+- **cuBLAS**
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
- Using `make`:
@@ -308,8 +308,81 @@ Building the program with BLAS support may lead to some performance improvements
cmake .. -DLLAMA_CUBLAS=ON
cmake --build . --config Release
```
+ Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
-Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
+- **CLBlast**
+
+ OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
+
+ You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
+ - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
+
+ -
+ Installing the OpenCL SDK from source
+
+ ```sh
+ git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
+ mkdir OpenCL-SDK/build
+ cd OpenCL-SDK/build
+ cmake .. -DBUILD_DOCS=OFF \
+ -DBUILD_EXAMPLES=OFF \
+ -DBUILD_TESTING=OFF \
+ -DOPENCL_SDK_BUILD_SAMPLES=OFF \
+ -DOPENCL_SDK_TEST_SAMPLES=OFF
+ cmake --build . --config Release
+ cmake --install . --prefix /some/path
+ ```
+
+
+ Installing CLBlast: it may be found in your operating system's packages.
+
+ -
+ If not, then installing from source:
+
+ ```sh
+ git clone https://github.com/CNugteren/CLBlast.git
+ mkdir CLBlast/build
+ cd CLBLast/build
+ cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
+ cmake --build . --config Release
+ cmake --install . --prefix /some/path
+ ```
+
+ Where `/some/path` is where the built library will be installed (default is `/usr/loca`l`).
+
+
+ Building:
+
+ - Build with make:
+ ```sh
+ make LLAMA_CLBLAST=1
+ ```
+ - CMake:
+ ```sh
+ mkdir build
+ cd build
+ cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
+ cmake --build . --config Release
+ ```
+
+ Running:
+
+ The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
+
+ To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
+ The selection can be a number (starting from 0) or a text string to search:
+
+ ```sh
+ GGML_OPENCL_PLATFORM=1 ./main ...
+ GGML_OPENCL_DEVICE=2 ./main ...
+ GGML_OPENCL_PLATFORM=Intel ./main ...
+ GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
+ ```
+
+ The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
+ Using the variables it is possible to select a CPU-based driver as well, if so desired.
+
+ You can get a list of platforms and devices from the `clinfo -l` command, etc.
### Prepare Data & Run
diff --git a/examples/common.cpp b/examples/common.cpp
index 1308f8410..32247cef7 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -251,6 +251,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.model = argv[i];
+ } else if (arg == "-a" || arg == "--alias") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.model_alias = argv[i];
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
@@ -283,7 +289,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers = std::stoi(argv[i]);
+#else
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
@@ -410,7 +421,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
- fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n");
+ fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
+ fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
@@ -421,8 +433,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n");
+#endif
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
diff --git a/examples/common.h b/examples/common.h
index 2b66382a6..fea9aa81a 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -45,6 +45,7 @@ struct gpt_params {
float mirostat_eta = 0.10f; // learning rate
std::string model = "models/7B/ggml-model.bin"; // model path
+ std::string model_alias = "unknown"; // model alias
std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
diff --git a/examples/main/README.md b/examples/main/README.md
index e71ba6173..dd0874977 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -69,8 +69,8 @@ In this section, we cover the most commonly used options for running the `main`
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
-- `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-- `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
## Input Prompts
@@ -136,9 +136,9 @@ During text generation, LLaMA models have a limited context size, which means th
### Context Size
-The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
-- `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
### Keep Prompt
@@ -146,7 +146,7 @@ The `--keep` option allows users to retain the original prompt when the model ru
- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
-By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
## Generation Flags
@@ -154,11 +154,11 @@ The following options allow you to control the text generation process and fine-
### Number of Tokens to Predict
-- `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
-The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
### Temperature
@@ -170,33 +170,33 @@ Example usage: `--temp 0.5`
### Repeat Penalty
-- `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
-- `--repeat_last_n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx_size).
+- `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
-The `repeat_penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
-The `repeat_last_n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx_size`).
+The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
-Example usage: `--repeat_penalty 1.15 --repeat_last_n 128 --no-penalize-nl`
+Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
### Top-K Sampling
-- `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
+- `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
-Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
+Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
-Example usage: `--top_k 30`
+Example usage: `--top-k 30`
### Top-P Sampling
-- `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+- `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
-Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
+Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
-Example usage: `--top_p 0.95`
+Example usage: `--top-p 0.95`
### Tail Free Sampling (TFS)
@@ -217,16 +217,16 @@ Example usage: `--typical 0.9`
### Mirostat Sampling
- `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
-- `--mirostat_lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
-- `--mirostat_ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+- `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+- `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
-The `--mirostat_lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
+The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
-The `--mirostat_ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
+The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
-Example usage: `--mirostat 2 --mirostat_lr 0.05 --mirostat_ent 3.0`
+Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
### Logit Bias
@@ -264,11 +264,11 @@ These options help improve the performance and memory usage of the LLaMA models.
### Memory Float 32
-- `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
+- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
### Batch Size
-- `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+- `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
### Prompt Caching
@@ -285,5 +285,6 @@ These options provide extra functionality and customization when running the LLa
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text.
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
+- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c7c591537..6131f5b46 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -360,6 +360,12 @@ int main(int argc, char ** argv) {
}
}
if (i > 0) {
+ // check if we've used up all the prompt but not all cached tokens
+ if (embd.size() == i && n_session_consumed < (int) session_tokens.size()) {
+ // force revaluation of the last token to recalculate logits
+ i--;
+ n_past--;
+ }
embd.erase(embd.begin(), embd.begin() + i);
}
}
diff --git a/examples/server/README.md b/examples/server/README.md
index 089e8908c..bba513c7e 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -285,7 +285,8 @@ Test();
## Common Options
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
-- `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
- `--port`: Set the port to listen. Default: `8080`.
@@ -304,7 +305,7 @@ The RNG seed is used to initialize the random number generator that influences t
### Memory Float 32
-- `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
+- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended.
## Limitations:
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7209a2b52..9aa7db255 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -61,7 +61,7 @@ struct llama_server_context
std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
// compare the evaluated prompt with the new prompt
int new_prompt_len = 0;
- for (int i = 0;i < prompt_tokens.size(); i++) {
+ for (size_t i = 0; i < prompt_tokens.size(); i++) {
if (i < processed_tokens.size() &&
processed_tokens[i] == prompt_tokens[i])
{
@@ -71,7 +71,7 @@ struct llama_server_context
{
embd_inp.push_back(prompt_tokens[i]);
if(new_prompt_len == 0) {
- if(i - 1 < n_past) {
+ if(int32_t(i) - 1 < n_past) {
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
}
// Evaluate the new fragment prompt from the last token processed.
@@ -136,7 +136,7 @@ struct llama_server_context
{
// out of user input, sample next token
const float temp = params.temp;
- const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+ // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
@@ -306,12 +306,12 @@ struct llama_server_context
// Avoid add the no show words to the response
for (std::vector word_tokens : no_show_words)
{
- int match_token = 1;
+ size_t match_token = 1;
if (tokens_predicted.front() == word_tokens.front())
{
bool execute_matching = true;
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
- for (int i = 1; i < word_tokens.size(); i++)
+ for (size_t i = 1; i < word_tokens.size(); i++)
{
if (i >= tokens_predicted.size()) {
match_token = i;
@@ -385,7 +385,9 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
- fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
+ fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
+ fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
+ fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stderr, " --embedding enable embedding mode\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
if (llama_mlock_supported())
@@ -396,12 +398,16 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
{
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n");
+#endif
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
- fprintf(stderr, " -host ip address to listen (default 127.0.0.1)\n");
- fprintf(stderr, " -port PORT port to listen (default 8080)\n");
+ fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
+ fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
+ fprintf(stderr, " --host ip address to listen (default 127.0.0.1)\n");
+ fprintf(stderr, " --port PORT port to listen (default 8080)\n");
fprintf(stderr, "\n");
}
@@ -453,6 +459,15 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
}
params.model = argv[i];
}
+ else if (arg == "-a" || arg == "--alias")
+ {
+ if (++i >= argc)
+ {
+ invalid_param = true;
+ break;
+ }
+ params.model_alias = argv[i];
+ }
else if (arg == "--embedding")
{
params.embedding = true;
@@ -462,7 +477,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
server_print_usage(argc, argv, default_params);
exit(0);
}
- else if (arg == "-c" || arg == "--ctx_size")
+ else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
{
if (++i >= argc)
{
@@ -471,7 +486,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
}
params.n_ctx = std::stoi(argv[i]);
}
- else if (arg == "--memory_f32")
+ else if (arg == "--memory-f32" || arg == "--memory_f32")
{
params.memory_f16 = false;
}
@@ -482,7 +497,12 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
invalid_param = true;
break;
}
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers = std::stoi(argv[i]);
+#else
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
}
else
{
@@ -601,7 +621,7 @@ int main(int argc, char **argv)
Server svr;
- svr.Get("/", [](const Request &req, Response &res)
+ svr.Get("/", [](const Request &, Response &res)
{ res.set_content("llama.cpp server works
", "text/html"); });
svr.Post("/completion", [&llama](const Request &req, Response &res)
@@ -645,11 +665,12 @@ int main(int argc, char **argv)
try
{
json data = {
+ {"model", llama.params.model_alias },
{"content", llama.generated_text },
{"tokens_predicted", llama.num_tokens_predicted}};
return res.set_content(data.dump(), "application/json");
}
- catch (json::exception e)
+ catch (const json::exception &e)
{
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
json data = {
@@ -701,7 +722,7 @@ int main(int argc, char **argv)
{"content", result },
{"stop", !llama.has_next_token }};
return res.set_content(data.dump(), "application/json");
- } catch (json::exception e) {
+ } catch (const json::exception &e) {
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
json data = {
{"content", "" },
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 197868863..09a69314a 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -497,16 +497,11 @@ void ggml_cl_init(void) {
size_t ext_str_size;
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
- char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size);
+ char *ext_buffer = (char *)alloca(ext_str_size + 1);
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+ ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
// Check if ext_buffer contains cl_khr_fp16
- for (size_t i = 0; i < ext_str_size - 12; i++) {
- if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
- fp16_support = true;
- break;
- }
- }
- free(ext_buffer);
+ fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
cl_context_properties properties[] = {
@@ -795,7 +790,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
size_t d_size;
cl_mem d_X;
if (src0->backend == GGML_BACKEND_CL) {
- d_X = *(cl_mem*) src0->data;
+ d_X = (cl_mem) src0->data;
} else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
}
@@ -871,7 +866,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
size_t d_size;
cl_mem d_X;
if (src0->backend == GGML_BACKEND_CL) {
- d_X = *(cl_mem*) src0->data;
+ d_X = (cl_mem) src0->data;
} else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
}
@@ -998,7 +993,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
} else if (src0->backend == GGML_BACKEND_CL) {
- d_Q = *(cl_mem*) src0->data;
+ d_Q = (cl_mem) src0->data;
} else {
GGML_ASSERT(false);
}
@@ -1145,14 +1140,13 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
size_t q_size;
- cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
- *dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
+ cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
// copy tensor to device
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
int i = i3*ne2 + i2;
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL));
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
}
}
diff --git a/ggml.c b/ggml.c
index 0018e29df..91552c94c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -186,10 +186,12 @@ typedef double ggml_float;
#if defined(_MSC_VER) || defined(__MINGW32__)
#include
#else
+#if !defined(__riscv)
#include
#endif
#endif
#endif
+#endif
#ifdef __F16C__
@@ -3494,7 +3496,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
-static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
+static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"NONE",
"DUP",
@@ -3749,6 +3751,9 @@ const char * ggml_type_name(enum ggml_type type) {
return GGML_TYPE_NAME[type];
}
+const char * ggml_op_name(enum ggml_op op) {
+ return GGML_OP_NAME[op];
+}
size_t ggml_element_size(const struct ggml_tensor * tensor) {
return GGML_TYPE_SIZE[tensor->type];
@@ -3805,6 +3810,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
return wtype;
}
+size_t ggml_tensor_overhead(void) {
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
+}
+
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1];
}
@@ -4017,6 +4026,18 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
return result;
}
+void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
+ ctx->no_alloc = no_alloc;
+}
+
+void * ggml_get_mem_buffer(struct ggml_context * ctx) {
+ return ctx->mem_buffer;
+}
+
+size_t ggml_get_mem_size(struct ggml_context * ctx) {
+ return ctx->mem_size;
+}
+
// IMPORTANT:
// when creating "opt" tensors, always save and load the scratch buffer
// this is an error prone process, but it is necessary to support inplace
@@ -4061,7 +4082,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
if (ctx->scratch.data == NULL || data != NULL) {
- size_needed += sizeof(struct ggml_tensor);
+ size_needed += GGML_TENSOR_SIZE;
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4098,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
};
} else {
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
- GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
+ __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
assert(false);
return NULL;
}
- if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
+ if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
- __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
+ __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false);
return NULL;
}
@@ -4093,7 +4115,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
*obj_new = (struct ggml_object) {
.offs = cur_end + GGML_OBJECT_SIZE,
- .size = sizeof(struct ggml_tensor),
+ .size = GGML_TENSOR_SIZE,
.next = NULL,
};
@@ -4509,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor(
return result;
}
+struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
+ struct ggml_object * obj = ctx->objects_begin;
+
+ char * const mem_buffer = ctx->mem_buffer;
+
+ while (obj != NULL) {
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
+ if (strcmp(cur->name, name) == 0) {
+ return cur;
+ }
+
+ obj = obj->next;
+ }
+
+ return NULL;
+}
+
////////////////////////////////////////////////////////////////////////////////
// ggml_dup
@@ -6303,7 +6342,7 @@ struct ggml_tensor * ggml_alibi(
ggml_scratch_save(ctx);
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_head;
@@ -13799,11 +13838,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
// reached a leaf node, not part of the gradient graph (e.g. a constant)
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
+ if (strlen(node->name) == 0) {
+ snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
+ }
+
cgraph->leafs[cgraph->n_leafs] = node;
cgraph->n_leafs++;
} else {
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
+ if (strlen(node->name) == 0) {
+ snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
+ }
+
cgraph->nodes[cgraph->n_nodes] = node;
cgraph->grads[cgraph->n_nodes] = node->grad;
cgraph->n_nodes++;
@@ -14517,6 +14564,481 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
}
}
+struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
+ for (int i = 0; i < cgraph->n_leafs; i++) {
+ struct ggml_tensor * leaf = cgraph->leafs[i];
+
+ if (strcmp(leaf->name, name) == 0) {
+ return leaf;
+ }
+ }
+
+ for (int i = 0; i < cgraph->n_nodes; i++) {
+ struct ggml_tensor * node = cgraph->nodes[i];
+
+ if (strcmp(node->name, name) == 0) {
+ return node;
+ }
+ }
+
+ return NULL;
+}
+
+static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
+ const int64_t * ne = tensor->ne;
+ const size_t * nb = tensor->nb;
+
+ fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
+ ggml_type_name(tensor->type),
+ ggml_op_name (tensor->op),
+ tensor->n_dims,
+ ne[0], ne[1], ne[2], ne[3],
+ nb[0], nb[1], nb[2], nb[3],
+ tensor->data,
+ tensor->name);
+}
+
+static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
+ const int64_t * ne = tensor->ne;
+ const size_t * nb = tensor->nb;
+
+ fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
+ arg,
+ ggml_type_name(tensor->type),
+ ggml_op_name (tensor->op),
+ tensor->n_dims,
+ ne[0], ne[1], ne[2], ne[3],
+ nb[0], nb[1], nb[2], nb[3],
+ tensor->n_tasks,
+ tensor->data,
+ tensor->name);
+}
+
+void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
+ assert(cgraph->work == NULL);
+ assert(cgraph->work_size == 0);
+
+ uint64_t size_eval = 0;
+
+ // compute size of intermediate results
+ // TODO: does not take into account scratch buffers !!!!
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
+ size_eval += ggml_nbytes(cgraph->nodes[i]);
+ }
+
+ // print
+ {
+ FILE * fout = stdout;
+
+ fprintf(fout, "\n");
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
+ fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
+
+ // header
+ fprintf(fout, "\n");
+ fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
+ "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
+
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
+ ggml_graph_export_leaf(cgraph->leafs[i], fout);
+
+ GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
+ GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
+ GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
+ }
+
+ // header
+ fprintf(fout, "\n");
+ fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
+ "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
+
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
+ ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
+
+ if (cgraph->nodes[i]->src0) {
+ ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
+ }
+
+ if (cgraph->nodes[i]->src1) {
+ ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
+ }
+
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
+ if (cgraph->nodes[i]->opt[j]) {
+ ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
+ }
+ }
+
+ fprintf(fout, "\n");
+ }
+
+ fprintf(fout, "\n");
+ }
+
+ // write binary data
+ {
+ FILE * fout = fopen(fname, "wb");
+
+ if (!fout) {
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+ return;
+ }
+
+ // header
+ {
+ const uint32_t magic = GGML_FILE_MAGIC;
+ const uint32_t version = GGML_FILE_VERSION;
+ const uint32_t n_leafs = cgraph->n_leafs;
+ const uint32_t nodes = cgraph->n_nodes;
+
+ fwrite(&magic, sizeof(uint32_t), 1, fout);
+ fwrite(&version, sizeof(uint32_t), 1, fout);
+ fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
+ fwrite(&nodes, sizeof(uint32_t), 1, fout);
+ fwrite(&size_eval, sizeof(uint64_t), 1, fout);
+ }
+
+ // leafs
+ {
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
+ const struct ggml_tensor * tensor = cgraph->leafs[i];
+
+ const uint32_t type = tensor->type;
+ const uint32_t op = tensor->op;
+ const uint32_t n_dims = tensor->n_dims;
+
+ fwrite(&type, sizeof(uint32_t), 1, fout);
+ fwrite(&op, sizeof(uint32_t), 1, fout);
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ const uint64_t ne = tensor->ne[j];
+ const uint64_t nb = tensor->nb[j];
+
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
+ }
+
+ // store the pointer address
+ {
+ const uint64_t ptr = (uint64_t) tensor->data;
+
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
+ }
+
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+
+ // dump the data
+ // TODO: pad this to 32 byte boundary
+ {
+ const size_t size = ggml_nbytes(tensor);
+
+ fwrite(tensor->data, sizeof(char), size, fout);
+ }
+ }
+ }
+
+ // nodes
+ {
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
+ const struct ggml_tensor * tensor = cgraph->nodes[i];
+
+ const uint32_t type = tensor->type;
+ const uint32_t op = tensor->op;
+ const uint32_t n_dims = tensor->n_dims;
+
+ fwrite(&type, sizeof(uint32_t), 1, fout);
+ fwrite(&op, sizeof(uint32_t), 1, fout);
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ const uint64_t ne = tensor->ne[j];
+ const uint64_t nb = tensor->nb[j];
+
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
+ }
+
+ // store the pointer address
+ {
+ const uint64_t ptr = (uint64_t) tensor->data;
+
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
+ }
+
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+
+ // output the op arguments
+ {
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
+
+ args[0] = tensor->src0;
+ args[1] = tensor->src1;
+
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
+ args[2 + j] = tensor->opt[j];
+ }
+
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+ if (args[j]) {
+ int32_t idx = -1;
+
+ // check if leaf
+ {
+ for (int k = 0; k < cgraph->n_leafs; ++k) {
+ if (args[j] == cgraph->leafs[k]) {
+ idx = k;
+ break;
+ }
+ }
+ }
+
+ // check if node
+ if (idx == -1) {
+ for (int k = 0; k < cgraph->n_nodes; ++k) {
+ if (args[j] == cgraph->nodes[k]) {
+ idx = GGML_MAX_NODES + k;
+ break;
+ }
+ }
+ }
+
+ if (idx == -1) {
+ fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
+ return;
+ }
+
+ fwrite(&idx, sizeof(int32_t), 1, fout);
+ } else {
+ const int32_t nul = -1;
+
+ fwrite(&nul, sizeof(int32_t), 1, fout);
+ }
+ }
+ }
+ }
+ }
+
+ fclose(fout);
+ }
+}
+
+struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
+ assert(*ctx_data == NULL);
+ assert(*ctx_eval == NULL);
+
+ struct ggml_cgraph result = { 0 };
+
+ struct ggml_tensor * data = NULL;
+
+ // read file into data
+ {
+ FILE * fin = fopen(fname, "rb");
+
+ if (!fin) {
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+ return result;
+ }
+
+ size_t fsize = 0;
+
+ fseek(fin, 0, SEEK_END);
+ fsize = ftell(fin);
+ fseek(fin, 0, SEEK_SET);
+
+ // create the data context
+ {
+ const size_t overhead = 1*ggml_tensor_overhead();
+
+ struct ggml_init_params params = {
+ .mem_size = fsize + overhead,
+ .mem_buffer = NULL,
+ .no_alloc = false,
+ };
+
+ *ctx_data = ggml_init(params);
+
+ if (!*ctx_data) {
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+ return result;
+ }
+ }
+
+ data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
+
+ fread(data->data, sizeof(char), fsize, fin);
+
+ fclose(fin);
+ }
+
+ // populate result
+ {
+ char * ptr = (char *) data->data;
+
+ const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
+
+ if (magic != GGML_FILE_MAGIC) {
+ fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
+ return result;
+ }
+
+ const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
+
+ if (version != GGML_FILE_VERSION) {
+ fprintf(stderr, "%s: invalid version number\n", __func__);
+ return result;
+ }
+
+ const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
+ const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
+ const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
+
+ result.n_leafs = n_leafs;
+ result.n_nodes = n_nodes;
+
+ // create the data context
+ {
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
+
+ struct ggml_init_params params = {
+ .mem_size = size_eval + overhead,
+ .mem_buffer = NULL,
+ .no_alloc = true,
+ };
+
+ *ctx_eval = ggml_init(params);
+
+ if (!*ctx_eval) {
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+ return result;
+ }
+ }
+
+ // leafs
+ {
+ uint32_t type;
+ uint32_t op;
+ uint32_t n_dims;
+
+ for (uint32_t i = 0; i < n_leafs; ++i) {
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+ int64_t ne[GGML_MAX_DIMS];
+ size_t nb[GGML_MAX_DIMS];
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ uint64_t ne_cur;
+ uint64_t nb_cur;
+
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+ ne[j] = ne_cur;
+ nb[j] = nb_cur;
+ }
+
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+ tensor->op = (enum ggml_op) op;
+
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+
+ tensor->data = (void *) ptr;
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ tensor->nb[j] = nb[j];
+ }
+
+ result.leafs[i] = tensor;
+
+ ptr += ggml_nbytes(tensor);
+
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+ }
+ }
+
+ ggml_set_no_alloc(*ctx_eval, false);
+
+ // nodes
+ {
+ uint32_t type;
+ uint32_t op;
+ uint32_t n_dims;
+
+ for (uint32_t i = 0; i < n_nodes; ++i) {
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+ int64_t ne[GGML_MAX_DIMS];
+ size_t nb[GGML_MAX_DIMS];
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ uint64_t ne_cur;
+ uint64_t nb_cur;
+
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+ ne[j] = ne_cur;
+ nb[j] = nb_cur;
+ }
+
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+ tensor->op = (enum ggml_op) op;
+
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ tensor->nb[j] = nb[j];
+ }
+
+ // parse args
+ {
+ struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
+ &tensor->src0,
+ &tensor->src1,
+ };
+
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
+ args[2 + j] = &tensor->opt[j];
+ }
+
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+ const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
+
+ if (arg_idx == -1) {
+ continue;
+ }
+
+ if (arg_idx < GGML_MAX_NODES) {
+ *args[j] = result.leafs[arg_idx];
+ } else {
+ *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
+ }
+ }
+ }
+
+ result.nodes[i] = tensor;
+
+ fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+ }
+ }
+ }
+
+ return result;
+}
+
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
@@ -14534,7 +15056,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
i,
node->ne[0], node->ne[1], node->ne[2],
- GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
+ GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
(double) node->perf_time_us / 1000.0,
@@ -14548,7 +15070,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
i,
node->ne[0], node->ne[1],
- GGML_OP_LABEL[node->op]);
+ GGML_OP_NAME[node->op]);
}
for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -14556,7 +15078,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
continue;
}
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
}
GGML_PRINT("========================================\n");
diff --git a/ggml.h b/ggml.h
index c22d93836..60c0ad8bf 100644
--- a/ggml.h
+++ b/ggml.h
@@ -198,6 +198,7 @@
#define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4
+#define GGML_MAX_NAME 32
#define GGML_DEFAULT_N_THREADS 4
#define GGML_ASSERT(x) \
@@ -372,11 +373,13 @@ extern "C" {
void * data;
- char name[32];
+ char name[GGML_MAX_NAME];
char padding[16];
};
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+
// computation graph
struct ggml_cgraph {
int n_nodes;
@@ -429,6 +432,7 @@ extern "C" {
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
GGML_API const char * ggml_type_name(enum ggml_type type);
+ GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
@@ -437,6 +441,9 @@ extern "C" {
// TODO: temporary until model loading of ggml examples is refactored
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
+ // use this to compute the memory overhead of a tensor
+ GGML_API size_t ggml_tensor_overhead(void);
+
// main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -444,7 +451,11 @@ extern "C" {
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
- GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+
+ GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
+ GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
@@ -484,6 +495,8 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -970,6 +983,11 @@ extern "C" {
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
+
// print info and performance information for the graph
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
diff --git a/llama.cpp b/llama.cpp
index 5cec6062f..52966b2c5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -42,6 +42,7 @@
// available llama models
enum e_model {
MODEL_UNKNOWN,
+ MODEL_3B,
MODEL_7B,
MODEL_13B,
MODEL_30B,
@@ -58,6 +59,7 @@ static const size_t MB = 1024*1024;
static const std::map & MEM_REQ_SCRATCH0()
{
static std::map k_sizes = {
+ { MODEL_3B, 128ull * MB },
{ MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB },
@@ -69,6 +71,7 @@ static const std::map & MEM_REQ_SCRATCH0()
static const std::map & MEM_REQ_SCRATCH1()
{
static std::map k_sizes = {
+ { MODEL_3B, 128ull * MB },
{ MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB },
@@ -81,6 +84,7 @@ static const std::map & MEM_REQ_SCRATCH1()
static const std::map & MEM_REQ_KV_SELF()
{
static std::map k_sizes = {
+ { MODEL_3B, 682ull * MB },
{ MODEL_7B, 1026ull * MB },
{ MODEL_13B, 1608ull * MB },
{ MODEL_30B, 3124ull * MB },
@@ -94,6 +98,7 @@ static const std::map & MEM_REQ_KV_SELF()
static const std::map & MEM_REQ_EVAL()
{
static std::map k_sizes = {
+ { MODEL_3B, 512ull * MB },
{ MODEL_7B, 768ull * MB },
{ MODEL_13B, 1024ull * MB },
{ MODEL_30B, 1280ull * MB },
@@ -899,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
static const char *llama_model_type_name(e_model type) {
switch (type) {
+ case MODEL_3B: return "3B";
case MODEL_7B: return "7B";
case MODEL_13B: return "13B";
case MODEL_30B: return "30B";
@@ -932,6 +938,7 @@ static void llama_model_load_internal(
{
switch (hparams.n_layer) {
+ case 26: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break;
case 60: model.type = e_model::MODEL_30B; break;
diff --git a/llama.h b/llama.h
index 37bae5357..c6b0a2889 100644
--- a/llama.h
+++ b/llama.h
@@ -31,6 +31,11 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
+#define LLAMA_SUPPORTS_GPU_OFFLOAD
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif