Merge branch 'ggerganov:master' into theme
This commit is contained in:
commit
765cd50afa
50 changed files with 3551 additions and 1374 deletions
2
.github/workflows/bench.yml
vendored
2
.github/workflows/bench.yml
vendored
|
@ -32,7 +32,7 @@ on:
|
|||
- cron: '04 2 * * *'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }}
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
|
|
57
.github/workflows/build.yml
vendored
57
.github/workflows/build.yml
vendored
|
@ -593,6 +593,63 @@ jobs:
|
|||
run: |
|
||||
make swift
|
||||
|
||||
windows-msys2:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
|
||||
- { sys: CLANG64, env: clang-x86_64, build: Release }
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup ${{ matrix.sys }}
|
||||
uses: msys2/setup-msys2@v2
|
||||
with:
|
||||
update: true
|
||||
msystem: ${{matrix.sys}}
|
||||
install: >-
|
||||
base-devel
|
||||
mingw-w64-${{matrix.env}}-toolchain
|
||||
mingw-w64-${{matrix.env}}-cmake
|
||||
mingw-w64-${{matrix.env}}-openblas
|
||||
|
||||
- name: Build using make
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
make -j $(nproc)
|
||||
|
||||
- name: Clean after building using make
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
make clean
|
||||
|
||||
- name: Build using make w/ OpenBLAS
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
make LLAMA_OPENBLAS=1 -j $(nproc)
|
||||
|
||||
- name: Build using CMake
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cmake -B build
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
- name: Clean after building using CMake
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
rm -rf build
|
||||
|
||||
- name: Build using CMake w/ OpenBLAS
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-latest
|
||||
|
||||
|
|
29
.github/workflows/server.yml
vendored
29
.github/workflows/server.yml
vendored
|
@ -23,7 +23,7 @@ on:
|
|||
- cron: '2 4 * * *'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
|
@ -41,23 +41,16 @@ jobs:
|
|||
sanitizer: ""
|
||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||
|
||||
container:
|
||||
image: ubuntu:latest
|
||||
ports:
|
||||
- 8888
|
||||
options: --cpus 4
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get -y install \
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
python3-pip \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
|
@ -70,6 +63,17 @@ jobs:
|
|||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r examples/server/tests/requirements.txt
|
||||
|
||||
- name: Verify server deps
|
||||
id: verify_server_deps
|
||||
run: |
|
||||
|
@ -100,10 +104,6 @@ jobs:
|
|||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r examples/server/tests/requirements.txt
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
|
@ -129,6 +129,7 @@ jobs:
|
|||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
|
|
|
@ -43,17 +43,7 @@ else()
|
|||
set(LLAMA_METAL_DEFAULT OFF)
|
||||
endif()
|
||||
|
||||
# TODO: fix this for Android CI
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
|
||||
#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
|
||||
# set(LLAMA_LLAMAFILE_DEFAULT OFF)
|
||||
#else()
|
||||
# set(LLAMA_LLAMAFILE_DEFAULT ON)
|
||||
#endif()
|
||||
|
||||
# TODO: temporary disable until MoE is fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6716
|
||||
set(LLAMA_LLAMAFILE_DEFAULT OFF)
|
||||
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
||||
|
||||
# general
|
||||
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
|
||||
|
|
17
Makefile
17
Makefile
|
@ -27,6 +27,17 @@ ifndef UNAME_M
|
|||
UNAME_M := $(shell uname -m)
|
||||
endif
|
||||
|
||||
# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
|
||||
# of non-gcc compilers don't have to provide g++ alias or wrapper.
|
||||
DEFCC := cc
|
||||
DEFCXX := c++
|
||||
ifeq ($(origin CC),default)
|
||||
CC := $(DEFCC)
|
||||
endif
|
||||
ifeq ($(origin CXX),default)
|
||||
CXX := $(DEFCXX)
|
||||
endif
|
||||
|
||||
# Mac OS + Arm can report x86_64
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
|
@ -384,10 +395,6 @@ ifdef LLAMA_OPENBLAS
|
|||
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
||||
endif # LLAMA_OPENBLAS
|
||||
|
||||
# TODO: temporary disable until MoE is fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6716
|
||||
LLAMA_NO_LLAMAFILE := 1
|
||||
|
||||
ifndef LLAMA_NO_LLAMAFILE
|
||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||
OBJS += sgemm.o
|
||||
|
@ -772,7 +779,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
|
|
|
@ -229,12 +229,12 @@ source /opt/intel/oneapi/setvars.sh
|
|||
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
||||
mkdir -p build && cd build
|
||||
|
||||
# Option 1: Use FP16 for better performance in long-prompt inference
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
|
||||
# Option 2: Use FP32 by default
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# Option 2: Use FP16
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -j -v
|
||||
```
|
||||
|
@ -250,12 +250,12 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
|||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||
mkdir -p build && cd build
|
||||
|
||||
# Option 1: Use FP16 for better performance in long-prompt inference
|
||||
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
|
||||
# Option 2: Use FP32 by default
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# Option 2: Use FP16
|
||||
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -j -v
|
||||
|
||||
|
@ -416,6 +416,10 @@ mkdir -p build
|
|||
cd build
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
# Option 2: Or FP16
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
make -j
|
||||
|
|
10
README.md
10
README.md
|
@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well.
|
|||
|
||||
- [X] LLaMA 🦙
|
||||
- [x] LLaMA 2 🦙🦙
|
||||
- [x] LLaMA 3 🦙🦙🦙
|
||||
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
||||
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
|
||||
|
@ -119,8 +120,9 @@ Typically finetunes of the base models below are supported as well.
|
|||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||
- [x] [Mamba](https://github.com/state-spaces/mamba)
|
||||
- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
|
||||
- [x] [Xverse](https://huggingface.co/models?search=xverse)
|
||||
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
|
||||
- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
|
||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
- [x] [OLMo](https://allenai.org/olmo)
|
||||
|
@ -135,6 +137,8 @@ Typically finetunes of the base models below are supported as well.
|
|||
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
||||
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||
|
||||
**HTTP server**
|
||||
|
||||
|
@ -1117,7 +1121,9 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
|
|||
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
||||
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
||||
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
||||
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
|
||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
||||
|
||||

|
||||
|
||||
### Docs
|
||||
|
||||
|
|
|
@ -160,7 +160,9 @@ function gg_run_test_scripts_debug {
|
|||
|
||||
set -e
|
||||
|
||||
# TODO: too slow, run on dedicated node
|
||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
@ -184,6 +186,7 @@ function gg_run_test_scripts_release {
|
|||
set -e
|
||||
|
||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
|
|
@ -234,15 +234,63 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
return result;
|
||||
}
|
||||
|
||||
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
||||
const char * sep = strchr(data, '=');
|
||||
if (sep == nullptr || sep - data >= 128) {
|
||||
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
llama_model_kv_override kvo;
|
||||
std::strncpy(kvo.key, data, sep - data);
|
||||
kvo.key[sep - data] = 0;
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.val_i64 = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.val_f64 = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.val_bool = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.val_bool = false;
|
||||
} else {
|
||||
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
} else if (strncmp(sep, "str:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||
if (strlen(sep) > 127) {
|
||||
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
strncpy(kvo.val_str, sep, 127);
|
||||
kvo.val_str[127] = '\0';
|
||||
} else {
|
||||
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
overrides.emplace_back(std::move(kvo));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
||||
llama_sampling_params& sparams = params.sparams;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
if (arg == "-s" || arg == "--seed") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
// This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
|
||||
params.seed = std::stoul(argv[i]);
|
||||
sparams.seed = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-t" || arg == "--threads") {
|
||||
|
@ -1087,6 +1135,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.n_print = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--check-tensors") {
|
||||
params.check_tensors = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--ppl-output-type") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -1238,47 +1290,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
char* sep = strchr(argv[i], '=');
|
||||
if (sep == nullptr || sep - argv[i] >= 128) {
|
||||
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
struct llama_model_kv_override kvo;
|
||||
std::strncpy(kvo.key, argv[i], sep - argv[i]);
|
||||
kvo.key[sep - argv[i]] = 0;
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
}
|
||||
else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
}
|
||||
else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
}
|
||||
else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.bool_value = false;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (!parse_kv_override(argv[i], params.kv_overrides)) {
|
||||
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params.kv_overrides.push_back(kvo);
|
||||
return true;
|
||||
}
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
|
@ -1549,9 +1565,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -ptc N, --print-token-count N\n");
|
||||
printf(" print token count every N tokens (default: %d)\n", params.n_print);
|
||||
printf(" --check-tensors check model tensor data for invalid values\n");
|
||||
printf("\n");
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_print_usage();
|
||||
|
@ -1772,6 +1789,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
|
@ -2326,12 +2344,12 @@ std::vector<llama_token> llama_tokenize(
|
|||
return result;
|
||||
}
|
||||
|
||||
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
|
||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
|
||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
|
|
|
@ -86,8 +86,8 @@ struct gpt_params {
|
|||
|
||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
|
||||
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
|
@ -161,6 +161,7 @@ struct gpt_params {
|
|||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
|
||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
|
@ -170,6 +171,8 @@ struct gpt_params {
|
|||
std::string image = ""; // path to an image file
|
||||
};
|
||||
|
||||
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
|
||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||
|
@ -237,11 +240,12 @@ std::vector<llama_token> llama_tokenize(
|
|||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
// tokenizes a token into a piece
|
||||
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||
// should work similar to Python's `tokenizer.id_to_piece`
|
||||
std::string llama_token_to_piece(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
llama_token token,
|
||||
bool special = true);
|
||||
|
||||
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
||||
// that takes into account the tokenizer type and decides how to handle the leading space
|
||||
|
|
|
@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|||
// INTERNAL, DO NOT USE
|
||||
// USE LOG() INSTEAD
|
||||
//
|
||||
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
|
||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
|
||||
#define LOG_IMPL(str, ...) \
|
||||
do { \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
|
@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|||
// INTERNAL, DO NOT USE
|
||||
// USE LOG_TEE() INSTEAD
|
||||
//
|
||||
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
|
||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
|
||||
#define LOG_TEE_IMPL(str, ...) \
|
||||
do { \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#define LLAMA_API_INTERNAL
|
||||
#include "sampling.h"
|
||||
#include <random>
|
||||
|
||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
|
||||
struct llama_sampling_context * result = new llama_sampling_context();
|
||||
|
@ -33,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
|||
|
||||
result->prev.resize(params.n_prev);
|
||||
|
||||
llama_sampling_set_rng_seed(result, params.seed);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -62,6 +66,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
|||
ctx->cur.clear();
|
||||
}
|
||||
|
||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
||||
if (seed == LLAMA_DEFAULT_SEED) {
|
||||
seed = time(NULL);
|
||||
}
|
||||
ctx->rng.seed(seed);
|
||||
}
|
||||
|
||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
|
||||
if (dst->grammar) {
|
||||
llama_grammar_free(dst->grammar);
|
||||
|
@ -203,7 +214,7 @@ static llama_token llama_sampling_sample_impl(
|
|||
|
||||
sampler_queue(ctx_main, params, cur_p, min_keep);
|
||||
|
||||
id = llama_sample_token(ctx_main, &cur_p);
|
||||
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
|
||||
|
||||
//{
|
||||
// const int n_top = 10;
|
||||
|
|
|
@ -4,9 +4,10 @@
|
|||
|
||||
#include "grammar-parser.h"
|
||||
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
// sampler types
|
||||
enum class llama_sampler_type : char {
|
||||
|
@ -20,25 +21,26 @@ enum class llama_sampler_type : char {
|
|||
|
||||
// sampling parameters
|
||||
typedef struct llama_sampling_params {
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
|
||||
|
||||
std::vector<llama_sampler_type> samplers_sequence = {
|
||||
llama_sampler_type::TOP_K,
|
||||
|
@ -79,6 +81,8 @@ struct llama_sampling_context {
|
|||
// TODO: replace with ring-buffer
|
||||
std::vector<llama_token> prev;
|
||||
std::vector<llama_token_data> cur;
|
||||
|
||||
std::mt19937 rng;
|
||||
};
|
||||
|
||||
#include "common.h"
|
||||
|
@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
|
|||
// - reset grammar
|
||||
void llama_sampling_reset(llama_sampling_context * ctx);
|
||||
|
||||
// Set the sampler seed
|
||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
|
||||
|
||||
// Copy the sampler context
|
||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
||||
|
||||
|
|
|
@ -363,6 +363,16 @@ class Model(ABC):
|
|||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||
|
||||
if vocab_size > len(tokens):
|
||||
pad_count = vocab_size - len(tokens)
|
||||
print(
|
||||
f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
|
||||
)
|
||||
for i in range(1, pad_count + 1):
|
||||
tokens.append(f"[PAD{i}]")
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||
|
||||
assert len(tokens) == vocab_size
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
|
@ -1789,6 +1799,12 @@ class QwenModel(Model):
|
|||
class Qwen2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
|
||||
@Model.register("Qwen2MoeForCausalLM")
|
||||
class Qwen2MoeModel(Model):
|
||||
|
@ -1979,6 +1995,91 @@ class Phi2Model(Model):
|
|||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
|
||||
@Model.register("Phi3ForCausalLM")
|
||||
class Phi3MiniModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.PHI3
|
||||
|
||||
def set_vocab(self):
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||
|
||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
|
||||
piece = tokenizer.id_to_piece(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(token_id)
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if tokenizer.is_unknown(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif tokenizer.is_control(token_id):
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif tokenizer.is_unused(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.is_byte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
toktypes[token_id] = toktype
|
||||
|
||||
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||
if added_tokens_file.is_file():
|
||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||
added_tokens_json = json.load(f)
|
||||
|
||||
for key in added_tokens_json:
|
||||
token_id = added_tokens_json[key]
|
||||
if (token_id >= vocab_size):
|
||||
print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
|
||||
tokens[token_id] = key.encode("utf-8")
|
||||
scores[token_id] = -1000.0
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
||||
|
||||
rot_pct = 1.0
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
rms_eps = self.find_hparam(["rms_norm_eps"])
|
||||
|
||||
self.gguf_writer.add_name("Phi3")
|
||||
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
||||
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
self.gguf_writer.add_feed_forward_length(8192)
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(n_head)
|
||||
self.gguf_writer.add_head_count_kv(n_head)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
|
||||
@Model.register("PlamoForCausalLM")
|
||||
class PlamoModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.PLAMO
|
||||
|
|
16
examples/gguf-split/tests.sh
Normal file → Executable file
16
examples/gguf-split/tests.sh
Normal file → Executable file
|
@ -4,16 +4,16 @@ set -eu
|
|||
|
||||
if [ $# -lt 1 ]
|
||||
then
|
||||
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
|
||||
echo "example: $0 ../../build/bin ../../tmp"
|
||||
exit 1
|
||||
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
|
||||
echo "example: $0 ../../build/bin ../../tmp"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# -gt 1 ]
|
||||
then
|
||||
TMP_DIR=$2
|
||||
TMP_DIR=$2
|
||||
else
|
||||
TMP_DIR=/tmp
|
||||
TMP_DIR=/tmp
|
||||
fi
|
||||
|
||||
set -x
|
||||
|
@ -21,7 +21,7 @@ set -x
|
|||
SPLIT=$1/gguf-split
|
||||
MAIN=$1/main
|
||||
WORK_PATH=$TMP_DIR/gguf-split
|
||||
CUR_DIR=$(pwd)
|
||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||
|
||||
mkdir -p "$WORK_PATH"
|
||||
|
||||
|
@ -30,8 +30,8 @@ rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
|
|||
|
||||
# 1. Get a model
|
||||
(
|
||||
cd $WORK_PATH
|
||||
"$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
|
||||
cd $WORK_PATH
|
||||
"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
|
||||
)
|
||||
echo PASS
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ struct Stats {
|
|||
};
|
||||
|
||||
struct StatParams {
|
||||
std::string dataset;
|
||||
std::string ofile = "imatrix.dat";
|
||||
int n_output_frequency = 10;
|
||||
int verbosity = 1;
|
||||
|
@ -46,7 +47,7 @@ private:
|
|||
std::vector<float> m_src1_data;
|
||||
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
|
||||
//
|
||||
void save_imatrix(const char * file_name) const;
|
||||
void save_imatrix(const char * file_name, const char * dataset) const;
|
||||
void keep_imatrix(int ncall) const;
|
||||
};
|
||||
|
||||
|
@ -199,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||
}
|
||||
|
||||
void IMatrixCollector::save_imatrix() const {
|
||||
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
|
||||
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
|
||||
}
|
||||
|
||||
void IMatrixCollector::keep_imatrix(int ncall) const {
|
||||
|
@ -207,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
|
|||
if (file_name.empty()) file_name = "imatrix.dat";
|
||||
file_name += ".at_";
|
||||
file_name += std::to_string(ncall);
|
||||
save_imatrix(file_name.c_str());
|
||||
save_imatrix(file_name.c_str(), m_params.dataset.c_str());
|
||||
}
|
||||
|
||||
void IMatrixCollector::save_imatrix(const char * fname) const {
|
||||
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
|
||||
std::ofstream out(fname, std::ios::binary);
|
||||
int n_entries = m_stats.size();
|
||||
out.write((const char*)&n_entries, sizeof(n_entries));
|
||||
for (auto& p : m_stats) {
|
||||
out.write((const char *) &n_entries, sizeof(n_entries));
|
||||
for (const auto & p : m_stats) {
|
||||
int len = p.first.size();
|
||||
out.write((const char*)&len, sizeof(len));
|
||||
out.write((const char *) &len, sizeof(len));
|
||||
out.write(p.first.c_str(), len);
|
||||
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
|
||||
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
||||
int nval = p.second.values.size();
|
||||
out.write((const char*)&nval, sizeof(nval));
|
||||
if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
|
||||
out.write((const char *) &nval, sizeof(nval));
|
||||
if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
|
||||
}
|
||||
|
||||
// Write the number of call the matrix was computed with
|
||||
out.write((const char *) &m_last_call, sizeof(m_last_call));
|
||||
|
||||
// Write the dataset name at the end of the file to later on specify it in quantize
|
||||
int n_dataset = strlen(dataset);
|
||||
out.write((const char *) &n_dataset, sizeof(n_dataset));
|
||||
out.write(dataset, n_dataset);
|
||||
|
||||
if (m_params.verbosity > 0) {
|
||||
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
|
||||
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -547,6 +557,29 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
gpt_params params;
|
||||
params.n_batch = 512;
|
||||
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.logits_all = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
sparams.dataset = params.prompt_file;
|
||||
g_collector.set_parameters(std::move(sparams));
|
||||
|
||||
if (!combine_files.empty()) {
|
||||
|
@ -585,28 +618,6 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
gpt_params params;
|
||||
params.n_batch = 512;
|
||||
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.logits_all = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
|
|
|
@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
|
|||
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||
#define TN_CLASS_EMBD "v.class_embd"
|
||||
#define TN_PATCH_EMBD "v.patch_embd.weight"
|
||||
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||
|
@ -425,6 +426,7 @@ struct clip_vision_model {
|
|||
// embeddings
|
||||
struct ggml_tensor * class_embedding;
|
||||
struct ggml_tensor * patch_embeddings;
|
||||
struct ggml_tensor * patch_bias;
|
||||
struct ggml_tensor * position_embeddings;
|
||||
|
||||
struct ggml_tensor * pre_ln_w;
|
||||
|
@ -501,6 +503,11 @@ struct clip_ctx {
|
|||
bool use_gelu = false;
|
||||
int32_t ftype = 1;
|
||||
|
||||
bool has_class_embedding = true;
|
||||
bool has_pre_norm = true;
|
||||
bool has_post_norm = false;
|
||||
bool has_patch_bias = false;
|
||||
|
||||
struct gguf_context * ctx_gguf;
|
||||
struct ggml_context * ctx_data;
|
||||
|
||||
|
@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
||||
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
|
||||
const int num_positions = num_patches + 1;
|
||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
|
@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||
|
||||
if (ctx->has_patch_bias) {
|
||||
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
}
|
||||
|
||||
// concat class_embeddings and patch_embeddings
|
||||
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
struct ggml_tensor * embeddings = inp;
|
||||
if (ctx->has_class_embedding) {
|
||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
}
|
||||
ggml_set_name(embeddings, "embeddings");
|
||||
ggml_set_input(embeddings);
|
||||
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
ggml_set_name(positions, "positions");
|
||||
|
@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||
|
||||
// pre-layernorm
|
||||
{
|
||||
if (ctx->has_pre_norm) {
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
ggml_set_name(embeddings, "pre_ln");
|
||||
|
||||
|
@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
embeddings = cur;
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (ctx->has_post_norm) {
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
ggml_set_name(embeddings, "post_ln");
|
||||
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
||||
}
|
||||
|
||||
// llava projector
|
||||
{
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||
|
@ -1148,12 +1170,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
||||
new_clip->has_class_embedding = true;
|
||||
} catch (const std::exception& e) {
|
||||
new_clip->has_class_embedding = false;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
new_clip->has_pre_norm = true;
|
||||
} catch (std::exception & e) {
|
||||
new_clip->has_pre_norm = false;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
|
||||
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
|
||||
new_clip->has_post_norm = true;
|
||||
} catch (std::exception & e) {
|
||||
new_clip->has_post_norm = false;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
|
||||
new_clip->has_patch_bias = true;
|
||||
} catch (std::exception & e) {
|
||||
new_clip->has_patch_bias = false;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
} catch(const std::exception& e) {
|
||||
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
|
||||
}
|
||||
|
@ -1325,7 +1374,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|||
}
|
||||
|
||||
// Linear interpolation between two points
|
||||
inline float lerp(float s, float e, float t) {
|
||||
inline float clip_lerp(float s, float e, float t) {
|
||||
return s + (e - s) * t;
|
||||
}
|
||||
// Bilinear resize function
|
||||
|
@ -1347,17 +1396,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
|
|||
float y_lerp = py - y_floor;
|
||||
|
||||
for (int c = 0; c < 3; c++) {
|
||||
float top = lerp(
|
||||
float top = clip_lerp(
|
||||
static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
|
||||
static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
|
||||
x_lerp
|
||||
);
|
||||
float bottom = lerp(
|
||||
float bottom = clip_lerp(
|
||||
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
|
||||
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
|
||||
x_lerp
|
||||
);
|
||||
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
|
||||
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,7 +30,6 @@ int main(int argc, char ** argv){
|
|||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
|
|
|
@ -38,7 +38,6 @@ int main(int argc, char ** argv){
|
|||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
|
|
|
@ -240,7 +240,6 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
session_tokens.resize(n_token_count_out);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
set(TARGET quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
#include <unordered_map>
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
||||
struct quant_option {
|
||||
std::string name;
|
||||
|
@ -53,6 +52,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
};
|
||||
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
|
||||
|
||||
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
std::string ftype_str;
|
||||
|
@ -97,6 +100,7 @@ static void usage(const char * executable) {
|
|||
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
||||
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
||||
printf(" --keep-split: will generate quatized model in the same shards as input");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
||||
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||
|
@ -112,7 +116,7 @@ static void usage(const char * executable) {
|
|||
exit(1);
|
||||
}
|
||||
|
||||
static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
||||
static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
||||
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
|
||||
if (!in) {
|
||||
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
|
||||
|
@ -159,18 +163,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
|
|||
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
|
||||
}
|
||||
}
|
||||
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
|
||||
|
||||
// latest imatrix version contains the dataset filename at the end of the file
|
||||
int m_last_call = 0;
|
||||
if (in.peek() != EOF) {
|
||||
in.read((char *)&m_last_call, sizeof(m_last_call));
|
||||
int dataset_len;
|
||||
in.read((char *)&dataset_len, sizeof(dataset_len));
|
||||
std::vector<char> dataset_as_vec(dataset_len);
|
||||
in.read(dataset_as_vec.data(), dataset_len);
|
||||
imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
|
||||
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
|
||||
}
|
||||
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
|
||||
return m_last_call;
|
||||
}
|
||||
|
||||
static void prepare_imatrix(const std::string & imatrix_file,
|
||||
static int prepare_imatrix(const std::string & imatrix_file,
|
||||
std::string & imatrix_dataset,
|
||||
const std::vector<std::string> & included_weights,
|
||||
const std::vector<std::string> & excluded_weights,
|
||||
std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
||||
int m_last_call = -1;
|
||||
if (!imatrix_file.empty()) {
|
||||
load_imatrix(imatrix_file, imatrix_data);
|
||||
m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
|
||||
}
|
||||
if (imatrix_data.empty()) {
|
||||
return;
|
||||
return m_last_call;
|
||||
}
|
||||
if (!excluded_weights.empty()) {
|
||||
for (auto& name : excluded_weights) {
|
||||
|
@ -196,6 +215,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
|
|||
if (!imatrix_data.empty()) {
|
||||
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
|
||||
}
|
||||
return m_last_call;
|
||||
}
|
||||
|
||||
static ggml_type parse_ggml_type(const char * arg) {
|
||||
|
@ -210,43 +230,6 @@ static ggml_type parse_ggml_type(const char * arg) {
|
|||
return result;
|
||||
}
|
||||
|
||||
static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
||||
const char* sep = strchr(data, '=');
|
||||
if (sep == nullptr || sep - data >= 128) {
|
||||
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
llama_model_kv_override kvo;
|
||||
std::strncpy(kvo.key, data, sep - data);
|
||||
kvo.key[sep - data] = 0;
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.bool_value = false;
|
||||
} else {
|
||||
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
overrides.emplace_back(std::move(kvo));
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
|
@ -300,6 +283,8 @@ int main(int argc, char ** argv) {
|
|||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--keep-split")) {
|
||||
params.keep_split = true;
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
@ -313,10 +298,43 @@ int main(int argc, char ** argv) {
|
|||
usage(argv[0]);
|
||||
}
|
||||
|
||||
std::string imatrix_dataset;
|
||||
std::unordered_map<std::string, std::vector<float>> imatrix_data;
|
||||
prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
|
||||
int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
|
||||
if (!imatrix_data.empty()) {
|
||||
params.imatrix = &imatrix_data;
|
||||
{
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
|
||||
kvo.val_str[127] = '\0';
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
if (!imatrix_dataset.empty()) {
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
|
||||
kvo.val_str[127] = '\0';
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
|
||||
{
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.val_i64 = imatrix_data.size();
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
|
||||
if (m_last_call > 0) {
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.val_i64 = m_last_call;
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
}
|
||||
if (!kv_overrides.empty()) {
|
||||
kv_overrides.emplace_back();
|
||||
|
@ -332,20 +350,28 @@ int main(int argc, char ** argv) {
|
|||
std::string fname_out;
|
||||
|
||||
std::string ftype_str;
|
||||
std::string suffix = ".gguf";
|
||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
// export as [inp path]/ggml-model-[ftype].gguf
|
||||
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
|
||||
|
||||
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
|
||||
fname_out = fpath + "ggml-model-" + ftype_str;
|
||||
if (!params.keep_split) {
|
||||
fname_out += suffix;
|
||||
}
|
||||
arg_idx++;
|
||||
if (ftype_str == "COPY") {
|
||||
params.only_copy = true;
|
||||
}
|
||||
} else {
|
||||
fname_out = argv[arg_idx];
|
||||
if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
|
||||
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
|
||||
}
|
||||
arg_idx++;
|
||||
|
||||
if (argc <= arg_idx) {
|
||||
|
|
65
examples/quantize/tests.sh
Normal file
65
examples/quantize/tests.sh
Normal file
|
@ -0,0 +1,65 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
if [ $# -lt 1 ]
|
||||
then
|
||||
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
|
||||
echo "example: $0 ../../build/bin ../../tmp"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# -gt 1 ]
|
||||
then
|
||||
TMP_DIR=$2
|
||||
else
|
||||
TMP_DIR=/tmp
|
||||
fi
|
||||
|
||||
set -x
|
||||
|
||||
SPLIT=$1/gguf-split
|
||||
QUANTIZE=$1/quantize
|
||||
MAIN=$1/main
|
||||
WORK_PATH=$TMP_DIR/quantize
|
||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||
|
||||
mkdir -p "$WORK_PATH"
|
||||
|
||||
# Clean up in case of previously failed test
|
||||
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
|
||||
|
||||
# 1. Get a model
|
||||
(
|
||||
cd $WORK_PATH
|
||||
"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
|
||||
)
|
||||
echo PASS
|
||||
|
||||
# 2. Split model
|
||||
$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
# 3. Requant model with '--keep_split'
|
||||
$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
# 3a. Test the requanted model is loading properly
|
||||
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
# 4. Requant mode without '--keep_split'
|
||||
$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
# 4b. Test the requanted model is loading properly
|
||||
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
# Clean up
|
||||
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
|
|
@ -90,7 +90,8 @@ export default function () {
|
|||
"model": model,
|
||||
"stream": true,
|
||||
"seed": 42,
|
||||
"max_tokens": max_tokens
|
||||
"max_tokens": max_tokens,
|
||||
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
|
||||
}
|
||||
|
||||
const params = {method: 'POST', body: JSON.stringify(payload)};
|
||||
|
|
|
@ -881,11 +881,11 @@
|
|||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
|
||||
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
||||
.replace(/__(.*?)__/g, '<strong>$1</strong>')
|
||||
.replace(/\*(.*?)\*/g, '<em>$1</em>')
|
||||
.replace(/_(.*?)_/g, '<em>$1</em>')
|
||||
.replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
|
||||
.replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
||||
.replace(/`(.*?)`/g, '<code>$1</code>')
|
||||
.replace(/\n/gim, '<br />');
|
||||
|
|
|
@ -854,7 +854,7 @@ struct server_context {
|
|||
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
|
||||
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
|
||||
slot.params.seed = json_value(data, "seed", default_params.seed);
|
||||
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
|
||||
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||
|
||||
|
@ -1028,7 +1028,6 @@ struct server_context {
|
|||
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
llama_set_rng_seed(ctx, slot.params.seed);
|
||||
}
|
||||
|
||||
slot.command = SLOT_COMMAND_LOAD_PROMPT;
|
||||
|
@ -1118,7 +1117,7 @@ struct server_context {
|
|||
|
||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok);
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
|
@ -1208,6 +1207,27 @@ struct server_context {
|
|||
LOG_VERBOSE("eos token found", {});
|
||||
}
|
||||
|
||||
auto n_ctx_train = llama_n_ctx_train(model);
|
||||
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
|
||||
&& slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
|
||||
LOG_WARNING("n_predict is not set and self-context extend is disabled."
|
||||
" Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
|
||||
{ "id_slot", slot.id },
|
||||
{ "params.n_predict", slot.params.n_predict },
|
||||
{ "slot.n_prompt_tokens", slot.n_prompt_tokens },
|
||||
{ "slot.n_decoded", slot.n_decoded },
|
||||
{ "slot.n_predict", slot.n_predict },
|
||||
{ "n_slots", params.n_parallel },
|
||||
{ "slot.n_ctx", slot.n_ctx },
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "n_ctx_train", n_ctx_train },
|
||||
{ "ga_n", slot.ga_n },
|
||||
});
|
||||
slot.truncated = true;
|
||||
slot.stopped_limit = true;
|
||||
slot.has_next_token = false; // stop prediction
|
||||
}
|
||||
|
||||
LOG_VERBOSE("next token", {
|
||||
{"id_slot", slot.id},
|
||||
{"id_task", slot.id_task},
|
||||
|
@ -2142,7 +2162,7 @@ struct server_context {
|
|||
});
|
||||
|
||||
// process the created batch of tokens
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
|
||||
for (auto & slot : slots) {
|
||||
|
@ -2372,7 +2392,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
|
||||
printf(" --chat-template JINJA_TEMPLATE\n");
|
||||
|
@ -2803,43 +2823,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
char * sep = strchr(argv[i], '=');
|
||||
if (sep == nullptr || sep - argv[i] >= 128) {
|
||||
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
||||
struct llama_model_kv_override kvo;
|
||||
std::strncpy(kvo.key, argv[i], sep - argv[i]);
|
||||
kvo.key[sep - argv[i]] = 0;
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.bool_value = false;
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (!parse_kv_override(argv[i], params.kv_overrides)) {
|
||||
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.kv_overrides.push_back(kvo);
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
server_print_usage(argv[0], default_params, default_sparams);
|
||||
|
|
57
examples/server/tests/features/results.feature
Normal file
57
examples/server/tests/features/results.feature
Normal file
|
@ -0,0 +1,57 @@
|
|||
@llama.cpp
|
||||
@results
|
||||
Feature: Results
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
|
||||
And a model file test-model-00001-of-00003.gguf
|
||||
And 128 as batch size
|
||||
And 256 KV cache size
|
||||
And 128 max tokens to predict
|
||||
|
||||
Scenario Outline: Multi users completion
|
||||
Given <n_slots> slots
|
||||
And continuous batching
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
Then all predictions are equal
|
||||
Examples:
|
||||
| n_slots |
|
||||
| 1 |
|
||||
| 2 |
|
|
@ -61,6 +61,7 @@ def step_server_config(context, server_fqdn, server_port):
|
|||
context.server_metrics = False
|
||||
context.server_process = None
|
||||
context.seed = None
|
||||
context.draft = None
|
||||
context.server_seed = None
|
||||
context.user_api_key = None
|
||||
context.response_format = None
|
||||
|
@ -107,6 +108,11 @@ def step_n_gpu_layer(context, ngl):
|
|||
context.n_gpu_layer = ngl
|
||||
|
||||
|
||||
@step('{draft:d} as draft')
|
||||
def step_draft(context, draft):
|
||||
context.draft = draft
|
||||
|
||||
|
||||
@step('{n_ctx:d} KV cache size')
|
||||
def step_n_ctx(context, n_ctx):
|
||||
context.n_ctx = n_ctx
|
||||
|
@ -254,6 +260,15 @@ def step_n_tokens_predicted(context, predicted_n):
|
|||
assert_n_tokens_predicted(context.completion, predicted_n)
|
||||
|
||||
|
||||
@step('all predictions are equal')
|
||||
@async_run_until_complete
|
||||
async def step_predictions_equal(context):
|
||||
n_completions = await gather_tasks_results(context)
|
||||
assert n_completions >= 2, "need at least 2 completions"
|
||||
assert_all_predictions_equal(context.tasks_result)
|
||||
context.tasks_result = []
|
||||
|
||||
|
||||
@step('the completion is truncated')
|
||||
def step_assert_completion_truncated(context):
|
||||
step_assert_completion_truncated(context, '')
|
||||
|
@ -1020,6 +1035,23 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
|||
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
||||
f' {n_predicted} <> {expected_predicted_n}')
|
||||
|
||||
def assert_all_predictions_equal(completion_responses):
|
||||
content_0 = completion_responses[0]['content']
|
||||
|
||||
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||
print(f"content 0: {content_0}")
|
||||
|
||||
i = 1
|
||||
for response in completion_responses[1:]:
|
||||
content = response['content']
|
||||
|
||||
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||
print(f"content {i}: {content}")
|
||||
|
||||
assert content == content_0, "contents not equal"
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
async def gather_tasks_results(context):
|
||||
n_tasks = len(context.concurrent_tasks)
|
||||
|
@ -1148,6 +1180,8 @@ def start_server_background(context):
|
|||
server_args.extend(['--ubatch-size', context.n_ubatch])
|
||||
if context.n_gpu_layer:
|
||||
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
|
||||
if context.draft is not None:
|
||||
server_args.extend(['--draft', context.draft])
|
||||
if context.server_continuous_batching:
|
||||
server_args.append('--cont-batching')
|
||||
if context.server_embeddings:
|
||||
|
|
|
@ -4,9 +4,8 @@ set -eu
|
|||
|
||||
if [ $# -lt 1 ]
|
||||
then
|
||||
# Start @llama.cpp scenario
|
||||
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
# Start @llama.cpp scenario
|
||||
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
else
|
||||
behave "$@"
|
||||
behave "$@"
|
||||
fi
|
||||
|
||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1713537308,
|
||||
"narHash": "sha256-XtTSSIB2DA6tOv+l0FhvfDMiyCmhoRbNB+0SeInZkbk=",
|
||||
"lastModified": 1714076141,
|
||||
"narHash": "sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "5c24cf2f0a12ad855f444c30b2421d044120c66f",
|
||||
"rev": "7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
16
ggml-alloc.c
16
ggml-alloc.c
|
@ -371,16 +371,16 @@ struct ggml_gallocr {
|
|||
};
|
||||
|
||||
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
||||
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
|
||||
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
|
||||
GGML_ASSERT(galloc != NULL);
|
||||
|
||||
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
|
||||
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
||||
GGML_ASSERT(galloc->bufts != NULL);
|
||||
|
||||
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
|
||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
||||
GGML_ASSERT(galloc->buffers != NULL);
|
||||
|
||||
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
|
||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||
|
||||
for (int i = 0; i < n_bufs; i++) {
|
||||
|
@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
free(galloc->hash_set.keys);
|
||||
free(galloc->hash_values);
|
||||
galloc->hash_set.size = hash_size;
|
||||
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
|
||||
galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
|
||||
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
||||
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
||||
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
||||
GGML_ASSERT(galloc->hash_values != NULL);
|
||||
} else {
|
||||
|
@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
// set the node_allocs from the hash table
|
||||
if (galloc->n_nodes < graph->n_nodes) {
|
||||
free(galloc->node_allocs);
|
||||
galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
|
||||
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
|
||||
GGML_ASSERT(galloc->node_allocs != NULL);
|
||||
}
|
||||
galloc->n_nodes = graph->n_nodes;
|
||||
|
@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
}
|
||||
if (galloc->n_leafs < graph->n_leafs) {
|
||||
free(galloc->leaf_allocs);
|
||||
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
|
||||
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
|
||||
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
||||
}
|
||||
galloc->n_leafs = graph->n_leafs;
|
||||
|
|
|
@ -1725,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
||||
|
||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
||||
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
||||
|
||||
// initialize hash table
|
||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
||||
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
||||
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
||||
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
||||
|
||||
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
||||
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
||||
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||
|
||||
sched->n_backends = n_backends;
|
||||
|
||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||
|
||||
const int initial_splits_capacity = 16;
|
||||
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
|
||||
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
||||
sched->splits_capacity = initial_splits_capacity;
|
||||
|
||||
for (int b = 0; b < n_backends; b++) {
|
||||
|
@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|||
|
||||
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||
// reset state for the next run
|
||||
size_t hash_size = sched->hash_set.size;
|
||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
||||
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
||||
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
||||
if (!sched->is_reset) {
|
||||
size_t hash_size = sched->hash_set.size;
|
||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
||||
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
||||
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
||||
|
||||
sched->is_reset = true;
|
||||
sched->is_reset = true;
|
||||
}
|
||||
sched->is_alloc = false;
|
||||
}
|
||||
|
||||
|
@ -1972,10 +1974,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|||
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
||||
struct ggml_hash_set hash_set = {
|
||||
/* .size = */ graph->visited_hash_table.size,
|
||||
/* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
|
||||
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
||||
};
|
||||
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
|
||||
bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
|
||||
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
||||
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
||||
|
|
|
@ -5,16 +5,16 @@
|
|||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||
const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ib = i/qk; // block index
|
||||
const int iqs = (i%qk)/qr; // quant index
|
||||
const int iybs = i - i%qk; // y block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
const int64_t iqs = (i%qk)/qr; // quant index
|
||||
const int64_t iybs = i - i%qk; // y block start index
|
||||
const int64_t y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
dfloat2 v;
|
||||
|
@ -29,7 +29,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
|||
#if __CUDA_ARCH__ >= CC_PASCAL
|
||||
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
||||
|
||||
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
||||
const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
||||
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
||||
half2 * y2 = (half2 *) (y + i0);
|
||||
|
||||
|
@ -73,9 +73,9 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
|
|||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8;
|
||||
const int64_t ir = tid%8;
|
||||
const int64_t ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
|
@ -101,9 +101,9 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
|
|||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8;
|
||||
const int64_t ir = tid%8;
|
||||
const int64_t ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
|
@ -127,14 +127,14 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int n = tid/32;
|
||||
const int l = tid - 32*n;
|
||||
const int is = 8*n + l/16;
|
||||
const int64_t n = tid/32;
|
||||
const int64_t l = tid - 32*n;
|
||||
const int64_t is = 8*n + l/16;
|
||||
|
||||
const uint8_t q = x[i].qs[32*n + l];
|
||||
dst_t * y = yy + i*QK_K + 128*n;
|
||||
|
@ -146,8 +146,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
|||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||
#else
|
||||
const int is = tid/16; // 0 or 1
|
||||
const int il = tid%16; // 0...15
|
||||
const int64_t is = tid/16; // 0 or 1
|
||||
const int64_t il = tid%16; // 0...15
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
||||
float dall = __low2half(x[i].dm);
|
||||
|
@ -161,19 +161,19 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
#if QK_K == 256
|
||||
const int r = threadIdx.x/4;
|
||||
const int tid = r/2;
|
||||
const int is0 = r%2;
|
||||
const int l0 = 16*is0 + 4*(threadIdx.x%4);
|
||||
const int n = tid / 4;
|
||||
const int j = tid - 4*n;
|
||||
const int64_t r = threadIdx.x/4;
|
||||
const int64_t tid = r/2;
|
||||
const int64_t is0 = r%2;
|
||||
const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
|
||||
const int64_t n = tid / 4;
|
||||
const int64_t j = tid - 4*n;
|
||||
|
||||
uint8_t m = 1 << (4*n + j);
|
||||
int is = 8*n + 2*j + is0;
|
||||
int64_t is = 8*n + 2*j + is0;
|
||||
int shift = 2*j;
|
||||
|
||||
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
|
||||
|
@ -189,11 +189,11 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
|||
|
||||
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||
#else
|
||||
const int tid = threadIdx.x;
|
||||
const int is = tid/16; // 0 or 1
|
||||
const int il = tid%16; // 0...15
|
||||
const int im = il/8; // 0...1
|
||||
const int in = il%8; // 0...7
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t is = tid/16; // 0 or 1
|
||||
const int64_t il = tid%16; // 0...15
|
||||
const int64_t im = il/8; // 0...1
|
||||
const int64_t in = il%8; // 0...7
|
||||
|
||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
||||
|
||||
|
@ -227,15 +227,15 @@ template<typename dst_t>
|
|||
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q4_K * x = (const block_q4_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int is = 2*il;
|
||||
const int n = 4;
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8;
|
||||
const int64_t ir = tid%8;
|
||||
const int64_t is = 2*il;
|
||||
const int64_t n = 4;
|
||||
|
||||
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
||||
|
||||
|
@ -254,7 +254,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
|||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||
}
|
||||
#else
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
const uint8_t * q = x[i].qs;
|
||||
dst_t * y = yy + i*QK_K;
|
||||
const float d = (float)x[i].dm[0];
|
||||
|
@ -268,14 +268,14 @@ template<typename dst_t>
|
|||
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q5_K * x = (const block_q5_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/16; // il is in 0...3
|
||||
const int ir = tid%16; // ir is in 0...15
|
||||
const int is = 2*il; // is is in 0...6
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/16; // il is in 0...3
|
||||
const int64_t ir = tid%16; // ir is in 0...15
|
||||
const int64_t is = 2*il; // is is in 0...6
|
||||
|
||||
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
||||
|
||||
|
@ -298,11 +298,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
|||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||
#else
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
const uint8_t q = x[i].qs[tid];
|
||||
const int im = tid/8; // 0...3
|
||||
const int in = tid%8; // 0...7
|
||||
const int is = tid/16; // 0 or 1
|
||||
const int64_t im = tid/8; // 0...3
|
||||
const int64_t in = tid%8; // 0...7
|
||||
const int64_t is = tid/16; // 0 or 1
|
||||
const uint8_t h = x[i].qh[in] >> im;
|
||||
const float d = x[i].d;
|
||||
dst_t * y = yy + i*QK_K + tid;
|
||||
|
@ -359,13 +359,13 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint16_t * q2 = x[i].qs + 4*ib;
|
||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
||||
|
@ -383,13 +383,13 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint16_t * q2 = x[i].qs + 4*ib;
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
|
||||
|
@ -405,13 +405,13 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
|
@ -426,13 +426,13 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * q3 = x[i].qs + 8*ib;
|
||||
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
|
||||
|
@ -454,13 +454,13 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * qs = x[i].qs + 8*ib;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
||||
|
@ -480,13 +480,13 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq1_s * x = (const block_iq1_s *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
||||
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
||||
|
@ -506,18 +506,18 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq1_m * x = (const block_iq1_m *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||
iq1m_scale_t scale;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
|
||||
const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
|
||||
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
|
||||
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
|
||||
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
||||
|
@ -537,12 +537,12 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||
const float d = (float)x[ib].d;
|
||||
|
@ -556,12 +556,12 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
|||
#if QK_K != 64
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
||||
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
||||
|
|
|
@ -28,7 +28,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
|||
extern __shared__ float data_soft_max_f32[];
|
||||
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
||||
// shared memory buffer to cache values between iterations:
|
||||
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
|
||||
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
|
||||
|
||||
float max_val = -INFINITY;
|
||||
|
||||
|
@ -40,8 +40,8 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
|||
break;
|
||||
}
|
||||
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
const int64_t ix = (int64_t)rowx*ncols + col;
|
||||
const int64_t iy = (int64_t)rowy*ncols + col;
|
||||
|
||||
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
||||
|
||||
|
@ -109,7 +109,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
|||
return;
|
||||
}
|
||||
|
||||
const int idst = rowx*ncols + col;
|
||||
const int64_t idst = (int64_t)rowx*ncols + col;
|
||||
dst[idst] = vals[col] * inv_sum;
|
||||
}
|
||||
}
|
||||
|
|
266
ggml-impl.h
266
ggml-impl.h
|
@ -11,6 +11,12 @@
|
|||
#include <string.h> // memcpy
|
||||
#include <math.h> // fabsf
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -45,7 +51,7 @@ extern "C" {
|
|||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
|
@ -53,8 +59,262 @@ extern "C" {
|
|||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
typedef uint16_t ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||
|
||||
#else
|
||||
|
||||
typedef __fp16 ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
// 32-bit ARM compatibility
|
||||
|
||||
// vaddvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
// vaddvq_f32
|
||||
// vmaxvq_f32
|
||||
// vcvtnq_s32_f32
|
||||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||
return
|
||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||
return vcombine_s32(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[0]; res[1] = b[0];
|
||||
res[2] = a[1]; res[3] = b[1];
|
||||
res[4] = a[2]; res[5] = b[2];
|
||||
res[6] = a[3]; res[7] = b[3];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[4]; res[1] = b[4];
|
||||
res[2] = a[5]; res[3] = b[5];
|
||||
res[4] = a[6]; res[5] = b[6];
|
||||
res[6] = a[7]; res[7] = b[7];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// vld1q_s16_x2
|
||||
// vld1q_u8_x2
|
||||
// vld1q_u8_x4
|
||||
// vld1q_s8_x2
|
||||
// vld1q_s8_x4
|
||||
// TODO: double-check these work correctly
|
||||
|
||||
typedef struct ggml_int16x8x2_t {
|
||||
int16x8_t val[2];
|
||||
} ggml_int16x8x2_t;
|
||||
|
||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||
ggml_int16x8x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s16(ptr + 0);
|
||||
res.val[1] = vld1q_s16(ptr + 8);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
uint8x16_t val[2];
|
||||
} ggml_uint8x16x2_t;
|
||||
|
||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||
ggml_uint8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x4_t {
|
||||
uint8x16_t val[4];
|
||||
} ggml_uint8x16x4_t;
|
||||
|
||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||
ggml_uint8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
res.val[2] = vld1q_u8(ptr + 32);
|
||||
res.val[3] = vld1q_u8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x2_t {
|
||||
int8x16_t val[2];
|
||||
} ggml_int8x16x2_t;
|
||||
|
||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||
ggml_int8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x4_t {
|
||||
int8x16_t val[4];
|
||||
} ggml_int8x16x4_t;
|
||||
|
||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
ggml_int8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
res.val[2] = vld1q_s8(ptr + 32);
|
||||
res.val[3] = vld1q_s8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
uint8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||
#define ggml_int8x16x2_t int8x16x2_t
|
||||
#define ggml_int8x16x4_t int8x16x4_t
|
||||
|
||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif // !defined(__aarch64__)
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
#endif // defined(__ARM_NEON)
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(__MSC_VER)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
|
@ -75,8 +335,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|||
|
||||
#else
|
||||
|
||||
typedef uint16_t ggml_fp16_internal_t;
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
|
@ -221,7 +479,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|||
|
||||
#endif // __F16C__
|
||||
|
||||
#endif // __ARM_NEON
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
|
|
577
ggml-quants.c
577
ggml-quants.c
|
@ -14,47 +14,6 @@
|
|||
#include <stdlib.h> // for qsort
|
||||
#include <stdio.h> // for GGML_ASSERT
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
|
||||
#include <altivec.h>
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
||||
#if !defined(__riscv)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
||||
|
@ -276,258 +235,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|||
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
||||
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
// 64-bit compatibility
|
||||
|
||||
// vaddvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
// vaddvq_f32
|
||||
// vmaxvq_f32
|
||||
// vcvtnq_s32_f32
|
||||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||
return
|
||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||
return vcombine_s32(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[0]; res[1] = b[0];
|
||||
res[2] = a[1]; res[3] = b[1];
|
||||
res[4] = a[2]; res[5] = b[2];
|
||||
res[6] = a[3]; res[7] = b[3];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[4]; res[1] = b[4];
|
||||
res[2] = a[5]; res[3] = b[5];
|
||||
res[4] = a[6]; res[5] = b[6];
|
||||
res[6] = a[7]; res[7] = b[7];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// vld1q_s16_x2
|
||||
// vld1q_u8_x2
|
||||
// vld1q_u8_x4
|
||||
// vld1q_s8_x2
|
||||
// vld1q_s8_x4
|
||||
// TODO: double-check these work correctly
|
||||
|
||||
typedef struct ggml_int16x8x2_t {
|
||||
int16x8_t val[2];
|
||||
} ggml_int16x8x2_t;
|
||||
|
||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||
ggml_int16x8x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s16(ptr + 0);
|
||||
res.val[1] = vld1q_s16(ptr + 8);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
uint8x16_t val[2];
|
||||
} ggml_uint8x16x2_t;
|
||||
|
||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||
ggml_uint8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x4_t {
|
||||
uint8x16_t val[4];
|
||||
} ggml_uint8x16x4_t;
|
||||
|
||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||
ggml_uint8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
res.val[2] = vld1q_u8(ptr + 32);
|
||||
res.val[3] = vld1q_u8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x2_t {
|
||||
int8x16_t val[2];
|
||||
} ggml_int8x16x2_t;
|
||||
|
||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||
ggml_int8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x4_t {
|
||||
int8x16_t val[4];
|
||||
} ggml_int8x16x4_t;
|
||||
|
||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
ggml_int8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
res.val[2] = vld1q_s8(ptr + 32);
|
||||
res.val[3] = vld1q_s8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
uint8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||
#define ggml_int8x16x2_t int8x16x2_t
|
||||
#define ggml_int8x16x4_t int8x16x4_t
|
||||
|
||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
||||
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
||||
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
||||
|
@ -12676,3 +12383,287 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
|
|||
block_iq2_s * restrict y = vy;
|
||||
quantize_row_iq2_s_reference(x, y, k);
|
||||
}
|
||||
|
||||
static bool validate_float(float f, size_t i) {
|
||||
if (isinf(f)) {
|
||||
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isnan(f)) {
|
||||
fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool isinf_fp16(ggml_fp16_t f) {
|
||||
return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0;
|
||||
}
|
||||
|
||||
static bool isnan_fp16(ggml_fp16_t f) {
|
||||
return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0;
|
||||
}
|
||||
|
||||
static bool validate_fp16(ggml_fp16_t f, size_t i) {
|
||||
if (isinf_fp16(f)) {
|
||||
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isnan_fp16(f)) {
|
||||
fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
|
||||
const type * q = (const type *) (data); \
|
||||
for (size_t i = 0; i < (nb); ++i) { \
|
||||
if (!validate_fp16(q[i].d, i)) { \
|
||||
return false; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \
|
||||
const type * q = (const type *) (data); \
|
||||
for (size_t i = 0; i < (nb); ++i) { \
|
||||
if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \
|
||||
return false; \
|
||||
} \
|
||||
}
|
||||
|
||||
bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
|
||||
if (type < 0 || type >= GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (nbytes % ggml_type_size(type) != 0) {
|
||||
fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t nb = nbytes/ggml_type_size(type);
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
const ggml_fp16_t * f = (const ggml_fp16_t *) data;
|
||||
size_t i = 0;
|
||||
#if defined(__AVX2__)
|
||||
for (; i + 15 < nb; i += 16) {
|
||||
__m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
|
||||
__m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00));
|
||||
__m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00));
|
||||
int mask = _mm256_movemask_epi8(cmp);
|
||||
if (mask) {
|
||||
for (size_t j = 0; j < 16; ++j) {
|
||||
if (!validate_fp16(f[i + j], i + j)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
GGML_UNREACHABLE();
|
||||
}
|
||||
}
|
||||
#elif defined(__ARM_NEON)
|
||||
for (; i + 7 < nb; i += 8) {
|
||||
uint16x8_t v = vld1q_u16(f + i);
|
||||
uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00));
|
||||
uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00));
|
||||
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0);
|
||||
if (mask) {
|
||||
for (size_t j = 0; j < 8; ++j) {
|
||||
if (!validate_fp16(f[i + j], i + j)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
GGML_UNREACHABLE();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (; i < nb; ++i) {
|
||||
if (!validate_fp16(f[i], i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
const float * f = (const float *) data;
|
||||
size_t i = 0;
|
||||
#if defined(__AVX2__)
|
||||
for (; i + 7 < nb; i += 8) {
|
||||
__m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
|
||||
__m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000));
|
||||
__m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000));
|
||||
int mask = _mm256_movemask_epi8(cmp);
|
||||
if (mask) {
|
||||
for (size_t j = 0; j < 8; ++j) {
|
||||
if (!validate_float(f[i + j], i + j)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
GGML_UNREACHABLE();
|
||||
}
|
||||
}
|
||||
#elif defined(__ARM_NEON)
|
||||
for (; i + 3 < nb; i += 4) {
|
||||
uint32x4_t v = vld1q_u32((const uint32_t *)f + i);
|
||||
uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000));
|
||||
uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000));
|
||||
uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0);
|
||||
if (mask) {
|
||||
for (size_t j = 0; j < 4; ++j) {
|
||||
if (!validate_float(f[i + j], i + j)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
GGML_UNREACHABLE();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (; i < nb; ++i) {
|
||||
if (!validate_float(f[i], i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_F64:
|
||||
{
|
||||
const double * f = (const double *) data;
|
||||
for (size_t i = 0; i < nb; ++i) {
|
||||
if (!validate_float(f[i], i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m);
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m);
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
#ifdef GGML_QKK_64
|
||||
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]);
|
||||
#else
|
||||
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin);
|
||||
#endif
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
{
|
||||
#ifdef GGML_QKK_64
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb);
|
||||
#else
|
||||
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin);
|
||||
#endif
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q8_K:
|
||||
{
|
||||
const block_q8_K * q = (const block_q8_K *) data;
|
||||
for (size_t i = 0; i < nb; ++i) {
|
||||
if (!validate_float(q[i].d, i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
{
|
||||
const block_iq1_m * q = (const block_iq1_m *) data;
|
||||
for (size_t i = 0; i < nb; ++i) {
|
||||
#if QK_K == 64
|
||||
if (!validate_fp16(q[i].d, i)) {
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
iq1m_scale_t scale;
|
||||
const uint16_t * sc = (const uint16_t *)q[i].scales;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
if (!validate_fp16(scale.f16, i)) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb);
|
||||
} break;
|
||||
|
||||
case GGML_TYPE_IQ3_S:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
#if QK_K != 64
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb);
|
||||
} break;
|
||||
#endif
|
||||
// with QK_K == 64, iq4_xs is iq4_nl
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_I64:
|
||||
// nothing to validate
|
||||
break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -13416,11 +13416,16 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|||
version += std::to_string(prop.get_minor_version());
|
||||
|
||||
device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
|
||||
std::string name = std::string(prop.get_name());
|
||||
name = std::regex_replace(name, std::regex("\\(R\\)"), "");
|
||||
name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
|
||||
|
||||
fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
|
||||
prop.get_name(), version.c_str(), prop.get_max_compute_units(),
|
||||
auto global_mem_size = prop.get_global_mem_size()/1000000;
|
||||
|
||||
fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
|
||||
name.c_str(), version.c_str(), prop.get_max_compute_units(),
|
||||
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
||||
prop.get_global_mem_size());
|
||||
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
||||
}
|
||||
|
||||
void ggml_backend_sycl_print_sycl_devices() {
|
||||
|
@ -13428,9 +13433,10 @@ void ggml_backend_sycl_print_sycl_devices() {
|
|||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
std::map<std::string, size_t> DeviceNums;
|
||||
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
||||
fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
|
||||
fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
|
||||
fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
|
||||
fprintf(stderr, "| | | | |Max | |Max |Global | |\n");
|
||||
fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n");
|
||||
fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n");
|
||||
fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
|
||||
for (int id = 0; id < device_count; ++id) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
sycl::backend backend = device.get_backend();
|
||||
|
|
63
ggml.c
63
ggml.c
|
@ -858,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|||
// simd mappings
|
||||
//
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
// 64-bit compatibility
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
||||
// we then implement the fundamental computation operations below using only these macros
|
||||
// adding support for new architectures requires to define the corresponding SIMD macros
|
||||
|
@ -10825,7 +10813,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
#endif
|
||||
|
||||
#if GGML_USE_LLAMAFILE
|
||||
if (nb10 == ggml_type_size(src1->type)) {
|
||||
if (src1_cont) {
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
|
@ -10878,15 +10866,13 @@ UseGgmlGemm1:;
|
|||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
#if GGML_USE_LLAMAFILE
|
||||
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
|
||||
if (src1->type != vec_dot_type) {
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(src0->type),
|
||||
(const char *)wdata + ggml_row_size(vec_dot_type,
|
||||
nb12/ggml_type_size(src1->type)*i12 +
|
||||
nb13/ggml_type_size(src1->type)*i13),
|
||||
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
||||
row_size/ggml_type_size(vec_dot_type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
|
@ -20628,7 +20614,7 @@ static void gguf_free_kv(struct gguf_kv * kv) {
|
|||
}
|
||||
|
||||
struct gguf_context * gguf_init_empty(void) {
|
||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
|
||||
|
||||
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
||||
ctx->header.version = GGUF_VERSION;
|
||||
|
@ -20673,7 +20659,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
bool ok = true;
|
||||
|
||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
|
||||
|
||||
// read the header
|
||||
{
|
||||
|
@ -20710,9 +20696,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
// read the kv pairs
|
||||
{
|
||||
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
const uint64_t n_kv = ctx->header.n_kv;
|
||||
|
||||
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||
// header.n_kv will hold the actual value of pairs that were successfully read in the loop below
|
||||
ctx->header.n_kv = 0;
|
||||
ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
|
||||
|
||||
for (uint64_t i = 0; i < n_kv; ++i) {
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
|
||||
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
||||
|
@ -20761,7 +20751,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
return NULL;
|
||||
}
|
||||
|
||||
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
||||
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
|
||||
|
||||
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
||||
} break;
|
||||
|
@ -20775,7 +20765,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
return NULL;
|
||||
}
|
||||
|
||||
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
|
||||
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
|
||||
|
||||
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
||||
|
@ -20791,6 +20781,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
if (!ok) {
|
||||
break;
|
||||
}
|
||||
|
||||
ctx->header.n_kv++;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
|
@ -20803,7 +20795,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
// read the tensor infos
|
||||
{
|
||||
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||
ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
|
||||
|
||||
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
|
@ -20824,8 +20816,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||
|
||||
// TODO: return an error instead of crashing with GGML_ASSERT
|
||||
gguf_tensor_info_sanitize(info);
|
||||
|
||||
// make sure there is no duplicated tensor names
|
||||
for (uint64_t j = 0; j < i; ++j) {
|
||||
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
|
||||
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
||||
fclose(file);
|
||||
|
@ -20994,7 +20995,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|||
GGML_FREE(ctx->infos);
|
||||
}
|
||||
|
||||
GGML_ALIGNED_FREE(ctx);
|
||||
GGML_FREE(ctx);
|
||||
}
|
||||
|
||||
const char * gguf_type_name(enum gguf_type type) {
|
||||
|
@ -21305,7 +21306,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
|||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = type;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
|
||||
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
|
||||
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
||||
}
|
||||
|
||||
|
@ -21315,7 +21316,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
|
||||
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
|
||||
for (int i = 0; i < n; i++) {
|
||||
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
||||
str->n = strlen(data[i]);
|
||||
|
@ -21342,7 +21343,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
||||
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
|
||||
const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
|
||||
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
||||
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
||||
}
|
||||
|
@ -21362,6 +21363,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|||
void gguf_add_tensor(
|
||||
struct gguf_context * ctx,
|
||||
const struct ggml_tensor * tensor) {
|
||||
if (gguf_find_tensor(ctx, tensor->name) != -1) {
|
||||
GGML_ASSERT(false && "duplicated tensor name");
|
||||
}
|
||||
|
||||
const int idx = ctx->header.n_tensors;
|
||||
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
||||
|
||||
|
@ -21430,7 +21435,7 @@ struct gguf_buf {
|
|||
|
||||
static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
struct gguf_buf buf = {
|
||||
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
|
||||
/*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
|
||||
/*buf.size =*/ size,
|
||||
/*buf.offset =*/ 0,
|
||||
};
|
||||
|
|
2
ggml.h
2
ggml.h
|
@ -762,6 +762,8 @@ extern "C" {
|
|||
// use this to compute the memory overhead of a tensor
|
||||
GGML_API size_t ggml_tensor_overhead(void);
|
||||
|
||||
GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
|
||||
|
||||
// main
|
||||
|
||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
|
|
|
@ -124,6 +124,7 @@ class MODEL_ARCH(IntEnum):
|
|||
QWEN2 = auto()
|
||||
QWEN2MOE = auto()
|
||||
PHI2 = auto()
|
||||
PHI3 = auto()
|
||||
PLAMO = auto()
|
||||
CODESHELL = auto()
|
||||
ORION = auto()
|
||||
|
@ -200,6 +201,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.QWEN2: "qwen2",
|
||||
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
||||
MODEL_ARCH.PHI2: "phi2",
|
||||
MODEL_ARCH.PHI3: "phi3",
|
||||
MODEL_ARCH.PLAMO: "plamo",
|
||||
MODEL_ARCH.CODESHELL: "codeshell",
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
|
@ -550,6 +552,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.PHI3: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.CODESHELL: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.POS_EMBD,
|
||||
|
|
|
@ -234,8 +234,14 @@ class GGUFReader:
|
|||
|
||||
def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
|
||||
tensors = []
|
||||
tensor_names = set() # keep track of name to prevent duplicated tensors
|
||||
for field in fields:
|
||||
_name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
|
||||
# check if there's any tensor having same name already in the list
|
||||
tensor_name = str(bytes(name_data), encoding = 'utf-8')
|
||||
if tensor_name in tensor_names:
|
||||
raise ValueError(f'Found duplicated tensor with name {tensor_name}')
|
||||
tensor_names.add(tensor_name)
|
||||
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
||||
n_elems = np.prod(dims)
|
||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||
|
@ -267,7 +273,7 @@ class GGUFReader:
|
|||
item_count = n_bytes
|
||||
item_type = np.uint8
|
||||
tensors.append(ReaderTensor(
|
||||
name = str(bytes(name_data), encoding = 'utf-8'),
|
||||
name = tensor_name,
|
||||
tensor_type = ggml_type,
|
||||
shape = dims,
|
||||
n_elements = n_elems,
|
||||
|
|
|
@ -63,6 +63,7 @@ class GGUFWriter:
|
|||
self.kv_data_count = 0
|
||||
self.ti_data = bytearray()
|
||||
self.ti_data_count = 0
|
||||
self.ti_names = set()
|
||||
self.use_temp_file = use_temp_file
|
||||
self.temp_file = None
|
||||
self.tensors = []
|
||||
|
@ -197,6 +198,10 @@ class GGUFWriter:
|
|||
if self.state is not WriterState.EMPTY:
|
||||
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
||||
|
||||
if name in self.ti_names:
|
||||
raise ValueError(f'Duplicated tensor name {name}')
|
||||
self.ti_names.add(name)
|
||||
|
||||
encoded_name = name.encode("utf8")
|
||||
self.ti_data += self._pack("Q", len(encoded_name))
|
||||
self.ti_data += encoded_name
|
||||
|
|
|
@ -117,6 +117,7 @@ class TensorNameMap:
|
|||
"h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||
"model.layers.{bid}.self_attn.qkv_proj" # phi3
|
||||
),
|
||||
|
||||
# Attention query
|
||||
|
@ -234,6 +235,7 @@ class TensorNameMap:
|
|||
"h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.{bid}.mlp.gate_up_proj", # phi3
|
||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||
|
|
478
llama.cpp
478
llama.cpp
|
@ -75,6 +75,7 @@
|
|||
#include <forward_list>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <future>
|
||||
#include <initializer_list>
|
||||
#include <locale>
|
||||
#include <map>
|
||||
|
@ -211,6 +212,7 @@ enum llm_arch {
|
|||
LLM_ARCH_QWEN2,
|
||||
LLM_ARCH_QWEN2MOE,
|
||||
LLM_ARCH_PHI2,
|
||||
LLM_ARCH_PHI3,
|
||||
LLM_ARCH_PLAMO,
|
||||
LLM_ARCH_CODESHELL,
|
||||
LLM_ARCH_ORION,
|
||||
|
@ -246,6 +248,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_QWEN2, "qwen2" },
|
||||
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
||||
{ LLM_ARCH_PHI2, "phi2" },
|
||||
{ LLM_ARCH_PHI3, "phi3" },
|
||||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
|
@ -793,6 +796,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PHI3,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PLAMO,
|
||||
{
|
||||
|
@ -2863,6 +2883,7 @@ namespace GGUFMeta {
|
|||
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
||||
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
||||
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
||||
case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
@ -2874,13 +2895,16 @@ namespace GGUFMeta {
|
|||
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
||||
switch (ovrd->tag) {
|
||||
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
||||
LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
|
||||
LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
|
||||
} break;
|
||||
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
||||
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
|
||||
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
|
||||
} break;
|
||||
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
||||
LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
|
||||
LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
|
||||
} break;
|
||||
case LLAMA_KV_OVERRIDE_TYPE_STR: {
|
||||
LLAMA_LOG_INFO("%s\n", ovrd->val_str);
|
||||
} break;
|
||||
default:
|
||||
// Shouldn't be possible to end up here, but just in case...
|
||||
|
@ -2899,7 +2923,7 @@ namespace GGUFMeta {
|
|||
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
||||
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
||||
target = ovrd->bool_value;
|
||||
target = ovrd->val_bool;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -2909,7 +2933,7 @@ namespace GGUFMeta {
|
|||
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
||||
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
||||
target = ovrd->int_value;
|
||||
target = ovrd->val_i64;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -2919,7 +2943,7 @@ namespace GGUFMeta {
|
|||
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
||||
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
||||
target = ovrd->float_value;
|
||||
target = ovrd->val_f64;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -2928,12 +2952,11 @@ namespace GGUFMeta {
|
|||
template<typename OT>
|
||||
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
||||
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
||||
(void)target;
|
||||
(void)ovrd;
|
||||
if (!ovrd) { return false; }
|
||||
// Currently, we should never end up here so it would be a bug if we do.
|
||||
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
||||
ovrd ? ovrd->key : "NULL"));
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
|
||||
target = ovrd->val_str;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
||||
|
@ -2966,6 +2989,7 @@ struct llama_model_loader {
|
|||
size_t n_bytes = 0;
|
||||
|
||||
bool use_mmap = false;
|
||||
bool check_tensors;
|
||||
|
||||
llama_files files;
|
||||
llama_ftype ftype;
|
||||
|
@ -2980,9 +3004,13 @@ struct llama_model_loader {
|
|||
|
||||
ggml_tensor * tensor;
|
||||
|
||||
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
||||
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
||||
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
||||
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
||||
|
||||
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
||||
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
||||
}
|
||||
}
|
||||
};
|
||||
std::vector<llama_tensor_weight> weights;
|
||||
|
@ -2995,7 +3023,7 @@ struct llama_model_loader {
|
|||
std::string arch_name;
|
||||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||
|
||||
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
||||
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
||||
int trace = 0;
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
trace = atoi(getenv("LLAMA_TRACE"));
|
||||
|
@ -3021,15 +3049,15 @@ struct llama_model_loader {
|
|||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
weights.emplace_back(0, cur->name, meta, cur);
|
||||
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
|
||||
}
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
uint16_t n_split = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||
|
||||
|
@ -3063,13 +3091,14 @@ struct llama_model_loader {
|
|||
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
||||
}
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
||||
}
|
||||
files.emplace_back(new llama_file(split_path, "rb"));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
|
||||
}
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
}
|
||||
|
||||
|
@ -3091,9 +3120,17 @@ struct llama_model_loader {
|
|||
|
||||
fver = (enum llama_fver) gguf_get_version(meta);
|
||||
|
||||
std::set<std::string> tensor_names;
|
||||
for (auto & w : weights) {
|
||||
n_elements += ggml_nelements(w.tensor);
|
||||
n_bytes += ggml_nbytes(w.tensor);
|
||||
// make sure there is no duplicated tensor names
|
||||
const std::string name(w.tensor->name);
|
||||
auto found = tensor_names.find(name);
|
||||
if (found != tensor_names.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
|
||||
}
|
||||
tensor_names.insert(name);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||
|
@ -3199,6 +3236,7 @@ struct llama_model_loader {
|
|||
}
|
||||
|
||||
this->use_mmap = use_mmap;
|
||||
this->check_tensors = check_tensors;
|
||||
}
|
||||
|
||||
~llama_model_loader() {
|
||||
|
@ -3278,6 +3316,10 @@ struct llama_model_loader {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
const llama_tensor_weight * get_weight(int i) const {
|
||||
return get_weight(get_tensor_name(i));
|
||||
}
|
||||
|
||||
const llama_tensor_weight & require_weight(const char * name) const {
|
||||
const llama_tensor_weight * weight = get_weight(name);
|
||||
if (!weight) {
|
||||
|
@ -3453,6 +3495,10 @@ struct llama_model_loader {
|
|||
file->seek(w.offs, SEEK_SET);
|
||||
file->read_raw(cur->data, ggml_nbytes(cur));
|
||||
}
|
||||
|
||||
if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
|
||||
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
||||
}
|
||||
}
|
||||
|
||||
size_t size_done = 0;
|
||||
|
@ -3469,6 +3515,8 @@ struct llama_model_loader {
|
|||
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
||||
|
||||
std::vector<no_init<uint8_t>> read_buf;
|
||||
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
||||
|
||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
const auto * weight = get_weight(ggml_get_name(cur));
|
||||
if (weight == nullptr) {
|
||||
|
@ -3490,37 +3538,66 @@ struct llama_model_loader {
|
|||
if (bufs_mmap.count(weight->idx)) {
|
||||
buf_mmap = bufs_mmap.at(weight->idx);
|
||||
}
|
||||
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
||||
|
||||
if (check_tensors) {
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
||||
}));
|
||||
}
|
||||
|
||||
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
||||
if (buf_mmap && cur->data == nullptr) {
|
||||
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
|
||||
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
||||
if (lmlocks) {
|
||||
const auto & lmlock = lmlocks->at(weight->idx);
|
||||
lmlock->grow_to(weight->offs + ggml_nbytes(cur));
|
||||
lmlock->grow_to(weight->offs + n_size);
|
||||
}
|
||||
|
||||
auto & mmap_used = mmaps_used[weight->idx];
|
||||
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
||||
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
||||
} else {
|
||||
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
|
||||
ggml_backend_tensor_set(cur, data, 0, n_size);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(weight->idx < files.size());
|
||||
const auto & file = files.at(weight->idx);
|
||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(cur->data, ggml_nbytes(cur));
|
||||
file->read_raw(cur->data, n_size);
|
||||
if (check_tensors) {
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
read_buf.resize(ggml_nbytes(cur));
|
||||
read_buf.resize(n_size);
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
||||
file->read_raw(read_buf.data(), n_size);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
||||
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_done += n_size;
|
||||
}
|
||||
|
||||
// check validation results
|
||||
bool validation_failed = false;
|
||||
for (auto & future : validation_result) {
|
||||
auto result = future.get();
|
||||
if (!result.second) {
|
||||
LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
|
||||
validation_failed = true;
|
||||
}
|
||||
}
|
||||
if (validation_failed) {
|
||||
throw std::runtime_error("found tensors with invalid data");
|
||||
}
|
||||
|
||||
// check if this is the last call and do final cleanup
|
||||
if (size_done >= size_data) {
|
||||
// unmap offloaded tensors and metadata
|
||||
|
@ -3955,6 +4032,16 @@ static void llm_load_hparams(
|
|||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24: model.type = e_model::MODEL_1B; break;
|
||||
case 32: model.type = e_model::MODEL_3B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PHI3:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24: model.type = e_model::MODEL_1B; break;
|
||||
case 32: model.type = e_model::MODEL_3B; break;
|
||||
|
@ -4352,6 +4439,7 @@ static void llm_load_vocab(
|
|||
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
||||
(t.first == "<|eot_id|>" ||
|
||||
t.first == "<|im_end|>" ||
|
||||
t.first == "<|end|>" ||
|
||||
t.first == "<end_of_turn>"
|
||||
)
|
||||
) {
|
||||
|
@ -5375,6 +5463,33 @@ static bool llm_load_tensors(
|
|||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PHI3:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context* ctx_layer = ctx_for_layer(i);
|
||||
ggml_context* ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto& layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
@ -5909,7 +6024,7 @@ static bool llm_load_tensors(
|
|||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||
try {
|
||||
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
||||
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
||||
|
||||
model.hparams.vocab_only = params.vocab_only;
|
||||
|
||||
|
@ -6326,7 +6441,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
if (model.arch == LLM_ARCH_PHI2) {
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
||||
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
|
@ -8967,12 +9082,140 @@ struct llm_build_context {
|
|||
|
||||
cur = ggml_add(ctx0, cur, model.output_b);
|
||||
cb(cur, "result_output", -1);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_phi3() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto residual = inpL;
|
||||
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm,
|
||||
NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(attn_norm_output, "attn_norm", il);
|
||||
|
||||
struct ggml_tensor * Qcur = nullptr;
|
||||
struct ggml_tensor * Kcur = nullptr;
|
||||
struct ggml_tensor * Vcur = nullptr;
|
||||
|
||||
if (model.layers[il].wqkv) {
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
||||
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
||||
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
||||
}
|
||||
else {
|
||||
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
||||
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
||||
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor* inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, residual);
|
||||
residual = cur;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// FF
|
||||
// special-case: the up and gate tensors are merged into a single tensor
|
||||
// TOOD: support into llm_build_ffn
|
||||
{
|
||||
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
||||
cb(up, "ffn_up", il);
|
||||
|
||||
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
|
||||
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
|
||||
|
||||
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
|
||||
cb(y, "ffn_gate", il);
|
||||
|
||||
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
|
||||
cb(down, "ffn_down", il);
|
||||
|
||||
cur = down;
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, residual, cur);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.output_norm,
|
||||
NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_cgraph * build_plamo() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
|
@ -10474,6 +10717,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{
|
||||
result = llm.build_phi2();
|
||||
} break;
|
||||
case LLM_ARCH_PHI3:
|
||||
{
|
||||
result = llm.build_phi3();
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO:
|
||||
{
|
||||
result = llm.build_plamo();
|
||||
|
@ -11234,6 +11481,10 @@ static int llama_decode_internal(
|
|||
}
|
||||
}
|
||||
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(lctx.sched);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -13478,7 +13729,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|||
return result;
|
||||
}
|
||||
|
||||
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
||||
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
||||
GGML_ASSERT(ctx);
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
@ -13491,7 +13742,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|||
}
|
||||
|
||||
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||
auto & rng = ctx->rng;
|
||||
int idx = dist(rng);
|
||||
|
||||
llama_token result = candidates->data[idx].id;
|
||||
|
@ -13501,6 +13751,10 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|||
return result;
|
||||
}
|
||||
|
||||
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
||||
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
|
||||
}
|
||||
|
||||
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
|
@ -14159,14 +14413,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
}
|
||||
|
||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
if (nthread < 2) {
|
||||
// single-thread
|
||||
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
||||
if (!ggml_validate_row_data(new_type, new_data, new_size)) {
|
||||
throw std::runtime_error("quantized data validation failed");
|
||||
}
|
||||
return new_size;
|
||||
}
|
||||
|
||||
std::mutex mutex;
|
||||
int64_t counter = 0;
|
||||
size_t new_size = 0;
|
||||
if (nthread < 2) {
|
||||
// single-thread
|
||||
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
||||
}
|
||||
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
||||
bool valid = true;
|
||||
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
|
||||
nrows, n_per_row, imatrix]() {
|
||||
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
||||
size_t local_size = 0;
|
||||
|
@ -14181,7 +14441,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|||
}
|
||||
lock.unlock();
|
||||
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
||||
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
||||
local_size += this_size;
|
||||
|
||||
// validate the quantized data
|
||||
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
||||
void * this_data = (char *) new_data + first_row * row_size;
|
||||
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
valid = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
for (int it = 0; it < nthread - 1; ++it) {
|
||||
|
@ -14190,6 +14460,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|||
compute();
|
||||
for (auto & w : workers) { w.join(); }
|
||||
workers.clear();
|
||||
if (!valid) {
|
||||
throw std::runtime_error("quantized data validation failed");
|
||||
}
|
||||
return new_size;
|
||||
}
|
||||
|
||||
|
@ -14252,7 +14525,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||
kv_overrides = v->data();
|
||||
}
|
||||
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
||||
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model;
|
||||
|
@ -14290,11 +14563,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
for (auto & o : overrides) {
|
||||
if (o.key[0] == 0) break;
|
||||
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
||||
gguf_set_val_f32(ctx_out, o.key, o.float_value);
|
||||
gguf_set_val_f32(ctx_out, o.key, o.val_f64);
|
||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
||||
gguf_set_val_i32(ctx_out, o.key, o.int_value);
|
||||
gguf_set_val_i32(ctx_out, o.key, o.val_i64);
|
||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
||||
gguf_set_val_bool(ctx_out, o.key, o.bool_value);
|
||||
gguf_set_val_bool(ctx_out, o.key, o.val_bool);
|
||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
||||
gguf_set_val_str(ctx_out, o.key, o.val_str);
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
||||
}
|
||||
|
@ -14336,26 +14611,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
std::vector<no_init<uint8_t>> work;
|
||||
std::vector<no_init<float>> f32_conv_buf;
|
||||
|
||||
uint16_t n_split = 1;
|
||||
// Assume split index is continuous
|
||||
if (params->keep_split) {
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
|
||||
}
|
||||
}
|
||||
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
||||
ctx_outs[0] = ctx_out;
|
||||
|
||||
// populate the original tensors so we get an initial meta data
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||
gguf_add_tensor(ctx_out, meta);
|
||||
auto weight = ml.get_weight(i);
|
||||
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
||||
struct ggml_tensor * tensor = weight->tensor;
|
||||
if (ctx_outs[i_split] == NULL) {
|
||||
ctx_outs[i_split] = gguf_init_empty();
|
||||
}
|
||||
gguf_add_tensor(ctx_outs[i_split], tensor);
|
||||
}
|
||||
|
||||
std::ofstream fout(fname_out, std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
// Set split info if needed
|
||||
if (n_split > 1) {
|
||||
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
||||
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
||||
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
||||
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
||||
int cur_split = -1;
|
||||
std::ofstream fout;
|
||||
auto close_ofstream = [&]() {
|
||||
// Write metadata and close file handler
|
||||
if (fout.is_open()) {
|
||||
fout.seekp(0);
|
||||
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
||||
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
||||
fout.write((const char *) data.data(), data.size());
|
||||
fout.close();
|
||||
}
|
||||
};
|
||||
auto new_ofstream = [&](int index) {
|
||||
cur_split = index;
|
||||
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
||||
std::string fname = fname_out;
|
||||
if (params->keep_split) {
|
||||
char split_path[PATH_MAX] = {0};
|
||||
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
||||
fname = std::string(split_path);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
|
||||
|
||||
// placeholder for the meta data
|
||||
::zeros(fout, meta_size);
|
||||
fout = std::ofstream(fname, std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
||||
// placeholder for the meta data
|
||||
::zeros(fout, meta_size);
|
||||
};
|
||||
|
||||
const auto tn = LLM_TN(model.arch);
|
||||
|
||||
new_ofstream(0);
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
||||
auto weight = ml.get_weight(i);
|
||||
struct ggml_tensor * tensor = weight->tensor;
|
||||
if (weight->idx != cur_split && params->keep_split) {
|
||||
close_ofstream();
|
||||
new_ofstream(weight->idx);
|
||||
}
|
||||
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
|
@ -14510,26 +14833,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
total_size_new += new_size;
|
||||
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
||||
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
|
||||
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
||||
}
|
||||
|
||||
// go back to beginning of file and write the updated meta data
|
||||
{
|
||||
fout.seekp(0);
|
||||
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
||||
gguf_get_meta_data(ctx_out, data.data());
|
||||
fout.write((const char *) data.data(), data.size());
|
||||
close_ofstream();
|
||||
for (auto & c:ctx_outs) {
|
||||
gguf_free(c);
|
||||
}
|
||||
|
||||
fout.close();
|
||||
|
||||
gguf_free(ctx_out);
|
||||
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
||||
|
||||
|
@ -14573,7 +14888,7 @@ static int llama_apply_lora_from_file_internal(
|
|||
std::unique_ptr<llama_model_loader> ml;
|
||||
if (path_base_model) {
|
||||
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
||||
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
||||
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
||||
}
|
||||
|
||||
|
@ -14832,6 +15147,7 @@ struct llama_model_params llama_model_default_params() {
|
|||
/*.vocab_only =*/ false,
|
||||
/*.use_mmap =*/ true,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.check_tensors =*/ false,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
@ -14885,6 +15201,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||
/*.quantize_output_tensor =*/ true,
|
||||
/*.only_copy =*/ false,
|
||||
/*.pure =*/ false,
|
||||
/*.keep_split =*/ false,
|
||||
/*.imatrix =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
};
|
||||
|
@ -15393,6 +15710,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
case LLM_ARCH_QWEN2:
|
||||
case LLM_ARCH_QWEN2MOE:
|
||||
case LLM_ARCH_PHI2:
|
||||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_STARCODER2:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
@ -15406,6 +15724,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
return LLAMA_ROPE_TYPE_NONE;
|
||||
}
|
||||
|
||||
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
||||
return ctx->cparams.pooling_type;
|
||||
}
|
||||
|
||||
int32_t llama_n_vocab(const struct llama_model * model) {
|
||||
return model->hparams.n_vocab;
|
||||
}
|
||||
|
@ -15884,6 +16206,8 @@ struct llama_data_file_context : llama_data_context {
|
|||
*
|
||||
*/
|
||||
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
// copy rng
|
||||
{
|
||||
std::ostringstream rng_ss;
|
||||
|
@ -16036,6 +16360,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
|||
|
||||
// Sets the state reading from the specified source address
|
||||
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
const uint8_t * inp = src;
|
||||
|
||||
// set rng
|
||||
|
@ -16340,6 +16666,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
|
|||
}
|
||||
|
||||
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
const auto & kv_self = ctx->kv_self;
|
||||
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
||||
|
||||
|
@ -16457,6 +16785,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
|
|||
}
|
||||
|
||||
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
auto & kv_self = ctx->kv_self;
|
||||
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
||||
|
||||
|
@ -17257,6 +17587,15 @@ static int32_t llama_chat_apply_template_internal(
|
|||
if (add_ass) {
|
||||
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
||||
}
|
||||
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
||||
// Phi 3
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
@ -17389,6 +17728,11 @@ const char * llama_print_system_info(void) {
|
|||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
||||
#ifdef GGML_USE_LLAMAFILE
|
||||
s += "LLAMAFILE = 1 | ";
|
||||
#else
|
||||
s += "LLAMAFILE = 0 | ";
|
||||
#endif
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
|
35
llama.h
35
llama.h
|
@ -195,15 +195,19 @@ extern "C" {
|
|||
LLAMA_KV_OVERRIDE_TYPE_INT,
|
||||
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
||||
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
||||
LLAMA_KV_OVERRIDE_TYPE_STR,
|
||||
};
|
||||
|
||||
struct llama_model_kv_override {
|
||||
char key[128];
|
||||
enum llama_model_kv_override_type tag;
|
||||
|
||||
char key[128];
|
||||
|
||||
union {
|
||||
int64_t int_value;
|
||||
double float_value;
|
||||
bool bool_value;
|
||||
int64_t val_i64;
|
||||
double val_f64;
|
||||
bool val_bool;
|
||||
char val_str[128];
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -232,9 +236,10 @@ extern "C" {
|
|||
const struct llama_model_kv_override * kv_overrides;
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
};
|
||||
|
||||
struct llama_context_params {
|
||||
|
@ -288,6 +293,7 @@ extern "C" {
|
|||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
bool keep_split; // quantize to the same number of shards
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
} llama_model_quantize_params;
|
||||
|
@ -390,8 +396,10 @@ extern "C" {
|
|||
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
|
@ -987,7 +995,7 @@ extern "C" {
|
|||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
/// @details Randomly selects a token from the candidates based on their probabilities.
|
||||
/// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
|
||||
LLAMA_API llama_token llama_sample_token(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
@ -1074,8 +1082,9 @@ extern "C" {
|
|||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||
#ifdef LLAMA_API_INTERNAL
|
||||
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct ggml_tensor;
|
||||
|
||||
|
@ -1112,6 +1121,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|||
const std::string & src,
|
||||
llama_partial_utf8 partial_start);
|
||||
|
||||
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
|
||||
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
|
||||
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
|
||||
|
||||
#endif // LLAMA_API_INTERNAL
|
||||
|
||||
#endif // LLAMA_H
|
||||
|
|
BIN
media/matmul.png
Normal file
BIN
media/matmul.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 260 KiB |
1238
media/matmul.svg
Normal file
1238
media/matmul.svg
Normal file
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 51 KiB |
6
sgemm.h
6
sgemm.h
|
@ -1,11 +1,13 @@
|
|||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bool llamafile_sgemm(int, int, int, const void *, int, const void *, int,
|
||||
void *, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
||||
const void *, int64_t, void *, int64_t, int, int,
|
||||
int, int, int, int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -49,6 +49,8 @@ int main(void) {
|
|||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
|
||||
// Llama-3
|
||||
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
||||
// Phi-3
|
||||
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
|
||||
};
|
||||
std::vector<std::string> expected_output = {
|
||||
// teknium/OpenHermes-2.5-Mistral-7B
|
||||
|
@ -77,6 +79,8 @@ int main(void) {
|
|||
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
||||
// Llama 3
|
||||
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||
// Phi 3
|
||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||
};
|
||||
std::vector<char> formatted_chat(1024);
|
||||
int32_t res;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue