Merge 'origin/master' into hipblas
This commit is contained in:
commit
7b842170c4
53 changed files with 2624 additions and 1039 deletions
58
.devops/lamma-cpp-clblast.srpm.spec
Normal file
58
.devops/lamma-cpp-clblast.srpm.spec
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||||
|
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||||
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
|
# Notes for llama.cpp:
|
||||||
|
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||||
|
# We need to declare standard versioning if people want to sort latest releases.
|
||||||
|
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||||
|
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||||
|
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||||
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
|
Name: llama.cpp-clblast
|
||||||
|
Version: master
|
||||||
|
Release: 1%{?dist}
|
||||||
|
Summary: OpenCL Inference of LLaMA model in pure C/C++
|
||||||
|
License: MIT
|
||||||
|
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||||
|
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel
|
||||||
|
URL: https://github.com/ggerganov/llama.cpp
|
||||||
|
|
||||||
|
%define debug_package %{nil}
|
||||||
|
%define source_date_epoch_from_changelog 0
|
||||||
|
|
||||||
|
%description
|
||||||
|
CPU inference for Meta's Lllama2 models using default options.
|
||||||
|
|
||||||
|
%prep
|
||||||
|
%setup -n llama.cpp-master
|
||||||
|
|
||||||
|
%build
|
||||||
|
make -j LLAMA_CLBLAST=1
|
||||||
|
|
||||||
|
%install
|
||||||
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
|
cp -p main %{buildroot}%{_bindir}/llamacppclblast
|
||||||
|
cp -p server %{buildroot}%{_bindir}/llamacppclblastserver
|
||||||
|
cp -p simple %{buildroot}%{_bindir}/llamacppclblastsimple
|
||||||
|
|
||||||
|
%clean
|
||||||
|
rm -rf %{buildroot}
|
||||||
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
|
%files
|
||||||
|
%{_bindir}/llamacppclblast
|
||||||
|
%{_bindir}/llamacppclblastserver
|
||||||
|
%{_bindir}/llamacppclblastsimple
|
||||||
|
|
||||||
|
%pre
|
||||||
|
|
||||||
|
%post
|
||||||
|
|
||||||
|
%preun
|
||||||
|
%postun
|
||||||
|
|
||||||
|
%changelog
|
59
.devops/lamma-cpp-cublas.srpm.spec
Normal file
59
.devops/lamma-cpp-cublas.srpm.spec
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||||
|
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||||
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
|
# Notes for llama.cpp:
|
||||||
|
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||||
|
# We need to declare standard versioning if people want to sort latest releases.
|
||||||
|
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||||
|
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||||
|
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||||
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
|
Name: llama.cpp-cublas
|
||||||
|
Version: master
|
||||||
|
Release: 1%{?dist}
|
||||||
|
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||||
|
License: MIT
|
||||||
|
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||||
|
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
|
||||||
|
Requires: cuda-toolkit
|
||||||
|
URL: https://github.com/ggerganov/llama.cpp
|
||||||
|
|
||||||
|
%define debug_package %{nil}
|
||||||
|
%define source_date_epoch_from_changelog 0
|
||||||
|
|
||||||
|
%description
|
||||||
|
CPU inference for Meta's Lllama2 models using default options.
|
||||||
|
|
||||||
|
%prep
|
||||||
|
%setup -n llama.cpp-master
|
||||||
|
|
||||||
|
%build
|
||||||
|
make -j LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
%install
|
||||||
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
|
cp -p main %{buildroot}%{_bindir}/llamacppcublas
|
||||||
|
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
|
||||||
|
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
|
||||||
|
|
||||||
|
%clean
|
||||||
|
rm -rf %{buildroot}
|
||||||
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
|
%files
|
||||||
|
%{_bindir}/llamacppcublas
|
||||||
|
%{_bindir}/llamacppcublasserver
|
||||||
|
%{_bindir}/llamacppcublassimple
|
||||||
|
|
||||||
|
%pre
|
||||||
|
|
||||||
|
%post
|
||||||
|
|
||||||
|
%preun
|
||||||
|
%postun
|
||||||
|
|
||||||
|
%changelog
|
58
.devops/llama-cpp.srpm.spec
Normal file
58
.devops/llama-cpp.srpm.spec
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||||
|
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||||
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
|
# Notes for llama.cpp:
|
||||||
|
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||||
|
# We need to declare standard versioning if people want to sort latest releases.
|
||||||
|
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||||
|
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||||
|
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||||
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
|
Name: llama.cpp
|
||||||
|
Version: master
|
||||||
|
Release: 1%{?dist}
|
||||||
|
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||||
|
License: MIT
|
||||||
|
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||||
|
BuildRequires: coreutils make gcc-c++ git
|
||||||
|
URL: https://github.com/ggerganov/llama.cpp
|
||||||
|
|
||||||
|
%define debug_package %{nil}
|
||||||
|
%define source_date_epoch_from_changelog 0
|
||||||
|
|
||||||
|
%description
|
||||||
|
CPU inference for Meta's Lllama2 models using default options.
|
||||||
|
|
||||||
|
%prep
|
||||||
|
%autosetup
|
||||||
|
|
||||||
|
%build
|
||||||
|
make -j
|
||||||
|
|
||||||
|
%install
|
||||||
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
|
cp -p main %{buildroot}%{_bindir}/llamacpp
|
||||||
|
cp -p server %{buildroot}%{_bindir}/llamacppserver
|
||||||
|
cp -p simple %{buildroot}%{_bindir}/llamacppsimple
|
||||||
|
|
||||||
|
%clean
|
||||||
|
rm -rf %{buildroot}
|
||||||
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
|
%files
|
||||||
|
%{_bindir}/llamacpp
|
||||||
|
%{_bindir}/llamacppserver
|
||||||
|
%{_bindir}/llamacppsimple
|
||||||
|
|
||||||
|
%pre
|
||||||
|
|
||||||
|
%post
|
||||||
|
|
||||||
|
%preun
|
||||||
|
%postun
|
||||||
|
|
||||||
|
%changelog
|
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -3,6 +3,8 @@
|
||||||
*.so
|
*.so
|
||||||
*.gguf
|
*.gguf
|
||||||
*.bin
|
*.bin
|
||||||
|
*.exe
|
||||||
|
*.dll
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.build/
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
|
@ -81,4 +83,3 @@ tests/test-quantize-fns
|
||||||
tests/test-quantize-perf
|
tests/test-quantize-perf
|
||||||
tests/test-sampling
|
tests/test-sampling
|
||||||
tests/test-tokenizer-0
|
tests/test-tokenizer-0
|
||||||
|
|
||||||
|
|
162
README.md
162
README.md
|
@ -11,7 +11,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
|
- Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
|
||||||
|
|
||||||
|
- A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
|
||||||
|
|
||||||
Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
|
Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
|
||||||
|
|
||||||
|
@ -39,6 +41,7 @@ Last revision compatible with the old format: [dadbed9](https://github.com/ggerg
|
||||||
<li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
|
<li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
|
||||||
<li><a href="#quantization">Quantization</a></li>
|
<li><a href="#quantization">Quantization</a></li>
|
||||||
<li><a href="#interactive-mode">Interactive mode</a></li>
|
<li><a href="#interactive-mode">Interactive mode</a></li>
|
||||||
|
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
|
||||||
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
|
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
|
||||||
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
|
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
|
||||||
<li><a href="#using-gpt4all">Using GPT4All</a></li>
|
<li><a href="#using-gpt4all">Using GPT4All</a></li>
|
||||||
|
@ -65,12 +68,11 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
||||||
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||||
- Mixed F16 / F32 precision
|
- Mixed F16 / F32 precision
|
||||||
- 4-bit, 5-bit and 8-bit integer quantization support
|
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
|
||||||
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
|
- CUDA, Metal and OpenCL GPU backend support
|
||||||
- cuBLAS and CLBlast support
|
|
||||||
|
|
||||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||||
Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
|
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
|
||||||
as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
|
as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
|
||||||
|
|
||||||
**Supported platforms:**
|
**Supported platforms:**
|
||||||
|
@ -84,6 +86,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
|
|
||||||
- [X] LLaMA 🦙
|
- [X] LLaMA 🦙
|
||||||
- [x] LLaMA 2 🦙🦙
|
- [x] LLaMA 2 🦙🦙
|
||||||
|
- [X] Falcon
|
||||||
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
||||||
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
||||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
||||||
|
@ -114,90 +117,84 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
Here is a typical run using LLaMA-7B:
|
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||||
I llama.cpp build info:
|
I llama.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
I UNAME_M: arm64
|
I UNAME_M: arm64
|
||||||
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -DGGML_USE_ACCELERATE
|
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
|
||||||
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
|
I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
|
||||||
I LDFLAGS: -framework Accelerate
|
I LDFLAGS: -framework Accelerate
|
||||||
I CC: Apple clang version 14.0.0 (clang-1400.0.29.202)
|
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
||||||
I CXX: Apple clang version 14.0.0 (clang-1400.0.29.202)
|
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
||||||
|
|
||||||
make: Nothing to be done for `default'.
|
make: Nothing to be done for `default'.
|
||||||
main: seed = 1678486056
|
main: build = 1041 (cf658ad)
|
||||||
llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ...
|
main: seed = 1692823051
|
||||||
llama_model_load: n_vocab = 32000
|
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
|
||||||
llama_model_load: n_ctx = 512
|
llama_model_loader: - type f32: 81 tensors
|
||||||
llama_model_load: n_embd = 4096
|
llama_model_loader: - type q4_0: 281 tensors
|
||||||
llama_model_load: n_mult = 256
|
llama_model_loader: - type q6_K: 1 tensors
|
||||||
llama_model_load: n_head = 32
|
llm_load_print_meta: format = GGUF V1 (latest)
|
||||||
llama_model_load: n_layer = 32
|
llm_load_print_meta: arch = llama
|
||||||
llama_model_load: n_rot = 128
|
llm_load_print_meta: vocab type = SPM
|
||||||
llama_model_load: f16 = 2
|
llm_load_print_meta: n_vocab = 32000
|
||||||
llama_model_load: n_ff = 11008
|
llm_load_print_meta: n_merges = 0
|
||||||
llama_model_load: ggml ctx size = 4529.34 MB
|
llm_load_print_meta: n_ctx_train = 4096
|
||||||
llama_model_load: memory_size = 512.00 MB, n_mem = 16384
|
llm_load_print_meta: n_ctx = 512
|
||||||
llama_model_load: .................................... done
|
llm_load_print_meta: n_embd = 5120
|
||||||
llama_model_load: model size = 4017.27 MB / num tensors = 291
|
llm_load_print_meta: n_head = 40
|
||||||
|
llm_load_print_meta: n_head_kv = 40
|
||||||
|
llm_load_print_meta: n_layer = 40
|
||||||
|
llm_load_print_meta: n_rot = 128
|
||||||
|
llm_load_print_meta: n_gqa = 1
|
||||||
|
llm_load_print_meta: f_norm_eps = 1.0e-05
|
||||||
|
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
|
||||||
|
llm_load_print_meta: n_ff = 13824
|
||||||
|
llm_load_print_meta: freq_base = 10000.0
|
||||||
|
llm_load_print_meta: freq_scale = 1
|
||||||
|
llm_load_print_meta: model type = 13B
|
||||||
|
llm_load_print_meta: model ftype = mostly Q4_0
|
||||||
|
llm_load_print_meta: model size = 13.02 B
|
||||||
|
llm_load_print_meta: general.name = LLaMA v2
|
||||||
|
llm_load_print_meta: BOS token = 1 '<s>'
|
||||||
|
llm_load_print_meta: EOS token = 2 '</s>'
|
||||||
|
llm_load_print_meta: UNK token = 0 '<unk>'
|
||||||
|
llm_load_print_meta: LF token = 13 '<0x0A>'
|
||||||
|
llm_load_tensors: ggml ctx size = 0.11 MB
|
||||||
|
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
|
||||||
|
...................................................................................................
|
||||||
|
llama_new_context_with_model: kv self size = 400.00 MB
|
||||||
|
llama_new_context_with_model: compute buffer total size = 75.41 MB
|
||||||
|
|
||||||
main: prompt: 'Building a website can be done in 10 simple steps:'
|
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
|
||||||
main: number of tokens in prompt = 15
|
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
|
||||||
1 -> ''
|
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
|
||||||
8893 -> 'Build'
|
|
||||||
292 -> 'ing'
|
|
||||||
263 -> ' a'
|
|
||||||
4700 -> ' website'
|
|
||||||
508 -> ' can'
|
|
||||||
367 -> ' be'
|
|
||||||
2309 -> ' done'
|
|
||||||
297 -> ' in'
|
|
||||||
29871 -> ' '
|
|
||||||
29896 -> '1'
|
|
||||||
29900 -> '0'
|
|
||||||
2560 -> ' simple'
|
|
||||||
6576 -> ' steps'
|
|
||||||
29901 -> ':'
|
|
||||||
|
|
||||||
sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000
|
|
||||||
|
|
||||||
|
|
||||||
Building a website can be done in 10 simple steps:
|
Building a website can be done in 10 simple steps:
|
||||||
1) Select a domain name and web hosting plan
|
Step 1: Find the right website platform.
|
||||||
2) Complete a sitemap
|
Step 2: Choose your domain name and hosting plan.
|
||||||
3) List your products
|
Step 3: Design your website layout.
|
||||||
4) Write product descriptions
|
Step 4: Write your website content and add images.
|
||||||
5) Create a user account
|
Step 5: Install security features to protect your site from hackers or spammers
|
||||||
6) Build the template
|
Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
|
||||||
7) Start building the website
|
Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
|
||||||
8) Advertise the website
|
Step 8: Start marketing and promoting the website via social media channels or paid ads
|
||||||
9) Provide email support
|
Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
|
||||||
10) Submit the website to search engines
|
Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
|
||||||
A website is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
|
How does a Website Work?
|
||||||
The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user's browser.
|
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
|
||||||
The web pages are stored in a web server. The web server is also called a host. When the website is accessed, it is retrieved from the server and displayed on the user's computer.
|
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
|
||||||
A website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
|
How to
|
||||||
A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user's screen.
|
llama_print_timings: load time = 576.45 ms
|
||||||
A website can also be viewed on different devices such as desktops, tablets and smartphones.
|
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
|
||||||
Hence, to have a website displayed on a browser, the website must be hosted.
|
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
|
||||||
A domain name is an address of a website. It is the name of the website.
|
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
|
||||||
The website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
|
llama_print_timings: total time = 25431.49 ms
|
||||||
A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user’s screen.
|
|
||||||
A website can also be viewed on different devices such as desktops, tablets and smartphones. Hence, to have a website displayed on a browser, the website must be hosted.
|
|
||||||
A domain name is an address of a website. It is the name of the website.
|
|
||||||
A website is an address of a website. It is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
|
|
||||||
The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user’s browser.
|
|
||||||
A website is known as a website when it is hosted
|
|
||||||
|
|
||||||
main: mem per token = 14434244 bytes
|
|
||||||
main: load time = 1332.48 ms
|
|
||||||
main: sample time = 1081.40 ms
|
|
||||||
main: predict time = 31378.77 ms / 61.41 ms per token
|
|
||||||
main: total time = 34036.74 ms
|
|
||||||
```
|
```
|
||||||
|
|
||||||
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
|
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
|
||||||
|
@ -571,6 +568,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
|
||||||
|
|
||||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||||
|
|
||||||
|
*(outdated)*
|
||||||
|
|
||||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||||
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
|
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
|
||||||
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
|
||||||
|
@ -633,6 +632,16 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||||
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
|
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Constrained output with grammars
|
||||||
|
|
||||||
|
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||||
|
```
|
||||||
|
|
||||||
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
|
||||||
### Instruction mode with Alpaca
|
### Instruction mode with Alpaca
|
||||||
|
|
||||||
1. First, download the `ggml` Alpaca model into the `./models` folder
|
1. First, download the `ggml` Alpaca model into the `./models` folder
|
||||||
|
@ -914,3 +923,4 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
|
||||||
- [BLIS](./docs/BLIS.md)
|
- [BLIS](./docs/BLIS.md)
|
||||||
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
||||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||||
|
- [GBNF grammars](./grammars/README.md)
|
||||||
|
|
0
ci/run.sh
Normal file → Executable file
0
ci/run.sh
Normal file → Executable file
|
@ -387,11 +387,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
|
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
params.mul_mat_q = true;
|
params.mul_mat_q = false;
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--low-vram" || arg == "-lv") {
|
} else if (arg == "--low-vram" || arg == "-lv") {
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
@ -417,6 +417,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
params.antiprompt.push_back(argv[i]);
|
params.antiprompt.push_back(argv[i]);
|
||||||
} else if (arg == "--perplexity") {
|
} else if (arg == "--perplexity") {
|
||||||
params.perplexity = true;
|
params.perplexity = true;
|
||||||
|
} else if (arg == "--ppl-stride") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.ppl_stride = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--ppl-output-type") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.ppl_output_type = std::stoi(argv[i]);
|
||||||
} else if (arg == "--hellaswag") {
|
} else if (arg == "--hellaswag") {
|
||||||
params.hellaswag = true;
|
params.hellaswag = true;
|
||||||
} else if (arg == "--hellaswag-tasks") {
|
} else if (arg == "--hellaswag-tasks") {
|
||||||
|
@ -601,13 +613,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
||||||
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q HIP kernels instead of hipBLAS. TEMP!!!\n" );
|
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
|
||||||
#else
|
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
||||||
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
|
|
||||||
#endif
|
|
||||||
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
|
|
||||||
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
|
|
||||||
#endif
|
#endif
|
||||||
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
||||||
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
||||||
|
@ -736,35 +744,3 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
|
||||||
|
|
||||||
return std::string(result.data(), result.size());
|
return std::string(result.data(), result.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize_bpe(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const std::string & text,
|
|
||||||
bool add_bos) {
|
|
||||||
int n_tokens = text.length() + add_bos;
|
|
||||||
std::vector<llama_token> result(n_tokens);
|
|
||||||
n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
|
||||||
if (n_tokens < 0) {
|
|
||||||
result.resize(-n_tokens);
|
|
||||||
int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
|
||||||
GGML_ASSERT(check == -n_tokens);
|
|
||||||
} else {
|
|
||||||
result.resize(n_tokens);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
|
|
||||||
std::vector<char> result(8, 0);
|
|
||||||
const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
|
|
||||||
if (n_tokens < 0) {
|
|
||||||
result.resize(-n_tokens);
|
|
||||||
const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
|
|
||||||
GGML_ASSERT(check == -n_tokens);
|
|
||||||
} else {
|
|
||||||
result.resize(n_tokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::string(result.data(), result.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
|
@ -64,11 +64,15 @@ struct gpt_params {
|
||||||
std::string lora_adapter = ""; // lora adapter path
|
std::string lora_adapter = ""; // lora adapter path
|
||||||
std::string lora_base = ""; // base model path for the lora adapter
|
std::string lora_base = ""; // base model path for the lora adapter
|
||||||
|
|
||||||
|
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||||
|
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||||
|
// (which is more convenient to use for plotting)
|
||||||
|
//
|
||||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||||
|
|
||||||
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
||||||
bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
|
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
|
@ -116,15 +120,6 @@ std::vector<llama_token> llama_tokenize(
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos);
|
bool add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize_bpe(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const std::string & text,
|
|
||||||
bool add_bos);
|
|
||||||
|
|
||||||
std::string llama_token_to_str(
|
std::string llama_token_to_str(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token);
|
llama_token token);
|
||||||
|
|
||||||
std::string llama_token_to_str_bpe(
|
|
||||||
const struct llama_context * ctx,
|
|
||||||
llama_token token);
|
|
||||||
|
|
52
convert-falcon-hf-to-gguf.py
Normal file → Executable file
52
convert-falcon-hf-to-gguf.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
# HF falcon--> gguf conversion
|
# HF falcon--> gguf conversion
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
@ -94,14 +95,17 @@ print("gguf: get model metadata")
|
||||||
|
|
||||||
block_count = hparams["n_layer"]
|
block_count = hparams["n_layer"]
|
||||||
|
|
||||||
gguf_writer.add_name(last_dir)
|
gguf_writer.add_name("Falcon")
|
||||||
gguf_writer.add_context_length(2048) # not in config.json
|
gguf_writer.add_context_length(2048) # not in config.json
|
||||||
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||||
gguf_writer.add_block_count(block_count)
|
gguf_writer.add_block_count(block_count)
|
||||||
gguf_writer.add_head_count(hparams["n_head"])
|
gguf_writer.add_head_count(hparams["n_head"])
|
||||||
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
if "n_head_kv" in hparams:
|
||||||
|
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
||||||
|
else:
|
||||||
|
gguf_writer.add_head_count_kv(1)
|
||||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||||
|
|
||||||
# TOKENIZATION
|
# TOKENIZATION
|
||||||
|
@ -109,6 +113,8 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||||
print("gguf: get tokenizer metadata")
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
tokens: List[str] = []
|
tokens: List[str] = []
|
||||||
|
scores: List[float] = []
|
||||||
|
toktypes: List[int] = []
|
||||||
merges: List[str] = []
|
merges: List[str] = []
|
||||||
|
|
||||||
|
|
||||||
|
@ -152,41 +158,30 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||||
text = bytearray(pad_token)
|
text = bytearray(pad_token)
|
||||||
|
|
||||||
tokens.append(text)
|
tokens.append(text)
|
||||||
|
scores.append(0.0) # dymmy
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
|
|
||||||
print("gguf: get special token ids")
|
print("gguf: get special token ids")
|
||||||
|
# Look for special tokens in config.json
|
||||||
|
|
||||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
||||||
tokenizer_config = json.load(f)
|
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
||||||
|
|
||||||
# find special token ids
|
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
||||||
|
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
||||||
|
|
||||||
if "bos_token" in tokenizer_config:
|
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
||||||
for key in tokenizer_json["added_tokens"]:
|
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
||||||
if key["content"] == tokenizer_config["bos_token"]:
|
|
||||||
gguf_writer.add_bos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "eos_token" in tokenizer_config:
|
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
||||||
for key in tokenizer_json["added_tokens"]:
|
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
||||||
if key["content"] == tokenizer_config["eos_token"]:
|
|
||||||
gguf_writer.add_eos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "unk_token" in tokenizer_config:
|
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
||||||
for key in tokenizer_json["added_tokens"]:
|
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
||||||
if key["content"] == tokenizer_config["unk_token"]:
|
|
||||||
gguf_writer.add_unk_token_id(key["id"])
|
|
||||||
|
|
||||||
if "sep_token" in tokenizer_config:
|
|
||||||
for key in tokenizer_json["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["sep_token"]:
|
|
||||||
gguf_writer.add_sep_token_id(key["id"])
|
|
||||||
|
|
||||||
if "pad_token" in tokenizer_config:
|
|
||||||
for key in tokenizer_json["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["pad_token"]:
|
|
||||||
gguf_writer.add_pad_token_id(key["id"])
|
|
||||||
|
|
||||||
|
|
||||||
# TENSORS
|
# TENSORS
|
||||||
|
@ -196,6 +191,7 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||||
# params for qkv transform
|
# params for qkv transform
|
||||||
n_head = hparams["n_head"]
|
n_head = hparams["n_head"]
|
||||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||||
|
|
||||||
head_dim = hparams["hidden_size"] // n_head
|
head_dim = hparams["hidden_size"] // n_head
|
||||||
|
|
||||||
# tensor info
|
# tensor info
|
||||||
|
|
1
convert-gptneox-hf-to-gguf.py
Normal file → Executable file
1
convert-gptneox-hf-to-gguf.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
# HF gptneox--> gguf conversion
|
# HF gptneox--> gguf conversion
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
|
1
convert-llama-7b-pth-to-gguf.py
Normal file → Executable file
1
convert-llama-7b-pth-to-gguf.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
# 7b pth llama --> gguf conversion
|
# 7b pth llama --> gguf conversion
|
||||||
# Only models with a single datafile are supported, like 7B
|
# Only models with a single datafile are supported, like 7B
|
||||||
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
|
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
|
||||||
|
|
34
convert-llama-ggmlv3-to-gguf.py
Normal file → Executable file
34
convert-llama-ggmlv3-to-gguf.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
import sys, struct, math, argparse
|
import sys, struct, math, argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -93,7 +94,7 @@ class Tensor:
|
||||||
pad = ((offset + 31) & ~31) - offset
|
pad = ((offset + 31) & ~31) - offset
|
||||||
offset += pad
|
offset += pad
|
||||||
n_elems = np.prod(self.dims)
|
n_elems = np.prod(self.dims)
|
||||||
n_bytes = (n_elems * tysize) // blksize
|
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
|
||||||
self.start_offset = offset
|
self.start_offset = offset
|
||||||
self.len_bytes = n_bytes
|
self.len_bytes = n_bytes
|
||||||
offset += n_bytes
|
offset += n_bytes
|
||||||
|
@ -215,15 +216,10 @@ class GGMLToGGUF:
|
||||||
if self.vocab_override is not None:
|
if self.vocab_override is not None:
|
||||||
vo = self.vocab_override
|
vo = self.vocab_override
|
||||||
print('* Adding vocab item(s)')
|
print('* Adding vocab item(s)')
|
||||||
for (idx, vitem) in enumerate(vo.all_tokens()):
|
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||||
if len(vitem) == 3:
|
tokens.append(vbytes)
|
||||||
tokens.append(vitem[0])
|
scores.append(score)
|
||||||
scores.append(vitem[1])
|
toktypes.append(ttype)
|
||||||
toktypes.append(vitem[2])
|
|
||||||
else:
|
|
||||||
# Maybe try to guess the token type here?
|
|
||||||
tokens.append(vitem[0])
|
|
||||||
scores.append(vitem[1])
|
|
||||||
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
|
@ -231,9 +227,21 @@ class GGMLToGGUF:
|
||||||
gguf_writer.add_token_types(toktypes)
|
gguf_writer.add_token_types(toktypes)
|
||||||
return
|
return
|
||||||
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
||||||
|
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
|
||||||
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
||||||
tt = 1 # Normal
|
tt = 1 # Normal
|
||||||
if len(vbytes) == 0:
|
# Special handling for UNK, BOS, EOS tokens.
|
||||||
|
if tokid <= 2:
|
||||||
|
if tokid == 0:
|
||||||
|
vbytes = b'<unk>'
|
||||||
|
tt = 2
|
||||||
|
elif tokid == 1:
|
||||||
|
vbytes = b'<s>'
|
||||||
|
tt = 3
|
||||||
|
else:
|
||||||
|
vbytes = b'</s>'
|
||||||
|
tt = 3
|
||||||
|
elif len(vbytes) == 0:
|
||||||
tt = 3 # Control
|
tt = 3 # Control
|
||||||
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
||||||
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
|
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
|
||||||
|
@ -246,6 +254,9 @@ class GGMLToGGUF:
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
gguf_writer.add_token_types(toktypes)
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
gguf_writer.add_unk_token_id(0)
|
||||||
|
gguf_writer.add_bos_token_id(1)
|
||||||
|
gguf_writer.add_eos_token_id(2)
|
||||||
|
|
||||||
def add_tensors(self, gguf_writer):
|
def add_tensors(self, gguf_writer):
|
||||||
nm = self.name_map
|
nm = self.name_map
|
||||||
|
@ -330,4 +341,5 @@ def main():
|
||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
1
convert-llama-hf-to-gguf.py
Normal file → Executable file
1
convert-llama-hf-to-gguf.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
# HF llama --> gguf conversion
|
# HF llama --> gguf conversion
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python3
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
@ -6,23 +6,22 @@ import struct
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Dict, Sequence, TextIO
|
from typing import Any, Dict, Sequence, TextIO
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
|
NUMPY_TYPE_TO_FTYPE: Dict[str, int] = {"float32": 0, "float16": 1}
|
||||||
|
|
||||||
|
|
||||||
HF_SUBLAYER_TO_GGML = {
|
HF_SUBLAYER_TO_GGML = {
|
||||||
"self_attn.q_proj": "attention.wq",
|
"self_attn.q_proj": "attn_q",
|
||||||
"self_attn.k_proj": "attention.wk",
|
"self_attn.k_proj": "attn_k",
|
||||||
"self_attn.v_proj": "attention.wv",
|
"self_attn.v_proj": "attn_v",
|
||||||
"self_attn.o_proj": "attention.wo",
|
"self_attn.o_proj": "attn_output",
|
||||||
"mlp.gate_proj": "feed_forward.w1",
|
"mlp.gate_proj": "ffn_gate",
|
||||||
"mlp.down_proj": "feed_forward.w2",
|
"mlp.down_proj": "ffn_down",
|
||||||
"mlp.up_proj": "feed_forward.w3",
|
"mlp.up_proj": "ffn_up",
|
||||||
"input_layernorm": "attention_norm",
|
"input_layernorm": "attn_norm",
|
||||||
"post_attention_layernorm": "ffn_norm",
|
"post_attention_layernorm": "ffn_norm",
|
||||||
# "norm": "norm",
|
|
||||||
# "embed_tokens": "tok_embeddings",
|
|
||||||
# "lm_head": "output",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,7 +38,7 @@ def translate_tensor_name(t: str) -> str:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
output_string = (
|
output_string = (
|
||||||
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
||||||
)
|
)
|
||||||
return output_string
|
return output_string
|
||||||
else:
|
else:
|
||||||
|
@ -54,12 +53,14 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
|
||||||
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
|
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
|
||||||
# but some models ship a float value instead
|
# but some models ship a float value instead
|
||||||
# let's convert to int, but fail if lossless conversion is not possible
|
# let's convert to int, but fail if lossless conversion is not possible
|
||||||
assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
|
assert (
|
||||||
|
int(params["lora_alpha"]) == params["lora_alpha"]
|
||||||
|
), "cannot convert float to int losslessly"
|
||||||
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
||||||
|
|
||||||
|
|
||||||
def write_tensor_header(
|
def write_tensor_header(
|
||||||
self, name: str, shape: Sequence[int], data_type: DataType
|
self, name: str, shape: Sequence[int], data_type: np.dtype
|
||||||
) -> None:
|
) -> None:
|
||||||
sname = name.encode("utf-8")
|
sname = name.encode("utf-8")
|
||||||
fout.write(
|
fout.write(
|
||||||
|
@ -67,7 +68,7 @@ def write_tensor_header(
|
||||||
"iii",
|
"iii",
|
||||||
len(shape),
|
len(shape),
|
||||||
len(sname),
|
len(sname),
|
||||||
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
|
NUMPY_TYPE_TO_FTYPE[data_type.name],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||||
|
|
35
convert.py
Normal file → Executable file
35
convert.py
Normal file → Executable file
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
import argparse
|
import argparse
|
||||||
|
@ -69,7 +69,10 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
||||||
'I32': DT_I32,
|
'I32': DT_I32,
|
||||||
}
|
}
|
||||||
|
|
||||||
class GGMLFileType(enum.Enum):
|
# TODO: match this with `llama_ftype`
|
||||||
|
# TODO: rename to LLAMAFileType
|
||||||
|
# TODO: move to `gguf.py`
|
||||||
|
class GGMLFileType(enum.IntEnum):
|
||||||
AllF32 = 0
|
AllF32 = 0
|
||||||
MostlyF16 = 1 # except 1d tensors
|
MostlyF16 = 1 # except 1d tensors
|
||||||
|
|
||||||
|
@ -101,6 +104,8 @@ class Params:
|
||||||
n_head_kv: int
|
n_head_kv: int
|
||||||
f_norm_eps: float
|
f_norm_eps: float
|
||||||
|
|
||||||
|
ftype: Optional[GGMLFileType] = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
||||||
# hardcoded magic range
|
# hardcoded magic range
|
||||||
|
@ -728,7 +733,11 @@ class OutputFile:
|
||||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
self.gguf.add_name ("LLaMA")
|
ver = None
|
||||||
|
if (params.n_ctx == 4096):
|
||||||
|
ver = "v2"
|
||||||
|
|
||||||
|
self.gguf.add_name ("LLaMA" if ver == None else "LLaMA " + ver)
|
||||||
self.gguf.add_context_length (params.n_ctx)
|
self.gguf.add_context_length (params.n_ctx)
|
||||||
self.gguf.add_embedding_length (params.n_embd)
|
self.gguf.add_embedding_length (params.n_embd)
|
||||||
self.gguf.add_block_count (params.n_layer)
|
self.gguf.add_block_count (params.n_layer)
|
||||||
|
@ -738,6 +747,9 @@ class OutputFile:
|
||||||
self.gguf.add_head_count_kv (params.n_head_kv)
|
self.gguf.add_head_count_kv (params.n_head_kv)
|
||||||
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
|
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
|
||||||
|
|
||||||
|
if params.ftype:
|
||||||
|
self.gguf.add_file_type(params.ftype)
|
||||||
|
|
||||||
def add_meta_vocab(self, vocab: Vocab) -> None:
|
def add_meta_vocab(self, vocab: Vocab) -> None:
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
|
@ -956,7 +968,7 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, Sentence
|
||||||
path = path3
|
path = path3
|
||||||
else:
|
else:
|
||||||
raise FileNotFoundError(
|
raise FileNotFoundError(
|
||||||
f"Could not find tokenizer.model in {path} or its parent; "
|
f"Could not find {vocab_file} in {path} or its parent; "
|
||||||
"if it's in another directory, pass the directory as --vocab-dir")
|
"if it's in another directory, pass the directory as --vocab-dir")
|
||||||
|
|
||||||
print(f"Loading vocab file '{path}', type '{vocabtype}'")
|
print(f"Loading vocab file '{path}', type '{vocabtype}'")
|
||||||
|
@ -1020,6 +1032,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||||
" - LLaMA v2: --ctx 4096\n")
|
" - LLaMA v2: --ctx 4096\n")
|
||||||
params.n_ctx = args.ctx
|
params.n_ctx = args.ctx
|
||||||
|
|
||||||
|
if args.outtype:
|
||||||
|
params.ftype = {
|
||||||
|
"f32": GGMLFileType.AllF32,
|
||||||
|
"f16": GGMLFileType.MostlyF16,
|
||||||
|
}[args.outtype]
|
||||||
|
|
||||||
print(f"params = {params}")
|
print(f"params = {params}")
|
||||||
|
|
||||||
vocab: Vocab
|
vocab: Vocab
|
||||||
|
@ -1042,9 +1060,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||||
|
|
||||||
model = model_plus.model
|
model = model_plus.model
|
||||||
model = convert_model_names(model, params)
|
model = convert_model_names(model, params)
|
||||||
output_type = pick_output_type(model, args.outtype)
|
ftype = pick_output_type(model, args.outtype)
|
||||||
model = convert_to_output_type(model, output_type)
|
model = convert_to_output_type(model, ftype)
|
||||||
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
|
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
||||||
|
|
||||||
|
params.ftype = ftype
|
||||||
|
print(f"Writing {outfile}, format {ftype}")
|
||||||
|
|
||||||
OutputFile.write_all(outfile, params, model, vocab)
|
OutputFile.write_all(outfile, params, model, vocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
|
|
@ -12,15 +12,19 @@ usage: ./convert-llama2c-to-ggml [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'models/ggml-vocab.bin')
|
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
|
||||||
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
|
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
|
||||||
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
|
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
|
||||||
```
|
```
|
||||||
|
|
||||||
An example command is as follows:
|
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
||||||
|
|
||||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
|
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
|
||||||
|
|
||||||
Now you can use the model with command like:
|
For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
|
||||||
|
|
||||||
`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
|
`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
|
||||||
|
|
||||||
|
Now you can use the model with a command like:
|
||||||
|
|
||||||
|
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||||
|
|
|
@ -17,6 +17,9 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||||
|
#define LLAMA_FILE_VERSION_GGJT_V3 3
|
||||||
|
|
||||||
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
|
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int dim; // transformer dimension
|
int dim; // transformer dimension
|
||||||
|
@ -49,10 +52,10 @@ typedef struct {
|
||||||
// float* freq_cis_real; // (seq_len, dim/2)
|
// float* freq_cis_real; // (seq_len, dim/2)
|
||||||
// float* freq_cis_imag; // (seq_len, dim/2)
|
// float* freq_cis_imag; // (seq_len, dim/2)
|
||||||
// (optional) classifier weights for the logits, on the last layer
|
// (optional) classifier weights for the logits, on the last layer
|
||||||
//float* wcls;
|
float* wcls;
|
||||||
} TransformerWeights;
|
} TransformerWeights;
|
||||||
|
|
||||||
void malloc_weights(TransformerWeights* w, Config* p) {
|
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||||
// we calloc instead of malloc to keep valgrind happy
|
// we calloc instead of malloc to keep valgrind happy
|
||||||
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
||||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||||
|
@ -86,9 +89,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
|
||||||
|
|
||||||
w->rms_final_weight = new float[p->dim]();
|
w->rms_final_weight = new float[p->dim]();
|
||||||
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||||
|
|
||||||
|
if (shared_weights) {
|
||||||
|
w->wcls = NULL;
|
||||||
|
} else {
|
||||||
|
w->wcls = new float[p->vocab_size * p->dim]();
|
||||||
|
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
|
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
||||||
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||||
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
||||||
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||||
|
@ -100,6 +110,22 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
|
||||||
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
|
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
|
||||||
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
|
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
|
||||||
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
|
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
|
||||||
|
|
||||||
|
// Skip freq_cis_real & freq_cis_imag
|
||||||
|
int head_size = p->dim / p->n_heads;
|
||||||
|
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
|
||||||
|
|
||||||
|
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||||
|
|
||||||
|
// Check we didn't forget to read anything
|
||||||
|
auto curr = ftell(f);
|
||||||
|
fseek(f, 0, SEEK_END);
|
||||||
|
auto end = ftell(f);
|
||||||
|
if (curr != end) {
|
||||||
|
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,6 +141,7 @@ void free_weights(TransformerWeights* w) {
|
||||||
delete w->w2;
|
delete w->w2;
|
||||||
delete w->w3;
|
delete w->w3;
|
||||||
delete w->rms_final_weight;
|
delete w->rms_final_weight;
|
||||||
|
if (w->wcls) delete w->wcls;
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_sample_weights(TransformerWeights *w){
|
void print_sample_weights(TransformerWeights *w){
|
||||||
|
@ -131,6 +158,7 @@ void print_sample_weights(TransformerWeights *w){
|
||||||
printf("%f\n", w->w2[0]);
|
printf("%f\n", w->w2[0]);
|
||||||
printf("%f\n", w->w3[0]);
|
printf("%f\n", w->w3[0]);
|
||||||
printf("%f\n", w->rms_att_weight[0]);
|
printf("%f\n", w->rms_att_weight[0]);
|
||||||
|
if (w->wcls) printf("%f\n", w->wcls[0]);
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
@ -509,26 +537,28 @@ bool is_ggml_file(const char *filename) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||||
// heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
|
#pragma message("TODO: implement reading vocabulary using gguf")
|
||||||
if (is_ggml_file(filename)) {
|
// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
|
||||||
|
// if (is_ggml_file(filename)) {
|
||||||
struct llama_context_params llama_params = llama_context_default_params();
|
//
|
||||||
llama_params.vocab_only = true;
|
// struct llama_context_params llama_params = llama_context_default_params();
|
||||||
|
// llama_params.vocab_only = true;
|
||||||
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
|
//
|
||||||
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
|
||||||
|
// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
||||||
const int n_vocab = llama_n_vocab(lctx);
|
//
|
||||||
vocab->id_to_token.resize(n_vocab);
|
// const int n_vocab = llama_n_vocab(lctx);
|
||||||
for (int i=0; i<n_vocab; ++i) {
|
// vocab->id_to_token.resize(n_vocab);
|
||||||
vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
|
// for (int i=0; i<n_vocab; ++i) {
|
||||||
vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
|
// vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
|
||||||
vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
|
// vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
|
||||||
vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
|
// vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
|
||||||
}
|
// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
|
||||||
llama_free(lctx);
|
// }
|
||||||
llama_free_model(lmodel);
|
// llama_free(lctx);
|
||||||
} else { // assume llama2.c vocabulary
|
// llama_free_model(lmodel);
|
||||||
|
// } else
|
||||||
|
{ // assume llama2.c vocabulary
|
||||||
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
|
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
|
||||||
llama_file file(filename, "rb");
|
llama_file file(filename, "rb");
|
||||||
const int n_vocab = config->vocab_size;
|
const int n_vocab = config->vocab_size;
|
||||||
|
@ -538,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
||||||
float_t score = file.read_f32();
|
float_t score = file.read_f32();
|
||||||
uint32_t len = file.read_u32();
|
uint32_t len = file.read_u32();
|
||||||
std::string text = file.read_string(len);
|
std::string text = file.read_string(len);
|
||||||
|
// Special-case handling of <0xXX> single byte tokens.
|
||||||
|
char byte_val;
|
||||||
|
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
|
||||||
|
char cstr[2] = { byte_val, 0 };
|
||||||
|
text = cstr;
|
||||||
|
}
|
||||||
vocab->id_to_token[i].text = text;
|
vocab->id_to_token[i].text = text;
|
||||||
vocab->id_to_token[i].score = score;
|
vocab->id_to_token[i].score = score;
|
||||||
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
|
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
|
||||||
|
@ -589,83 +625,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma message("TODO: implement file saving using gguf")
|
#pragma message("TODO: implement file saving using gguf")
|
||||||
(void) vocab;
|
// write_magic
|
||||||
(void) model;
|
file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
|
||||||
(void) w;
|
file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
|
||||||
// // write_magic
|
// write_hparams
|
||||||
// file.write_u32(LLAMA_FILE_MAGIC); // magic
|
file.write_u32(model->hparams.n_vocab);
|
||||||
// file.write_u32(LLAMA_FILE_VERSION); // version
|
file.write_u32(model->hparams.n_embd);
|
||||||
// // write_hparams
|
file.write_u32(model->hparams.n_mult);
|
||||||
// file.write_u32(model->hparams.n_vocab);
|
file.write_u32(model->hparams.n_head);
|
||||||
// file.write_u32(model->hparams.n_embd);
|
file.write_u32(model->hparams.n_layer);
|
||||||
// file.write_u32(model->hparams.n_mult);
|
file.write_u32(model->hparams.n_rot);
|
||||||
// file.write_u32(model->hparams.n_head);
|
file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||||
// file.write_u32(model->hparams.n_layer);
|
|
||||||
// file.write_u32(model->hparams.n_rot);
|
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
||||||
// file.write_u32(LLAMA_FTYPE_ALL_F32);
|
uint32_t n_vocab = model->hparams.n_vocab;
|
||||||
//
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||||
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
const auto & token_data = vocab->id_to_token.at(i);
|
||||||
// uint32_t n_vocab = model->hparams.n_vocab;
|
file.write_u32((uint32_t) token_data.text.size());
|
||||||
// for (uint32_t i = 0; i < n_vocab; i++) {
|
file.write_raw(token_data.text.data(), token_data.text.size());
|
||||||
// const auto & token_data = vocab->id_to_token.at(i);
|
file.write_raw(&token_data.score, sizeof(token_data.score));
|
||||||
// file.write_u32((uint32_t) token_data.tok.size());
|
}
|
||||||
// file.write_raw(token_data.tok.data(), token_data.tok.size());
|
|
||||||
// file.write_raw(&token_data.score, sizeof(token_data.score));
|
// stuff AK weights into GG weights one by one.
|
||||||
// }
|
// w->token_embedding_table -> model->tok_embeddings
|
||||||
//
|
// float* -> struct ggml_tensor
|
||||||
// // stuff AK weights into GG weights one by one.
|
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
||||||
// // w->token_embedding_table -> model->tok_embeddings
|
stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
|
||||||
// // float* -> struct ggml_tensor
|
|
||||||
// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
||||||
// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
|
//print_row(model->norm, 0);
|
||||||
//
|
|
||||||
// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
// for rms-att-weight
|
||||||
// //print_row(model->norm, 0);
|
int row_length = model->hparams.n_embd;
|
||||||
//
|
const auto & hparams = model->hparams;
|
||||||
// // for rms-att-weight
|
//int n_ff = model->hparams.n_embd;
|
||||||
// int row_length = model->hparams.n_embd;
|
int n_ff = get_n_ff(&hparams);
|
||||||
// const auto & hparams = model->hparams;
|
|
||||||
// //int n_ff = model->hparams.n_embd;
|
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||||
// int n_ff = get_n_ff(&hparams);
|
auto & layer = model->layers[i];
|
||||||
//
|
// 1d
|
||||||
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||||
// auto & layer = model->layers[i];
|
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||||
// // 1d
|
|
||||||
// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||||
// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||||
//
|
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||||
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||||
// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||||
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
|
||||||
// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||||
// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||||
//
|
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||||
// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
}
|
||||||
// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
// write tensors
|
||||||
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
write_tensor(&file, model->tok_embeddings);
|
||||||
// }
|
write_tensor(&file, model->norm);
|
||||||
// // write tensors
|
write_tensor(&file, model->output); // ?
|
||||||
// write_tensor(&file, model->tok_embeddings);
|
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||||
// write_tensor(&file, model->norm);
|
auto & layer = model->layers[i];
|
||||||
// write_tensor(&file, model->output); // ?
|
|
||||||
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
write_tensor(&file, layer.attention_norm);
|
||||||
// auto & layer = model->layers[i];
|
write_tensor(&file, layer.wq);
|
||||||
//
|
write_tensor(&file, layer.wk);
|
||||||
// write_tensor(&file, layer.attention_norm);
|
write_tensor(&file, layer.wv);
|
||||||
// write_tensor(&file, layer.wq);
|
write_tensor(&file, layer.wo);
|
||||||
// write_tensor(&file, layer.wk);
|
write_tensor(&file, layer.ffn_norm);
|
||||||
// write_tensor(&file, layer.wv);
|
write_tensor(&file, layer.w1);
|
||||||
// write_tensor(&file, layer.wo);
|
write_tensor(&file, layer.w2);
|
||||||
// write_tensor(&file, layer.ffn_norm);
|
write_tensor(&file, layer.w3);
|
||||||
// write_tensor(&file, layer.w1);
|
}
|
||||||
// write_tensor(&file, layer.w2);
|
|
||||||
// write_tensor(&file, layer.w3);
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct train_params get_default_train_params() {
|
struct train_params get_default_train_params() {
|
||||||
struct train_params params;
|
struct train_params params;
|
||||||
params.fn_vocab_model = "models/ggml-vocab.bin";
|
params.fn_vocab_model = "tokenizer.bin";
|
||||||
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
||||||
params.fn_train_data = "shakespeare.txt";
|
params.fn_train_data = "shakespeare.txt";
|
||||||
params.fn_checkpoint_in = "checkpoint.bin";
|
params.fn_checkpoint_in = "checkpoint.bin";
|
||||||
|
@ -718,7 +751,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
|
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
|
||||||
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
|
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
|
||||||
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
|
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
@ -791,9 +824,12 @@ int main(int argc, char ** argv) {
|
||||||
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
|
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
|
||||||
// read in the config header
|
// read in the config header
|
||||||
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
|
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
|
||||||
|
auto shared_weights = config.vocab_size > 0;
|
||||||
|
config.vocab_size = abs(config.vocab_size);
|
||||||
|
|
||||||
// read in the Transformer weights
|
// read in the Transformer weights
|
||||||
malloc_weights(&weights, &config);
|
malloc_weights(&weights, &config, shared_weights);
|
||||||
if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
|
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
|
||||||
fclose(file);
|
fclose(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
1
examples/embd-input/embd_input.py
Normal file → Executable file
1
examples/embd-input/embd_input.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
import ctypes
|
import ctypes
|
||||||
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
|
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
1
examples/embd-input/llava.py
Normal file → Executable file
1
examples/embd-input/llava.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
sys.path.insert(0, os.path.dirname(__file__))
|
sys.path.insert(0, os.path.dirname(__file__))
|
||||||
|
|
1
examples/embd-input/minigpt4.py
Normal file → Executable file
1
examples/embd-input/minigpt4.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
sys.path.insert(0, os.path.dirname(__file__))
|
sys.path.insert(0, os.path.dirname(__file__))
|
||||||
|
|
1
examples/embd-input/panda_gpt.py
Normal file → Executable file
1
examples/embd-input/panda_gpt.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
sys.path.insert(0, os.path.dirname(__file__))
|
sys.path.insert(0, os.path.dirname(__file__))
|
||||||
|
|
1
examples/jeopardy/graph.py
Normal file → Executable file
1
examples/jeopardy/graph.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import os
|
import os
|
||||||
import csv
|
import csv
|
||||||
|
|
0
examples/jeopardy/jeopardy.sh
Normal file → Executable file
0
examples/jeopardy/jeopardy.sh
Normal file → Executable file
1
examples/json-schema-to-grammar.py
Normal file → Executable file
1
examples/json-schema-to-grammar.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
|
@ -288,6 +288,10 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||||
|
|
||||||
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
|
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
|
||||||
|
|
||||||
|
### Grammars
|
||||||
|
|
||||||
|
- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
|
||||||
|
|
||||||
### Quantization
|
### Quantization
|
||||||
|
|
||||||
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
|
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
|
||||||
|
|
|
@ -189,23 +189,30 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
||||||
} else {
|
} else {
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Should not run without any tokens
|
||||||
|
if (embd_inp.empty()) {
|
||||||
|
embd_inp.push_back(llama_token_bos(ctx));
|
||||||
|
}
|
||||||
|
|
||||||
// Tokenize negative prompt
|
// Tokenize negative prompt
|
||||||
std::vector<llama_token> guidance_inp;
|
std::vector<llama_token> guidance_inp;
|
||||||
int guidance_offset = 0;
|
int guidance_offset = 0;
|
||||||
int original_prompt_len = 0;
|
int original_prompt_len = 0;
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
params.cfg_negative_prompt.insert(0, 1, ' ');
|
params.cfg_negative_prompt.insert(0, 1, ' ');
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
|
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||||
}
|
}
|
||||||
|
@ -252,7 +259,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// prefix & suffix for instruct mode
|
// prefix & suffix for instruct mode
|
||||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
|
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
|
|
1
examples/make-ggml.py
Normal file → Executable file
1
examples/make-ggml.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
This script converts Hugging Face llama models to GGML and quantizes them.
|
This script converts Hugging Face llama models to GGML and quantizes them.
|
||||||
|
|
||||||
|
|
|
@ -27,12 +27,136 @@ std::vector<float> softmax(const std::vector<float>& logits) {
|
||||||
return probs;
|
return probs;
|
||||||
}
|
}
|
||||||
|
|
||||||
void perplexity(llama_context * ctx, const gpt_params & params) {
|
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
|
||||||
|
if (params.ppl_stride <= 0) {
|
||||||
|
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
const bool add_bos = is_spm;
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
|
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
|
const int calc_chunk = params.n_ctx;
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
||||||
|
|
||||||
|
if (int(tokens.size()) <= calc_chunk) {
|
||||||
|
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
||||||
|
tokens.size(), params.n_ctx, params.ppl_stride);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
|
||||||
|
|
||||||
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||||
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
double nll = 0.0;
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_chunk; ++i) {
|
||||||
|
const int start = i * params.ppl_stride;
|
||||||
|
const int end = start + calc_chunk;
|
||||||
|
|
||||||
|
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
||||||
|
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
||||||
|
|
||||||
|
std::vector<float> logits;
|
||||||
|
|
||||||
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
for (int j = 0; j < num_batches; ++j) {
|
||||||
|
const int batch_start = start + j * n_batch;
|
||||||
|
const int batch_size = std::min(end - batch_start, n_batch);
|
||||||
|
|
||||||
|
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
||||||
|
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
|
||||||
|
//fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// save original token and restore it after eval
|
||||||
|
const auto token_org = tokens[batch_start];
|
||||||
|
|
||||||
|
// add BOS token for the first batch of each chunk
|
||||||
|
if (add_bos && j == 0) {
|
||||||
|
tokens[batch_start] = llama_token_bos(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto batch_logits = llama_get_logits(ctx);
|
||||||
|
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||||
|
|
||||||
|
if (j == 0) {
|
||||||
|
tokens[batch_start] = token_org;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
if (i == 0) {
|
||||||
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
|
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
|
if (total_seconds >= 60*60) {
|
||||||
|
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||||
|
total_seconds = total_seconds % (60*60);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
||||||
|
for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {
|
||||||
|
|
||||||
|
// Calculate probability of next token, given the previous ones.
|
||||||
|
const std::vector<float> tok_logits(
|
||||||
|
logits.begin() + (j + 0) * n_vocab,
|
||||||
|
logits.begin() + (j + 1) * n_vocab);
|
||||||
|
|
||||||
|
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||||
|
|
||||||
|
nll += -std::log(prob);
|
||||||
|
++count;
|
||||||
|
}
|
||||||
|
// perplexity is e^(average negative log-likelihood)
|
||||||
|
if (params.ppl_output_type == 0) {
|
||||||
|
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||||
|
} else {
|
||||||
|
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
|
if (params.ppl_stride > 0) {
|
||||||
|
perplexity_v2(ctx, params);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||||
|
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
const bool add_bos = is_spm;
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
|
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
const int n_chunk_max = tokens.size() / params.n_ctx;
|
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||||
|
|
||||||
|
@ -63,7 +187,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
const auto token_org = tokens[batch_start];
|
const auto token_org = tokens[batch_start];
|
||||||
|
|
||||||
// add BOS token for the first batch of each chunk
|
// add BOS token for the first batch of each chunk
|
||||||
if (j == 0) {
|
if (add_bos && j == 0) {
|
||||||
tokens[batch_start] = llama_token_bos(ctx);
|
tokens[batch_start] = llama_token_bos(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,7 +240,11 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
++count;
|
++count;
|
||||||
}
|
}
|
||||||
// perplexity is e^(average negative log-likelihood)
|
// perplexity is e^(average negative log-likelihood)
|
||||||
|
if (params.ppl_output_type == 0) {
|
||||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||||
|
} else {
|
||||||
|
printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count));
|
||||||
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -177,8 +305,10 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
size_t hs_task_count = prompt_lines.size()/6;
|
size_t hs_task_count = prompt_lines.size()/6;
|
||||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||||
|
|
||||||
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
// This is needed as usual for LLaMA models
|
||||||
bool prepend_bos = true;
|
const bool add_bos = is_spm;
|
||||||
|
|
||||||
// Number of tasks to use when computing the score
|
// Number of tasks to use when computing the score
|
||||||
if ( params.hellaswag_tasks < hs_task_count ) {
|
if ( params.hellaswag_tasks < hs_task_count ) {
|
||||||
|
@ -234,14 +364,13 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
std::vector<float> tok_logits(n_vocab);
|
std::vector<float> tok_logits(n_vocab);
|
||||||
|
|
||||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
||||||
|
|
||||||
// Tokenize the context to count tokens
|
// Tokenize the context to count tokens
|
||||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
|
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
|
||||||
size_t context_size = context_embd.size();
|
size_t context_size = context_embd.size();
|
||||||
|
|
||||||
// Do the 1st ending
|
// Do the 1st ending
|
||||||
// In this case we include the context when evaluating
|
// In this case we include the context when evaluating
|
||||||
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
|
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
|
||||||
auto query_size = query_embd.size();
|
auto query_size = query_embd.size();
|
||||||
//printf("First query: %d\n",(int)query_size);
|
//printf("First query: %d\n",(int)query_size);
|
||||||
|
|
||||||
|
@ -369,6 +498,12 @@ int main(int argc, char ** argv) {
|
||||||
params.perplexity = true;
|
params.perplexity = true;
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
|
if (params.ppl_stride > 0) {
|
||||||
|
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
||||||
|
params.n_ctx, params.n_ctx + params.ppl_stride/2);
|
||||||
|
params.n_ctx += params.ppl_stride/2;
|
||||||
|
}
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
|
|
|
@ -14,25 +14,25 @@ struct quant_option {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", },
|
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", },
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", },
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", },
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
|
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
|
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
|
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
|
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
|
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
|
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", },
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||||
#endif
|
#endif
|
||||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", },
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
cd `dirname $0`
|
cd `dirname $0`
|
||||||
|
|
0
examples/server-llama2-13B.sh
Normal file → Executable file
0
examples/server-llama2-13B.sh
Normal file → Executable file
|
@ -126,7 +126,7 @@ node .
|
||||||
|
|
||||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||||
|
|
||||||
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
|
`prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
|
||||||
|
|
||||||
`stop`: Specify a JSON array of stopping strings.
|
`stop`: Specify a JSON array of stopping strings.
|
||||||
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
from flask import Flask, jsonify, request, Response
|
from flask import Flask, jsonify, request, Response
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
0
examples/server/chat-llama2.sh
Normal file → Executable file
0
examples/server/chat-llama2.sh
Normal file → Executable file
0
examples/server/chat.sh
Normal file → Executable file
0
examples/server/chat.sh
Normal file → Executable file
|
@ -190,6 +190,7 @@ struct llama_server_context
|
||||||
size_t n_past = 0;
|
size_t n_past = 0;
|
||||||
size_t n_remain = 0;
|
size_t n_remain = 0;
|
||||||
|
|
||||||
|
json prompt;
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
std::vector<llama_token> last_n_tokens;
|
std::vector<llama_token> last_n_tokens;
|
||||||
|
|
||||||
|
@ -267,6 +268,53 @@ struct llama_server_context
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
|
||||||
|
{
|
||||||
|
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||||
|
// or the first element of the json_prompt array is a string.
|
||||||
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
|
||||||
|
if (json_prompt.is_array())
|
||||||
|
{
|
||||||
|
bool first = true;
|
||||||
|
for (const auto& p : json_prompt)
|
||||||
|
{
|
||||||
|
if (p.is_string())
|
||||||
|
{
|
||||||
|
auto s = p.template get<std::string>();
|
||||||
|
std::vector<llama_token> p;
|
||||||
|
if (first)
|
||||||
|
{
|
||||||
|
s.insert(0, 1, ' '); // add a space if it's the first
|
||||||
|
p = ::llama_tokenize(ctx, s, add_bos);
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
p = ::llama_tokenize(ctx, s, false);
|
||||||
|
}
|
||||||
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (first)
|
||||||
|
{
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
prompt_tokens.push_back(p.template get<llama_token>());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto s = json_prompt.template get<std::string>();
|
||||||
|
s.insert(0, 1, ' '); // always add a first space
|
||||||
|
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
||||||
|
}
|
||||||
|
|
||||||
|
return prompt_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
bool loadGrammar()
|
bool loadGrammar()
|
||||||
{
|
{
|
||||||
if (!params.grammar.empty()) {
|
if (!params.grammar.empty()) {
|
||||||
|
@ -294,8 +342,8 @@ struct llama_server_context
|
||||||
|
|
||||||
void loadPrompt()
|
void loadPrompt()
|
||||||
{
|
{
|
||||||
params.prompt.insert(0, 1, ' '); // always add a first space
|
auto prompt_tokens = tokenize(prompt, true); // always add BOS
|
||||||
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
|
||||||
num_prompt_tokens = prompt_tokens.size();
|
num_prompt_tokens = prompt_tokens.size();
|
||||||
|
|
||||||
if (params.n_keep < 0)
|
if (params.n_keep < 0)
|
||||||
|
@ -671,12 +719,11 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
fprintf(stdout, " number of layers to store in VRAM\n");
|
fprintf(stdout, " number of layers to store in VRAM\n");
|
||||||
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
|
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
|
||||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
|
||||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||||
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
|
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
||||||
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
|
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
||||||
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
|
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
||||||
#endif
|
#endif
|
||||||
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
||||||
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
||||||
|
@ -867,12 +914,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
|
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
}
|
}
|
||||||
else if (arg == "--mul-mat-q" || arg == "-mmq")
|
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
|
||||||
{
|
{
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
params.mul_mat_q = true;
|
params.mul_mat_q = false;
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
|
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
}
|
}
|
||||||
else if (arg == "--main-gpu" || arg == "-mg")
|
else if (arg == "--main-gpu" || arg == "-mg")
|
||||||
|
@ -1017,7 +1064,7 @@ static json format_final_response(llama_server_context &llama, const std::string
|
||||||
{"tokens_predicted", llama.num_tokens_predicted},
|
{"tokens_predicted", llama.num_tokens_predicted},
|
||||||
{"tokens_evaluated", llama.num_prompt_tokens},
|
{"tokens_evaluated", llama.num_prompt_tokens},
|
||||||
{"generation_settings", format_generation_settings(llama)},
|
{"generation_settings", format_generation_settings(llama)},
|
||||||
{"prompt", llama.params.prompt},
|
{"prompt", llama.prompt},
|
||||||
{"truncated", llama.truncated},
|
{"truncated", llama.truncated},
|
||||||
{"stopped_eos", llama.stopped_eos},
|
{"stopped_eos", llama.stopped_eos},
|
||||||
{"stopped_word", llama.stopped_word},
|
{"stopped_word", llama.stopped_word},
|
||||||
|
@ -1086,10 +1133,18 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
|
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
|
||||||
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
||||||
llama.params.seed = json_value(body, "seed", default_params.seed);
|
llama.params.seed = json_value(body, "seed", default_params.seed);
|
||||||
llama.params.prompt = json_value(body, "prompt", default_params.prompt);
|
|
||||||
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
||||||
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
|
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
|
||||||
|
|
||||||
|
if (body.count("prompt") != 0)
|
||||||
|
{
|
||||||
|
llama.prompt = body["prompt"];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
llama.prompt = "";
|
||||||
|
}
|
||||||
|
|
||||||
llama.params.logit_bias.clear();
|
llama.params.logit_bias.clear();
|
||||||
if (json_value(body, "ignore_eos", false))
|
if (json_value(body, "ignore_eos", false))
|
||||||
{
|
{
|
||||||
|
@ -1346,8 +1401,11 @@ int main(int argc, char **argv)
|
||||||
auto lock = llama.lock();
|
auto lock = llama.lock();
|
||||||
|
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
const std::string content = json_value<std::string>(body, "content", "");
|
std::vector<llama_token> tokens;
|
||||||
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
if (body.count("content") != 0)
|
||||||
|
{
|
||||||
|
tokens = llama.tokenize(body["content"], false);
|
||||||
|
}
|
||||||
const json data = format_tokenizer_response(tokens);
|
const json data = format_tokenizer_response(tokens);
|
||||||
return res.set_content(data.dump(), "application/json"); });
|
return res.set_content(data.dump(), "application/json"); });
|
||||||
|
|
||||||
|
@ -1359,7 +1417,14 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
llama.rewind();
|
llama.rewind();
|
||||||
llama_reset_timings(llama.ctx);
|
llama_reset_timings(llama.ctx);
|
||||||
llama.params.prompt = json_value<std::string>(body, "content", "");
|
if (body.count("content") != 0)
|
||||||
|
{
|
||||||
|
llama.prompt = body["content"];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
llama.prompt = "";
|
||||||
|
}
|
||||||
llama.params.n_predict = 0;
|
llama.params.n_predict = 0;
|
||||||
llama.loadPrompt();
|
llama.loadPrompt();
|
||||||
llama.beginCompletion();
|
llama.beginCompletion();
|
||||||
|
|
|
@ -238,7 +238,7 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
||||||
alloc->n_free_blocks++;
|
alloc->n_free_blocks++;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
if (list[i] != -1) {
|
if (list[i] != -1) {
|
||||||
|
@ -547,7 +547,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||||
struct ggml_tensor * view_src = get_view_source(parent);
|
struct ggml_tensor * view_src = get_view_source(parent);
|
||||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||||
view_src_hn->n_views -= 1;
|
view_src_hn->n_views -= 1;
|
||||||
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
AT_PRINTF("view_src %s\n", view_src->name);
|
||||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||||
ggml_allocator_free_tensor(alloc, view_src);
|
ggml_allocator_free_tensor(alloc, view_src);
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
||||||
|
|
||||||
// tell the allocator to parse nodes following the order described in the list
|
// tell the allocator to parse nodes following the order described in the list
|
||||||
// you should call this if your graph are optimized to execute out-of-order
|
// you should call this if your graph are optimized to execute out-of-order
|
||||||
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
||||||
|
|
||||||
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
||||||
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
||||||
|
|
62
ggml-cuda.cu
62
ggml-cuda.cu
|
@ -388,7 +388,7 @@ static int g_device_count = -1;
|
||||||
static int g_main_device = 0;
|
static int g_main_device = 0;
|
||||||
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
||||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
static bool g_mul_mat_q = false;
|
static bool g_mul_mat_q = true;
|
||||||
|
|
||||||
static void * g_scratch_buffer = nullptr;
|
static void * g_scratch_buffer = nullptr;
|
||||||
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
||||||
|
@ -4008,6 +4008,29 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
||||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: this implementation is wrong!
|
||||||
|
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
||||||
|
// const float p_delta, const int p_delta_rows, const float theta_scale) {
|
||||||
|
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||||
|
//
|
||||||
|
// if (col >= ncols) {
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
// const int i = row*ncols + col/2;
|
||||||
|
//
|
||||||
|
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
||||||
|
// const float sin_theta = sinf(theta);
|
||||||
|
// const float cos_theta = cosf(theta);
|
||||||
|
//
|
||||||
|
// const float x0 = x[i + 0];
|
||||||
|
// const float x1 = x[i + ncols/2];
|
||||||
|
//
|
||||||
|
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||||
|
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||||
|
//}
|
||||||
|
|
||||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
||||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
const int half_n_dims = ncols/4;
|
const int half_n_dims = ncols/4;
|
||||||
|
@ -4080,24 +4103,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
||||||
|
|
||||||
// the CUDA soft max implementation differs from the CPU implementation
|
// the CUDA soft max implementation differs from the CPU implementation
|
||||||
// instead of doubles floats are used
|
// instead of doubles floats are used
|
||||||
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
|
||||||
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
|
||||||
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
||||||
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
const int block_size = blockDim.y;
|
const int block_size = blockDim.y;
|
||||||
const int tid = threadIdx.y;
|
const int tid = threadIdx.y;
|
||||||
|
|
||||||
float tmp = 0.0;
|
float max_val = -INFINITY;
|
||||||
|
|
||||||
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
const int col = block_start + tid;
|
const int i = row*ncols + col;
|
||||||
|
max_val = max(max_val, x[i]);
|
||||||
if (col >= ncols) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// find the max value in the block
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
|
||||||
|
}
|
||||||
|
|
||||||
|
float tmp = 0.f;
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
const int i = row*ncols + col;
|
const int i = row*ncols + col;
|
||||||
const float val = expf(x[i]);
|
const float val = expf(x[i] - max_val);
|
||||||
tmp += val;
|
tmp += val;
|
||||||
dst[i] = val;
|
dst[i] = val;
|
||||||
}
|
}
|
||||||
|
@ -4108,15 +4136,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
const float inv_tmp = 1.f / tmp;
|
||||||
const int col = block_start + tid;
|
|
||||||
|
|
||||||
if (col >= ncols) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
const int i = row*ncols + col;
|
const int i = row*ncols + col;
|
||||||
dst[i] /= tmp;
|
dst[i] *= inv_tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5623,6 +5647,7 @@ inline void ggml_cuda_op_rope(
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
|
|
||||||
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
|
@ -5631,6 +5656,9 @@ inline void ggml_cuda_op_rope(
|
||||||
const float id_p = min(p, n_ctx - 2.f);
|
const float id_p = min(p, n_ctx - 2.f);
|
||||||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
||||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
||||||
|
} else if (is_neox) {
|
||||||
|
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
|
||||||
|
#pragma message("TODO: implement RoPE NeoX for CUDA")
|
||||||
} else {
|
} else {
|
||||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||||
|
|
|
@ -4,8 +4,10 @@
|
||||||
|
|
||||||
#ifdef GGML_USE_HIPBLAS
|
#ifdef GGML_USE_HIPBLAS
|
||||||
#define GGML_CUDA_NAME "ROCm"
|
#define GGML_CUDA_NAME "ROCm"
|
||||||
|
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||||
#else
|
#else
|
||||||
#define GGML_CUDA_NAME "CUDA"
|
#define GGML_CUDA_NAME "CUDA"
|
||||||
|
#define GGML_CUBLAS_NAME "cuBLAS"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
14
ggml-metal.m
14
ggml-metal.m
|
@ -167,7 +167,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
#define GGML_METAL_ADD_KERNEL(name) \
|
#define GGML_METAL_ADD_KERNEL(name) \
|
||||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||||
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
|
fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||||
|
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||||
|
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||||
if (error) { \
|
if (error) { \
|
||||||
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||||
return NULL; \
|
return NULL; \
|
||||||
|
@ -538,7 +540,7 @@ void ggml_metal_graph_compute(
|
||||||
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||||
|
|
||||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||||
const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
|
||||||
|
|
||||||
for (int ind = node_start; ind < node_end; ++ind) {
|
for (int ind = node_start; ind < node_end; ++ind) {
|
||||||
const int i = has_concur ? ctx->concur_list[ind] : ind;
|
const int i = has_concur ? ctx->concur_list[ind] : ind;
|
||||||
|
@ -768,8 +770,7 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
|
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
|
||||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
int nth0 = 32;
|
int nth0 = 32;
|
||||||
int nth1 = 1;
|
int nth1 = 1;
|
||||||
|
|
||||||
|
@ -938,7 +939,8 @@ void ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
{
|
{
|
||||||
const float eps = 1e-5f;
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
const int nth = 256;
|
const int nth = 256;
|
||||||
|
|
||||||
|
@ -990,7 +992,9 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||||
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
||||||
|
|
||||||
const int nth = 32;
|
const int nth = 32;
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
|
|
|
@ -87,7 +87,12 @@ kernel void kernel_gelu(
|
||||||
device float * dst,
|
device float * dst,
|
||||||
uint tpig[[thread_position_in_grid]]) {
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
float x = src0[tpig];
|
float x = src0[tpig];
|
||||||
dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
|
||||||
|
// BEWARE !!!
|
||||||
|
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
|
||||||
|
// This was observed with Falcon 7B and 40B models
|
||||||
|
//
|
||||||
|
dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_soft_max(
|
kernel void kernel_soft_max(
|
||||||
|
@ -571,7 +576,25 @@ kernel void kernel_rope(
|
||||||
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: implement
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||||
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||||
|
const float cos_theta = cos(theta);
|
||||||
|
const float sin_theta = sin(theta);
|
||||||
|
|
||||||
|
theta *= theta_scale;
|
||||||
|
|
||||||
|
const int64_t i0 = ib*n_dims + ic/2;
|
||||||
|
|
||||||
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
const float x0 = src[0];
|
||||||
|
const float x1 = src[n_dims/2];
|
||||||
|
|
||||||
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||||
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
24
ggml.c
24
ggml.c
|
@ -5555,10 +5555,6 @@ struct ggml_tensor * ggml_repeat(
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ggml_are_same_shape(a, b) && !is_node) {
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
||||||
|
|
||||||
result->op = GGML_OP_REPEAT;
|
result->op = GGML_OP_REPEAT;
|
||||||
|
@ -5789,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back(
|
||||||
static struct ggml_tensor * ggml_norm_impl(
|
static struct ggml_tensor * ggml_norm_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
float eps,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
|
@ -5799,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl(
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
// TODO: maybe store epsilon here?
|
ggml_set_op_params(result, &eps, sizeof(eps));
|
||||||
|
|
||||||
result->op = GGML_OP_NORM;
|
result->op = GGML_OP_NORM;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -5810,14 +5807,16 @@ static struct ggml_tensor * ggml_norm_impl(
|
||||||
|
|
||||||
struct ggml_tensor * ggml_norm(
|
struct ggml_tensor * ggml_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a,
|
||||||
return ggml_norm_impl(ctx, a, false);
|
float eps) {
|
||||||
|
return ggml_norm_impl(ctx, a, eps, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_norm_inplace(
|
struct ggml_tensor * ggml_norm_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a,
|
||||||
return ggml_norm_impl(ctx, a, true);
|
float eps) {
|
||||||
|
return ggml_norm_impl(ctx, a, eps, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_rms_norm
|
// ggml_rms_norm
|
||||||
|
@ -10619,7 +10618,8 @@ static void ggml_compute_forward_norm_f32(
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||||
|
|
||||||
const float eps = 1e-5f; // TODO: make this a parameter
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
|
@ -12537,7 +12537,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this is probably wrong, but I can't figure it out ..
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||||
|
@ -12666,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this is probably wrong, but I can't figure it out ..
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||||
|
|
7
ggml.h
7
ggml.h
|
@ -909,14 +909,15 @@ extern "C" {
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// normalize along rows
|
// normalize along rows
|
||||||
// TODO: eps is hardcoded to 1e-5 for now
|
|
||||||
GGML_API struct ggml_tensor * ggml_norm(
|
GGML_API struct ggml_tensor * ggml_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a,
|
||||||
|
float eps);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a,
|
||||||
|
float eps);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_rms_norm(
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
|
31
gguf.py
Normal file → Executable file
31
gguf.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
import struct
|
import struct
|
||||||
|
@ -26,14 +27,15 @@ KEY_GENERAL_DESCRIPTION = "general.description"
|
||||||
KEY_GENERAL_LICENSE = "general.license"
|
KEY_GENERAL_LICENSE = "general.license"
|
||||||
KEY_GENERAL_SOURCE_URL = "general.source.url"
|
KEY_GENERAL_SOURCE_URL = "general.source.url"
|
||||||
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
||||||
|
KEY_GENERAL_FILE_TYPE = "general.file_type"
|
||||||
|
|
||||||
# LLM
|
# LLM
|
||||||
KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
|
KEY_CONTEXT_LENGTH = "{arch}.context_length"
|
||||||
KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||||
KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
|
KEY_BLOCK_COUNT = "{arch}.block_count"
|
||||||
KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||||
KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||||
KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||||
|
|
||||||
# attention
|
# attention
|
||||||
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
|
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -581,7 +583,7 @@ class GGUFWriter:
|
||||||
self.add_string(KEY_GENERAL_AUTHOR, author)
|
self.add_string(KEY_GENERAL_AUTHOR, author)
|
||||||
|
|
||||||
def add_tensor_data_layout(self, layout: str):
|
def add_tensor_data_layout(self, layout: str):
|
||||||
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||||
|
|
||||||
def add_url(self, url: str):
|
def add_url(self, url: str):
|
||||||
self.add_string(KEY_GENERAL_URL, url)
|
self.add_string(KEY_GENERAL_URL, url)
|
||||||
|
@ -595,6 +597,9 @@ class GGUFWriter:
|
||||||
def add_source_hf_repo(self, repo: str):
|
def add_source_hf_repo(self, repo: str):
|
||||||
self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
|
self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
|
||||||
|
|
||||||
|
def add_file_type(self, ftype: int):
|
||||||
|
self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
|
||||||
|
|
||||||
def add_name(self, name: str):
|
def add_name(self, name: str):
|
||||||
self.add_string(KEY_GENERAL_NAME, name)
|
self.add_string(KEY_GENERAL_NAME, name)
|
||||||
|
|
||||||
|
@ -608,27 +613,27 @@ class GGUFWriter:
|
||||||
|
|
||||||
def add_context_length(self, length: int):
|
def add_context_length(self, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
|
KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_embedding_length(self, length: int):
|
def add_embedding_length(self, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_block_count(self, length: int):
|
def add_block_count(self, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
|
KEY_BLOCK_COUNT.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_feed_forward_length(self, length: int):
|
def add_feed_forward_length(self, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_parallel_residual(self, use: bool):
|
def add_parallel_residual(self, use: bool):
|
||||||
self.add_bool(
|
self.add_bool(
|
||||||
KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||||
|
|
||||||
def add_tensor_data_layout(self, layout: str):
|
def add_tensor_data_layout(self, layout: str):
|
||||||
self.add_string(
|
self.add_string(
|
||||||
KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||||
|
|
||||||
def add_head_count(self, count: int):
|
def add_head_count(self, count: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
|
|
91
grammars/README.md
Normal file
91
grammars/README.md
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
# GBNF Guide
|
||||||
|
|
||||||
|
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
|
||||||
|
|
||||||
|
## Basics
|
||||||
|
|
||||||
|
In GBNF, we define *production rules* that specify how a *non-terminal* (rule name) can be replaced with sequences of *terminals* (characters, specifically Unicode [code points](https://en.wikipedia.org/wiki/Code_point)) and other non-terminals. The basic format of a production rule is `nonterminal ::= sequence...`.
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
Before going deeper, let's look at some of the features demonstrated in `grammars/chess.gbnf`, a small chess notation grammar:
|
||||||
|
```
|
||||||
|
# `root` specifies the pattern for the overall output
|
||||||
|
root ::= (
|
||||||
|
# it must start with the characters "1. " followed by a sequence
|
||||||
|
# of characters that match the `move` rule, followed by a space, followed
|
||||||
|
# by another move, and then a newline
|
||||||
|
"1. " move " " move "\n"
|
||||||
|
|
||||||
|
# it's followed by one or more subsequent moves, numbered with one or two digits
|
||||||
|
([1-9] [0-9]? ". " move " " move "\n")+
|
||||||
|
)
|
||||||
|
|
||||||
|
# `move` is an abstract representation, which can be a pawn, nonpawn, or castle.
|
||||||
|
# The `[+#]?` denotes the possibility of checking or mate signs after moves
|
||||||
|
move ::= (pawn | nonpawn | castle) [+#]?
|
||||||
|
|
||||||
|
pawn ::= ...
|
||||||
|
nonpawn ::= ...
|
||||||
|
castle ::= ...
|
||||||
|
```
|
||||||
|
|
||||||
|
## Non-Terminals and Terminals
|
||||||
|
|
||||||
|
Non-terminal symbols (rule names) stand for a pattern of terminals and other non-terminals. They are required to be a dashed lowercase word, like `move`, `castle`, or `check-mate`.
|
||||||
|
|
||||||
|
Terminals are actual characters ([code points](https://en.wikipedia.org/wiki/Code_point)). They can be specified as a sequence like `"1"` or `"O-O"` or as ranges like `[1-9]` or `[NBKQR]`.
|
||||||
|
|
||||||
|
## Characters and character ranges
|
||||||
|
|
||||||
|
Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example `hiragana ::= [ぁ-ゟ]`, or with escapes: 8-bit (`\xXX`), 16-bit (`\uXXXX`) or 32-bit (`\UXXXXXXXX`).
|
||||||
|
|
||||||
|
Character ranges can be negated with `^`:
|
||||||
|
```
|
||||||
|
single-line ::= [^\n]+ "\n"`
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sequences and Alternatives
|
||||||
|
|
||||||
|
The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
|
||||||
|
|
||||||
|
Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
|
||||||
|
|
||||||
|
Parentheses `()` can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optptional symbols (below) to a sequence.
|
||||||
|
|
||||||
|
## Repetition and Optional Symbols
|
||||||
|
|
||||||
|
- `*` after a symbol or sequence means that it can be repeated zero or more times.
|
||||||
|
- `+` denotes that the symbol or sequence should appear one or more times.
|
||||||
|
- `?` makes the preceding symbol or sequence optional.
|
||||||
|
|
||||||
|
## Comments and newlines
|
||||||
|
|
||||||
|
Comments can be specified with `#`:
|
||||||
|
```
|
||||||
|
# defines optional whitspace
|
||||||
|
ws ::= [ \t\n]+
|
||||||
|
```
|
||||||
|
|
||||||
|
Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker `|` will continue the current rule, even outside of parentheses.
|
||||||
|
|
||||||
|
## The root rule
|
||||||
|
|
||||||
|
In a full grammar, the `root` rule always defines the starting point of the grammar. In other words, it specifies what the entire output must match.
|
||||||
|
|
||||||
|
```
|
||||||
|
# a grammar for lists
|
||||||
|
root ::= ("- " item)+
|
||||||
|
item ::= [^\n]+ "\n"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Next steps
|
||||||
|
|
||||||
|
This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
|
||||||
|
```
|
||||||
|
./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
|
||||||
|
```
|
164
k_quants.c
164
k_quants.c
|
@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||||
}
|
}
|
||||||
return 1/iscale;
|
return 1/iscale;
|
||||||
}
|
}
|
||||||
|
bool return_early = false;
|
||||||
|
if (rmse_type < 0) {
|
||||||
|
rmse_type = -rmse_type;
|
||||||
|
return_early = true;
|
||||||
|
}
|
||||||
int weight_type = rmse_type%2;
|
int weight_type = rmse_type%2;
|
||||||
float sumlx = 0;
|
float sumlx = 0;
|
||||||
float suml2 = 0;
|
float suml2 = 0;
|
||||||
|
@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||||
suml2 += w*l*l;
|
suml2 += w*l*l;
|
||||||
}
|
}
|
||||||
float scale = sumlx/suml2;
|
float scale = sumlx/suml2;
|
||||||
|
if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
|
||||||
float best = scale * sumlx;
|
float best = scale * sumlx;
|
||||||
for (int itry = 0; itry < 3; ++itry) {
|
for (int is = -9; is <= 9; ++is) {
|
||||||
iscale = 1/scale;
|
|
||||||
float slx = 0;
|
|
||||||
float sl2 = 0;
|
|
||||||
bool changed = false;
|
|
||||||
for (int i = 0; i < n; ++i) {
|
|
||||||
int l = nearest_int(iscale * x[i]);
|
|
||||||
l = MAX(-nmax, MIN(nmax-1, l));
|
|
||||||
if (l + nmax != L[i]) { changed = true; }
|
|
||||||
float w = weight_type == 1 ? x[i] * x[i] : 1.f;
|
|
||||||
slx += w*x[i]*l;
|
|
||||||
sl2 += w*l*l;
|
|
||||||
}
|
|
||||||
if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; }
|
|
||||||
for (int i = 0; i < n; ++i) {
|
|
||||||
int l = nearest_int(iscale * x[i]);
|
|
||||||
L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
|
|
||||||
}
|
|
||||||
sumlx = slx; suml2 = sl2;
|
|
||||||
scale = sumlx/suml2;
|
|
||||||
best = scale * sumlx;
|
|
||||||
}
|
|
||||||
for (int itry = 0; itry < 5; ++itry) {
|
|
||||||
int n_changed = 0;
|
|
||||||
for (int i = 0; i < n; ++i) {
|
|
||||||
float w = weight_type == 1 ? x[i]*x[i] : 1;
|
|
||||||
int l = L[i] - nmax;
|
|
||||||
float slx = sumlx - w*x[i]*l;
|
|
||||||
if (slx > 0) {
|
|
||||||
float sl2 = suml2 - w*l*l;
|
|
||||||
int new_l = nearest_int(x[i] * sl2 / slx);
|
|
||||||
new_l = MAX(-nmax, MIN(nmax-1, new_l));
|
|
||||||
if (new_l != l) {
|
|
||||||
slx += w*x[i]*new_l;
|
|
||||||
sl2 += w*new_l*new_l;
|
|
||||||
if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
|
|
||||||
L[i] = nmax + new_l; sumlx = slx; suml2 = sl2;
|
|
||||||
scale = sumlx / suml2; best = scale * sumlx;
|
|
||||||
++n_changed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!n_changed) { break; }
|
|
||||||
}
|
|
||||||
if (rmse_type < 3) {
|
|
||||||
return scale;
|
|
||||||
}
|
|
||||||
for (int is = -4; is <= 4; ++is) {
|
|
||||||
if (is == 0) {
|
if (is == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -221,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||||
return 1/iscale;
|
return 1/iscale;
|
||||||
}
|
}
|
||||||
|
|
||||||
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) {
|
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
|
||||||
|
int ntry, float alpha) {
|
||||||
float min = x[0];
|
float min = x[0];
|
||||||
float max = x[0];
|
float max = x[0];
|
||||||
|
float sum_x = 0;
|
||||||
|
float sum_x2 = 0;
|
||||||
for (int i = 1; i < n; ++i) {
|
for (int i = 1; i < n; ++i) {
|
||||||
if (x[i] < min) min = x[i];
|
if (x[i] < min) min = x[i];
|
||||||
if (x[i] > max) max = x[i];
|
if (x[i] > max) max = x[i];
|
||||||
|
sum_x += x[i];
|
||||||
|
sum_x2 += x[i]*x[i];
|
||||||
}
|
}
|
||||||
if (max == min) {
|
if (max == min) {
|
||||||
for (int i = 0; i < n; ++i) L[i] = 0;
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
||||||
|
@ -254,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
sum += x[i] - scale*L[i];
|
sum += x[i] - scale*L[i];
|
||||||
}
|
}
|
||||||
min = sum/n;
|
min = alpha*min + (1 - alpha)*sum/n;
|
||||||
if (min > 0) min = 0;
|
if (min > 0) min = 0;
|
||||||
iscale = 1/scale;
|
iscale = 1/scale;
|
||||||
if (!did_change) break;
|
if (!did_change) break;
|
||||||
|
@ -263,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
||||||
return scale;
|
return scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
||||||
|
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
||||||
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
||||||
|
float min = x[0];
|
||||||
|
float max = x[0];
|
||||||
|
float sum_w = weights[0];
|
||||||
|
float sum_x = sum_w * x[0];
|
||||||
|
for (int i = 1; i < n; ++i) {
|
||||||
|
if (x[i] < min) min = x[i];
|
||||||
|
if (x[i] > max) max = x[i];
|
||||||
|
float w = weights[i];
|
||||||
|
sum_w += w;
|
||||||
|
sum_x += w * x[i];
|
||||||
|
}
|
||||||
|
if (min > 0) min = 0;
|
||||||
|
if (max == min) {
|
||||||
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
||||||
|
*the_min = -min;
|
||||||
|
return 0.f;
|
||||||
|
}
|
||||||
|
float iscale = nmax/(max - min);
|
||||||
|
float scale = 1/iscale;
|
||||||
|
float best_mad = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
int l = nearest_int(iscale*(x[i] - min));
|
||||||
|
L[i] = MAX(0, MIN(nmax, l));
|
||||||
|
float diff = scale * L[i] + min - x[i];
|
||||||
|
diff = use_mad ? fabsf(diff) : diff * diff;
|
||||||
|
float w = weights[i];
|
||||||
|
best_mad += w * diff;
|
||||||
|
}
|
||||||
|
if (nstep < 1) {
|
||||||
|
*the_min = -min;
|
||||||
|
return scale;
|
||||||
|
}
|
||||||
|
for (int is = 0; is <= nstep; ++is) {
|
||||||
|
iscale = (rmin + rdelta*is + nmax)/(max - min);
|
||||||
|
float sum_l = 0, sum_l2 = 0, sum_xl = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
int l = nearest_int(iscale*(x[i] - min));
|
||||||
|
l = MAX(0, MIN(nmax, l));
|
||||||
|
Laux[i] = l;
|
||||||
|
float w = weights[i];
|
||||||
|
sum_l += w*l;
|
||||||
|
sum_l2 += w*l*l;
|
||||||
|
sum_xl += w*l*x[i];
|
||||||
|
}
|
||||||
|
float D = sum_w * sum_l2 - sum_l * sum_l;
|
||||||
|
if (D > 0) {
|
||||||
|
float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
|
||||||
|
float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
|
||||||
|
if (this_min > 0) {
|
||||||
|
this_min = 0;
|
||||||
|
this_scale = sum_xl / sum_l2;
|
||||||
|
}
|
||||||
|
float mad = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
float diff = this_scale * Laux[i] + this_min - x[i];
|
||||||
|
diff = use_mad ? fabsf(diff) : diff * diff;
|
||||||
|
float w = weights[i];
|
||||||
|
mad += w * diff;
|
||||||
|
}
|
||||||
|
if (mad < best_mad) {
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
L[i] = Laux[i];
|
||||||
|
}
|
||||||
|
best_mad = mad;
|
||||||
|
scale = this_scale;
|
||||||
|
min = this_min;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*the_min = -min;
|
||||||
|
return scale;
|
||||||
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
|
static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
|
||||||
if (j < 4) {
|
if (j < 4) {
|
||||||
|
@ -281,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
uint8_t L[QK_K];
|
uint8_t L[QK_K];
|
||||||
|
uint8_t Laux[16];
|
||||||
|
float weights[16];
|
||||||
float mins[QK_K/16];
|
float mins[QK_K/16];
|
||||||
float scales[QK_K/16];
|
float scales[QK_K/16];
|
||||||
|
|
||||||
|
@ -291,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
||||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
float max_min = 0;
|
float max_min = 0;
|
||||||
for (int j = 0; j < QK_K/16; ++j) {
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5);
|
for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
|
||||||
|
scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
|
||||||
float scale = scales[j];
|
float scale = scales[j];
|
||||||
if (scale > max_scale) {
|
if (scale > max_scale) {
|
||||||
max_scale = scale;
|
max_scale = scale;
|
||||||
|
@ -637,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
uint8_t L[QK_K];
|
uint8_t L[QK_K];
|
||||||
|
uint8_t Laux[32];
|
||||||
|
float weights[32];
|
||||||
float mins[QK_K/32];
|
float mins[QK_K/32];
|
||||||
float scales[QK_K/32];
|
float scales[QK_K/32];
|
||||||
|
|
||||||
|
@ -645,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
||||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
float max_min = 0;
|
float max_min = 0;
|
||||||
for (int j = 0; j < QK_K/32; ++j) {
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5);
|
//scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
||||||
|
float sum_x2 = 0;
|
||||||
|
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
||||||
|
float av_x = sqrtf(sum_x2/32);
|
||||||
|
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
||||||
|
scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
|
||||||
float scale = scales[j];
|
float scale = scales[j];
|
||||||
if (scale > max_scale) {
|
if (scale > max_scale) {
|
||||||
max_scale = scale;
|
max_scale = scale;
|
||||||
|
@ -798,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
||||||
uint8_t L[QK_K];
|
uint8_t L[QK_K];
|
||||||
float mins[QK_K/32];
|
float mins[QK_K/32];
|
||||||
float scales[QK_K/32];
|
float scales[QK_K/32];
|
||||||
|
float weights[32];
|
||||||
|
uint8_t Laux[32];
|
||||||
#else
|
#else
|
||||||
int8_t L[QK_K];
|
int8_t L[QK_K];
|
||||||
float scales[QK_K/16];
|
float scales[QK_K/16];
|
||||||
|
@ -810,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
||||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
float max_min = 0;
|
float max_min = 0;
|
||||||
for (int j = 0; j < QK_K/32; ++j) {
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5);
|
//scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
||||||
|
float sum_x2 = 0;
|
||||||
|
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
||||||
|
float av_x = sqrtf(sum_x2/32);
|
||||||
|
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
||||||
|
scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
|
||||||
float scale = scales[j];
|
float scale = scales[j];
|
||||||
if (scale > max_scale) {
|
if (scale > max_scale) {
|
||||||
max_scale = scale;
|
max_scale = scale;
|
||||||
|
|
17
llama.h
17
llama.h
|
@ -103,6 +103,8 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||||
|
|
||||||
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct llama_token_data {
|
typedef struct llama_token_data {
|
||||||
|
@ -245,6 +247,8 @@ extern "C" {
|
||||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||||
|
|
||||||
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
|
||||||
|
|
||||||
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
|
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
|
||||||
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
||||||
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
||||||
|
@ -366,13 +370,6 @@ extern "C" {
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos);
|
bool add_bos);
|
||||||
|
|
||||||
LLAMA_API int llama_tokenize_bpe(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const char * text,
|
|
||||||
llama_token * tokens,
|
|
||||||
int n_max_tokens,
|
|
||||||
bool add_bos);
|
|
||||||
|
|
||||||
LLAMA_API int llama_tokenize_with_model(
|
LLAMA_API int llama_tokenize_with_model(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const char * text,
|
const char * text,
|
||||||
|
@ -388,12 +385,6 @@ extern "C" {
|
||||||
char * buf,
|
char * buf,
|
||||||
int length);
|
int length);
|
||||||
|
|
||||||
LLAMA_API int llama_token_to_str_bpe(
|
|
||||||
const struct llama_context * ctx,
|
|
||||||
llama_token token,
|
|
||||||
char * buf,
|
|
||||||
int length);
|
|
||||||
|
|
||||||
LLAMA_API int llama_token_to_str_with_model(
|
LLAMA_API int llama_token_to_str_with_model(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
|
|
0
scripts/get-wikitext-2.sh
Normal file → Executable file
0
scripts/get-wikitext-2.sh
Normal file → Executable file
|
@ -28,7 +28,8 @@ llama_build_and_test_executable(test-sampling.cpp)
|
||||||
llama_build_executable(test-tokenizer-0.cpp)
|
llama_build_executable(test-tokenizer-0.cpp)
|
||||||
llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_build_executable(test-tokenizer-1.cpp)
|
llama_build_executable(test-tokenizer-1.cpp)
|
||||||
llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
||||||
|
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
|
|
|
@ -17,6 +17,8 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector<lla
|
||||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||||
{ " ", {1, 259, }, },
|
{ " ", {1, 259, }, },
|
||||||
|
{ " ", { 1, 1678, }, },
|
||||||
|
{ " ", { 1, 268, }, },
|
||||||
{ "\t", { 1, 29871, 12, }, },
|
{ "\t", { 1, 29871, 12, }, },
|
||||||
{ "\n", { 1, 29871, 13, }, },
|
{ "\n", { 1, 29871, 13, }, },
|
||||||
{ "\t\n", { 1, 29871, 12, 13, }, },
|
{ "\t\n", { 1, 29871, 12, 13, }, },
|
||||||
|
@ -38,6 +40,12 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
|
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
|
||||||
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
|
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
|
||||||
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
||||||
|
{ "Hello", { 1, 15043 }, },
|
||||||
|
{ " Hello", { 1, 29871, 15043 }, },
|
||||||
|
{ " Hello", { 1, 259, 15043 }, },
|
||||||
|
{ " Hello", { 1, 1678, 15043 }, },
|
||||||
|
{ " Hello", { 1, 268, 15043 }, },
|
||||||
|
{ " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, },
|
||||||
};
|
};
|
||||||
|
|
||||||
return _k_tests;
|
return _k_tests;
|
||||||
|
@ -106,7 +114,8 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
if (!correct) {
|
if (!correct) {
|
||||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||||
fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
|
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||||
|
unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
|
||||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||||
for (const auto & t : test_kv.second) {
|
for (const auto & t : test_kv.second) {
|
||||||
fprintf(stderr, "%6d, ", t);
|
fprintf(stderr, "%6d, ", t);
|
||||||
|
|
|
@ -11,18 +11,11 @@
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
|
||||||
static std::string escape_whitespace(const std::string& text) {
|
static std::string escape_whitespace(const std::string& text) {
|
||||||
std::string result;
|
std::string result = "\xe2\x96\x81";
|
||||||
bool escaping = false;
|
|
||||||
result += "\xe2\x96\x81";
|
|
||||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||||
if (text[offs] == ' ') {
|
if (text[offs] == ' ') {
|
||||||
if (!escaping) {
|
|
||||||
result += "\xe2\x96\x81";
|
result += "\xe2\x96\x81";
|
||||||
escaping = true;
|
} else {
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
escaping = false;
|
|
||||||
result += text[offs];
|
result += text[offs];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -74,11 +67,13 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
for (int i = 0; i < n_vocab; ++i) {
|
for (int i = 0; i < n_vocab; ++i) {
|
||||||
std::string forward = llama_token_to_str_bpe(ctx, i);
|
std::string forward = llama_token_to_str(ctx, i);
|
||||||
std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
||||||
if (tokens.size() == 1) {
|
if (tokens.size() == 1) {
|
||||||
if (i != tokens[0]) {
|
if (i != tokens[0]) {
|
||||||
std::string backward = llama_token_to_str(ctx, tokens[0]);
|
std::string backward = llama_token_to_str(ctx, tokens[0]);
|
||||||
|
@ -86,16 +81,6 @@ int main(int argc, char **argv) {
|
||||||
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
|
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
llama_token_type type = llama_token_get_type(ctx, i);
|
|
||||||
if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
|
|
||||||
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
|
|
||||||
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n",
|
|
||||||
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue