Compare commits
205 commits
master
...
gguf-write
Author | SHA1 | Date | |
---|---|---|---|
|
6a9e6375b5 | ||
|
307e09cd85 | ||
|
e426b3cfc8 | ||
|
5484737d58 | ||
|
57eaadb853 | ||
|
b3cc182990 | ||
|
acaa98234a | ||
|
78e1e57862 | ||
|
fb11dd3f92 | ||
|
e72c8c2124 | ||
|
4dbce7d009 | ||
|
1d93d04ce2 | ||
|
899f9a5350 | ||
|
93f285bdf1 | ||
|
f71704177f | ||
|
81a2c2a6f4 | ||
|
9f02694c91 | ||
|
dd9e2fc988 | ||
|
c3b739374e | ||
|
22c61c5b45 | ||
|
6d66ef96eb | ||
|
11bf4366c2 | ||
|
2f8fc92d86 | ||
|
8ace03ad3d | ||
|
d646c4efce | ||
|
dd016cc246 | ||
|
2ddd9681d6 | ||
|
e0429d38e4 | ||
|
5f97a48fc1 | ||
|
dce07c3121 | ||
|
d6fd53afd6 | ||
|
5a0a2c5685 | ||
|
f31e9230ad | ||
|
42f8fe1927 | ||
|
5ec18934ad | ||
|
c8ee87f141 | ||
|
88b5769487 | ||
|
758ff1bbb5 | ||
|
ea5615a03a | ||
|
4a1741aa2d | ||
|
2ae0e985b3 | ||
|
66756c82af | ||
|
b6056c3db8 | ||
|
2dd5d2c92c | ||
|
ca4758290c | ||
|
ab2cbd03ca | ||
|
cedb4870c6 | ||
|
5d518d421f | ||
|
7ec125b1dc | ||
|
6c63550f63 | ||
|
7494c78428 | ||
|
afc4ca2889 | ||
|
ec1b100720 | ||
|
8af3a99ff1 | ||
|
6f14854880 | ||
|
f00780b2ee | ||
|
6f64b6c0f8 | ||
|
62490f1380 | ||
|
0c19ae70d5 | ||
|
5c5a95ba2d | ||
|
a7d226f871 | ||
|
d753dfbcc8 | ||
|
806a15749d | ||
|
51939d7d1b | ||
|
5d22a9db13 | ||
|
56a1f32072 | ||
|
196b50fee7 | ||
|
24f48833ab | ||
|
6beebf3fd9 | ||
|
2827b840e4 | ||
|
bf2dad3100 | ||
|
1d60468eee | ||
|
91d4bfd536 | ||
|
17800cd80f | ||
|
e3d1f07eb1 | ||
|
9bf5a7efcb | ||
|
c7bd8c147c | ||
|
e91a2224e4 | ||
|
489616e126 | ||
|
d2ce9cfe8d | ||
|
8b5f0c5067 | ||
|
5e58ffa1ed | ||
|
e606ffeaee | ||
|
f8218477b3 | ||
|
4cef57c81a | ||
|
8f09157ec9 | ||
|
5d81a715d4 | ||
|
60d540831b | ||
|
202eab04d3 | ||
|
1fc3d30b71 | ||
|
fa7c39540c | ||
|
b2571af255 | ||
|
c4f02b4f74 | ||
|
0e1a3c7e7d | ||
|
4fa017a1f9 | ||
|
186c496fdf | ||
|
2f52008b20 | ||
|
e76c59d524 | ||
|
2a5ac7af44 | ||
|
e732423280 | ||
|
f44bbd3d88 | ||
|
7009cf581c | ||
|
61919c1a8f | ||
|
d09fd10713 | ||
|
781b9ec3f5 | ||
|
28abfc90fa | ||
|
e3a4960953 | ||
|
eb8ca6996f | ||
|
b2440f1943 | ||
|
a356b0e228 | ||
|
e7d346c37c | ||
|
f316b94c7c | ||
|
cfb8e35b73 | ||
|
42cc04d11d | ||
|
22de6c5c4c | ||
|
4c0f64e302 | ||
|
4f865181aa | ||
|
1c4d8bf981 | ||
|
0246d0dd6f | ||
|
7d5f4522dd | ||
|
f4d137d98c | ||
|
ece4fc185e | ||
|
65559a23c8 | ||
|
8083ae347a | ||
|
1da82c551f | ||
|
4357e692ac | ||
|
db5618ad99 | ||
|
278ada9572 | ||
|
fb0b243705 | ||
|
5d98989cf6 | ||
|
e6f19ba240 | ||
|
2922280a1a | ||
|
6691aa8797 | ||
|
23abbe8e00 | ||
|
c5ba5efda2 | ||
|
e1e9b28547 | ||
|
c3a65c4bbe | ||
|
cf365fbc20 | ||
|
1b4f9c8eb9 | ||
|
49380a23a3 | ||
|
ff1cb02397 | ||
|
36a36c32a3 | ||
|
c77fabb1f9 | ||
|
e7a741695c | ||
|
da4900e835 | ||
|
f3de876a12 | ||
|
bb42aefaeb | ||
|
b26f5b2e43 | ||
|
7aa0a0e7f7 | ||
|
6b3a7b9f4f | ||
|
4f5b6224be | ||
|
2a0914673c | ||
|
068a8e0fbe | ||
|
30c4ea47e6 | ||
|
2fabc176ce | ||
|
f175b05872 | ||
|
e9192b0135 | ||
|
4ed98bf1ab | ||
|
b19c11750b | ||
|
b4676ee447 | ||
|
ccd81a751b | ||
|
0790c121aa | ||
|
87c34e4dd4 | ||
|
32e037ffbe | ||
|
06c3e4a1a7 | ||
|
9577821487 | ||
|
2c22e3bcdb | ||
|
34469b9ea7 | ||
|
0f5e57f01d | ||
|
8ad7cd49fb | ||
|
0317c41d98 | ||
|
cc3dd7f042 | ||
|
8a76dd8a85 | ||
|
c861e234f4 | ||
|
0c219fb5b5 | ||
|
93f7f7aef7 | ||
|
aa99562d70 | ||
|
ea5f9ad2ca | ||
|
999431c4b6 | ||
|
d54f53ca51 | ||
|
06f423a8e1 | ||
|
08dc8fd884 | ||
|
9475cdb7a3 | ||
|
1495735aac | ||
|
3492f848d7 | ||
|
11ef380c2a | ||
|
d2bb3ac10b | ||
|
68f53485e4 | ||
|
158be8f7f4 | ||
|
d2b6ca13ad | ||
|
d89533dff6 | ||
|
c85d3178b3 | ||
|
d8491fc7e3 | ||
|
5628ec7163 | ||
|
e46870f5af | ||
|
d313c0fa33 | ||
|
cb871fa022 | ||
|
860c9c63ce | ||
|
78b226a959 | ||
|
d91b985d2d | ||
|
8d6acfec12 | ||
|
6873148771 | ||
|
7e82d25f40 | ||
|
bae6b125f6 | ||
|
4d698495ea |
40 changed files with 7626 additions and 2672 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,6 +1,7 @@
|
|||
*.o
|
||||
*.a
|
||||
*.so
|
||||
*.gguf
|
||||
*.bin
|
||||
.DS_Store
|
||||
.build/
|
||||
|
@ -47,6 +48,8 @@ models-mnt
|
|||
/server
|
||||
/Pipfile
|
||||
/embd-input-test
|
||||
/gguf
|
||||
/gguf-llama-simple
|
||||
/libllama.so
|
||||
build-info.h
|
||||
arm_neon.h
|
||||
|
@ -64,7 +67,6 @@ perf-*.txt
|
|||
|
||||
examples/jeopardy/results.txt
|
||||
|
||||
|
||||
pyproject.toml
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
|
|
|
@ -527,7 +527,6 @@ endif()
|
|||
add_library(llama
|
||||
llama.cpp
|
||||
llama.h
|
||||
llama-util.h
|
||||
)
|
||||
|
||||
target_include_directories(llama PUBLIC .)
|
||||
|
|
9
Makefile
9
Makefile
|
@ -1,5 +1,5 @@
|
|||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
|
||||
|
@ -329,7 +329,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
|||
|
||||
OBJS += ggml-alloc.o
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
|
@ -345,7 +345,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
|||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test build-info.h $(TEST_TARGETS)
|
||||
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf build-info.h $(TEST_TARGETS)
|
||||
|
||||
#
|
||||
# Examples
|
||||
|
@ -385,6 +385,9 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
|
|||
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||
|
||||
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
|
|
22
README.md
22
README.md
|
@ -284,7 +284,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
|
|||
Any value larger than 0 will offload the computation to the GPU. For example:
|
||||
|
||||
```bash
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
|
||||
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
|
||||
```
|
||||
|
||||
### MPI Build
|
||||
|
@ -323,7 +323,7 @@ The above will distribute the computation across 2 processes on the first host a
|
|||
Finally, you're ready to run a computation using `mpirun`:
|
||||
|
||||
```bash
|
||||
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
||||
```
|
||||
|
||||
### BLAS Build
|
||||
|
@ -506,10 +506,10 @@ python3 convert.py models/7B/
|
|||
python convert.py models/7B/ --vocabtype bpe
|
||||
|
||||
# quantize the model to 4-bits (using q4_0 method)
|
||||
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
|
||||
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
|
||||
|
||||
# run the inference
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
||||
```
|
||||
|
||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||
|
@ -565,7 +565,7 @@ Here is an example of a few-shot interaction, invoked with the command
|
|||
./examples/chat-13B.sh
|
||||
|
||||
# custom arguments using a 13B model
|
||||
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
```
|
||||
|
||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
||||
|
@ -628,6 +628,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It
|
|||
|
||||
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
||||
|
||||
*Note: these instructions are likely obsoleted by the GGUF update*
|
||||
|
||||
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
|
||||
- Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
|
||||
- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
|
||||
|
@ -703,7 +705,7 @@ If your issue is with model generation quality, then please at least scan the fo
|
|||
#### How to run
|
||||
|
||||
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||
3. Output:
|
||||
```
|
||||
perplexity : calculating perplexity over 655 chunks
|
||||
|
@ -802,13 +804,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
|
|||
On completion, you are ready to play!
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with a light image:
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
### Docker With CUDA
|
||||
|
@ -839,8 +841,8 @@ The resulting images, are essentially the same as the non-CUDA images:
|
|||
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
|
||||
|
||||
```bash
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
```
|
||||
|
||||
### Contributing
|
||||
|
|
44
ci/run.sh
44
ci/run.sh
|
@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {
|
|||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.bin"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.bin"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.bin"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.bin"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.bin"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.bin"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.bin"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.bin"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.bin"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.bin"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.bin"
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
|
@ -285,17 +285,17 @@ function gg_run_open_llama_7b_v2 {
|
|||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.bin"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.bin"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.bin"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.bin"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.bin"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.bin"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.bin"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.bin"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.bin"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.bin"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.bin"
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
|
|
268
convert-gptneox-hf-to-gguf.py
Normal file
268
convert-gptneox-hf-to-gguf.py
Normal file
|
@ -0,0 +1,268 @@
|
|||
# HF gptneox--> gguf conversion
|
||||
|
||||
import gguf
|
||||
import os
|
||||
import sys
|
||||
import struct
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
return num_parts
|
||||
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
|
||||
print(" ftype == 0 -> float32")
|
||||
print(" ftype == 1 -> float16")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# output in the same directory as the model
|
||||
dir_model = sys.argv[1]
|
||||
last_dir = os.path.basename(os.path.normpath(dir_model))
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if len(sys.argv) > 2:
|
||||
ftype = int(sys.argv[2])
|
||||
if ftype < 0 or ftype > 1:
|
||||
print("Invalid ftype: " + str(ftype))
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
|
||||
|
||||
print("gguf: loading model "+last_dir)
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "GPTNeoXForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit()
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
|
||||
llm_arch = "gptneox"
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, arch=llm_arch)
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
gguf_writer.add_architecture()
|
||||
gguf_writer.add_name(last_dir)
|
||||
gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
|
||||
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: List[str] = []
|
||||
merges: List[str] = []
|
||||
|
||||
|
||||
if Path(dir_model + "/tokenizer.json").is_file():
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
print("gguf: get gpt2 tokenizer merges")
|
||||
|
||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
merges = tokenizer_json["model"]["merges"]
|
||||
|
||||
gguf_writer.add_token_merges(merges)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
|
||||
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
|
||||
print("gguf: get special token ids")
|
||||
|
||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
# find special token ids
|
||||
|
||||
if "bos_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["bos_token"]:
|
||||
gguf_writer.add_bos_token_id(key["id"])
|
||||
|
||||
if "eos_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["eos_token"]:
|
||||
gguf_writer.add_eos_token_id(key["id"])
|
||||
|
||||
if "unk_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["unk_token"]:
|
||||
gguf_writer.add_unk_token_id(key["id"])
|
||||
|
||||
if "sep_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["sep_token"]:
|
||||
gguf_writer.add_sep_token_id(key["id"])
|
||||
|
||||
if "pad_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["pad_token"]:
|
||||
gguf_writer.add_pad_token_id(key["id"])
|
||||
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(block_count)
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = ("pytorch_model.bin",)
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
for part_name in part_names:
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
|
||||
# we don't need these
|
||||
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
if name.endswith(".weight") and name[:-7] in tensor_map:
|
||||
name = tensor_map[name[:-7]] + ".weight"
|
||||
elif name.endswith(".bias") and name[:-5] in tensor_map:
|
||||
name = tensor_map[name[:-5]] + ".bias"
|
||||
else:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
old_dtype = data_dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data.dtype == np.float16:
|
||||
data_dtype = np.float32
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data.dtype == np.float16 and n_dims == 1:
|
||||
data_dtype = np.float32
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data_dtype = np.float16
|
||||
|
||||
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data_dtype))
|
||||
|
||||
data = data.astype(data_dtype)
|
||||
|
||||
gguf_writer.add_tensor(name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print("gguf: model successfully exported to '" + fname_out + "'")
|
||||
print("")
|
265
convert-llama-7b-pth-to-gguf.py
Normal file
265
convert-llama-7b-pth-to-gguf.py
Normal file
|
@ -0,0 +1,265 @@
|
|||
# 7b pth llama --> gguf conversion, GQA/70b not supported
|
||||
# Only models with a single datafile are supported, like 7B
|
||||
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
|
||||
|
||||
import gguf
|
||||
import os
|
||||
import sys
|
||||
import struct
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
#NDArray = np.ndarray[Any, Any]
|
||||
# compatible with python < 3.9
|
||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("consolidated."):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
return num_parts
|
||||
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
|
||||
print(" ftype == 0 -> float32")
|
||||
print(" ftype == 1 -> float16")
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# output in the same directory as the model
|
||||
dir_model = sys.argv[1]
|
||||
last_dir = os.path.basename(os.path.normpath(dir_model))
|
||||
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if len(sys.argv) > 2:
|
||||
ftype = int(sys.argv[2])
|
||||
if ftype < 0 or ftype > 1:
|
||||
print("Invalid ftype: " + str(ftype))
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
|
||||
|
||||
print("gguf: loading model "+last_dir)
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "LlamaForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
sys.exit()
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
|
||||
if num_parts > 1:
|
||||
print("gguf: Only models with a single datafile are supported.")
|
||||
|
||||
sys.exit()
|
||||
llm_arch = "llama"
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, arch=llm_arch)
|
||||
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
head_count = hparams["num_attention_heads"]
|
||||
|
||||
if "num_key_value_heads" in hparams:
|
||||
head_count_kv = hparams["num_key_value_heads"]
|
||||
else:
|
||||
head_count_kv = head_count
|
||||
|
||||
if "_name_or_path" in hparams:
|
||||
hf_repo = hparams["_name_or_path"]
|
||||
else:
|
||||
hf_repo = ""
|
||||
|
||||
gguf_writer.add_architecture()
|
||||
gguf_writer.add_name(last_dir)
|
||||
gguf_writer.add_source_hf_repo(hf_repo)
|
||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
gguf_writer.add_head_count(head_count)
|
||||
gguf_writer.add_head_count_kv(head_count_kv)
|
||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: List[bytes] = []
|
||||
scores: List[float] = []
|
||||
toktypes: List[int] = []
|
||||
|
||||
if Path(dir_model + "/tokenizer.model").is_file():
|
||||
# vocab type sentencepiece
|
||||
print("gguf: get sentencepiece tokenizer vocab and scores")
|
||||
|
||||
tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
|
||||
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
text: bytes
|
||||
score: float
|
||||
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(i)
|
||||
|
||||
toktype = 1 # defualt to normal token type
|
||||
if tokenizer.is_unknown(i):
|
||||
toktype = 2
|
||||
if tokenizer.is_control(i):
|
||||
toktype = 3
|
||||
|
||||
# TODO: How to determinate if a token is user defined?
|
||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||
# if tokenizer.is_user_defined(i): toktype = 4
|
||||
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = 5
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = 6
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
gguf_writer.add_tokenizer_model("llama")
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
if Path(dir_model + "/tokenizer.json").is_file():
|
||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
tokenizer = json.load(f)
|
||||
|
||||
if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
|
||||
print("gguf: get special token ids")
|
||||
|
||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
# find special token ids
|
||||
|
||||
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["bos_token"]["content"]:
|
||||
gguf_writer.add_bos_token_id(key["id"])
|
||||
|
||||
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["eos_token"]["content"]:
|
||||
gguf_writer.add_eos_token_id(key["id"])
|
||||
|
||||
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["unk_token"]["content"]:
|
||||
gguf_writer.add_unk_token_id(key["id"])
|
||||
|
||||
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["sep_token"]["content"]:
|
||||
gguf_writer.add_sep_token_id(key["id"])
|
||||
|
||||
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["pad_token"]["content"]:
|
||||
gguf_writer.add_pad_token_id(key["id"])
|
||||
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(block_count)
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
|
||||
|
||||
for part_name in part_names:
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
|
||||
# we don't need these
|
||||
if name == "rope.freqs":
|
||||
continue
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
if name.endswith(".weight") and name[:-7] in tensor_map:
|
||||
name = tensor_map[name[:-7]] + ".weight"
|
||||
elif name.endswith(".bias") and name[:-5] in tensor_map:
|
||||
name = tensor_map[name[:-5]] + ".bias"
|
||||
else:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
old_dtype = data_dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data.dtype == np.float16:
|
||||
data_dtype = np.float32
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data_dtype = np.float32
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data_dtype = np.float16
|
||||
|
||||
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data_dtype))
|
||||
|
||||
data = data.astype(data_dtype)
|
||||
|
||||
gguf_writer.add_tensor(name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
|
||||
print("gguf: model successfully exported to '" + fname_out + "'")
|
||||
print("")
|
294
convert-llama-hf-to-gguf.py
Normal file
294
convert-llama-hf-to-gguf.py
Normal file
|
@ -0,0 +1,294 @@
|
|||
# HF llama --> gguf conversion
|
||||
|
||||
import gguf
|
||||
import os
|
||||
import sys
|
||||
import struct
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from typing import Any, List, Optional
|
||||
from pathlib import Path
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
#NDArray = np.ndarray[Any, Any]
|
||||
# compatible with python < 3.9
|
||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||
|
||||
# reverse HF permute back to original pth layout
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
||||
|
||||
|
||||
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
|
||||
if n_kv_head is not None and n_head != n_kv_head:
|
||||
n_head //= n_kv_head
|
||||
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
num_parts = 0
|
||||
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
|
||||
return num_parts
|
||||
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
|
||||
print(" ftype == 0 -> float32")
|
||||
print(" ftype == 1 -> float16")
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# output in the same directory as the model
|
||||
dir_model = sys.argv[1]
|
||||
last_dir = os.path.basename(os.path.normpath(dir_model))
|
||||
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if len(sys.argv) > 2:
|
||||
ftype = int(sys.argv[2])
|
||||
if ftype < 0 or ftype > 1:
|
||||
print("Invalid ftype: " + str(ftype))
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
|
||||
|
||||
print("gguf: loading model "+last_dir)
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "LlamaForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit()
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, arch="llama")
|
||||
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
head_count = hparams["num_attention_heads"]
|
||||
|
||||
if "num_key_value_heads" in hparams:
|
||||
head_count_kv = hparams["num_key_value_heads"]
|
||||
else:
|
||||
head_count_kv = head_count
|
||||
|
||||
if "_name_or_path" in hparams:
|
||||
hf_repo = hparams["_name_or_path"]
|
||||
else:
|
||||
hf_repo = ""
|
||||
|
||||
if "max_sequence_length" in hparams:
|
||||
ctx_length = hparams["max_sequence_length"]
|
||||
elif "max_position_embeddings" in hparams:
|
||||
ctx_length = hparams["max_position_embeddings"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
|
||||
sys.exit()
|
||||
|
||||
|
||||
gguf_writer.add_architecture()
|
||||
gguf_writer.add_name(last_dir)
|
||||
gguf_writer.add_source_hf_repo(hf_repo)
|
||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
gguf_writer.add_context_length(ctx_length)
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
gguf_writer.add_head_count(head_count)
|
||||
gguf_writer.add_head_count_kv(head_count_kv)
|
||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: List[bytes] = []
|
||||
scores: List[float] = []
|
||||
toktypes: List[int] = []
|
||||
|
||||
if Path(dir_model + "/tokenizer.model").is_file():
|
||||
# vocab type sentencepiece
|
||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||
|
||||
tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
|
||||
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
text: bytes
|
||||
score: float
|
||||
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(i)
|
||||
|
||||
toktype = 1 # defualt to normal token type
|
||||
if tokenizer.is_unknown(i):
|
||||
toktype = 2
|
||||
if tokenizer.is_control(i):
|
||||
toktype = 3
|
||||
|
||||
# TODO: How to determinate if a token is user defined?
|
||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||
# if tokenizer.is_user_defined(i): toktype = 4
|
||||
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = 5
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = 6
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
gguf_writer.add_tokenizer_model("llama")
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
if Path(dir_model + "/tokenizer.json").is_file():
|
||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
tokenizer = json.load(f)
|
||||
|
||||
if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
|
||||
print("gguf: get special token ids")
|
||||
|
||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
# find special token ids
|
||||
|
||||
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["bos_token"]["content"]:
|
||||
gguf_writer.add_bos_token_id(key["id"])
|
||||
|
||||
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["eos_token"]["content"]:
|
||||
gguf_writer.add_eos_token_id(key["id"])
|
||||
|
||||
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["unk_token"]["content"]:
|
||||
gguf_writer.add_unk_token_id(key["id"])
|
||||
|
||||
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["sep_token"]["content"]:
|
||||
gguf_writer.add_sep_token_id(key["id"])
|
||||
|
||||
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["pad_token"]["content"]:
|
||||
gguf_writer.add_pad_token_id(key["id"])
|
||||
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(block_count)
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = ("pytorch_model.bin",)
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
for part_name in part_names:
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# reverse permute these
|
||||
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
|
||||
data = reverse_hf_permute(data, head_count, head_count_kv)
|
||||
|
||||
# map tensor names
|
||||
if name.endswith(".weight") and name[:-7] in tensor_map:
|
||||
name = tensor_map[name[:-7]] + ".weight"
|
||||
elif name.endswith(".bias") and name[:-5] in tensor_map:
|
||||
name = tensor_map[name[:-5]] + ".bias"
|
||||
else:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
old_dtype = data_dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data.dtype == np.float16:
|
||||
data_dtype = np.float32
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data_dtype = np.float32
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data_dtype = np.float16
|
||||
|
||||
data = data.astype(data_dtype)
|
||||
|
||||
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
|
||||
gguf_writer.add_tensor(name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
|
||||
print("gguf: model successfully exported to '" + fname_out + "'")
|
||||
print("")
|
989
convert.py
Normal file → Executable file
989
convert.py
Normal file → Executable file
File diff suppressed because it is too large
Load diff
|
@ -3,7 +3,7 @@
|
|||
## Verifying that the model is running on the GPU with cuBLAS
|
||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||
```shell
|
||||
./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
|
||||
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||
```
|
||||
|
||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||
|
@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
|
|||
CPU: 7 physical cores
|
||||
RAM: 32GB
|
||||
|
||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
|
||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
||||
|
||||
Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||
|
||||
Result:
|
||||
|
||||
|
|
|
@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "-gqa" || arg == "--gqa") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_gqa = std::stoi(argv[i]);
|
||||
} else if (arg == "-eps" || arg == "--rms-norm-eps") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rms_norm_eps = std::stof(argv[i]);
|
||||
} else if (arg == "--rope-freq-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -561,8 +549,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
||||
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
|
||||
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
||||
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
||||
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
||||
|
@ -650,24 +636,11 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
|||
return "The";
|
||||
}
|
||||
|
||||
// TODO: not great allocating this every time
|
||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
||||
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
||||
std::vector<llama_token> res(text.size() + (int) add_bos);
|
||||
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||
assert(n >= 0);
|
||||
res.resize(n);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_batch = params.n_batch;
|
||||
lparams.n_gqa = params.n_gqa;
|
||||
lparams.rms_norm_eps = params.rms_norm_eps;
|
||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||
lparams.main_gpu = params.main_gpu;
|
||||
lparams.tensor_split = params.tensor_split;
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#define LLAMA_API_CPP // TODO: eliminate me
|
||||
#include "llama.h"
|
||||
|
||||
#include <string>
|
||||
|
@ -22,14 +23,12 @@ struct gpt_params {
|
|||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
|
||||
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||
|
||||
|
@ -53,7 +52,7 @@ struct gpt_params {
|
|||
std::string cfg_negative_prompt; // string to help guidance
|
||||
float cfg_scale = 1.f; // How strong is guidance
|
||||
|
||||
std::string model = "models/7B/ggml-model.bin"; // model path
|
||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string prompt = "";
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
|
@ -100,12 +99,6 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
|||
|
||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
|
||||
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
|
@ -502,7 +503,7 @@ bool is_ggml_file(const char *filename) {
|
|||
return false;
|
||||
}
|
||||
uint32_t magic = file.read_u32();
|
||||
return magic == LLAMA_FILE_MAGIC;
|
||||
return magic == GGUF_MAGIC;
|
||||
}
|
||||
|
||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||
|
@ -590,75 +591,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
|||
if (file.fp == NULL) {
|
||||
return;
|
||||
}
|
||||
// write_magic
|
||||
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
||||
file.write_u32(LLAMA_FILE_VERSION); // version
|
||||
// write_hparams
|
||||
file.write_u32(model->hparams.n_vocab);
|
||||
file.write_u32(model->hparams.n_embd);
|
||||
file.write_u32(model->hparams.n_mult);
|
||||
file.write_u32(model->hparams.n_head);
|
||||
file.write_u32(model->hparams.n_layer);
|
||||
file.write_u32(model->hparams.n_rot);
|
||||
file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||
|
||||
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
||||
uint32_t n_vocab = model->hparams.n_vocab;
|
||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
const auto & token_score = vocab->id_to_token.at(i);
|
||||
file.write_u32((uint32_t) token_score.tok.size());
|
||||
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
||||
file.write_raw(&token_score.score, sizeof(token_score.score));
|
||||
}
|
||||
|
||||
// stuff AK weights into GG weights one by one.
|
||||
// w->token_embedding_table -> model->tok_embeddings
|
||||
// float* -> struct ggml_tensor
|
||||
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
|
||||
|
||||
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
||||
//print_row(model->norm, 0);
|
||||
|
||||
// for rms-att-weight
|
||||
int row_length = model->hparams.n_embd;
|
||||
const auto & hparams = model->hparams;
|
||||
//int n_ff = model->hparams.n_embd;
|
||||
int n_ff = get_n_ff(&hparams);
|
||||
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||
auto & layer = model->layers[i];
|
||||
// 1d
|
||||
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||
|
||||
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
|
||||
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||
}
|
||||
// write tensors
|
||||
write_tensor(&file, model->tok_embeddings);
|
||||
write_tensor(&file, model->norm);
|
||||
write_tensor(&file, model->output); // ?
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||
auto & layer = model->layers[i];
|
||||
|
||||
write_tensor(&file, layer.attention_norm);
|
||||
write_tensor(&file, layer.wq);
|
||||
write_tensor(&file, layer.wk);
|
||||
write_tensor(&file, layer.wv);
|
||||
write_tensor(&file, layer.wo);
|
||||
write_tensor(&file, layer.ffn_norm);
|
||||
write_tensor(&file, layer.w1);
|
||||
write_tensor(&file, layer.w2);
|
||||
write_tensor(&file, layer.w3);
|
||||
}
|
||||
#pragma message("TODO: implement file saving using gguf")
|
||||
(void) vocab;
|
||||
(void) model;
|
||||
(void) w;
|
||||
// // write_magic
|
||||
// file.write_u32(LLAMA_FILE_MAGIC); // magic
|
||||
// file.write_u32(LLAMA_FILE_VERSION); // version
|
||||
// // write_hparams
|
||||
// file.write_u32(model->hparams.n_vocab);
|
||||
// file.write_u32(model->hparams.n_embd);
|
||||
// file.write_u32(model->hparams.n_mult);
|
||||
// file.write_u32(model->hparams.n_head);
|
||||
// file.write_u32(model->hparams.n_layer);
|
||||
// file.write_u32(model->hparams.n_rot);
|
||||
// file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||
//
|
||||
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
||||
// uint32_t n_vocab = model->hparams.n_vocab;
|
||||
// for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
// const auto & token_score = vocab->id_to_token.at(i);
|
||||
// file.write_u32((uint32_t) token_score.tok.size());
|
||||
// file.write_raw(token_score.tok.data(), token_score.tok.size());
|
||||
// file.write_raw(&token_score.score, sizeof(token_score.score));
|
||||
// }
|
||||
//
|
||||
// // stuff AK weights into GG weights one by one.
|
||||
// // w->token_embedding_table -> model->tok_embeddings
|
||||
// // float* -> struct ggml_tensor
|
||||
// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
|
||||
//
|
||||
// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
||||
// //print_row(model->norm, 0);
|
||||
//
|
||||
// // for rms-att-weight
|
||||
// int row_length = model->hparams.n_embd;
|
||||
// const auto & hparams = model->hparams;
|
||||
// //int n_ff = model->hparams.n_embd;
|
||||
// int n_ff = get_n_ff(&hparams);
|
||||
//
|
||||
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||
// auto & layer = model->layers[i];
|
||||
// // 1d
|
||||
// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||
//
|
||||
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||
// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
//
|
||||
// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||
// }
|
||||
// // write tensors
|
||||
// write_tensor(&file, model->tok_embeddings);
|
||||
// write_tensor(&file, model->norm);
|
||||
// write_tensor(&file, model->output); // ?
|
||||
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||
// auto & layer = model->layers[i];
|
||||
//
|
||||
// write_tensor(&file, layer.attention_norm);
|
||||
// write_tensor(&file, layer.wq);
|
||||
// write_tensor(&file, layer.wk);
|
||||
// write_tensor(&file, layer.wv);
|
||||
// write_tensor(&file, layer.wo);
|
||||
// write_tensor(&file, layer.ffn_norm);
|
||||
// write_tensor(&file, layer.w1);
|
||||
// write_tensor(&file, layer.w2);
|
||||
// write_tensor(&file, layer.w3);
|
||||
// }
|
||||
}
|
||||
|
||||
struct train_params get_default_train_params() {
|
||||
|
|
|
@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
|
|||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
|
246
examples/gguf/gguf.cpp
Normal file
246
examples/gguf/gguf.cpp
Normal file
|
@ -0,0 +1,246 @@
|
|||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cinttypes>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
template<typename T>
|
||||
static std::string to_string(const T & val) {
|
||||
std::stringstream ss;
|
||||
ss << val;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
bool gguf_ex_write(const std::string & fname) {
|
||||
struct gguf_context * ctx = gguf_init_empty();
|
||||
|
||||
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
|
||||
gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
|
||||
gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
|
||||
gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
|
||||
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
|
||||
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
|
||||
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
|
||||
gguf_set_val_bool(ctx, "some.parameter.bool", true);
|
||||
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
|
||||
|
||||
gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
|
||||
gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
|
||||
gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ 128ull*1024ull*1024ull,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx_data = ggml_init(params);
|
||||
|
||||
const int n_tensors = 10;
|
||||
|
||||
// tensor infos
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const std::string name = "tensor_" + to_string(i);
|
||||
|
||||
int64_t ne[GGML_MAX_DIMS] = { 1 };
|
||||
int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
|
||||
|
||||
for (int j = 0; j < n_dims; ++j) {
|
||||
ne[j] = rand() % 10 + 1;
|
||||
}
|
||||
|
||||
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
|
||||
ggml_set_name(cur, name.c_str());
|
||||
|
||||
{
|
||||
float * data = (float *) cur->data;
|
||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||
data[j] = 100 + i;
|
||||
}
|
||||
}
|
||||
|
||||
gguf_add_tensor(ctx, cur);
|
||||
}
|
||||
|
||||
gguf_write_to_file(ctx, fname.c_str(), false);
|
||||
|
||||
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
|
||||
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// just read tensor info
|
||||
bool gguf_ex_read_0(const std::string & fname) {
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ NULL,
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||
|
||||
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
|
||||
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
||||
|
||||
// kv
|
||||
{
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
|
||||
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
|
||||
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
const char * key = gguf_get_key(ctx, i);
|
||||
|
||||
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
|
||||
}
|
||||
}
|
||||
|
||||
// find kv string
|
||||
{
|
||||
const char * findkey = "some.parameter.string";
|
||||
|
||||
const int keyidx = gguf_find_key(ctx, findkey);
|
||||
if (keyidx == -1) {
|
||||
fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
|
||||
} else {
|
||||
const char * key_value = gguf_get_val_str(ctx, keyidx);
|
||||
fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
|
||||
}
|
||||
}
|
||||
|
||||
// tensor info
|
||||
{
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name (ctx, i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
|
||||
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
||||
}
|
||||
}
|
||||
|
||||
gguf_free(ctx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
bool gguf_ex_read_1(const std::string & fname) {
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||
|
||||
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
|
||||
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
||||
|
||||
// kv
|
||||
{
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
|
||||
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
|
||||
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
const char * key = gguf_get_key(ctx, i);
|
||||
|
||||
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
|
||||
}
|
||||
}
|
||||
|
||||
// tensor info
|
||||
{
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name (ctx, i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
|
||||
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
||||
}
|
||||
}
|
||||
|
||||
// data
|
||||
{
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
|
||||
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||
|
||||
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
|
||||
|
||||
// print first 10 elements
|
||||
const float * data = (const float *) cur->data;
|
||||
|
||||
printf("%s data[:10] : ", name);
|
||||
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
|
||||
printf("%f ", data[j]);
|
||||
}
|
||||
printf("\n\n");
|
||||
|
||||
// check data
|
||||
{
|
||||
const float * data = (const float *) cur->data;
|
||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||
if (data[j] != 100 + i) {
|
||||
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
|
||||
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3) {
|
||||
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
const std::string fname(argv[1]);
|
||||
const std::string mode (argv[2]);
|
||||
|
||||
GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
|
||||
|
||||
if (mode == "w") {
|
||||
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
|
||||
} else if (mode == "r") {
|
||||
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
||||
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
1014
examples/gptneox-wip/cmpnct_gpt2bpe.hpp
Normal file
1014
examples/gptneox-wip/cmpnct_gpt2bpe.hpp
Normal file
File diff suppressed because one or more lines are too long
1076
examples/gptneox-wip/gptneox-main.cpp
Normal file
1076
examples/gptneox-wip/gptneox-main.cpp
Normal file
File diff suppressed because it is too large
Load diff
|
@ -191,10 +191,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
|
||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
} else {
|
||||
|
@ -270,15 +266,12 @@ int main(int argc, char ** argv) {
|
|||
params.interactive = true;
|
||||
}
|
||||
|
||||
// determine newline token
|
||||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (ctx_guidance) {
|
||||
|
@ -286,14 +279,14 @@ int main(int argc, char ** argv) {
|
|||
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
|
||||
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
|
||||
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
fprintf(stderr, "'\n");
|
||||
}
|
||||
|
@ -311,7 +304,7 @@ int main(int argc, char ** argv) {
|
|||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
||||
|
@ -662,7 +655,7 @@ int main(int argc, char ** argv) {
|
|||
// display text
|
||||
if (input_echo) {
|
||||
for (auto id : embd) {
|
||||
printf("%s", llama_token_to_str(ctx, id));
|
||||
printf("%s", llama_token_to_str(ctx, id).c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
@ -782,8 +775,7 @@ int main(int argc, char ** argv) {
|
|||
if (grammar != NULL) {
|
||||
llama_grammar_free(grammar);
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(
|
||||
parsed_grammar.c_rules());
|
||||
std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(),
|
||||
parsed_grammar.symbol_ids.at("root"));
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
//
|
||||
// - First, export a LLaMA graph:
|
||||
//
|
||||
// $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
|
||||
// $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
|
||||
//
|
||||
// - Run this tool to evaluate the exported graph:
|
||||
//
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "ggml.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#define LLAMA_API_CPP // TODO: eliminate me
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "llama.h"
|
||||
|
||||
|
@ -24,7 +25,7 @@
|
|||
#endif
|
||||
|
||||
struct quantize_stats_params {
|
||||
std::string model = "models/7B/ggml-model-f16.bin";
|
||||
std::string model = "models/7B/ggml-model-f16.gguf";
|
||||
bool verbose = false;
|
||||
bool per_layer_stats = false;
|
||||
bool print_histogram = false;
|
||||
|
|
|
@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
|||
}
|
||||
|
||||
// usage:
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
//
|
||||
void usage(const char * executable) {
|
||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
|
||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
fprintf(stderr, "\nAllowed quantization types:\n");
|
||||
|
@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
|
|||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
// export as [inp path]/ggml-model-[ftype].bin
|
||||
fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
|
||||
// export as [inp path]/ggml-model-[ftype].gguf
|
||||
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
|
||||
arg_idx++;
|
||||
}
|
||||
else {
|
||||
|
|
|
@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
|
|||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_gqa = params.n_gqa;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
|
@ -45,9 +44,8 @@ int main(int argc, char ** argv) {
|
|||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
auto tokens = std::vector<llama_token>(params.n_ctx);
|
||||
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
|
||||
|
||||
auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
|
||||
auto n_prompt_tokens = tokens.size();
|
||||
if (n_prompt_tokens < 1) {
|
||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||
llama_free(ctx);
|
||||
|
@ -92,7 +90,7 @@ int main(int argc, char ** argv) {
|
|||
auto next_token_str = llama_token_to_str(ctx, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
|
||||
printf("%s", next_token_str);
|
||||
printf("%s", next_token_str.c_str());
|
||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_free(ctx);
|
||||
|
@ -152,7 +150,7 @@ int main(int argc, char ** argv) {
|
|||
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
|
||||
printf("%s", next_token_str);
|
||||
printf("%s", next_token_str.c_str());
|
||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_free(ctx2);
|
||||
|
|
|
@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
|
|||
Command line options:
|
||||
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during computation.
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
|
@ -48,14 +48,12 @@ To get started right away, run the following command, making sure to use the cor
|
|||
### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./server -m models/7B/ggml-model.bin -c 2048
|
||||
./server -m models/7B/ggml-model.gguf -c 2048
|
||||
```
|
||||
|
||||
### Windows:
|
||||
|
||||
```powershell
|
||||
server.exe -m models\7B\ggml-model.bin -c 2048
|
||||
```
|
||||
|
||||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
|
||||
|
|
|
@ -652,8 +652,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
||||
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
|
||||
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
|
@ -774,23 +772,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-gqa" || arg == "--gqa")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_gqa = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-eps" || arg == "--rms-norm-eps") {
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rms_norm_eps = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--rope-freq-base")
|
||||
{
|
||||
if (++i >= argc)
|
||||
|
|
|
@ -2,180 +2,129 @@
|
|||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
//---------------------------------
|
||||
// Print help :
|
||||
//---------------------------------
|
||||
|
||||
if ( argc == 1 || argv[1][0] == '-' )
|
||||
{
|
||||
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
//---------------------------------
|
||||
// Load parameters :
|
||||
//---------------------------------
|
||||
|
||||
if ( argc >= 2 )
|
||||
{
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
||||
if ( argc >= 3 )
|
||||
{
|
||||
if (argc >= 3) {
|
||||
params.prompt = argv[2];
|
||||
}
|
||||
|
||||
if ( params.prompt.empty() )
|
||||
{
|
||||
if (params.prompt.empty()) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
|
||||
//---------------------------------
|
||||
// Init LLM :
|
||||
//---------------------------------
|
||||
// init LLM
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
|
||||
|
||||
if ( model == NULL )
|
||||
{
|
||||
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
||||
if (model == NULL) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
//---------------------------------
|
||||
// Tokenize the prompt :
|
||||
//---------------------------------
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
// tokenize the prompt
|
||||
|
||||
std::vector<llama_token> tokens_list;
|
||||
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
|
||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int max_context_size = llama_n_ctx( ctx );
|
||||
const int max_tokens_list_size = max_context_size - 4 ;
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
||||
if ( (int)tokens_list.size() > max_tokens_list_size )
|
||||
{
|
||||
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
|
||||
__func__ , (int)tokens_list.size() , max_tokens_list_size );
|
||||
if ((int) tokens_list.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf( stderr, "\n\n" );
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
// Print the tokens from the prompt :
|
||||
|
||||
for( auto id : tokens_list )
|
||||
{
|
||||
printf( "%s" , llama_token_to_str( ctx , id ) );
|
||||
for (auto id : tokens_list) {
|
||||
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
fflush(stderr);
|
||||
|
||||
|
||||
//---------------------------------
|
||||
// Main prediction loop :
|
||||
//---------------------------------
|
||||
// main loop
|
||||
|
||||
// The LLM keeps a contextual cache memory of previous token evaluation.
|
||||
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
|
||||
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
|
||||
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
|
||||
|
||||
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
|
||||
{
|
||||
//---------------------------------
|
||||
// Evaluate the tokens :
|
||||
//---------------------------------
|
||||
const int n_gen = std::min(32, max_context_size);
|
||||
|
||||
if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
|
||||
{
|
||||
fprintf( stderr, "%s : failed to eval\n" , __func__ );
|
||||
while (llama_get_kv_cache_token_count(ctx) < n_gen) {
|
||||
// evaluate the transformer
|
||||
|
||||
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
tokens_list.clear();
|
||||
|
||||
//---------------------------------
|
||||
// Select the best prediction :
|
||||
//---------------------------------
|
||||
// sample the next token
|
||||
|
||||
llama_token new_token_id = 0;
|
||||
|
||||
auto logits = llama_get_logits( ctx );
|
||||
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve( n_vocab );
|
||||
candidates.reserve(n_vocab);
|
||||
|
||||
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
|
||||
{
|
||||
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// Select it using the "Greedy sampling" method :
|
||||
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
|
||||
|
||||
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
|
||||
|
||||
// is it an end of stream ?
|
||||
if ( new_token_id == llama_token_eos() )
|
||||
{
|
||||
if (new_token_id == llama_token_eos()) {
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
// Print the new token :
|
||||
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
|
||||
fflush( stdout );
|
||||
// print the new token :
|
||||
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// Push this new token for next evaluation :
|
||||
tokens_list.push_back( new_token_id );
|
||||
// push this new token for next evaluation
|
||||
tokens_list.push_back(new_token_id);
|
||||
}
|
||||
|
||||
} // wend of main loop
|
||||
|
||||
llama_free( ctx );
|
||||
llama_free_model( model );
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// EOF
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "ggml.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
@ -16,7 +17,7 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
||||
static const float rms_norm_eps = 1e-5f;
|
||||
|
||||
struct random_normal_distribution {
|
||||
std::mt19937 gen;
|
||||
|
@ -1961,7 +1962,7 @@ void print_matrix(struct ggml_tensor * probs) {
|
|||
|
||||
|
||||
void print_token(struct llama_context * ctx, llama_token token) {
|
||||
printf("%s", llama_token_to_str(ctx, token));
|
||||
printf("%s", llama_token_to_str(ctx, token).c_str());
|
||||
}
|
||||
|
||||
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
|
||||
|
@ -2188,11 +2189,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
|
|||
f.read_raw(buf.data(), f.size);
|
||||
buf[f.size] = '\0';
|
||||
|
||||
out.resize(buf.size());
|
||||
|
||||
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
|
||||
if (n_tokens >= 0) {
|
||||
out.resize(n_tokens);
|
||||
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
|
||||
if (n_tokens < 0) {
|
||||
out.resize(-n_tokens);
|
||||
llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
|
||||
}
|
||||
|
||||
bool verify = false;
|
||||
|
@ -2200,17 +2200,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
|
|||
const char * in = buf.data();
|
||||
const char * end = buf.data() + buf.size();
|
||||
for (int i = 0; i < (int) out.size(); ++i) {
|
||||
const char * s = llama_token_to_str(lctx, out[i]);
|
||||
int len = strlen(s);
|
||||
std::string s = llama_token_to_str(lctx, out[i]);
|
||||
int len = s.length();
|
||||
if (in >= end) {
|
||||
printf("%s: unexpected end of original text.\n", __func__);
|
||||
break;
|
||||
}
|
||||
const bool matches = (strncmp(in, s, len) == 0);
|
||||
const bool matches = (strncmp(in, s.c_str(), len) == 0);
|
||||
if (matches) {
|
||||
in += len;
|
||||
} else {
|
||||
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
|
||||
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2612,42 +2612,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
|||
return;
|
||||
}
|
||||
|
||||
// write_magic
|
||||
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
||||
file.write_u32(LLAMA_FILE_VERSION); // version
|
||||
// write_hparams
|
||||
file.write_u32(model->hparams.n_vocab);
|
||||
file.write_u32(model->hparams.n_embd);
|
||||
file.write_u32(model->hparams.n_mult);
|
||||
file.write_u32(model->hparams.n_head);
|
||||
file.write_u32(model->hparams.n_layer);
|
||||
file.write_u32(model->hparams.n_rot);
|
||||
file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||
// write_vocab
|
||||
uint32_t n_vocab = model->hparams.n_vocab;
|
||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
const auto & token_score = vocab->id_to_token.at(i);
|
||||
file.write_u32((uint32_t) token_score.tok.size());
|
||||
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
||||
file.write_raw(&token_score.score, sizeof(token_score.score));
|
||||
}
|
||||
// write tensors
|
||||
write_tensor(&file, model->tok_embeddings);
|
||||
write_tensor(&file, model->norm);
|
||||
write_tensor(&file, model->output);
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||
auto & layer = model->layers[i];
|
||||
|
||||
write_tensor(&file, layer.attention_norm);
|
||||
write_tensor(&file, layer.wq);
|
||||
write_tensor(&file, layer.wk);
|
||||
write_tensor(&file, layer.wv);
|
||||
write_tensor(&file, layer.wo);
|
||||
write_tensor(&file, layer.ffn_norm);
|
||||
write_tensor(&file, layer.w1);
|
||||
write_tensor(&file, layer.w2);
|
||||
write_tensor(&file, layer.w3);
|
||||
}
|
||||
#pragma message("TODO: implement file saving using gguf")
|
||||
(void) vocab;
|
||||
(void) model;
|
||||
// // write_magic
|
||||
// file.write_u32(LLAMA_FILE_MAGIC); // magic
|
||||
// file.write_u32(LLAMA_FILE_VERSION); // version
|
||||
// // write_hparams
|
||||
// file.write_u32(model->hparams.n_vocab);
|
||||
// file.write_u32(model->hparams.n_embd);
|
||||
// file.write_u32(model->hparams.n_mult);
|
||||
// file.write_u32(model->hparams.n_head);
|
||||
// file.write_u32(model->hparams.n_layer);
|
||||
// file.write_u32(model->hparams.n_rot);
|
||||
// file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||
// // write_vocab
|
||||
// uint32_t n_vocab = model->hparams.n_vocab;
|
||||
// for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
// const auto & token_score = vocab->id_to_token.at(i);
|
||||
// file.write_u32((uint32_t) token_score.tok.size());
|
||||
// file.write_raw(token_score.tok.data(), token_score.tok.size());
|
||||
// file.write_raw(&token_score.score, sizeof(token_score.score));
|
||||
// }
|
||||
// // write tensors
|
||||
// write_tensor(&file, model->tok_embeddings);
|
||||
// write_tensor(&file, model->norm);
|
||||
// write_tensor(&file, model->output);
|
||||
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||
// auto & layer = model->layers[i];
|
||||
//
|
||||
// write_tensor(&file, layer.attention_norm);
|
||||
// write_tensor(&file, layer.wq);
|
||||
// write_tensor(&file, layer.wk);
|
||||
// write_tensor(&file, layer.wv);
|
||||
// write_tensor(&file, layer.wo);
|
||||
// write_tensor(&file, layer.ffn_norm);
|
||||
// write_tensor(&file, layer.w1);
|
||||
// write_tensor(&file, layer.w2);
|
||||
// write_tensor(&file, layer.w3);
|
||||
// }
|
||||
}
|
||||
|
||||
float cosine_decay(const int decay_steps, const float alpha, int step) {
|
||||
|
|
|
@ -38,6 +38,9 @@ struct ggml_metal_context;
|
|||
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||
|
||||
void * ggml_metal_host_malloc(size_t n);
|
||||
void ggml_metal_host_free (void * data);
|
||||
|
||||
// set the number of command buffers to use
|
||||
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
||||
|
||||
|
|
15
ggml-metal.m
15
ggml-metal.m
|
@ -237,6 +237,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
free(ctx);
|
||||
}
|
||||
|
||||
void * ggml_metal_host_malloc(size_t n) {
|
||||
void * data = NULL;
|
||||
const int result = posix_memalign((void **) &data, getpagesize(), n);
|
||||
if (result != 0) {
|
||||
fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
void ggml_metal_host_free(void * data) {
|
||||
free(data);
|
||||
}
|
||||
|
||||
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
||||
ctx->n_cb = n_cb;
|
||||
}
|
||||
|
|
122
ggml.h
122
ggml.h
|
@ -207,14 +207,18 @@
|
|||
#define GGML_MAX_PARAMS 256
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_SRC 6
|
||||
#define GGML_MAX_NAME 48
|
||||
#define GGML_MAX_NAME 64
|
||||
#define GGML_MAX_OP_PARAMS 32
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
|
||||
#define GGML_EXIT_SUCCESS 0
|
||||
#define GGML_EXIT_ABORTED 1
|
||||
|
||||
#define GGUF_MAGIC 0x47475546 // "GGUF"
|
||||
#define GGUF_VERSION 1
|
||||
|
||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||
|
||||
#define GGML_UNUSED(x) (void)(x)
|
||||
|
||||
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||
|
@ -562,6 +566,7 @@ extern "C" {
|
|||
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
||||
|
||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
||||
|
@ -1494,7 +1499,6 @@ extern "C" {
|
|||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
|
@ -1703,6 +1707,118 @@ extern "C" {
|
|||
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
|
||||
//
|
||||
// gguf
|
||||
//
|
||||
|
||||
enum gguf_type {
|
||||
GGUF_TYPE_UINT8 = 0,
|
||||
GGUF_TYPE_INT8 = 1,
|
||||
GGUF_TYPE_UINT16 = 2,
|
||||
GGUF_TYPE_INT16 = 3,
|
||||
GGUF_TYPE_UINT32 = 4,
|
||||
GGUF_TYPE_INT32 = 5,
|
||||
GGUF_TYPE_FLOAT32 = 6,
|
||||
GGUF_TYPE_BOOL = 7,
|
||||
GGUF_TYPE_STRING = 8,
|
||||
GGUF_TYPE_ARRAY = 9,
|
||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||
};
|
||||
|
||||
struct gguf_context;
|
||||
|
||||
struct gguf_init_params {
|
||||
bool no_alloc;
|
||||
|
||||
// if not NULL, create a ggml_context and allocate the tensor data in it
|
||||
struct ggml_context ** ctx;
|
||||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||
|
||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||
|
||||
GGML_API int gguf_get_version (struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
||||
GGML_API void * gguf_get_data (struct gguf_context * ctx);
|
||||
|
||||
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
||||
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
||||
|
||||
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
|
||||
|
||||
// results are undefined if the wrong type is used for the key
|
||||
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
|
||||
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
|
||||
GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
||||
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
||||
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
||||
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
||||
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
|
||||
|
||||
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
|
||||
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
|
||||
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
|
||||
|
||||
// overrides existing values or adds a new one
|
||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
||||
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
||||
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
|
||||
|
||||
// set or add KV pairs from another context
|
||||
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
|
||||
|
||||
// manage tensor info
|
||||
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
||||
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
||||
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
||||
|
||||
// writing gguf files can be done in 2 ways:
|
||||
//
|
||||
// - write the entire gguf_context to a binary file in a single pass:
|
||||
//
|
||||
// gguf_write_to_file(ctx, fname);
|
||||
//
|
||||
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
||||
//
|
||||
// FILE * f = fopen(fname, "wb");
|
||||
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
|
||||
// fwrite(f, ...);
|
||||
// void * data = gguf_meta_get_meta_data(ctx);
|
||||
// fseek(f, 0, SEEK_SET);
|
||||
// fwrite(f, data, gguf_get_meta_size(ctx));
|
||||
// free(data);
|
||||
// fclose(f);
|
||||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
|
||||
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
|
669
gguf.py
Normal file
669
gguf.py
Normal file
|
@ -0,0 +1,669 @@
|
|||
import shutil
|
||||
import sys
|
||||
import struct
|
||||
import tempfile
|
||||
import numpy as np
|
||||
|
||||
from enum import IntEnum, auto
|
||||
from typing import Any, IO, List
|
||||
|
||||
#
|
||||
# constants
|
||||
#
|
||||
|
||||
GGUF_MAGIC = 0x47475546
|
||||
GGUF_VERSION = 1
|
||||
GGUF_DEFAULT_ALIGNMENT = 32
|
||||
|
||||
# general
|
||||
KEY_GENERAL_ARCHITECTURE = "general.architecture"
|
||||
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
|
||||
KEY_GENERAL_ALIGNMENT = "general.alignment"
|
||||
KEY_GENERAL_NAME = "general.name"
|
||||
KEY_GENERAL_AUTHOR = "general.author"
|
||||
KEY_GENERAL_URL = "general.url"
|
||||
KEY_GENERAL_DESCRIPTION = "general.description"
|
||||
KEY_GENERAL_LICENSE = "general.license"
|
||||
KEY_GENERAL_SOURCE_URL = "general.source.url"
|
||||
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
||||
|
||||
# LLM
|
||||
KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
|
||||
KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
|
||||
KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
|
||||
# attention
|
||||
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
|
||||
KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
|
||||
KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
|
||||
KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
|
||||
KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||
|
||||
# RoPE
|
||||
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
KEY_ROPE_SCALE = "{arch}.rope.scale"
|
||||
|
||||
# tokenization
|
||||
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
|
||||
KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
|
||||
KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||
KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
|
||||
KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
|
||||
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
|
||||
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
||||
|
||||
#
|
||||
# recommended mapping of model tensor names for storage in gguf
|
||||
#
|
||||
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
LLAMA = auto()
|
||||
FALCON = auto()
|
||||
GPT2 = auto()
|
||||
GPTJ = auto()
|
||||
GPTNEOX = auto()
|
||||
MPT = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
TOKEN_EMBD = auto()
|
||||
POS_EMBD = auto()
|
||||
OUTPUT = auto()
|
||||
OUTPUT_NORM = auto()
|
||||
ROPE_FREQS = auto()
|
||||
ATTN_Q = auto()
|
||||
ATTN_K = auto()
|
||||
ATTN_V = auto()
|
||||
ATTN_QKV = auto()
|
||||
ATTN_OUT = auto()
|
||||
ATTN_NORM = auto()
|
||||
ATTN_NORM_2 = auto()
|
||||
ATTN_ROT_EMBD = auto()
|
||||
FFN_GATE = auto()
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
FFN_NORM = auto()
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES = {
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.FALCON: "falcon",
|
||||
MODEL_ARCH.GPT2: "gpt2",
|
||||
MODEL_ARCH.GPTJ: "gptj",
|
||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
}
|
||||
|
||||
MODEL_TENSOR_NAMES = {
|
||||
MODEL_ARCH.LLAMA: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.FALCON: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.GPT2: {
|
||||
# TODO
|
||||
},
|
||||
# TODO
|
||||
}
|
||||
|
||||
# tensors that will not be serialized
|
||||
MODEL_TENSOR_SKIP = {
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# TODO: the following helper functions should be removed
|
||||
# instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
|
||||
# however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
|
||||
# REMOVE
|
||||
def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
|
||||
for skip in MODEL_TENSOR_SKIP.get(arch, []):
|
||||
for i in range(n_blocks):
|
||||
if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
|
||||
tensor_map = {}
|
||||
|
||||
# Token embeddings
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
|
||||
|
||||
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
|
||||
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
|
||||
tensor_map["transformer.word_embeddings"] = mapped_to # falcon
|
||||
tensor_map["model.embed_tokens"] = mapped_to # llama-hf
|
||||
tensor_map["tok_embeddings"] = mapped_to # llama-pth
|
||||
|
||||
# Position embeddings
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
|
||||
|
||||
tensor_map["transformer.wpe"] = mapped_to # gpt2
|
||||
|
||||
# Output
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
|
||||
|
||||
tensor_map["embed_out"] = mapped_to # gptneox
|
||||
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
|
||||
tensor_map["output"] = mapped_to # llama-pth
|
||||
|
||||
# Output norm
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
|
||||
|
||||
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
|
||||
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
|
||||
tensor_map["transformer.norm_f"] = mapped_to # mpt
|
||||
tensor_map["model.norm"] = mapped_to # llama-hf
|
||||
tensor_map["norm"] = mapped_to # llama-pth
|
||||
|
||||
# Rope frequencies
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
|
||||
|
||||
tensor_map["rope.freqs"] = mapped_to # llama-pth
|
||||
|
||||
# Attention and feed-forward blocks
|
||||
for i in range(0, n_blocks):
|
||||
# Attention norm
|
||||
# TODO: is there are simpler way to write these 2 lines in Python?
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
|
||||
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
|
||||
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
|
||||
|
||||
# Attention norm 2
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
|
||||
|
||||
# Attention query-key-value
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
|
||||
|
||||
# Attention query
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
|
||||
|
||||
# Attention key
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
|
||||
|
||||
# Attention value
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
|
||||
|
||||
# Attention output
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
|
||||
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
|
||||
|
||||
# Rotary embeddings
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth
|
||||
|
||||
# Feed-forward norm
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
|
||||
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
|
||||
|
||||
# Feed-forward up
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
|
||||
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
|
||||
|
||||
# Feed-forward gate
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
|
||||
|
||||
# Feed-forward down
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
|
||||
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
|
||||
|
||||
return tensor_map
|
||||
|
||||
#
|
||||
# implementation
|
||||
#
|
||||
|
||||
|
||||
class GGMLQuantizationType(IntEnum):
|
||||
F32 = 0
|
||||
F16 = 1
|
||||
|
||||
|
||||
class GGUFValueType(IntEnum):
|
||||
UINT8 = 0
|
||||
INT8 = 1
|
||||
UINT16 = 2
|
||||
INT16 = 3
|
||||
UINT32 = 4
|
||||
INT32 = 5
|
||||
FLOAT32 = 6
|
||||
BOOL = 7
|
||||
STRING = 8
|
||||
ARRAY = 9
|
||||
|
||||
@staticmethod
|
||||
def get_type(val):
|
||||
if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
|
||||
return GGUFValueType.STRING
|
||||
elif isinstance(val, list):
|
||||
return GGUFValueType.ARRAY
|
||||
elif isinstance(val, float):
|
||||
return GGUFValueType.FLOAT32
|
||||
elif isinstance(val, bool):
|
||||
return GGUFValueType.BOOL
|
||||
elif isinstance(val, int):
|
||||
return GGUFValueType.INT32
|
||||
else:
|
||||
print("Unknown type: "+str(type(val)))
|
||||
sys.exit()
|
||||
|
||||
|
||||
class GGUFWriter:
|
||||
def __init__(self, path: str, arch: str):
|
||||
self.fout = open(path, "wb")
|
||||
self.arch = arch
|
||||
self.offset_tensor = 0
|
||||
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||
self.kv_data = b""
|
||||
self.kv_data_count = 0
|
||||
self.ti_data = b""
|
||||
self.ti_data_count = 0
|
||||
self.add_architecture()
|
||||
|
||||
def write_header_to_file(self):
|
||||
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
||||
self.fout.write(struct.pack("<I", GGUF_VERSION))
|
||||
self.fout.write(struct.pack("<I", self.ti_data_count))
|
||||
self.fout.write(struct.pack("<I", self.kv_data_count))
|
||||
self.flush()
|
||||
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
|
||||
|
||||
def write_kv_data_to_file(self):
|
||||
self.fout.write(self.kv_data)
|
||||
self.flush()
|
||||
|
||||
def write_ti_data_to_file(self):
|
||||
self.fout.write(self.ti_data)
|
||||
self.flush()
|
||||
|
||||
def add_key(self, key: str):
|
||||
self.add_val(key, GGUFValueType.STRING, add_vtype=False)
|
||||
|
||||
def add_uint8(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.UINT8)
|
||||
|
||||
def add_int8(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.INT8)
|
||||
|
||||
def add_uint16(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.UINT16)
|
||||
|
||||
def add_int16(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.INT16)
|
||||
|
||||
def add_uint32(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.UINT32)
|
||||
|
||||
def add_int32(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.INT32)
|
||||
|
||||
def add_float32(self, key: str, val: float):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.FLOAT32)
|
||||
|
||||
def add_bool(self, key: str, val: bool):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.BOOL)
|
||||
|
||||
def add_string(self, key: str, val: str):
|
||||
if len(val) == 0:
|
||||
return
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.STRING)
|
||||
|
||||
def add_array(self, key: str, val: list):
|
||||
if not isinstance(val, list):
|
||||
raise ValueError("Value must be a list for array type")
|
||||
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.ARRAY)
|
||||
|
||||
def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
|
||||
if vtype is None:
|
||||
vtype = GGUFValueType.get_type(val)
|
||||
|
||||
if add_vtype:
|
||||
self.kv_data += struct.pack("<I", vtype)
|
||||
self.kv_data_count += 1
|
||||
|
||||
if vtype == GGUFValueType.UINT8:
|
||||
self.kv_data += struct.pack("<B", val)
|
||||
elif vtype == GGUFValueType.INT8:
|
||||
self.kv_data += struct.pack("<b", val)
|
||||
elif vtype == GGUFValueType.UINT16:
|
||||
self.kv_data += struct.pack("<H", val)
|
||||
elif vtype == GGUFValueType.INT16:
|
||||
self.kv_data += struct.pack("<h", val)
|
||||
elif vtype == GGUFValueType.UINT32:
|
||||
self.kv_data += struct.pack("<I", val)
|
||||
elif vtype == GGUFValueType.INT32:
|
||||
self.kv_data += struct.pack("<i", val)
|
||||
elif vtype == GGUFValueType.FLOAT32:
|
||||
self.kv_data += struct.pack("<f", val)
|
||||
elif vtype == GGUFValueType.BOOL:
|
||||
self.kv_data += struct.pack("?", val)
|
||||
elif vtype == GGUFValueType.STRING:
|
||||
encoded_val = val.encode("utf8") if isinstance(val, str) else val
|
||||
self.kv_data += struct.pack("<I", len(encoded_val))
|
||||
self.kv_data += encoded_val
|
||||
elif vtype == GGUFValueType.ARRAY:
|
||||
ltype = set([GGUFValueType.get_type(item) for item in val])
|
||||
assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
|
||||
self.kv_data += struct.pack("<I", list(ltype)[0])
|
||||
self.kv_data += struct.pack("<I", len(val))
|
||||
for item in val:
|
||||
self.add_val(item, add_vtype=False)
|
||||
else:
|
||||
raise ValueError("Invalid GGUF metadata value type")
|
||||
|
||||
@staticmethod
|
||||
def ggml_pad(x: int, n: int) -> int:
|
||||
return ((x + n - 1) // n) * n
|
||||
|
||||
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
|
||||
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
||||
|
||||
encoded_name = name.encode("utf8")
|
||||
self.ti_data += struct.pack("<I", len(encoded_name))
|
||||
self.ti_data += encoded_name
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += struct.pack("<I", n_dims)
|
||||
for i in range(n_dims):
|
||||
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
|
||||
|
||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
||||
self.ti_data += struct.pack("<I", dtype)
|
||||
self.ti_data += struct.pack("<Q", self.offset_tensor)
|
||||
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
||||
self.ti_data_count += 1
|
||||
|
||||
def add_tensor(self, name: str, tensor: np.ndarray):
|
||||
if not hasattr(self, "temp_file"):
|
||||
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
||||
self.temp_file.seek(0)
|
||||
|
||||
self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
|
||||
|
||||
tensor.tofile(self.temp_file)
|
||||
|
||||
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
|
||||
if pad != 0:
|
||||
self.temp_file.write(bytes([0] * pad))
|
||||
|
||||
def write_tensor_data(self, tensor: np.ndarray):
|
||||
pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
|
||||
if pad != 0:
|
||||
self.fout.write(bytes([0] * pad))
|
||||
|
||||
tensor.tofile(self.fout)
|
||||
|
||||
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
|
||||
if pad != 0:
|
||||
self.fout.write(bytes([0] * pad))
|
||||
|
||||
def write_tensors_to_file(self):
|
||||
self.write_ti_data_to_file()
|
||||
|
||||
pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
|
||||
if pad != 0:
|
||||
self.fout.write(bytes([0] * pad))
|
||||
|
||||
self.temp_file.seek(0)
|
||||
|
||||
shutil.copyfileobj(self.temp_file, self.fout)
|
||||
self.flush()
|
||||
self.temp_file.close()
|
||||
|
||||
def flush(self):
|
||||
self.fout.flush()
|
||||
|
||||
def close(self):
|
||||
self.fout.close()
|
||||
|
||||
def add_architecture(self):
|
||||
self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
|
||||
|
||||
def add_author(self, author: str):
|
||||
self.add_string(KEY_GENERAL_AUTHOR, author)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str):
|
||||
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
def add_url(self, url: str):
|
||||
self.add_string(KEY_GENERAL_URL, url)
|
||||
|
||||
def add_description(self, description: str):
|
||||
self.add_string(KEY_GENERAL_DESCRIPTION, description)
|
||||
|
||||
def add_source_url(self, url: str):
|
||||
self.add_string(KEY_GENERAL_SOURCE_URL, url)
|
||||
|
||||
def add_source_hf_repo(self, repo: str):
|
||||
self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
|
||||
|
||||
def add_name(self, name: str):
|
||||
self.add_string(KEY_GENERAL_NAME, name)
|
||||
|
||||
def add_quantization_version(self, quantization_version: GGMLQuantizationType):
|
||||
self.add_uint32(
|
||||
KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
|
||||
|
||||
def add_custom_alignment(self, alignment: int):
|
||||
self.data_alignment = alignment
|
||||
self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
|
||||
|
||||
def add_context_length(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_embedding_length(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_block_count(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
|
||||
|
||||
def add_feed_forward_length(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_parallel_residual(self, use: bool):
|
||||
self.add_bool(
|
||||
KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str):
|
||||
self.add_string(
|
||||
KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
def add_head_count(self, count: int):
|
||||
self.add_uint32(
|
||||
KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_head_count_kv(self, count: int):
|
||||
self.add_uint32(
|
||||
KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
|
||||
|
||||
def add_max_alibi_bias(self, bias: float):
|
||||
self.add_float32(
|
||||
KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
|
||||
|
||||
def add_clamp_kqv(self, value: float):
|
||||
self.add_float32(
|
||||
KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
|
||||
|
||||
def add_layer_norm_eps(self, value: float):
|
||||
self.add_float32(
|
||||
KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
|
||||
|
||||
def add_layer_norm_rms_eps(self, value: float):
|
||||
self.add_float32(
|
||||
KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_dimension_count(self, count: int):
|
||||
self.add_uint32(
|
||||
KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_rope_scale(self, value: float):
|
||||
self.add_float32(KEY_ROPE_SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_tokenizer_model(self, model: str):
|
||||
self.add_string(KEY_TOKENIZER_MODEL, model)
|
||||
|
||||
def add_token_list(self, tokens: List):
|
||||
self.add_array(KEY_TOKENIZER_LIST, tokens)
|
||||
|
||||
def add_token_merges(self, merges: List):
|
||||
self.add_array(KEY_TOKENIZER_MERGES, merges)
|
||||
|
||||
def add_token_types(self, types: List[int]):
|
||||
self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
|
||||
|
||||
def add_token_scores(self, scores: List[float]):
|
||||
self.add_array(KEY_TOKENIZER_SCORES, scores)
|
||||
|
||||
def add_bos_token_id(self, id: int):
|
||||
self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
|
||||
|
||||
def add_eos_token_id(self, id: int):
|
||||
self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
|
||||
|
||||
def add_unk_token_id(self, id: int):
|
||||
self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
|
||||
|
||||
def add_sep_token_id(self, id: int):
|
||||
self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
|
||||
|
||||
def add_pad_token_id(self, id: int):
|
||||
self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
|
||||
|
||||
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
# Example usage with a file
|
||||
gguf_writer = GGUFWriter("example.gguf", "llama")
|
||||
|
||||
gguf_writer.add_architecture()
|
||||
gguf_writer.add_block_count(12)
|
||||
gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
|
||||
gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
|
||||
gguf_writer.add_custom_alignment(64)
|
||||
|
||||
tensor1 = np.ones((32,), dtype=np.float32) * 100.0
|
||||
tensor2 = np.ones((64,), dtype=np.float32) * 101.0
|
||||
tensor3 = np.ones((96,), dtype=np.float32) * 102.0
|
||||
|
||||
gguf_writer.add_tensor("tensor1", tensor1)
|
||||
gguf_writer.add_tensor("tensor2", tensor2)
|
||||
gguf_writer.add_tensor("tensor3", tensor3)
|
||||
|
||||
gguf_writer.write_header_to_file()
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
553
llama-util.h
553
llama-util.h
|
@ -1,553 +0,0 @@
|
|||
// Internal header to be included only by llama.cpp.
|
||||
// Contains wrappers around OS interfaces.
|
||||
|
||||
#ifndef LLAMA_UTIL_H
|
||||
#define LLAMA_UTIL_H
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <cstdarg>
|
||||
#include <cstdlib>
|
||||
#include <climits>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#if defined(_POSIX_MEMLOCK_RANGE)
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
#include <stdio.h> // for _fseeki64
|
||||
#endif
|
||||
|
||||
#define LLAMA_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((format(gnu_printf, 1, 2)))
|
||||
#else
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
#endif
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap, ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
LLAMA_ASSERT(size >= 0 && size < INT_MAX);
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
LLAMA_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
}
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
|
||||
llama_file(const char * fname, const char * mode) {
|
||||
fp = std::fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
#ifdef _WIN32
|
||||
__int64 ret = _ftelli64(fp);
|
||||
#else
|
||||
long ret = std::ftell(fp);
|
||||
#endif
|
||||
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
|
||||
return (size_t) ret;
|
||||
}
|
||||
|
||||
void seek(size_t offset, int whence) {
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||
#else
|
||||
int ret = std::fseek(fp, (long) offset, whence);
|
||||
#endif
|
||||
LLAMA_ASSERT(ret == 0); // same
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
||||
}
|
||||
}
|
||||
|
||||
std::uint32_t read_u32() {
|
||||
std::uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string read_string(std::uint32_t len) {
|
||||
std::vector<char> chars(len);
|
||||
read_raw(chars.data(), len);
|
||||
return std::string(chars.data(), len);
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, len, 1, fp);
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
void write_u32(std::uint32_t val) {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// llama_context_data
|
||||
struct llama_data_context {
|
||||
virtual void write(const void * src, size_t size) = 0;
|
||||
virtual size_t get_size_written() = 0;
|
||||
virtual ~llama_data_context() = default;
|
||||
};
|
||||
|
||||
struct llama_data_buffer_context : llama_data_context {
|
||||
uint8_t* ptr;
|
||||
size_t size_written = 0;
|
||||
|
||||
llama_data_buffer_context(uint8_t * p) : ptr(p) {}
|
||||
|
||||
void write(const void * src, size_t size) override {
|
||||
memcpy(ptr, src, size);
|
||||
ptr += size;
|
||||
size_written += size;
|
||||
}
|
||||
|
||||
size_t get_size_written() override {
|
||||
return size_written;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_data_file_context : llama_data_context {
|
||||
llama_file* file;
|
||||
size_t size_written = 0;
|
||||
|
||||
llama_data_file_context(llama_file * f) : file(f) {}
|
||||
|
||||
void write(const void * src, size_t size) override {
|
||||
file->write_raw(src, size);
|
||||
size_written += size;
|
||||
}
|
||||
|
||||
size_t get_size_written() override {
|
||||
return size_written;
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(_WIN32)
|
||||
static std::string llama_format_win_err(DWORD err) {
|
||||
LPSTR buf;
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||
if (!size) {
|
||||
return "FormatMessageA failed";
|
||||
}
|
||||
std::string ret(buf, size);
|
||||
LocalFree(buf);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct llama_mmap {
|
||||
void * addr;
|
||||
size_t size;
|
||||
|
||||
llama_mmap(const llama_mmap &) = delete;
|
||||
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
// prefetch/readahead impairs performance on NUMA systems
|
||||
if (numa) { prefetch = 0; }
|
||||
#ifdef __linux__
|
||||
if (prefetch >= file->size) { flags |= MAP_POPULATE; }
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
if (prefetch > 0) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
if (numa) {
|
||||
// advise the kernel not to use readahead
|
||||
// (because the next page might not belong on the same node)
|
||||
if (madvise(addr, file->size, MADV_RANDOM)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
munmap(addr, size);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
||||
(void) numa;
|
||||
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
DWORD error = GetLastError();
|
||||
|
||||
if (hMapping == NULL) {
|
||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
if (addr == NULL) {
|
||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
if (prefetch) {
|
||||
// The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
|
||||
// will dynamically load it using GetProcAddress.
|
||||
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
|
||||
HMODULE hKernel32;
|
||||
|
||||
// This call is guaranteed to succeed.
|
||||
hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
||||
|
||||
// This call may fail if on a pre-Win8 system.
|
||||
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
|
||||
|
||||
if (pPrefetchVirtualMemory) {
|
||||
// Advise the kernel to preload the mapped memory.
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
if (!UnmapViewOfFile(addr)) {
|
||||
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
||||
(void) prefetch;
|
||||
(void) numa;
|
||||
|
||||
throw std::runtime_error(std::string("mmap not supported"));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
||||
// will automatically unlock on destruction.
|
||||
struct llama_mlock {
|
||||
void * addr = NULL;
|
||||
size_t size = 0;
|
||||
bool failed_already = false;
|
||||
|
||||
llama_mlock() {}
|
||||
llama_mlock(const llama_mlock &) = delete;
|
||||
|
||||
~llama_mlock() {
|
||||
if (size) {
|
||||
raw_unlock(addr, size);
|
||||
}
|
||||
}
|
||||
|
||||
void init(void * ptr) {
|
||||
LLAMA_ASSERT(addr == NULL && size == 0);
|
||||
addr = ptr;
|
||||
}
|
||||
|
||||
void grow_to(size_t target_size) {
|
||||
LLAMA_ASSERT(addr);
|
||||
if (failed_already) {
|
||||
return;
|
||||
}
|
||||
size_t granularity = lock_granularity();
|
||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
||||
if (target_size > size) {
|
||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
||||
size = target_size;
|
||||
} else {
|
||||
failed_already = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _POSIX_MEMLOCK_RANGE
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) sysconf(_SC_PAGESIZE);
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
||||
#else
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
||||
#endif
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) {
|
||||
if (!mlock(addr, size)) {
|
||||
return true;
|
||||
} else {
|
||||
char* errmsg = std::strerror(errno);
|
||||
bool suggest = (errno == ENOMEM);
|
||||
|
||||
// Check if the resource limit is fine after all
|
||||
struct rlimit lock_limit;
|
||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
||||
suggest = false;
|
||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
||||
suggest = false;
|
||||
|
||||
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#undef MLOCK_SUGGESTION
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (munlock(addr, size)) {
|
||||
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
||||
}
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
return (size_t) si.dwPageSize;
|
||||
}
|
||||
|
||||
bool raw_lock(void * ptr, size_t len) {
|
||||
for (int tries = 1; ; tries++) {
|
||||
if (VirtualLock(ptr, len)) {
|
||||
return true;
|
||||
}
|
||||
if (tries == 2) {
|
||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||
len, size, llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// It failed but this was only the first try; increase the working
|
||||
// set size and try again.
|
||||
SIZE_T min_ws_size, max_ws_size;
|
||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
||||
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
// Per MSDN: "The maximum number of pages that a process can lock
|
||||
// is equal to the number of pages in its minimum working set minus
|
||||
// a small overhead."
|
||||
// Hopefully a megabyte is enough overhead:
|
||||
size_t increment = len + 1048576;
|
||||
// The minimum must be <= the maximum, so we need to increase both:
|
||||
min_ws_size += increment;
|
||||
max_ws_size += increment;
|
||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void raw_unlock(void * ptr, size_t len) {
|
||||
if (!VirtualUnlock(ptr, len)) {
|
||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) 65536;
|
||||
}
|
||||
|
||||
bool raw_lock(const void * addr, size_t len) {
|
||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
void raw_unlock(const void * addr, size_t len) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
||||
struct llama_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
llama_buffer() = default;
|
||||
|
||||
void resize(size_t len) {
|
||||
#ifdef GGML_USE_METAL
|
||||
free(addr);
|
||||
int result = posix_memalign((void **) &addr, getpagesize(), len);
|
||||
if (result == 0) {
|
||||
memset(addr, 0, len);
|
||||
}
|
||||
else {
|
||||
addr = NULL;
|
||||
}
|
||||
#else
|
||||
delete[] addr;
|
||||
addr = new uint8_t[len];
|
||||
#endif
|
||||
size = len;
|
||||
}
|
||||
|
||||
~llama_buffer() {
|
||||
#ifdef GGML_USE_METAL
|
||||
free(addr);
|
||||
#else
|
||||
delete[] addr;
|
||||
#endif
|
||||
addr = NULL;
|
||||
}
|
||||
|
||||
// disable copy and move
|
||||
llama_buffer(const llama_buffer&) = delete;
|
||||
llama_buffer(llama_buffer&&) = delete;
|
||||
llama_buffer& operator=(const llama_buffer&) = delete;
|
||||
llama_buffer& operator=(llama_buffer&&) = delete;
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
struct llama_ctx_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
bool is_cuda;
|
||||
size_t size = 0;
|
||||
|
||||
llama_ctx_buffer() = default;
|
||||
|
||||
void resize(size_t size) {
|
||||
free();
|
||||
|
||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
||||
if (addr) {
|
||||
is_cuda = true;
|
||||
}
|
||||
else {
|
||||
// fall back to pageable memory
|
||||
addr = new uint8_t[size];
|
||||
is_cuda = false;
|
||||
}
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
void free() {
|
||||
if (addr) {
|
||||
if (is_cuda) {
|
||||
ggml_cuda_host_free(addr);
|
||||
}
|
||||
else {
|
||||
delete[] addr;
|
||||
}
|
||||
}
|
||||
addr = NULL;
|
||||
}
|
||||
|
||||
~llama_ctx_buffer() {
|
||||
free();
|
||||
}
|
||||
|
||||
// disable copy and move
|
||||
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
||||
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
||||
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
||||
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
||||
};
|
||||
#else
|
||||
typedef llama_buffer llama_ctx_buffer;
|
||||
#endif
|
||||
|
||||
#endif
|
121
llama.h
121
llama.h
|
@ -34,29 +34,18 @@
|
|||
# define DEPRECATED(func, hint) func
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||
|
||||
#define LLAMA_FILE_VERSION 3
|
||||
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
||||
#ifndef LLAMA_DEFAULT_RMS_EPS
|
||||
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -103,8 +92,6 @@ extern "C" {
|
|||
uint32_t seed; // RNG seed, -1 for random
|
||||
int32_t n_ctx; // text context
|
||||
int32_t n_batch; // prompt processing batch size
|
||||
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
||||
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||
|
||||
|
@ -129,6 +116,7 @@ extern "C" {
|
|||
bool use_mlock; // force system to keep model in RAM
|
||||
bool embedding; // embedding mode only
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
|
@ -155,7 +143,7 @@ extern "C" {
|
|||
// model quantization parameters
|
||||
typedef struct llama_model_quantize_params {
|
||||
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
} llama_model_quantize_params;
|
||||
|
@ -208,17 +196,12 @@ extern "C" {
|
|||
int32_t n_eval;
|
||||
};
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
||||
|
||||
LLAMA_API int llama_max_devices();
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
||||
|
||||
LLAMA_API bool llama_mmap_supported();
|
||||
LLAMA_API bool llama_mlock_supported();
|
||||
LLAMA_API int llama_max_devices(void);
|
||||
LLAMA_API bool llama_mmap_supported(void);
|
||||
LLAMA_API bool llama_mlock_supported(void);
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Initialize the llama + ggml backend
|
||||
|
@ -226,9 +209,9 @@ extern "C" {
|
|||
// Call once at the start of the program
|
||||
LLAMA_API void llama_backend_init(bool numa);
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free();
|
||||
LLAMA_API void llama_backend_free(void);
|
||||
|
||||
LLAMA_API int64_t llama_time_us();
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
|
@ -240,13 +223,6 @@ extern "C" {
|
|||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
|
||||
// Various functions for loading a ggml llama model.
|
||||
// Allocate (almost) all memory needed for the model.
|
||||
// Return NULL on failure
|
||||
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params),
|
||||
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
@ -336,6 +312,13 @@ extern "C" {
|
|||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_bpe(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_with_model(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
|
@ -377,18 +360,28 @@ extern "C" {
|
|||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(
|
||||
// Does not write null terminator to the buffer
|
||||
LLAMA_API int llama_token_to_str(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
LLAMA_API const char * llama_token_to_str_with_model(
|
||||
LLAMA_API int llama_token_to_str_bpe(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
LLAMA_API int llama_token_to_str_with_model(
|
||||
const struct llama_model * model,
|
||||
llama_token token);
|
||||
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int length);
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl(); // next-line
|
||||
LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(void); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl(void); // next-line
|
||||
|
||||
// Grammar
|
||||
//
|
||||
|
@ -468,19 +461,51 @@ extern "C" {
|
|||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||
#ifdef LLAMA_API_INTERNAL
|
||||
// C++ API, will be moving to common.h soon (TM)
|
||||
#ifdef LLAMA_API_CPP
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos);
|
||||
|
||||
std::vector<llama_token> llama_tokenize_bpe(
|
||||
struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos);
|
||||
|
||||
std::string llama_token_to_str(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
std::string llama_token_to_str_bpe(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||
#ifdef LLAMA_API_INTERNAL
|
||||
|
||||
struct ggml_tensor;
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
|
||||
#endif
|
||||
#endif // LLAMA_API_CPP
|
||||
|
||||
#endif // LLAMA_API_INTERNAL
|
||||
|
||||
#endif // LLAMA_H
|
||||
|
|
1
models/.editorconfig
Normal file
1
models/.editorconfig
Normal file
|
@ -0,0 +1 @@
|
|||
root = true
|
BIN
models/ggml-vocab-llama.gguf
Normal file
BIN
models/ggml-vocab-llama.gguf
Normal file
Binary file not shown.
Binary file not shown.
|
@ -1,4 +1,19 @@
|
|||
function(llama_add_test source)
|
||||
function(llama_build_executable source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||
endfunction()
|
||||
|
||||
function(llama_test_executable name source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
# add_executable(${TEST_TARGET} ${source})
|
||||
# install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
# target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
endfunction()
|
||||
|
||||
function(llama_build_and_test_executable source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
|
@ -6,12 +21,16 @@ function(llama_add_test source)
|
|||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
endfunction()
|
||||
|
||||
# llama_add_test(test-double-float.cpp) # SLOW
|
||||
llama_add_test(test-quantize-fns.cpp)
|
||||
llama_add_test(test-quantize-perf.cpp)
|
||||
llama_add_test(test-sampling.cpp)
|
||||
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||
llama_add_test(test-grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp)
|
||||
llama_add_test(test-llama-grammar.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/common.cpp)
|
||||
llama_add_test(test-grad0.cpp) # SLOW
|
||||
# llama_add_test(test-opt.cpp) # SLOW
|
||||
# llama_build_and_test_executable(test-double-float.cpp) # SLOW
|
||||
llama_build_and_test_executable(test-quantize-fns.cpp)
|
||||
llama_build_and_test_executable(test-quantize-perf.cpp)
|
||||
llama_build_and_test_executable(test-sampling.cpp)
|
||||
llama_build_executable(test-tokenizer-0.cpp)
|
||||
llama_test_executable(test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_build_executable(test-tokenizer-1.cpp)
|
||||
llama_test_executable(test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/common.cpp)
|
||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#define LLAMA_API_CPP // TODO: eliminate me
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
|
@ -5,16 +6,40 @@
|
|||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
static const std::map<std::string, std::vector<llama_token>> & k_tests()
|
||||
{
|
||||
static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
result += llama_token_to_str(ctx, tokens[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||
{ "Hello World", { 1, 10994, 2787, }, },
|
||||
{ " Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
|
||||
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
|
||||
{ " ", {1, 259, }, },
|
||||
{ "\t", { 1, 29871, 12, }, },
|
||||
{ "\n", { 1, 29871, 13, }, },
|
||||
{ "\t\n", { 1, 29871, 12, 13, }, },
|
||||
{ "Hello world", { 1, 15043, 3186, }, },
|
||||
{ " Hello world", { 1, 29871, 15043, 3186, }, },
|
||||
{ "Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World", { 1, 29871, 15043, 2787, }, },
|
||||
{ " Hello World!", { 1, 29871, 15043, 2787, 29991, }, },
|
||||
{ " this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
|
||||
{ "កាន់តែពិសេសអាចខលចេញ", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161,
|
||||
146, 228, 162, 133, 228, 161, 153, 228, 161, 186,
|
||||
31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228,
|
||||
161, 136, 228, 161, 132, 228, 161, 158, 228, 161,
|
||||
136, 228, 162, 132, 228, 161, 140, }, },
|
||||
{ "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||
{ 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871,
|
||||
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
|
||||
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
|
||||
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
||||
};
|
||||
|
||||
return _k_tests;
|
||||
};
|
||||
|
||||
|
@ -64,10 +89,12 @@ int main(int argc, char **argv) {
|
|||
return 2;
|
||||
}
|
||||
|
||||
bool success = true;
|
||||
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
std::vector<llama_token> res(test_kv.first.size());
|
||||
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
|
||||
res.resize(n);
|
||||
std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
|
||||
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
||||
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
|
||||
|
||||
bool correct = res.size() == test_kv.second.size();
|
||||
|
||||
|
@ -78,7 +105,8 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
|
||||
if (!correct) {
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
|
||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||
for (const auto & t : test_kv.second) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
|
@ -90,9 +118,7 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
return 3;
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -101,5 +127,5 @@ int main(int argc, char **argv) {
|
|||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
return success ? 0 : 3;
|
||||
}
|
||||
|
|
128
tests/test-tokenizer-1.cpp
Normal file
128
tests/test-tokenizer-1.cpp
Normal file
|
@ -0,0 +1,128 @@
|
|||
#define LLAMA_API_CPP // TODO: eliminate me
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <codecvt>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
|
||||
static std::string vocab_type(llama_context * ctx) {
|
||||
return llama_n_vocab(ctx) == 32000 ? "spm": "bpe";
|
||||
}
|
||||
|
||||
static std::string escape_whitespace(const std::string& text) {
|
||||
std::string result;
|
||||
bool escaping = false;
|
||||
result += "\xe2\x96\x81";
|
||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||
if (text[offs] == ' ') {
|
||||
if (!escaping) {
|
||||
result += "\xe2\x96\x81";
|
||||
escaping = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
escaping = false;
|
||||
result += text[offs];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
result += llama_token_to_str(ctx, tokens[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init(false);
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string forward = llama_token_to_str_bpe(ctx, i);
|
||||
std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false);
|
||||
if (tokens.size() == 1) {
|
||||
if (i != tokens[0]) {
|
||||
std::string backward = llama_token_to_str(ctx, tokens[0]);
|
||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||
return 2;
|
||||
}
|
||||
} else {
|
||||
if ((vocab_type(ctx) == "spm" && i <= 258) ||
|
||||
(vocab_type(ctx) == "bpe" && (i == 0 || i >= 100000))) {
|
||||
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
||||
} else {
|
||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n",
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::wstring_convert<typename std::codecvt_utf8<wchar_t>, wchar_t> converter;
|
||||
for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
|
||||
std::wstring wstr(1, ch);
|
||||
std::string str;
|
||||
try {
|
||||
str = converter.to_bytes(wstr);
|
||||
} catch (std::exception & e) {
|
||||
continue;
|
||||
}
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str), false);
|
||||
if (tokens.size() == 1) {
|
||||
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
||||
__func__, str.c_str(), tokens[0]);
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue