Merge remote-tracking branch 'upstream/master' into eval-thread-count
This commit is contained in:
commit
c770e0145f
43 changed files with 2683 additions and 1891 deletions
|
@ -5,9 +5,10 @@ FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip
|
apt-get install -y build-essential python3 python3-pip
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
&& pip install numpy requests sentencepiece tqdm \
|
&& pip install -r requirements.txt
|
||||||
&& pip install torch --index-url https://download.pytorch.org/whl/cpu
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
@ -15,4 +15,4 @@ FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/main" ]
|
||||||
|
|
|
@ -21,4 +21,4 @@ models/*
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
|
5
.ecrc
Normal file
5
.ecrc
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
{
|
||||||
|
"Disable": {
|
||||||
|
"IndentSize": true
|
||||||
|
}
|
||||||
|
}
|
19
.editorconfig
Normal file
19
.editorconfig
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
# https://EditorConfig.org
|
||||||
|
|
||||||
|
# Top-most EditorConfig file
|
||||||
|
root = true
|
||||||
|
|
||||||
|
# Unix-style newlines with a newline ending every file, utf-8 charset
|
||||||
|
[*]
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
charset = utf-8
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
|
||||||
|
[Makefile]
|
||||||
|
indent_style = tab
|
||||||
|
|
||||||
|
[prompts/*.txt]
|
||||||
|
insert_final_newline = unset
|
16
.github/ISSUE_TEMPLATE/custom.md
vendored
16
.github/ISSUE_TEMPLATE/custom.md
vendored
|
@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and
|
||||||
|
|
||||||
# Current Behavior
|
# Current Behavior
|
||||||
|
|
||||||
Please provide a detailed written description of what `llama.cpp` did, instead.
|
Please provide a detailed written description of what `llama.cpp` did, instead.
|
||||||
|
|
||||||
# Environment and Context
|
# Environment and Context
|
||||||
|
|
||||||
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
|
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.
|
||||||
llama_model_load: .......................................................................................... done
|
llama_model_load: .......................................................................................... done
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||||
|
|
||||||
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
|
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
|
||||||
|
|
||||||
main: prompt: 'Please close your issue when it has been answered.'
|
main: prompt: 'Please close your issue when it has been answered.'
|
||||||
main: number of tokens in prompt = 11
|
main: number of tokens in prompt = 11
|
||||||
|
@ -166,14 +166,14 @@ main: total time = 246406.42 ms
|
||||||
|
|
||||||
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
|
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
|
||||||
|
|
||||||
3636882.89 msec task-clock # 14.677 CPUs utilized
|
3636882.89 msec task-clock # 14.677 CPUs utilized
|
||||||
13509 context-switches # 3.714 /sec
|
13509 context-switches # 3.714 /sec
|
||||||
2436 cpu-migrations # 0.670 /sec
|
2436 cpu-migrations # 0.670 /sec
|
||||||
10476679 page-faults # 2.881 K/sec
|
10476679 page-faults # 2.881 K/sec
|
||||||
13133115082869 cycles # 3.611 GHz (16.77%)
|
13133115082869 cycles # 3.611 GHz (16.77%)
|
||||||
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
|
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
|
||||||
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
|
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
|
||||||
23479217109614 instructions # 1.79 insn per cycle
|
23479217109614 instructions # 1.79 insn per cycle
|
||||||
# 0.44 stalled cycles per insn (16.76%)
|
# 0.44 stalled cycles per insn (16.76%)
|
||||||
2353072268027 branches # 647.002 M/sec (16.77%)
|
2353072268027 branches # 647.002 M/sec (16.77%)
|
||||||
1998682780 branch-misses # 0.08% of all branches (16.76%)
|
1998682780 branch-misses # 0.08% of all branches (16.76%)
|
||||||
|
|
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
|
@ -60,4 +60,4 @@ jobs:
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: linux/amd64,linux/arm64
|
||||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
17
.github/workflows/editorconfig.yml
vendored
Normal file
17
.github/workflows/editorconfig.yml
vendored
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
name: EditorConfig Checker
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
editorconfig:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: editorconfig-checker/action-editorconfig-checker@main
|
||||||
|
- run: editorconfig-checker
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -23,6 +23,7 @@ models/*
|
||||||
/result
|
/result
|
||||||
/perplexity
|
/perplexity
|
||||||
/embedding
|
/embedding
|
||||||
|
/benchmark-q4_0-matmult
|
||||||
/Pipfile
|
/Pipfile
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
|
|
|
@ -56,6 +56,10 @@ option(LLAMA_AVX "llama: enable AVX"
|
||||||
option(LLAMA_AVX2 "llama: enable AVX2" ON)
|
option(LLAMA_AVX2 "llama: enable AVX2" ON)
|
||||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||||
option(LLAMA_FMA "llama: enable FMA" ON)
|
option(LLAMA_FMA "llama: enable FMA" ON)
|
||||||
|
# in MSVC F16C is implied with AVX2/AVX512
|
||||||
|
if (NOT MSVC)
|
||||||
|
option(LLAMA_F16C "llama: enable F16C" ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
|
@ -116,6 +120,21 @@ if (LLAMA_OPENBLAS)
|
||||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||||
add_link_options(${BLAS_LIBRARIES})
|
add_link_options(${BLAS_LIBRARIES})
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
|
||||||
|
|
||||||
|
# find header file
|
||||||
|
set(OPENBLAS_INCLUDE_SEARCH_PATHS
|
||||||
|
/usr/include
|
||||||
|
/usr/include/openblas
|
||||||
|
/usr/include/openblas-base
|
||||||
|
/usr/local/include
|
||||||
|
/usr/local/include/openblas
|
||||||
|
/usr/local/include/openblas-base
|
||||||
|
/opt/OpenBLAS/include
|
||||||
|
$ENV{OpenBLAS_HOME}
|
||||||
|
$ENV{OpenBLAS_HOME}/include
|
||||||
|
)
|
||||||
|
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
|
||||||
|
add_compile_options(-I${OPENBLAS_INC})
|
||||||
else()
|
else()
|
||||||
message(WARNING "OpenBLAS not found")
|
message(WARNING "OpenBLAS not found")
|
||||||
endif()
|
endif()
|
||||||
|
@ -207,7 +226,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
||||||
add_compile_options(/arch:AVX)
|
add_compile_options(/arch:AVX)
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
add_compile_options(-mf16c)
|
if (LLAMA_F16C)
|
||||||
|
add_compile_options(-mf16c)
|
||||||
|
endif()
|
||||||
if (LLAMA_FMA)
|
if (LLAMA_FMA)
|
||||||
add_compile_options(-mfma)
|
add_compile_options(-mfma)
|
||||||
endif()
|
endif()
|
||||||
|
@ -247,7 +268,6 @@ endif()
|
||||||
add_library(llama
|
add_library(llama
|
||||||
llama.cpp
|
llama.cpp
|
||||||
llama.h
|
llama.h
|
||||||
llama_internal.h
|
|
||||||
llama_util.h)
|
llama_util.h)
|
||||||
|
|
||||||
target_include_directories(llama PUBLIC .)
|
target_include_directories(llama PUBLIC .)
|
||||||
|
|
29
Makefile
29
Makefile
|
@ -133,48 +133,53 @@ $(info I CC: $(CCV))
|
||||||
$(info I CXX: $(CXXV))
|
$(info I CXX: $(CXXV))
|
||||||
$(info )
|
$(info )
|
||||||
|
|
||||||
default: main quantize perplexity embedding
|
default: main quantize quantize-stats perplexity embedding
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build library
|
# Build library
|
||||||
#
|
#
|
||||||
|
|
||||||
ggml.o: ggml.c ggml.h
|
ggml.o: ggml.c ggml.h
|
||||||
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
llama.o: llama.cpp llama.h llama_util.h llama_internal.h
|
llama.o: llama.cpp ggml.h llama.h llama_util.h
|
||||||
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common.o: examples/common.cpp examples/common.h
|
common.o: examples/common.cpp examples/common.h
|
||||||
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o main quantize quantize-stats perplexity embedding
|
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
|
||||||
|
|
||||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
main: examples/main/main.cpp ggml.o llama.o common.o
|
||||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o
|
quantize: examples/quantize/quantize.cpp ggml.o llama.o
|
||||||
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
||||||
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
||||||
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
||||||
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
libllama.so: llama.o ggml.o
|
libllama.so: llama.o ggml.o
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Tests
|
# Tests
|
||||||
#
|
#
|
||||||
|
|
||||||
|
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
|
||||||
|
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
|
||||||
|
./benchmark-q4_0-matmult
|
||||||
|
|
||||||
.PHONY: tests
|
.PHONY: tests
|
||||||
tests:
|
tests:
|
||||||
bash ./tests/run-tests.sh
|
bash ./tests/run-tests.sh
|
||||||
|
|
60
README.md
60
README.md
|
@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
**Hot topics:**
|
**Hot topics:**
|
||||||
|
|
||||||
|
- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
|
||||||
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
|
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
@ -48,6 +49,7 @@ New features will probably be added mostly through community contributions.
|
||||||
|
|
||||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||||
|
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
|
||||||
|
|
||||||
**UI:**
|
**UI:**
|
||||||
|
|
||||||
|
@ -148,30 +150,52 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
Here are the step for the LLaMA-7B model:
|
Here are the step for the LLaMA-7B model.
|
||||||
|
|
||||||
|
### Get the Code
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# build this repo
|
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
cd llama.cpp
|
cd llama.cpp
|
||||||
make
|
```
|
||||||
|
|
||||||
#For Windows and CMake, use the following command instead:
|
### Build
|
||||||
cd <path_to_llama_folder>
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
cmake ..
|
|
||||||
cmake --build . --config Release
|
|
||||||
|
|
||||||
|
Note: For Windows, CMake or Zig can be used.
|
||||||
|
|
||||||
|
1. Use `make`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Use CMake
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Use Zig
|
||||||
|
|
||||||
|
```bash
|
||||||
|
zig build -Drelease-fast
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prepare Data & Run
|
||||||
|
|
||||||
|
```bash
|
||||||
# obtain the original LLaMA model weights and place them in ./models
|
# obtain the original LLaMA model weights and place them in ./models
|
||||||
ls ./models
|
ls ./models
|
||||||
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
||||||
|
|
||||||
# install Python dependencies
|
# install Python dependencies
|
||||||
python3 -m pip install torch numpy sentencepiece
|
python3 -m pip install -r requirements.txt
|
||||||
|
|
||||||
# convert the 7B model to ggml FP16 format
|
# convert the 7B model to ggml FP16 format
|
||||||
python3 convert-pth-to-ggml.py models/7B/ 1
|
python3 convert.py models/7B/
|
||||||
|
|
||||||
# quantize the model to 4-bits (using method 2 = q4_0)
|
# quantize the model to 4-bits (using method 2 = q4_0)
|
||||||
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
|
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
|
||||||
|
@ -180,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1
|
||||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
|
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
|
||||||
### Memory/Disk Requirements
|
### Memory/Disk Requirements
|
||||||
|
@ -243,7 +265,7 @@ There 26 letters in the English Alphabet
|
||||||
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
||||||
> List 5 words that start with "ca".
|
> List 5 words that start with "ca".
|
||||||
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||||
>
|
>
|
||||||
```
|
```
|
||||||
|
|
||||||
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
||||||
|
@ -254,19 +276,19 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||||
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
|
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
|
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
|
||||||
python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
|
python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
|
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
|
||||||
- The original model is saved in the same folder with a suffix `.orig`
|
- The original model is saved in the same folder with a suffix `.orig`
|
||||||
|
|
||||||
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
|
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
|
||||||
|
|
||||||
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
|
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
|
||||||
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
|
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
|
||||||
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
|
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
|
||||||
- Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
|
- Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
|
||||||
- The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
|
- The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
|
||||||
|
|
||||||
`sha256sum --ignore-missing -c SHA256SUMS` on Linux
|
`sha256sum --ignore-missing -c SHA256SUMS` on Linux
|
||||||
|
@ -284,7 +306,7 @@ convert the model from the old format to the new format with [./migrate-ggml-202
|
||||||
- GPT-3.5 / InstructGPT / ChatGPT:
|
- GPT-3.5 / InstructGPT / ChatGPT:
|
||||||
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
||||||
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
||||||
|
|
||||||
### Perplexity (Measuring model quality)
|
### Perplexity (Measuring model quality)
|
||||||
|
|
||||||
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
|
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
|
||||||
|
|
22
build.zig
22
build.zig
|
@ -1,16 +1,14 @@
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
|
|
||||||
pub fn build(b: *std.Build) void {
|
pub fn build(b: *std.build.Builder) void {
|
||||||
const target = b.standardTargetOptions(.{});
|
const target = b.standardTargetOptions(.{});
|
||||||
const optimize = b.standardOptimizeOption(.{});
|
const optimize = b.standardReleaseOptions();
|
||||||
const want_lto = b.option(bool, "lto", "Want -fLTO");
|
const want_lto = b.option(bool, "lto", "Want -fLTO");
|
||||||
|
|
||||||
const lib = b.addStaticLibrary(.{
|
const lib = b.addStaticLibrary("llama", null);
|
||||||
.name = "llama",
|
|
||||||
.target = target,
|
|
||||||
.optimize = optimize,
|
|
||||||
});
|
|
||||||
lib.want_lto = want_lto;
|
lib.want_lto = want_lto;
|
||||||
|
lib.setTarget(target);
|
||||||
|
lib.setBuildMode(optimize);
|
||||||
lib.linkLibCpp();
|
lib.linkLibCpp();
|
||||||
lib.addIncludePath(".");
|
lib.addIncludePath(".");
|
||||||
lib.addIncludePath("examples");
|
lib.addIncludePath("examples");
|
||||||
|
@ -44,16 +42,12 @@ pub fn build(b: *std.Build) void {
|
||||||
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
|
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
|
||||||
const b = args.b;
|
const b = args.b;
|
||||||
const lib = args.lib;
|
const lib = args.lib;
|
||||||
const target = args.target;
|
|
||||||
const optimize = args.optimize;
|
|
||||||
const want_lto = args.want_lto;
|
const want_lto = args.want_lto;
|
||||||
|
|
||||||
const exe = b.addExecutable(.{
|
const exe = b.addExecutable(name, null);
|
||||||
.name = name,
|
|
||||||
.target = target,
|
|
||||||
.optimize = optimize,
|
|
||||||
});
|
|
||||||
exe.want_lto = want_lto;
|
exe.want_lto = want_lto;
|
||||||
|
lib.setTarget(args.target);
|
||||||
|
lib.setBuildMode(args.optimize);
|
||||||
exe.addIncludePath(".");
|
exe.addIncludePath(".");
|
||||||
exe.addIncludePath("examples");
|
exe.addIncludePath("examples");
|
||||||
exe.addCSourceFiles(&.{
|
exe.addCSourceFiles(&.{
|
||||||
|
|
|
@ -1,299 +0,0 @@
|
||||||
# Author: github.com/ductai199x
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from numba import njit
|
|
||||||
from tqdm.auto import tqdm
|
|
||||||
|
|
||||||
|
|
||||||
def read_header(fin):
|
|
||||||
values = struct.unpack("i" * 9, fin.read(4 * 9))
|
|
||||||
_, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
|
|
||||||
return {
|
|
||||||
"vocab_size": vocab_size,
|
|
||||||
"dim": dim,
|
|
||||||
"multiple_of": multiple_of,
|
|
||||||
"n_heads": n_heads,
|
|
||||||
"n_layers": n_layers,
|
|
||||||
}, ftype
|
|
||||||
|
|
||||||
|
|
||||||
def read_tokens(fin, vocab_size):
|
|
||||||
tokens = []
|
|
||||||
for _ in range(vocab_size):
|
|
||||||
text_len = struct.unpack("i", fin.read(4))[0]
|
|
||||||
text_bytes = fin.read(text_len)
|
|
||||||
try:
|
|
||||||
text = text_bytes.decode()
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
text = text_bytes.decode(errors="replace")
|
|
||||||
score = struct.unpack("f", fin.read(4))[0]
|
|
||||||
tokens.append((text, score))
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
|
|
||||||
@njit
|
|
||||||
def dequantize_weights_numba(fin_data, n_rows, n_cols):
|
|
||||||
qk = 32
|
|
||||||
nb = n_cols // qk
|
|
||||||
bs = 4 + (qk // 2)
|
|
||||||
|
|
||||||
weights = np.zeros((n_rows, n_cols), dtype=np.float32)
|
|
||||||
data_pos = 0
|
|
||||||
|
|
||||||
for row in range(n_rows):
|
|
||||||
for block in range(nb):
|
|
||||||
d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
|
|
||||||
data_pos += 4
|
|
||||||
packed_values = fin_data[data_pos : data_pos + (qk // 2)]
|
|
||||||
data_pos += qk // 2
|
|
||||||
|
|
||||||
for i in range(qk // 2):
|
|
||||||
packed_value = packed_values[i]
|
|
||||||
v0 = np.float32((packed_value & 0b00001111) - 8) * d
|
|
||||||
v1 = np.float32((packed_value >> 4) - 8) * d
|
|
||||||
|
|
||||||
weights[row, block * qk + 2 * i] = v0
|
|
||||||
weights[row, block * qk + 2 * i + 1] = v1
|
|
||||||
|
|
||||||
return weights
|
|
||||||
|
|
||||||
|
|
||||||
def dequantize_weights(fin, n_rows, n_cols):
|
|
||||||
qk = 32
|
|
||||||
nb = n_cols // qk
|
|
||||||
data_size = n_rows * n_cols // 2 + n_rows * nb * 4
|
|
||||||
fin_data = fin.read(data_size)
|
|
||||||
return dequantize_weights_numba(fin_data, n_rows, n_cols)
|
|
||||||
|
|
||||||
|
|
||||||
def read_variables(fin):
|
|
||||||
model = {}
|
|
||||||
pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
|
|
||||||
while True:
|
|
||||||
start_pos = fin.tell()
|
|
||||||
try:
|
|
||||||
n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
|
|
||||||
except struct.error:
|
|
||||||
break
|
|
||||||
|
|
||||||
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
|
|
||||||
shape = shape[::-1]
|
|
||||||
name = fin.read(name_length).decode()
|
|
||||||
|
|
||||||
# ensure tensor data is aligned
|
|
||||||
tensor_data_offset = fin.tell()
|
|
||||||
tensor_data_offset = (tensor_data_offset + 31) & -32
|
|
||||||
fin.seek(tensor_data_offset)
|
|
||||||
|
|
||||||
if ftype_cur == 2:
|
|
||||||
# 4-bit quantized weights
|
|
||||||
dtype = np.uint8
|
|
||||||
data = dequantize_weights(fin, shape[0], shape[1])
|
|
||||||
data = data.reshape(shape)
|
|
||||||
elif ftype_cur == 0:
|
|
||||||
dtype = np.float32
|
|
||||||
data_size = np.prod(shape)
|
|
||||||
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
|
|
||||||
elif ftype_cur == 1:
|
|
||||||
dtype = np.float16
|
|
||||||
data_size = np.prod(shape)
|
|
||||||
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
|
|
||||||
|
|
||||||
model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
|
|
||||||
|
|
||||||
pbar.update(fin.tell() - start_pos)
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def convert_to_hf_format(model, hparams):
|
|
||||||
# This works for llama 7B, need to test with other models
|
|
||||||
n_layers = hparams["n_layers"]
|
|
||||||
n_heads = hparams["n_heads"]
|
|
||||||
dim = hparams["dim"]
|
|
||||||
dims_per_head = dim // n_heads
|
|
||||||
base = 10000.0
|
|
||||||
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
|
||||||
|
|
||||||
# permute for sliced rotary
|
|
||||||
def permute(w):
|
|
||||||
return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
|
|
||||||
|
|
||||||
state_dict = {}
|
|
||||||
for layer_i in range(n_layers):
|
|
||||||
state_dict.update(
|
|
||||||
{
|
|
||||||
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
|
|
||||||
model[f"layers.{layer_i}.attention.wq.weight"]
|
|
||||||
),
|
|
||||||
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
|
|
||||||
model[f"layers.{layer_i}.attention.wk.weight"]
|
|
||||||
),
|
|
||||||
f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.attention.wv.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.attention.wo.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.feed_forward.w1.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.mlp.down_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.feed_forward.w2.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.mlp.up_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.feed_forward.w3.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.input_layernorm.weight": model[
|
|
||||||
f"layers.{layer_i}.attention_norm.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
|
|
||||||
f"layers.{layer_i}.ffn_norm.weight"
|
|
||||||
],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
|
|
||||||
state_dict.update(
|
|
||||||
{
|
|
||||||
"model.embed_tokens.weight": model["tok_embeddings.weight"],
|
|
||||||
"model.norm.weight": model["norm.weight"],
|
|
||||||
"lm_head.weight": model["output.weight"],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return state_dict
|
|
||||||
|
|
||||||
|
|
||||||
def chat(model, hparams, llama_dir):
|
|
||||||
from transformers import (GenerationConfig, LlamaForCausalLM,
|
|
||||||
LlamaTokenizer, StoppingCriteria,
|
|
||||||
StoppingCriteriaList)
|
|
||||||
from transformers.models.llama.configuration_llama import LlamaConfig
|
|
||||||
|
|
||||||
class StoppingCriteriaSub(StoppingCriteria):
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
|
|
||||||
print(tokenizer.decode(input_ids[0]), end="", flush=True)
|
|
||||||
if input_ids[0][-1] == 13:
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
config = LlamaConfig(
|
|
||||||
vocab_size=hparams["vocab_size"],
|
|
||||||
dim=hparams["dim"],
|
|
||||||
num_hidden_layers=hparams["n_layers"],
|
|
||||||
num_attention_heads=hparams["n_heads"],
|
|
||||||
)
|
|
||||||
|
|
||||||
llama = LlamaForCausalLM(config=config)
|
|
||||||
llama.load_state_dict(state_dict=model, strict=True)
|
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
|
|
||||||
|
|
||||||
device = torch.device("cpu")
|
|
||||||
llama = llama.to(device)
|
|
||||||
|
|
||||||
ctx = """You are AI.
|
|
||||||
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
|
|
||||||
User: Hello, AI.
|
|
||||||
AI: Hello! How can I assist you today?
|
|
||||||
"""
|
|
||||||
print(ctx.rstrip("\n"))
|
|
||||||
while True:
|
|
||||||
print("-" * 60)
|
|
||||||
prompt = input("User: ")
|
|
||||||
if ctx != "":
|
|
||||||
ctx = f"{ctx}User: {prompt}\n"
|
|
||||||
else:
|
|
||||||
ctx = f"{prompt}\nAI:"
|
|
||||||
|
|
||||||
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
|
|
||||||
|
|
||||||
print("-" * 60)
|
|
||||||
if len(ctx.strip()) > 0:
|
|
||||||
input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
|
|
||||||
generation_config = GenerationConfig(
|
|
||||||
temperature=0.8,
|
|
||||||
top_p=0.95,
|
|
||||||
top_k=50,
|
|
||||||
repetition_penalty=1.1764,
|
|
||||||
)
|
|
||||||
with torch.no_grad():
|
|
||||||
generation_output = llama.generate(
|
|
||||||
input_ids=input_ids,
|
|
||||||
generation_config=generation_config,
|
|
||||||
return_dict_in_generate=True,
|
|
||||||
output_scores=True,
|
|
||||||
max_length=2048,
|
|
||||||
do_sample=True,
|
|
||||||
stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
|
|
||||||
)
|
|
||||||
s = generation_output.sequences[0]
|
|
||||||
decoded = tokenizer.decode(s)
|
|
||||||
ctx = f"{decoded}\n"
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--prefix",
|
|
||||||
"-p",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--hf",
|
|
||||||
action="store_true",
|
|
||||||
help="Whether to save the model in the Hugging Face format. (default: False)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
llama_dir = os.path.abspath(f"{args.input_dir}/../")
|
|
||||||
|
|
||||||
ggml_files = sorted(
|
|
||||||
[f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
|
|
||||||
)
|
|
||||||
|
|
||||||
fin = open(ggml_files[0], "rb")
|
|
||||||
hparams, ftype = read_header(fin)
|
|
||||||
tokens = read_tokens(fin, hparams["vocab_size"])
|
|
||||||
model = read_variables(fin)
|
|
||||||
|
|
||||||
for f in tqdm(ggml_files[1:]):
|
|
||||||
fin = open(f, "rb")
|
|
||||||
read_header(fin)
|
|
||||||
read_tokens(fin, hparams["vocab_size"])
|
|
||||||
model.update(read_variables(fin))
|
|
||||||
|
|
||||||
if args.hf:
|
|
||||||
model = convert_to_hf_format(model, hparams)
|
|
||||||
|
|
||||||
pth_ckpt = {
|
|
||||||
"state_dict": model,
|
|
||||||
"hparams": hparams,
|
|
||||||
"tokens": tokens,
|
|
||||||
}
|
|
||||||
|
|
||||||
torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
|
|
||||||
|
|
||||||
if args.chat:
|
|
||||||
if not args.hf:
|
|
||||||
model = convert_to_hf_format(model, hparams)
|
|
||||||
chat(model, hparams, llama_dir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,107 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
#
|
|
||||||
# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
|
|
||||||
#
|
|
||||||
|
|
||||||
# Original by https://github.com/eiz
|
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
|
|
||||||
parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
|
|
||||||
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def read_header(f_in):
|
|
||||||
struct_fmt = "i" * (3 + len(HPARAMS))
|
|
||||||
struct_size = struct.calcsize(struct_fmt)
|
|
||||||
buf = f_in.read(struct_size)
|
|
||||||
return struct.unpack(struct_fmt, buf)
|
|
||||||
|
|
||||||
def write_header(f_out, header):
|
|
||||||
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
|
||||||
|
|
||||||
if magic != 0x67676d6c:
|
|
||||||
raise Exception('Invalid file magic. Must be an old style ggml file.')
|
|
||||||
|
|
||||||
values = [
|
|
||||||
0x67676d66, # magic: ggml in hex
|
|
||||||
1, # file version
|
|
||||||
vocab_size,
|
|
||||||
dim,
|
|
||||||
multiple_of,
|
|
||||||
n_heads,
|
|
||||||
n_layers,
|
|
||||||
rot,
|
|
||||||
ftype
|
|
||||||
]
|
|
||||||
f_out.write(struct.pack("i" * len(values), *values))
|
|
||||||
|
|
||||||
def write_tokens(fout, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
text = " \u2047 ".encode()
|
|
||||||
elif tokenizer.is_control(i):
|
|
||||||
text = b""
|
|
||||||
elif tokenizer.is_byte(i):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
if len(piece) != 6:
|
|
||||||
print(f"Invalid token: {piece}")
|
|
||||||
sys.exit(1)
|
|
||||||
byte_value = int(piece[3:-1], 16)
|
|
||||||
text = struct.pack("B", byte_value)
|
|
||||||
else:
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
|
||||||
|
|
||||||
# TODO: GPT4All - add extra <pad> token
|
|
||||||
text = "<pad>".encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", 0.0))
|
|
||||||
|
|
||||||
def read_tokens(f_in, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
len_b = f_in.read(4)
|
|
||||||
(length,) = struct.unpack("i", len_b)
|
|
||||||
f_in.read(length)
|
|
||||||
|
|
||||||
def copy_all_data(f_out, f_in):
|
|
||||||
while True:
|
|
||||||
buf = f_in.read(1024 * 1024)
|
|
||||||
if not buf:
|
|
||||||
break
|
|
||||||
f_out.write(buf)
|
|
||||||
|
|
||||||
def convert_one_file(path_in, tokenizer):
|
|
||||||
path_tmp = f"{path_in}.tmp"
|
|
||||||
path_orig= f"{path_in}.orig"
|
|
||||||
print(f"converting {path_in}")
|
|
||||||
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
|
|
||||||
write_header(f_out, read_header(f_in))
|
|
||||||
read_tokens(f_in, tokenizer)
|
|
||||||
write_tokens(f_out, tokenizer)
|
|
||||||
copy_all_data(f_out, f_in)
|
|
||||||
os.rename(path_in, path_orig)
|
|
||||||
os.rename(path_tmp, path_in)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(args.tokenizer_model)
|
|
||||||
|
|
||||||
convert_one_file(args.gpt4all_model, tokenizer)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,172 +0,0 @@
|
||||||
# Convert a GPTQ quantized LLaMA model to a ggml compatible file
|
|
||||||
# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
|
|
||||||
#
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import struct
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
if len(sys.argv) != 4:
|
|
||||||
print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
fname_model = sys.argv[1]
|
|
||||||
fname_tokenizer = sys.argv[2]
|
|
||||||
dir_out = sys.argv[3]
|
|
||||||
|
|
||||||
model = torch.load(fname_model, map_location="cpu")
|
|
||||||
|
|
||||||
n_vocab, n_embd = model['model.embed_tokens.weight'].shape
|
|
||||||
n_layer = 1 + max(int(m.group(1)) for name in model
|
|
||||||
if (m := re.match(r'model\.layers\.([0-9]+)', name)))
|
|
||||||
|
|
||||||
# hardcoded:
|
|
||||||
n_mult = 256
|
|
||||||
n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
|
||||||
|
|
||||||
assert tokenizer.vocab_size() == n_vocab
|
|
||||||
|
|
||||||
fname_out = sys.argv[3]
|
|
||||||
|
|
||||||
fout = open(fname_out, "wb")
|
|
||||||
|
|
||||||
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
|
|
||||||
fout.write(struct.pack("i", 1)) # file version
|
|
||||||
fout.write(struct.pack("i", n_vocab))
|
|
||||||
fout.write(struct.pack("i", n_embd))
|
|
||||||
fout.write(struct.pack("i", n_mult))
|
|
||||||
fout.write(struct.pack("i", n_head))
|
|
||||||
fout.write(struct.pack("i", n_layer))
|
|
||||||
fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
|
|
||||||
fout.write(struct.pack("i", 4))
|
|
||||||
|
|
||||||
|
|
||||||
# This loop unchanged from convert-pth-to-ggml.py:
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
text = " \u2047 ".encode()
|
|
||||||
elif tokenizer.is_control(i):
|
|
||||||
text = b""
|
|
||||||
elif tokenizer.is_byte(i):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
if len(piece) != 6:
|
|
||||||
print(f"Invalid token: {piece}")
|
|
||||||
sys.exit(1)
|
|
||||||
byte_value = int(piece[3:-1], 16)
|
|
||||||
text = struct.pack("B", byte_value)
|
|
||||||
else:
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
|
||||||
|
|
||||||
def write_header(shape, dst_name, ftype_cur):
|
|
||||||
sname = dst_name.encode()
|
|
||||||
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
|
|
||||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
|
||||||
fout.write(sname)
|
|
||||||
|
|
||||||
# ensure tensor data is aligned
|
|
||||||
tensor_data_offset = fout.tell()
|
|
||||||
tensor_data_offset = (tensor_data_offset + 31) & -32
|
|
||||||
fout.seek(tensor_data_offset)
|
|
||||||
|
|
||||||
def convert_non_q4(src_name, dst_name):
|
|
||||||
v = model[src_name]
|
|
||||||
shape = v.shape
|
|
||||||
print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
|
|
||||||
if len(shape) == 1:
|
|
||||||
print(" Converting to float32")
|
|
||||||
v = v.to(torch.float32)
|
|
||||||
|
|
||||||
ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
|
|
||||||
|
|
||||||
# header
|
|
||||||
write_header(shape, dst_name, ftype_cur)
|
|
||||||
|
|
||||||
# data
|
|
||||||
v.numpy().tofile(fout)
|
|
||||||
|
|
||||||
def convert_q4(src_name, dst_name, permute=False):
|
|
||||||
zeros = model[f"{src_name}.zeros"].numpy()
|
|
||||||
scales = model[f"{src_name}.scales"].numpy()
|
|
||||||
bias = model[f"{src_name}.bias"].numpy()
|
|
||||||
qweight = model[f"{src_name}.qweight"].numpy().T # transpose
|
|
||||||
|
|
||||||
# Q4_1 does not support bias; good thing the bias is always all zeros.
|
|
||||||
assert not np.any(bias)
|
|
||||||
|
|
||||||
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
|
|
||||||
shape = (qweight.shape[0], qweight.shape[1] * 8)
|
|
||||||
|
|
||||||
print(f"Processing Q4 variable: {src_name} with shape: {shape}")
|
|
||||||
|
|
||||||
# The output format has the int4 weights in groups of 32 rather than 8.
|
|
||||||
# It looks like this:
|
|
||||||
# For each row:
|
|
||||||
# For each group of 32 columns:
|
|
||||||
# - addend (float32, 4 bytes)
|
|
||||||
# - scale (float32, 4 bytes)
|
|
||||||
# - weights (int4 * 32, 16 bytes)
|
|
||||||
# Note that in the input, the scales and addends are shared between all
|
|
||||||
# the columns in a row, so we end up wasting quite a bit of memory with
|
|
||||||
# repeated scales and addends.
|
|
||||||
|
|
||||||
addends = -zeros # flip sign
|
|
||||||
|
|
||||||
# Since the output format is mixed between integers and floats, we have
|
|
||||||
# to hackily view the floats as int32s just so numpy will let us
|
|
||||||
# concatenate them.
|
|
||||||
addends_view = addends.view(dtype=np.int32)
|
|
||||||
scales_view = scales.view(dtype=np.int32)
|
|
||||||
|
|
||||||
# Split into groups of 4 columns (i.e. 32 columns of quantized data):
|
|
||||||
grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
|
|
||||||
|
|
||||||
# Repeat addends and scales:
|
|
||||||
addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
|
|
||||||
scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
|
|
||||||
|
|
||||||
blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
|
|
||||||
|
|
||||||
if permute:
|
|
||||||
# Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
|
|
||||||
# This can be done after the above conversion because it doesn't affect column order/layout.
|
|
||||||
blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
|
|
||||||
.swapaxes(1, 2)
|
|
||||||
.reshape(blob.shape))
|
|
||||||
|
|
||||||
# header
|
|
||||||
write_header(shape, dst_name, 3) # ftype = Q4_1
|
|
||||||
|
|
||||||
# data
|
|
||||||
blob.tofile(fout)
|
|
||||||
|
|
||||||
convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
|
|
||||||
convert_non_q4("model.norm.weight", "norm.weight")
|
|
||||||
convert_non_q4("lm_head.weight", "output.weight")
|
|
||||||
|
|
||||||
for i in range(n_layer):
|
|
||||||
convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
|
|
||||||
convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
|
|
||||||
convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
|
|
||||||
convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
|
|
||||||
|
|
||||||
convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
|
|
||||||
convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
|
|
||||||
convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight")
|
|
||||||
|
|
||||||
convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
|
|
||||||
convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
|
|
||||||
|
|
||||||
|
|
||||||
fout.close()
|
|
||||||
|
|
||||||
print(f"Done. Output file: {fname_out}")
|
|
||||||
print()
|
|
|
@ -1,274 +1,11 @@
|
||||||
# Convert a LLaMA model checkpoint to a ggjt compatible file
|
# Compatibility stub
|
||||||
#
|
|
||||||
# Load the model using Torch
|
|
||||||
# Iterate over all variables and write them to a binary file.
|
|
||||||
#
|
|
||||||
# For each variable, write the following:
|
|
||||||
# - Number of dimensions (int)
|
|
||||||
# - Name length (int)
|
|
||||||
# - Dimensions (int[n_dims])
|
|
||||||
# - Name (char[name_length])
|
|
||||||
# - Data (float[n_dims])
|
|
||||||
#
|
|
||||||
# At the start of the ggml file we write the model parameters
|
|
||||||
# and vocabulary.
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import struct
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
import convert
|
||||||
|
|
||||||
QK = 32
|
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
|
||||||
|
parser.add_argument('dir_model', help='directory containing the model checkpoint')
|
||||||
GGML_TYPE_Q4_0 = 0
|
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
|
||||||
GGML_TYPE_Q4_1 = 1
|
args = parser.parse_args()
|
||||||
GGML_TYPE_I8 = 2
|
convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
|
||||||
GGML_TYPE_I16 = 3
|
|
||||||
GGML_TYPE_I32 = 4
|
|
||||||
GGML_TYPE_F16 = 5
|
|
||||||
GGML_TYPE_F32 = 6
|
|
||||||
|
|
||||||
WTYPES = {
|
|
||||||
0: GGML_TYPE_F32,
|
|
||||||
1: GGML_TYPE_F16,
|
|
||||||
2: GGML_TYPE_Q4_0,
|
|
||||||
3: GGML_TYPE_Q4_1,
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_BLCK_SIZE = {
|
|
||||||
GGML_TYPE_Q4_0: QK,
|
|
||||||
GGML_TYPE_Q4_1: QK,
|
|
||||||
GGML_TYPE_I8: 1,
|
|
||||||
GGML_TYPE_I16: 1,
|
|
||||||
GGML_TYPE_I32: 1,
|
|
||||||
GGML_TYPE_F16: 1,
|
|
||||||
GGML_TYPE_F32: 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_TYPE_SIZE = {
|
|
||||||
GGML_TYPE_Q4_0: 4 + QK//2,
|
|
||||||
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
|
||||||
GGML_TYPE_I8: 1,
|
|
||||||
GGML_TYPE_I16: 2,
|
|
||||||
GGML_TYPE_I32: 4,
|
|
||||||
GGML_TYPE_F16: 2,
|
|
||||||
GGML_TYPE_F32: 4,
|
|
||||||
}
|
|
||||||
|
|
||||||
def ggml_nelements(shape):
|
|
||||||
r = 1
|
|
||||||
for i in shape:
|
|
||||||
r *= i
|
|
||||||
return r
|
|
||||||
|
|
||||||
def ggml_nbytes(shape, ftype):
|
|
||||||
x = ggml_nelements(shape)
|
|
||||||
t = WTYPES[ftype]
|
|
||||||
x *= GGML_TYPE_SIZE[t]
|
|
||||||
x //= GGML_BLCK_SIZE[t]
|
|
||||||
return x
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
|
|
||||||
parser.add_argument('dir_model', help='directory containing the model checkpoint')
|
|
||||||
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
|
|
||||||
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def get_n_parts(dim):
|
|
||||||
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
|
|
||||||
n_parts = mappings.get(dim)
|
|
||||||
if n_parts is None:
|
|
||||||
print(f"Invalid dim: {dim}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
print(f"n_parts = {n_parts}\n")
|
|
||||||
return n_parts
|
|
||||||
|
|
||||||
def load_hparams_and_tokenizer(dir_model):
|
|
||||||
# `dir_model` is something like `models/7B` or `models/7B/`.
|
|
||||||
# "tokenizer.model" is expected under model's parent dir.
|
|
||||||
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
|
|
||||||
# Let's use the model's parent dir directly.
|
|
||||||
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
|
|
||||||
fname_hparams = f"{dir_model}/params.json"
|
|
||||||
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
|
|
||||||
with open(fname_hparams, "r") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
print(hparams)
|
|
||||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
|
||||||
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
|
||||||
return hparams, tokenizer
|
|
||||||
|
|
||||||
def write_header(fout, hparams, ftype):
|
|
||||||
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
|
||||||
values = [
|
|
||||||
0x67676a74, # magic: ggjt in hex
|
|
||||||
1, # file version
|
|
||||||
*[hparams[key] for key in keys],
|
|
||||||
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
|
||||||
ftype
|
|
||||||
]
|
|
||||||
fout.write(struct.pack("i" * len(values), *values))
|
|
||||||
|
|
||||||
def write_tokens(fout, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
text = " \u2047 ".encode()
|
|
||||||
elif tokenizer.is_control(i):
|
|
||||||
text = b""
|
|
||||||
elif tokenizer.is_byte(i):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
if len(piece) != 6:
|
|
||||||
print(f"Invalid token: {piece}")
|
|
||||||
sys.exit(1)
|
|
||||||
byte_value = int(piece[3:-1], 16)
|
|
||||||
text = struct.pack("B", byte_value)
|
|
||||||
else:
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
|
||||||
|
|
||||||
def process_and_write_variables(fout, model, ftype, part_id, n_parts):
|
|
||||||
for name, datao in model.items():
|
|
||||||
if name.endswith("freqs"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# remove dimensions with a single element
|
|
||||||
data = datao.numpy().squeeze()
|
|
||||||
partshape = data.shape
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
assert n_dims in (1, 2)
|
|
||||||
|
|
||||||
print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
|
|
||||||
|
|
||||||
# coerce single-dimensional tensors from float16 to float32
|
|
||||||
ftype_cur = 1
|
|
||||||
if ftype == 0 or n_dims == 1:
|
|
||||||
print(" Converting to float32")
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
ftype_cur = 0
|
|
||||||
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
|
|
||||||
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
|
|
||||||
|
|
||||||
# determine dimension along which multipart tensor is sharded
|
|
||||||
#
|
|
||||||
# split_dim 0 regex:
|
|
||||||
# - output.*
|
|
||||||
# - layers.*.attention.wq.weight
|
|
||||||
# - layers.*.attention.wk.weight
|
|
||||||
# - layers.*.attention.wv.weight
|
|
||||||
# - layers.*.feed_forward.w1.weight
|
|
||||||
# - layers.*.feed_forward.w3.weight
|
|
||||||
#
|
|
||||||
# split_dim 1 regex:
|
|
||||||
# - tok_embeddings.*
|
|
||||||
# - layers.*.attention.wo.weight
|
|
||||||
# - layers.*.feed_forward.w2.weight
|
|
||||||
#
|
|
||||||
if n_dims > 1:
|
|
||||||
split_dim = 1
|
|
||||||
if "tok_embeddings" in name:
|
|
||||||
split_dim = 1
|
|
||||||
elif "layers" in name:
|
|
||||||
if "attention.wo.weight" in name:
|
|
||||||
split_dim = 1
|
|
||||||
elif "feed_forward.w2.weight" in name:
|
|
||||||
split_dim = 1
|
|
||||||
else:
|
|
||||||
split_dim = 0
|
|
||||||
elif "output" in name:
|
|
||||||
split_dim = 0
|
|
||||||
|
|
||||||
# output tensor header
|
|
||||||
fullshape = list(partshape)
|
|
||||||
if n_dims > 1:
|
|
||||||
fullshape[split_dim] *= n_parts
|
|
||||||
sname = name.encode()
|
|
||||||
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
|
|
||||||
for dim in reversed(fullshape):
|
|
||||||
fout.write(struct.pack("i", dim))
|
|
||||||
fout.write(sname)
|
|
||||||
|
|
||||||
# ensure tensor data is aligned
|
|
||||||
tensor_data_offset = fout.tell()
|
|
||||||
while tensor_data_offset % QK != 0:
|
|
||||||
fout.write(struct.pack("B", 0))
|
|
||||||
tensor_data_offset += 1
|
|
||||||
|
|
||||||
# output unified mappable tensor data
|
|
||||||
if n_dims == 1 or n_parts == 1:
|
|
||||||
# copy tensor which we thankfully received in one piece
|
|
||||||
if part_id == 0:
|
|
||||||
data.tofile(fout)
|
|
||||||
elif split_dim == 0:
|
|
||||||
# reassemble multifile tensor containing some of the rows
|
|
||||||
rows_per_chunk = partshape[0]
|
|
||||||
current_row = part_id * rows_per_chunk
|
|
||||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
|
||||||
offset = current_row * bytes_per_row
|
|
||||||
fout.seek(tensor_data_offset + offset)
|
|
||||||
data.tofile(fout)
|
|
||||||
elif split_dim == 1:
|
|
||||||
# reassemble multifile tensor containing some of the cols
|
|
||||||
cols_per_chunk = partshape[1]
|
|
||||||
current_col = part_id * cols_per_chunk
|
|
||||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
|
||||||
offset_current_col = current_col // blck_size * type_size
|
|
||||||
for row in range(partshape[0]):
|
|
||||||
offset_row = row * bytes_per_row
|
|
||||||
offset = offset_row + offset_current_col
|
|
||||||
fout.seek(tensor_data_offset + offset)
|
|
||||||
data[row].tofile(fout)
|
|
||||||
|
|
||||||
# advance file position to next tensor
|
|
||||||
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
dir_model = args.dir_model
|
|
||||||
ftype = args.ftype
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
|
|
||||||
|
|
||||||
print(args)
|
|
||||||
|
|
||||||
# if only writing vocab to file
|
|
||||||
if args.vocab_only:
|
|
||||||
fname_model = f"{dir_model}/consolidated.00.pth"
|
|
||||||
fname_out = f"{dir_model}/ggml-vocab.bin"
|
|
||||||
print(f"Extracting only the vocab from '{fname_model}'\n")
|
|
||||||
with open(fname_out, "wb") as fout:
|
|
||||||
write_header(fout, hparams, ftype)
|
|
||||||
write_tokens(fout, tokenizer)
|
|
||||||
print(f"Done. Output file: {fname_out}\n")
|
|
||||||
return
|
|
||||||
|
|
||||||
n_parts = get_n_parts(hparams["dim"])
|
|
||||||
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
|
|
||||||
|
|
||||||
# we output a single file for ggml
|
|
||||||
with open(fname_out, "wb") as fout:
|
|
||||||
write_header(fout, hparams, ftype)
|
|
||||||
write_tokens(fout, tokenizer)
|
|
||||||
offset_of_tensors = fout.tell()
|
|
||||||
# the tensors we load could be split across multiple files
|
|
||||||
for part_id in range(n_parts):
|
|
||||||
fout.seek(offset_of_tensors)
|
|
||||||
print(f"Processing part {part_id+1} of {n_parts}\n")
|
|
||||||
fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
|
|
||||||
model = torch.load(fname_model, map_location="cpu")
|
|
||||||
process_and_write_variables(fout, model, ftype, part_id, n_parts)
|
|
||||||
del model
|
|
||||||
|
|
||||||
print(f"Done. Output file: {fname_out}\n")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
|
@ -1,100 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# Original by https://github.com/eiz
|
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
|
|
||||||
parser.add_argument('dir_model', help='directory containing ggml .bin files')
|
|
||||||
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def read_header(f_in):
|
|
||||||
struct_fmt = "i" * (3 + len(HPARAMS))
|
|
||||||
struct_size = struct.calcsize(struct_fmt)
|
|
||||||
buf = f_in.read(struct_size)
|
|
||||||
return struct.unpack(struct_fmt, buf)
|
|
||||||
|
|
||||||
def write_header(f_out, header):
|
|
||||||
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
|
||||||
|
|
||||||
if magic != 0x67676d6c:
|
|
||||||
raise Exception('Invalid file magic. Must be an old style ggml file.')
|
|
||||||
|
|
||||||
values = [
|
|
||||||
0x67676d66, # magic: ggml in hex
|
|
||||||
1, # file version
|
|
||||||
vocab_size,
|
|
||||||
dim,
|
|
||||||
multiple_of,
|
|
||||||
n_heads,
|
|
||||||
n_layers,
|
|
||||||
rot,
|
|
||||||
ftype
|
|
||||||
]
|
|
||||||
f_out.write(struct.pack("i" * len(values), *values))
|
|
||||||
|
|
||||||
def write_tokens(fout, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
text = " \u2047 ".encode()
|
|
||||||
elif tokenizer.is_control(i):
|
|
||||||
text = b""
|
|
||||||
elif tokenizer.is_byte(i):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
if len(piece) != 6:
|
|
||||||
print(f"Invalid token: {piece}")
|
|
||||||
sys.exit(1)
|
|
||||||
byte_value = int(piece[3:-1], 16)
|
|
||||||
text = struct.pack("B", byte_value)
|
|
||||||
else:
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
|
||||||
|
|
||||||
def read_tokens(f_in, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
len_b = f_in.read(4)
|
|
||||||
(length,) = struct.unpack("i", len_b)
|
|
||||||
f_in.read(length)
|
|
||||||
|
|
||||||
def copy_all_data(f_out, f_in):
|
|
||||||
while True:
|
|
||||||
buf = f_in.read(1024 * 1024)
|
|
||||||
if not buf:
|
|
||||||
break
|
|
||||||
f_out.write(buf)
|
|
||||||
|
|
||||||
def convert_one_file(path_in, tokenizer):
|
|
||||||
path_tmp = f"{path_in}.tmp"
|
|
||||||
path_orig= f"{path_in}.orig"
|
|
||||||
print(f"converting {path_in}")
|
|
||||||
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
|
|
||||||
write_header(f_out, read_header(f_in))
|
|
||||||
read_tokens(f_in, tokenizer)
|
|
||||||
write_tokens(f_out, tokenizer)
|
|
||||||
copy_all_data(f_out, f_in)
|
|
||||||
os.rename(path_in, path_orig)
|
|
||||||
os.rename(path_tmp, path_in)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
files = []
|
|
||||||
files.extend(glob.glob(f"{args.dir_model}/*.bin"))
|
|
||||||
files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(args.tokenizer_model)
|
|
||||||
|
|
||||||
for file in files:
|
|
||||||
convert_one_file(file, tokenizer)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
1148
convert.py
Normal file
1148
convert.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -19,15 +19,15 @@ GEN_OPTIONS=(--batch_size 1024
|
||||||
--top_p 0.5)
|
--top_p 0.5)
|
||||||
|
|
||||||
if [ -n "$N_THREAD" ]; then
|
if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./main "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
--color --interactive \
|
--color --interactive \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--prompt "
|
--prompt "
|
||||||
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
|
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
|
||||||
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||||
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
||||||
|
|
|
@ -7,4 +7,4 @@
|
||||||
cd `dirname $0`
|
cd `dirname $0`
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
|
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
|
||||||
|
|
270
examples/benchmark/benchmark-q4_0-matmult.c
Normal file
270
examples/benchmark/benchmark-q4_0-matmult.c
Normal file
|
@ -0,0 +1,270 @@
|
||||||
|
/*
|
||||||
|
License: MIT License
|
||||||
|
|
||||||
|
Changelog:
|
||||||
|
- 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <locale.h>
|
||||||
|
#include "ggml.h"
|
||||||
|
#include <assert.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <cstring>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <queue>
|
||||||
|
#include <string.h>
|
||||||
|
#include <cassert>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
#include <iterator>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
float tensor_sum_elements(struct ggml_tensor * tensor) {
|
||||||
|
float sum = 0;
|
||||||
|
if (tensor->type==GGML_TYPE_F32) {
|
||||||
|
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||||
|
for (int k = 0; k < tensor->ne[0]; k++) {
|
||||||
|
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
These are mapping to unknown
|
||||||
|
GGML_TYPE_I8,
|
||||||
|
GGML_TYPE_I16,
|
||||||
|
GGML_TYPE_I32,
|
||||||
|
GGML_TYPE_COUNT,
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
|
||||||
|
|
||||||
|
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
|
||||||
|
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
|
||||||
|
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
|
||||||
|
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
|
||||||
|
|
||||||
|
struct benchmark_params_struct {
|
||||||
|
int32_t n_threads = 1;
|
||||||
|
int32_t n_iterations = 10;
|
||||||
|
};
|
||||||
|
|
||||||
|
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
||||||
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "options:\n");
|
||||||
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
|
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
|
fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
|
struct benchmark_params_struct benchmark_params;
|
||||||
|
|
||||||
|
bool invalid_param = false;
|
||||||
|
std::string arg;
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
arg = argv[i];
|
||||||
|
|
||||||
|
if (arg == "-t" || arg == "--threads") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
benchmark_params.n_threads = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "-i" || arg == "--iter") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
benchmark_params.n_iterations = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
|
print_usage(argc, argv, benchmark_params);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
if (invalid_param) {
|
||||||
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||||
|
print_usage(argc, argv, benchmark_params);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// create the ggml context
|
||||||
|
printf("Starting Test\n");
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
struct ggml_context * ctx;
|
||||||
|
//const int sizex = 4096;
|
||||||
|
//const int sizey = 11008;
|
||||||
|
|
||||||
|
#undef VERBOSE_DEBUGGING
|
||||||
|
#ifndef VERBOSE_DEBUGGING
|
||||||
|
const int sizey = 4096;
|
||||||
|
const int sizex = 11008;
|
||||||
|
const int sizez = 128;
|
||||||
|
#else
|
||||||
|
/* Working - let's increase size */
|
||||||
|
const int sizey = 1;
|
||||||
|
const int sizex = (8*32);
|
||||||
|
const int sizez = 1;
|
||||||
|
|
||||||
|
/*const int sizey = 1;
|
||||||
|
const int sizex = 3*(8*32);
|
||||||
|
const int sizez = 1;*/
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//printf("Memsize required = %i\n", sizex*sizex);
|
||||||
|
ggml_type wtype = GGML_TYPE_F32;
|
||||||
|
|
||||||
|
size_t ctx_size = 0;
|
||||||
|
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
|
||||||
|
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
|
||||||
|
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
||||||
|
ctx_size += sizex*sizeof(float);
|
||||||
|
ctx_size += 1024*1024*100;
|
||||||
|
|
||||||
|
printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size =*/ ctx_size,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/* no_alloc =*/ 0
|
||||||
|
};
|
||||||
|
|
||||||
|
ctx = ggml_init(params);
|
||||||
|
if (!ctx) {
|
||||||
|
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
printf("Creating new tensors\n");
|
||||||
|
// printf("Creating new tensor m1\n");
|
||||||
|
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||||
|
ggml_set_f32(m11, 1.0f);
|
||||||
|
|
||||||
|
// printf("Creating new tensor m1\n");
|
||||||
|
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||||
|
ggml_set_f32(m12, 1.5f);
|
||||||
|
|
||||||
|
// printf("Creating new tensor m2\n");
|
||||||
|
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
|
||||||
|
ggml_set_f32(m2, 2.0f);
|
||||||
|
|
||||||
|
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
|
||||||
|
// printf("Creating new tensor m11xm2\n");
|
||||||
|
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
||||||
|
|
||||||
|
// printf("Creating compute graph\n");
|
||||||
|
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
||||||
|
|
||||||
|
gf.n_threads=benchmark_params.n_threads;
|
||||||
|
printf("cgraph->n_threads=%i\n",gf.n_threads);
|
||||||
|
|
||||||
|
TENSOR_DUMP(m11);
|
||||||
|
TENSOR_DUMP(m2);
|
||||||
|
|
||||||
|
ggml_graph_compute(ctx, &gf);
|
||||||
|
|
||||||
|
TENSOR_DUMP(gf.nodes[0]);
|
||||||
|
|
||||||
|
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
|
||||||
|
|
||||||
|
int32_t nelements = sizex*sizey;
|
||||||
|
int32_t ne[2] = { sizex, sizey };
|
||||||
|
|
||||||
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||||
|
|
||||||
|
// Set up a the benchmark matrices
|
||||||
|
// printf("Creating new tensor q11 & Running quantize\n");
|
||||||
|
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
|
||||||
|
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
|
||||||
|
|
||||||
|
// Set up a the compute graph
|
||||||
|
// printf("Creating new tensor q31\n");
|
||||||
|
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
|
||||||
|
|
||||||
|
// printf("Creating compute graph\n");
|
||||||
|
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
||||||
|
gf31.n_threads=benchmark_params.n_threads;
|
||||||
|
|
||||||
|
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||||
|
// printf("Creating new tensor q12 & Running quantize\n");
|
||||||
|
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
|
||||||
|
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
|
||||||
|
|
||||||
|
// printf("Creating new tensor q32\n");
|
||||||
|
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
||||||
|
|
||||||
|
//printf("Creating compute graph\n");
|
||||||
|
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
||||||
|
gf32.n_threads=benchmark_params.n_threads;
|
||||||
|
printf("cgraph->n_threads=%i\n",gf31.n_threads);
|
||||||
|
|
||||||
|
const int dimx = sizex;
|
||||||
|
const int dimy = sizey;
|
||||||
|
const int dimz = sizez;
|
||||||
|
long long int flops_per_dot_product = dimy + dimy;
|
||||||
|
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
|
||||||
|
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
|
||||||
|
|
||||||
|
|
||||||
|
// Let's use the F32 result from above as a reference for the q4_0 multiplication
|
||||||
|
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
|
||||||
|
|
||||||
|
|
||||||
|
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
|
||||||
|
printf("==============================================================================================\n");
|
||||||
|
|
||||||
|
for (int i=0;i<benchmark_params.n_iterations ;i++) {
|
||||||
|
|
||||||
|
long long int start = ggml_time_us();
|
||||||
|
//printf("Running ggml_graph_compute\n");
|
||||||
|
ggml_graph_compute(ctx, &gf31);
|
||||||
|
long long int stop = ggml_time_us();
|
||||||
|
long long int usec = stop-start;
|
||||||
|
float sec = usec/1000000;
|
||||||
|
float flops_per_usec = (1.0f*flops_per_matrix)/usec;
|
||||||
|
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
|
||||||
|
i,
|
||||||
|
gf31.n_threads,
|
||||||
|
sizex, sizey, sizez, flops_per_matrix,
|
||||||
|
usec,flops_per_usec);
|
||||||
|
|
||||||
|
#ifdef VERBOSE_DEBUGGING
|
||||||
|
TENSOR_DUMP("res",gf31.nodes[0])
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Check that the matrix multiplication result is in the right ballpark
|
||||||
|
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||||
|
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
|
||||||
|
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||||
|
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||||
|
|
||||||
|
if (delta > allowed_delta) {
|
||||||
|
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
|
||||||
|
sum_of_F32_reference,
|
||||||
|
sum_of_Q4_result,
|
||||||
|
delta,
|
||||||
|
allowed_delta
|
||||||
|
);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Running a different graph computation to make sure we override the CPU cache lines
|
||||||
|
ggml_graph_compute(ctx, &gf32);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -7,12 +7,6 @@
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
|
||||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
|
||||||
#include <alloca.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined (_WIN32)
|
#if defined (_WIN32)
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <io.h>
|
#include <io.h>
|
||||||
|
@ -22,9 +16,9 @@ extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHand
|
||||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
|
||||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
|
||||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
|
||||||
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
|
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
|
||||||
const wchar_t * lpWideCharStr, int cchWideChar,
|
const wchar_t * lpWideCharStr, int cchWideChar,
|
||||||
char * lpMultiByteStr, int cbMultiByte,
|
char * lpMultiByteStr, int cbMultiByte,
|
||||||
const char * lpDefaultChar, bool * lpUsedDefaultChar);
|
const char * lpDefaultChar, bool * lpUsedDefaultChar);
|
||||||
#define CP_UTF8 65001
|
#define CP_UTF8 65001
|
||||||
#endif
|
#endif
|
||||||
|
@ -342,9 +336,9 @@ void win32_console_init(bool enable_color) {
|
||||||
|
|
||||||
// Convert a wide Unicode string to an UTF8 string
|
// Convert a wide Unicode string to an UTF8 string
|
||||||
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
|
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
|
||||||
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
|
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
|
||||||
std::string strTo(size_needed, 0);
|
std::string strTo(size_needed, 0);
|
||||||
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
|
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
|
||||||
str = strTo;
|
str = strTo;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# embedding
|
# embedding
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <ctime>
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
params.model = "models/llama-7B/ggml-model.bin";
|
params.model = "models/llama-7B/ggml-model.bin";
|
||||||
|
|
|
@ -10,6 +10,6 @@ cd ..
|
||||||
./main --color --instruct --threads 4 \
|
./main --color --instruct --threads 4 \
|
||||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||||
--file ./prompts/alpaca.txt \
|
--file ./prompts/alpaca.txt \
|
||||||
--batch_size 8 --ctx_size 2048 \
|
--batch_size 8 --ctx_size 2048 -n -1 \
|
||||||
--repeat_last_n 64 --repeat_penalty 1.3 \
|
--repeat_last_n 64 --repeat_penalty 1.3 \
|
||||||
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# main
|
# main
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
|
@ -1,3 +1,8 @@
|
||||||
|
// Defines sigaction on msys:
|
||||||
|
#ifndef _GNU_SOURCE
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -6,6 +11,7 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <ctime>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -163,7 +169,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// enable interactive mode if reverse prompt or interactive start is specified
|
// enable interactive mode if reverse prompt or interactive start is specified
|
||||||
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# perplexity
|
# perplexity
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <ctime>
|
||||||
|
|
||||||
std::vector<float> softmax(const std::vector<float>& logits) {
|
std::vector<float> softmax(const std::vector<float>& logits) {
|
||||||
std::vector<float> probs(logits.size());
|
std::vector<float> probs(logits.size());
|
||||||
|
@ -27,20 +28,28 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
int seq_count = tokens.size() / params.n_ctx;
|
int seq_count = tokens.size() / params.n_ctx;
|
||||||
|
int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);
|
||||||
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
|
|
||||||
|
|
||||||
for (int i = 0; i < seq_count; ++i) {
|
for (int i = 0; i < seq_count; ++i) {
|
||||||
int start = i * params.n_ctx;
|
int start = i * params.n_ctx;
|
||||||
int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
|
int end = start + params.n_ctx;
|
||||||
// it is better to always be power of 2 for better performance
|
|
||||||
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
|
std::vector<float> logits;
|
||||||
|
int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
|
||||||
auto start_t = std::chrono::high_resolution_clock::now();
|
auto start_t = std::chrono::high_resolution_clock::now();
|
||||||
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads, params.n_ethreads)) {
|
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
for (int j = 0; j < num_batches; ++j) {
|
||||||
return;
|
int batch_start = start + j * params.n_batch;
|
||||||
|
int batch_size = std::min(end - batch_start, params.n_batch);
|
||||||
|
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads, params.n_ethreads)) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto batch_logits = llama_get_logits(ctx);
|
||||||
|
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||||
}
|
}
|
||||||
auto end_t = std::chrono::high_resolution_clock::now();
|
auto end_t = std::chrono::high_resolution_clock::now();
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
|
@ -59,15 +68,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
// Example, we have a context window of 512, we will compute perplexity for each of the
|
// Example, we have a context window of 512, we will compute perplexity for each of the
|
||||||
// last 256 tokens. Then, we split the input up into context window size chunks to
|
// last 256 tokens. Then, we split the input up into context window size chunks to
|
||||||
// process the entire prompt.
|
// process the entire prompt.
|
||||||
|
for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
|
||||||
auto logits = llama_get_logits(ctx);
|
|
||||||
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
|
|
||||||
// Calculate probability of next token, given the previous ones.
|
// Calculate probability of next token, given the previous ones.
|
||||||
int n_vocab = llama_n_vocab(ctx);
|
|
||||||
std::vector<float> tok_logits(
|
std::vector<float> tok_logits(
|
||||||
logits + j * n_vocab,
|
logits.begin() + j * n_vocab,
|
||||||
logits + (j + 1) * n_vocab);
|
logits.begin() + (j + 1) * n_vocab);
|
||||||
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||||
nll += -std::log(prob);
|
nll += -std::log(prob);
|
||||||
++count;
|
++count;
|
||||||
}
|
}
|
||||||
|
@ -82,11 +88,13 @@ int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
params.model = "models/llama-7B/ggml-model.bin";
|
params.model = "models/llama-7B/ggml-model.bin";
|
||||||
|
|
||||||
|
params.n_batch = 512;
|
||||||
if (gpt_params_parse(argc, argv, params) == false) {
|
if (gpt_params_parse(argc, argv, params) == false) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
params.perplexity = true;
|
params.perplexity = true;
|
||||||
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#define LLAMA_API_INTERNAL
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama_internal.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -15,9 +16,6 @@
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
|
|
||||||
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
|
|
||||||
|
|
||||||
struct quantize_stats_params {
|
struct quantize_stats_params {
|
||||||
std::string model = "models/7B/ggml-model-f16.bin";
|
std::string model = "models/7B/ggml-model-f16.bin";
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
|
@ -223,7 +221,7 @@ int main(int argc, char ** argv) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
int j;
|
int j;
|
||||||
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
|
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
|
||||||
// find match
|
// find match
|
||||||
}
|
}
|
||||||
if (j < GGML_TYPE_COUNT) {
|
if (j < GGML_TYPE_COUNT) {
|
||||||
|
@ -278,7 +276,7 @@ int main(int argc, char ** argv) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (params.verbose) {
|
if (params.verbose) {
|
||||||
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
|
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
|
||||||
}
|
}
|
||||||
if (kv_tensor.second->type == GGML_TYPE_F16) {
|
if (kv_tensor.second->type == GGML_TYPE_F16) {
|
||||||
is_f16 = true;
|
is_f16 = true;
|
||||||
|
@ -303,13 +301,14 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// loop throught quantization types
|
// loop throught quantization types
|
||||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
|
const ggml_type type = (ggml_type) i;
|
||||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||||
if (params.verbose) {
|
if (params.verbose) {
|
||||||
printf("testing %s ...\n", type_strs[i]);
|
printf("testing %s ...\n", ggml_type_name(type));
|
||||||
}
|
}
|
||||||
|
|
||||||
error_stats global_stats {};
|
error_stats global_stats {};
|
||||||
|
@ -321,7 +320,7 @@ int main(int argc, char ** argv) {
|
||||||
if (params.verbose) {
|
if (params.verbose) {
|
||||||
printf(" %s ...\n", kv_tensor.first.c_str());
|
printf(" %s ...\n", kv_tensor.first.c_str());
|
||||||
}
|
}
|
||||||
std::string layer_name { type_strs[i] };
|
std::string layer_name { ggml_type_name(type) };
|
||||||
layer_name += "::" + kv_tensor.first;
|
layer_name += "::" + kv_tensor.first;
|
||||||
test_roundtrip_on_layer(
|
test_roundtrip_on_layer(
|
||||||
layer_name,
|
layer_name,
|
||||||
|
@ -336,7 +335,7 @@ int main(int argc, char ** argv) {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
print_error_stats(type_strs[i], global_stats, params.print_histogram);
|
print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,15 +5,15 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
// usage:
|
// usage:
|
||||||
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
|
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
|
||||||
//
|
//
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||||
fprintf(stderr, " type = 2 - q4_0\n");
|
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
|
||||||
fprintf(stderr, " type = 3 - q4_1\n");
|
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
|
||||||
const std::string fname_inp = argv[1];
|
const std::string fname_inp = argv[1];
|
||||||
const std::string fname_out = argv[2];
|
const std::string fname_out = argv[2];
|
||||||
|
|
||||||
const int itype = atoi(argv[3]);
|
const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_time_us();
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
|
||||||
{
|
{
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
|
|
||||||
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
|
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
|
||||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,7 +10,6 @@
|
||||||
inherit system;
|
inherit system;
|
||||||
};
|
};
|
||||||
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
||||||
torch
|
|
||||||
numpy
|
numpy
|
||||||
sentencepiece
|
sentencepiece
|
||||||
]);
|
]);
|
||||||
|
@ -28,10 +27,8 @@
|
||||||
];
|
];
|
||||||
installPhase = ''
|
installPhase = ''
|
||||||
mkdir -p $out/bin
|
mkdir -p $out/bin
|
||||||
mv bin/main $out/bin/llama
|
mv bin/* $out/bin/
|
||||||
mv bin/quantize $out/bin/quantize
|
mv $out/bin/main $out/bin/llama
|
||||||
mv bin/embedding $out/bin/embedding
|
|
||||||
mv bin/perplexity $out/bin/perplexity
|
|
||||||
|
|
||||||
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
|
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
|
||||||
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
||||||
|
|
42
ggml.h
42
ggml.h
|
@ -177,11 +177,12 @@ extern "C" {
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
#define GGML_MAX_DIMS 4
|
#define GGML_MAX_DIMS 4
|
||||||
#define GGML_MAX_NODES 4096
|
#define GGML_MAX_NODES 4096
|
||||||
#define GGML_MAX_PARAMS 16
|
#define GGML_MAX_PARAMS 16
|
||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_OPT 4
|
#define GGML_MAX_OPT 4
|
||||||
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
// we use the built-in 16-bit float type
|
// we use the built-in 16-bit float type
|
||||||
|
@ -198,13 +199,15 @@ struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
|
||||||
enum ggml_type {
|
enum ggml_type {
|
||||||
GGML_TYPE_Q4_0,
|
// explicitly numbered values are used in llama.cpp files
|
||||||
GGML_TYPE_Q4_1,
|
GGML_TYPE_F32 = 0,
|
||||||
|
GGML_TYPE_F16 = 1,
|
||||||
|
GGML_TYPE_Q4_0 = 2,
|
||||||
|
GGML_TYPE_Q4_1 = 3,
|
||||||
|
GGML_TYPE_Q8_0 = 4,
|
||||||
GGML_TYPE_I8,
|
GGML_TYPE_I8,
|
||||||
GGML_TYPE_I16,
|
GGML_TYPE_I16,
|
||||||
GGML_TYPE_I32,
|
GGML_TYPE_I32,
|
||||||
GGML_TYPE_F16,
|
|
||||||
GGML_TYPE_F32,
|
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -251,6 +254,9 @@ enum ggml_op {
|
||||||
GGML_OP_FLASH_ATTN,
|
GGML_OP_FLASH_ATTN,
|
||||||
GGML_OP_FLASH_FF,
|
GGML_OP_FLASH_FF,
|
||||||
|
|
||||||
|
GGML_OP_MAP_UNARY,
|
||||||
|
GGML_OP_MAP_BINARY,
|
||||||
|
|
||||||
GGML_OP_COUNT,
|
GGML_OP_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -349,6 +355,8 @@ int ggml_blck_size (enum ggml_type type);
|
||||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||||
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||||
|
|
||||||
|
const char * ggml_type_name(enum ggml_type type);
|
||||||
|
|
||||||
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
struct ggml_context * ggml_init(struct ggml_init_params params);
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||||
|
@ -650,6 +658,21 @@ struct ggml_tensor * ggml_flash_ff(
|
||||||
struct ggml_tensor * c0,
|
struct ggml_tensor * c0,
|
||||||
struct ggml_tensor * c1);
|
struct ggml_tensor * c1);
|
||||||
|
|
||||||
|
// Mapping operations
|
||||||
|
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
||||||
|
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_unary_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
const ggml_unary_op_f32_t fun);
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_binary_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
const ggml_binary_op_f32_t fun);
|
||||||
|
|
||||||
//
|
//
|
||||||
// automatic differentiation
|
// automatic differentiation
|
||||||
//
|
//
|
||||||
|
@ -814,6 +837,7 @@ typedef struct {
|
||||||
dequantize_row_q_t dequantize_row_q;
|
dequantize_row_q_t dequantize_row_q;
|
||||||
quantize_row_q_t quantize_row_q;
|
quantize_row_q_t quantize_row_q;
|
||||||
quantize_row_q_t quantize_row_q_reference;
|
quantize_row_q_t quantize_row_q_reference;
|
||||||
|
quantize_row_q_t quantize_row_q_dot;
|
||||||
vec_dot_q_t vec_dot_q;
|
vec_dot_q_t vec_dot_q;
|
||||||
} quantize_fns_t;
|
} quantize_fns_t;
|
||||||
|
|
||||||
|
|
104
llama.cpp
104
llama.cpp
|
@ -1,10 +1,15 @@
|
||||||
|
// Defines fileno on msys:
|
||||||
|
#ifndef _GNU_SOURCE
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "llama_util.h"
|
#include "llama_util.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama_internal.h"
|
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
|
#include <ctime>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <random>
|
#include <random>
|
||||||
|
@ -77,7 +82,7 @@ struct llama_hparams {
|
||||||
uint32_t n_head = 32;
|
uint32_t n_head = 32;
|
||||||
uint32_t n_layer = 32;
|
uint32_t n_layer = 32;
|
||||||
uint32_t n_rot = 64;
|
uint32_t n_rot = 64;
|
||||||
uint32_t f16 = 1;
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
return memcmp(this, &other, sizeof(llama_hparams));
|
return memcmp(this, &other, sizeof(llama_hparams));
|
||||||
|
@ -257,22 +262,12 @@ static size_t checked_div(size_t a, size_t b) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
||||||
std::string ret = "[" + std::to_string(ne.at(0));
|
char buf[256];
|
||||||
|
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
||||||
for (size_t i = 1; i < ne.size(); i++) {
|
for (size_t i = 1; i < ne.size(); i++) {
|
||||||
ret += " x " + std::to_string(ne.at(i));
|
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
||||||
}
|
|
||||||
ret += "]";
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * llama_format_type(enum ggml_type type) {
|
|
||||||
switch (type) {
|
|
||||||
case GGML_TYPE_F32: return "f32";
|
|
||||||
case GGML_TYPE_F16: return "f16";
|
|
||||||
case GGML_TYPE_Q4_0: return "q4_0";
|
|
||||||
case GGML_TYPE_Q4_1: return "q4_1";
|
|
||||||
default: LLAMA_ASSERT(false);
|
|
||||||
}
|
}
|
||||||
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
||||||
|
@ -427,7 +422,7 @@ struct llama_file_loader {
|
||||||
hparams.n_head = file.read_u32();
|
hparams.n_head = file.read_u32();
|
||||||
hparams.n_layer = file.read_u32();
|
hparams.n_layer = file.read_u32();
|
||||||
hparams.n_rot = file.read_u32();
|
hparams.n_rot = file.read_u32();
|
||||||
hparams.f16 = file.read_u32();
|
hparams.ftype = (enum llama_ftype) file.read_u32();
|
||||||
}
|
}
|
||||||
void read_vocab() {
|
void read_vocab() {
|
||||||
vocab.id_to_token.resize(hparams.n_vocab);
|
vocab.id_to_token.resize(hparams.n_vocab);
|
||||||
|
@ -453,20 +448,21 @@ struct llama_file_loader {
|
||||||
llama_load_tensor_shard shard;
|
llama_load_tensor_shard shard;
|
||||||
uint32_t n_dims = file.read_u32();
|
uint32_t n_dims = file.read_u32();
|
||||||
uint32_t name_len = file.read_u32();
|
uint32_t name_len = file.read_u32();
|
||||||
uint32_t ftype = file.read_u32();
|
shard.type = (enum ggml_type) file.read_u32();
|
||||||
shard.ne.resize(n_dims);
|
shard.ne.resize(n_dims);
|
||||||
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
||||||
std::string name = file.read_string(name_len);
|
std::string name = file.read_string(name_len);
|
||||||
if (n_dims < 1 || n_dims > 2) {
|
if (n_dims < 1 || n_dims > 2) {
|
||||||
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
||||||
}
|
}
|
||||||
switch (ftype) {
|
switch (shard.type) {
|
||||||
case 0: shard.type = GGML_TYPE_F32; break;
|
case GGML_TYPE_F32:
|
||||||
case 1: shard.type = GGML_TYPE_F16; break;
|
case GGML_TYPE_F16:
|
||||||
case 2: shard.type = GGML_TYPE_Q4_0; break;
|
case GGML_TYPE_Q4_0:
|
||||||
case 3: shard.type = GGML_TYPE_Q4_1; break;
|
case GGML_TYPE_Q4_1:
|
||||||
|
break;
|
||||||
default: {
|
default: {
|
||||||
throw format("unrecognized ftype %u\n", ftype);
|
throw format("unrecognized tensor type %u\n", shard.type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -497,18 +493,18 @@ struct llama_file_loader {
|
||||||
struct llama_file_saver {
|
struct llama_file_saver {
|
||||||
llama_file file;
|
llama_file file;
|
||||||
llama_file_loader * any_file_loader;
|
llama_file_loader * any_file_loader;
|
||||||
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16)
|
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
||||||
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
||||||
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
||||||
write_magic();
|
write_magic();
|
||||||
write_hparams(new_f16);
|
write_hparams(new_ftype);
|
||||||
write_vocab();
|
write_vocab();
|
||||||
}
|
}
|
||||||
void write_magic() {
|
void write_magic() {
|
||||||
file.write_u32('ggjt'); // magic
|
file.write_u32('ggjt'); // magic
|
||||||
file.write_u32(1); // version
|
file.write_u32(1); // version
|
||||||
}
|
}
|
||||||
void write_hparams(uint32_t new_f16) {
|
void write_hparams(enum llama_ftype new_ftype) {
|
||||||
const llama_hparams & hparams = any_file_loader->hparams;
|
const llama_hparams & hparams = any_file_loader->hparams;
|
||||||
file.write_u32(hparams.n_vocab);
|
file.write_u32(hparams.n_vocab);
|
||||||
file.write_u32(hparams.n_embd);
|
file.write_u32(hparams.n_embd);
|
||||||
|
@ -516,7 +512,7 @@ struct llama_file_saver {
|
||||||
file.write_u32(hparams.n_head);
|
file.write_u32(hparams.n_head);
|
||||||
file.write_u32(hparams.n_layer);
|
file.write_u32(hparams.n_layer);
|
||||||
file.write_u32(hparams.n_rot);
|
file.write_u32(hparams.n_rot);
|
||||||
file.write_u32(new_f16);
|
file.write_u32(new_ftype);
|
||||||
}
|
}
|
||||||
void write_vocab() {
|
void write_vocab() {
|
||||||
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
||||||
|
@ -531,17 +527,17 @@ struct llama_file_saver {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
||||||
uint32_t ftype;
|
|
||||||
switch (new_type) {
|
switch (new_type) {
|
||||||
case GGML_TYPE_F32: ftype = 0; break;
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16: ftype = 1; break;
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_Q4_0: ftype = 2; break;
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q4_1: ftype = 3; break;
|
case GGML_TYPE_Q4_1:
|
||||||
|
break;
|
||||||
default: LLAMA_ASSERT(false);
|
default: LLAMA_ASSERT(false);
|
||||||
}
|
}
|
||||||
file.write_u32((uint32_t) tensor.ne.size());
|
file.write_u32((uint32_t) tensor.ne.size());
|
||||||
file.write_u32((uint32_t) tensor.name.size());
|
file.write_u32((uint32_t) tensor.name.size());
|
||||||
file.write_u32(ftype);
|
file.write_u32(new_type);
|
||||||
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
||||||
file.write_raw(tensor.name.data(), tensor.name.size());
|
file.write_raw(tensor.name.data(), tensor.name.size());
|
||||||
file.seek(-file.tell() & 31, SEEK_CUR);
|
file.seek(-file.tell() & 31, SEEK_CUR);
|
||||||
|
@ -815,6 +811,18 @@ static const char *llama_file_version_name(llama_file_version version) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||||
|
switch (ftype) {
|
||||||
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||||
|
return "mostly Q4_1, some F16";
|
||||||
|
default: return "unknown, may not work";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static const char *llama_model_type_name(e_model type) {
|
static const char *llama_model_type_name(e_model type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case MODEL_7B: return "7B";
|
case MODEL_7B: return "7B";
|
||||||
|
@ -867,7 +875,7 @@ static void llama_model_load_internal(
|
||||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||||
fprintf(stderr, "%s: f16 = %u\n", __func__, hparams.f16);
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
||||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||||
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
||||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||||
|
@ -934,8 +942,8 @@ static void llama_model_load_internal(
|
||||||
ml->ggml_ctx = ctx;
|
ml->ggml_ctx = ctx;
|
||||||
|
|
||||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
||||||
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
||||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
@ -1541,17 +1549,17 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
||||||
ggml_type quantized_type;
|
ggml_type quantized_type;
|
||||||
switch (itype) {
|
switch (ftype) {
|
||||||
case 2: quantized_type = GGML_TYPE_Q4_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
||||||
case 3: quantized_type = GGML_TYPE_Q4_1; break;
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
||||||
default: throw format("invalid quantization type %d\n", itype);
|
default: throw format("invalid output file type %d\n", ftype);
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
||||||
/*vocab_only*/ false));
|
/*vocab_only*/ false));
|
||||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), (uint32_t) itype);
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
|
@ -1564,10 +1572,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
tensor.data = read_data.addr;
|
tensor.data = read_data.addr;
|
||||||
model_loader->load_data_for(tensor);
|
model_loader->load_data_for(tensor);
|
||||||
|
|
||||||
printf("[%zu/%zu] %36s - %s, type = %6s, ",
|
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
||||||
++idx, model_loader->tensors_map.tensors.size(),
|
++idx, model_loader->tensors_map.tensors.size(),
|
||||||
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
||||||
llama_format_type(tensor.type));
|
ggml_type_name(tensor.type));
|
||||||
|
|
||||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
||||||
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
||||||
|
@ -1600,7 +1608,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type));
|
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("quantizing .. ");
|
printf("quantizing .. ");
|
||||||
|
@ -1742,9 +1750,9 @@ void llama_free(struct llama_context * ctx) {
|
||||||
int llama_model_quantize(
|
int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
int itype) {
|
enum llama_ftype ftype) {
|
||||||
try {
|
try {
|
||||||
llama_model_quantize_internal(fname_inp, fname_out, itype);
|
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
||||||
return 0;
|
return 0;
|
||||||
} catch (const std::string & err) {
|
} catch (const std::string & err) {
|
||||||
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
||||||
|
|
22
llama.h
22
llama.h
|
@ -65,6 +65,15 @@ extern "C" {
|
||||||
void * progress_callback_user_data;
|
void * progress_callback_user_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// model file types
|
||||||
|
enum llama_ftype {
|
||||||
|
LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
|
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||||
|
};
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||||
|
|
||||||
LLAMA_API bool llama_mmap_supported();
|
LLAMA_API bool llama_mmap_supported();
|
||||||
|
@ -85,7 +94,7 @@ extern "C" {
|
||||||
LLAMA_API int llama_model_quantize(
|
LLAMA_API int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
int itype);
|
enum llama_ftype ftype);
|
||||||
|
|
||||||
// Returns the KV cache that will contain the context for the
|
// Returns the KV cache that will contain the context for the
|
||||||
// ongoing prediction with the model.
|
// ongoing prediction with the model.
|
||||||
|
@ -171,4 +180,15 @@ extern "C" {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||||
|
#ifdef LLAMA_API_INTERNAL
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
struct ggml_tensor;
|
||||||
|
|
||||||
|
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif // LLAMA_H
|
#endif // LLAMA_H
|
||||||
|
|
|
@ -1,12 +0,0 @@
|
||||||
// Internal header to be included by llama.cpp and tests/benchmarks only.
|
|
||||||
|
|
||||||
#ifndef LLAMA_INTERNAL_H
|
|
||||||
#define LLAMA_INTERNAL_H
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
struct ggml_tensor;
|
|
||||||
|
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
|
||||||
|
|
||||||
#endif // LLAMA_INTERNAL_H
|
|
19
llama_util.h
19
llama_util.h
|
@ -26,7 +26,9 @@
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
#define NOMINMAX
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <io.h>
|
#include <io.h>
|
||||||
#include <stdio.h> // for _fseeki64
|
#include <stdio.h> // for _fseeki64
|
||||||
|
@ -41,8 +43,12 @@
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
|
#ifdef __MINGW32__
|
||||||
|
__attribute__((format(gnu_printf, 1, 2)))
|
||||||
|
#else
|
||||||
__attribute__((format(printf, 1, 2)))
|
__attribute__((format(printf, 1, 2)))
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
static std::string format(const char * fmt, ...) {
|
static std::string format(const char * fmt, ...) {
|
||||||
va_list ap, ap2;
|
va_list ap, ap2;
|
||||||
va_start(ap, fmt);
|
va_start(ap, fmt);
|
||||||
|
@ -55,7 +61,7 @@ static std::string format(const char * fmt, ...) {
|
||||||
va_end(ap2);
|
va_end(ap2);
|
||||||
va_end(ap);
|
va_end(ap);
|
||||||
return std::string(buf.data(), size);
|
return std::string(buf.data(), size);
|
||||||
};
|
}
|
||||||
|
|
||||||
struct llama_file {
|
struct llama_file {
|
||||||
// use FILE * so we don't have to re-open the file to mmap
|
// use FILE * so we don't have to re-open the file to mmap
|
||||||
|
@ -170,7 +176,6 @@ struct llama_mmap {
|
||||||
flags |= MAP_POPULATE;
|
flags |= MAP_POPULATE;
|
||||||
#endif
|
#endif
|
||||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||||
close(fd);
|
|
||||||
if (addr == MAP_FAILED) {
|
if (addr == MAP_FAILED) {
|
||||||
throw format("mmap failed: %s", strerror(errno));
|
throw format("mmap failed: %s", strerror(errno));
|
||||||
}
|
}
|
||||||
|
@ -209,6 +214,7 @@ struct llama_mmap {
|
||||||
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||||
// Advise the kernel to preload the mapped memory
|
// Advise the kernel to preload the mapped memory
|
||||||
WIN32_MEMORY_RANGE_ENTRY range;
|
WIN32_MEMORY_RANGE_ENTRY range;
|
||||||
range.VirtualAddress = addr;
|
range.VirtualAddress = addr;
|
||||||
|
@ -217,6 +223,9 @@ struct llama_mmap {
|
||||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
llama_format_win_err(GetLastError()).c_str());
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
||||||
|
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_mmap() {
|
~llama_mmap() {
|
||||||
|
@ -338,8 +347,8 @@ struct llama_mlock {
|
||||||
// Hopefully a megabyte is enough overhead:
|
// Hopefully a megabyte is enough overhead:
|
||||||
size_t increment = size + 1048576;
|
size_t increment = size + 1048576;
|
||||||
// The minimum must be <= the maximum, so we need to increase both:
|
// The minimum must be <= the maximum, so we need to increase both:
|
||||||
min_ws_size += size;
|
min_ws_size += increment;
|
||||||
max_ws_size += size;
|
max_ws_size += increment;
|
||||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
||||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
llama_format_win_err(GetLastError()).c_str());
|
||||||
|
|
|
@ -1,311 +0,0 @@
|
||||||
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
|
|
||||||
#
|
|
||||||
# We caused a breaking change to the file format on 2023-03-30 in:
|
|
||||||
# https://github.com/ggerganov/llama.cpp/pull/613
|
|
||||||
#
|
|
||||||
# (1) If you still have the Meta LLaMA .pth files, then close this
|
|
||||||
# file now; you can just run `convert-pth-to-ggml.py` again to
|
|
||||||
# migrate to the new format. The tool is easier to use too. It
|
|
||||||
# isn't necessary anymore to manage split output files because
|
|
||||||
# the new format always combines things into a single file.
|
|
||||||
#
|
|
||||||
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
|
|
||||||
# space, then this tool is intended to help you. Please check
|
|
||||||
# out the instructions below.
|
|
||||||
#
|
|
||||||
# USAGE
|
|
||||||
#
|
|
||||||
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
|
|
||||||
#
|
|
||||||
# PREREQUISITES
|
|
||||||
#
|
|
||||||
# pip install numpy
|
|
||||||
# cd llama.cpp
|
|
||||||
# make -j4
|
|
||||||
#
|
|
||||||
# EXAMPLE (7B MODEL)
|
|
||||||
#
|
|
||||||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
|
||||||
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
|
|
||||||
#
|
|
||||||
# # check that it works
|
|
||||||
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
|
||||||
#
|
|
||||||
# # you can delete the old files
|
|
||||||
# rm -f models/7B/ggml-model-f16.bin
|
|
||||||
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
|
|
||||||
#
|
|
||||||
# EXAMPLE (13B MODEL)
|
|
||||||
#
|
|
||||||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
|
||||||
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
|
|
||||||
#
|
|
||||||
# # check that it works
|
|
||||||
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
|
||||||
#
|
|
||||||
# # you can delete the old files
|
|
||||||
# rm -f models/13B/ggml-model-f16.bin*
|
|
||||||
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import struct
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
QK = 32
|
|
||||||
|
|
||||||
GGML_TYPE_Q4_0 = 0
|
|
||||||
GGML_TYPE_Q4_1 = 1
|
|
||||||
GGML_TYPE_I8 = 2
|
|
||||||
GGML_TYPE_I16 = 3
|
|
||||||
GGML_TYPE_I32 = 4
|
|
||||||
GGML_TYPE_F16 = 5
|
|
||||||
GGML_TYPE_F32 = 6
|
|
||||||
|
|
||||||
WTYPE_NAMES = {
|
|
||||||
0: "F32",
|
|
||||||
1: "F16",
|
|
||||||
2: "Q4_0",
|
|
||||||
3: "Q4_1",
|
|
||||||
}
|
|
||||||
|
|
||||||
WTYPES = {
|
|
||||||
0: GGML_TYPE_F32,
|
|
||||||
1: GGML_TYPE_F16,
|
|
||||||
2: GGML_TYPE_Q4_0,
|
|
||||||
3: GGML_TYPE_Q4_1,
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_BLCK_SIZE = {
|
|
||||||
GGML_TYPE_Q4_0: QK,
|
|
||||||
GGML_TYPE_Q4_1: QK,
|
|
||||||
GGML_TYPE_I8: 1,
|
|
||||||
GGML_TYPE_I16: 1,
|
|
||||||
GGML_TYPE_I32: 1,
|
|
||||||
GGML_TYPE_F16: 1,
|
|
||||||
GGML_TYPE_F32: 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_TYPE_SIZE = {
|
|
||||||
GGML_TYPE_Q4_0: 4 + QK//2,
|
|
||||||
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
|
||||||
GGML_TYPE_I8: 1,
|
|
||||||
GGML_TYPE_I16: 2,
|
|
||||||
GGML_TYPE_I32: 4,
|
|
||||||
GGML_TYPE_F16: 2,
|
|
||||||
GGML_TYPE_F32: 4,
|
|
||||||
}
|
|
||||||
|
|
||||||
HPARAMS = [
|
|
||||||
'magic', # int32
|
|
||||||
'version', # int32
|
|
||||||
'n_vocab', # int32
|
|
||||||
'n_embd', # int32
|
|
||||||
'n_mult', # int32
|
|
||||||
'n_head', # int32
|
|
||||||
'n_layer', # int32
|
|
||||||
'n_rot', # int32
|
|
||||||
'f16', # int32
|
|
||||||
]
|
|
||||||
|
|
||||||
def read_hparams(fin):
|
|
||||||
struct_fmt = "i" * len(HPARAMS)
|
|
||||||
struct_size = struct.calcsize(struct_fmt)
|
|
||||||
buf = fin.read(struct_size)
|
|
||||||
ints = struct.unpack(struct_fmt, buf)
|
|
||||||
hparams = dict(zip(HPARAMS, ints))
|
|
||||||
return hparams
|
|
||||||
|
|
||||||
def write_hparams(fout, hparams):
|
|
||||||
struct_fmt = "i" * len(HPARAMS)
|
|
||||||
struct_size = struct.calcsize(struct_fmt)
|
|
||||||
ints = [hparams[h] for h in HPARAMS]
|
|
||||||
fout.write(struct.pack(struct_fmt, *ints))
|
|
||||||
|
|
||||||
def read_tokens(fin, hparams):
|
|
||||||
tokens = []
|
|
||||||
for i in range(hparams['n_vocab']):
|
|
||||||
len_b = fin.read(4)
|
|
||||||
(length,) = struct.unpack("i", len_b)
|
|
||||||
word = fin.read(length)
|
|
||||||
score_b = fin.read(4)
|
|
||||||
(score,) = struct.unpack("f", score_b)
|
|
||||||
tokens.append((word, score))
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
def write_tokens(fout, tokens):
|
|
||||||
for word, score in tokens:
|
|
||||||
fout.write(struct.pack("i", len(word)))
|
|
||||||
fout.write(word)
|
|
||||||
fout.write(struct.pack("f", score))
|
|
||||||
|
|
||||||
def ggml_nelements(shape):
|
|
||||||
r = 1
|
|
||||||
for i in shape:
|
|
||||||
r *= i
|
|
||||||
return r
|
|
||||||
|
|
||||||
def ggml_nbytes(shape, ftype):
|
|
||||||
x = ggml_nelements(shape)
|
|
||||||
t = WTYPES[ftype]
|
|
||||||
x *= GGML_TYPE_SIZE[t]
|
|
||||||
x //= GGML_BLCK_SIZE[t]
|
|
||||||
return x
|
|
||||||
|
|
||||||
def copy_tensors(fin, fout, part_id, n_parts):
|
|
||||||
while True:
|
|
||||||
|
|
||||||
b = fin.read(4)
|
|
||||||
if not b: break
|
|
||||||
(n_dims,) = struct.unpack("i", b)
|
|
||||||
b = fin.read(4)
|
|
||||||
(length,) = struct.unpack("i", b)
|
|
||||||
b = fin.read(4)
|
|
||||||
(ftype,) = struct.unpack("i", b)
|
|
||||||
|
|
||||||
assert n_dims in (1, 2)
|
|
||||||
|
|
||||||
partshape = list(range(n_dims))
|
|
||||||
for i in range(n_dims):
|
|
||||||
b = fin.read(4)
|
|
||||||
partshape[i] = struct.unpack("i", b)[0]
|
|
||||||
partshape = list(reversed(partshape))
|
|
||||||
|
|
||||||
name = fin.read(length)
|
|
||||||
data = fin.read(ggml_nbytes(partshape, ftype))
|
|
||||||
|
|
||||||
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
|
|
||||||
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
|
|
||||||
|
|
||||||
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
|
|
||||||
|
|
||||||
# determine dimension along which multipart tensor is sharded
|
|
||||||
#
|
|
||||||
# split_dim 0 regex:
|
|
||||||
# - output.*
|
|
||||||
# - layers.*.attention.wq.weight
|
|
||||||
# - layers.*.attention.wk.weight
|
|
||||||
# - layers.*.attention.wv.weight
|
|
||||||
# - layers.*.feed_forward.w1.weight
|
|
||||||
# - layers.*.feed_forward.w3.weight
|
|
||||||
#
|
|
||||||
# split_dim 1 regex:
|
|
||||||
# - tok_embeddings.*
|
|
||||||
# - layers.*.attention.wo.weight
|
|
||||||
# - layers.*.feed_forward.w2.weight
|
|
||||||
#
|
|
||||||
if n_dims > 1:
|
|
||||||
split_dim = 1
|
|
||||||
if b"tok_embeddings" in name:
|
|
||||||
split_dim = 1
|
|
||||||
elif b"layers" in name:
|
|
||||||
if b"attention.wo.weight" in name:
|
|
||||||
split_dim = 1
|
|
||||||
elif b"feed_forward.w2.weight" in name:
|
|
||||||
split_dim = 1
|
|
||||||
else:
|
|
||||||
split_dim = 0
|
|
||||||
elif b"output" in name:
|
|
||||||
split_dim = 0
|
|
||||||
|
|
||||||
# output tensor header
|
|
||||||
fullshape = list(partshape)
|
|
||||||
if n_dims > 1:
|
|
||||||
fullshape[split_dim] *= n_parts
|
|
||||||
fout.write(struct.pack("iii", n_dims, len(name), ftype))
|
|
||||||
for dim in reversed(fullshape):
|
|
||||||
fout.write(struct.pack("i", dim))
|
|
||||||
fout.write(name)
|
|
||||||
|
|
||||||
# ensure tensor data is aligned
|
|
||||||
tensor_data_offset = fout.tell()
|
|
||||||
while tensor_data_offset % QK != 0:
|
|
||||||
fout.write(struct.pack("B", 0))
|
|
||||||
tensor_data_offset += 1
|
|
||||||
|
|
||||||
# output unified mappable tensor data
|
|
||||||
if n_dims == 1 or n_parts == 1:
|
|
||||||
# copy tensor which we thankfully received in one piece
|
|
||||||
if part_id == 0:
|
|
||||||
fout.write(data)
|
|
||||||
elif split_dim == 0:
|
|
||||||
# reassemble multifile tensor containing some of the rows
|
|
||||||
rows_per_chunk = partshape[0]
|
|
||||||
current_row = part_id * rows_per_chunk
|
|
||||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
|
||||||
offset = current_row * bytes_per_row
|
|
||||||
fout.seek(tensor_data_offset + offset)
|
|
||||||
fout.write(data)
|
|
||||||
elif split_dim == 1:
|
|
||||||
# reassemble multifile tensor containing some of the cols
|
|
||||||
cols_per_chunk = partshape[1]
|
|
||||||
current_col = part_id * cols_per_chunk
|
|
||||||
bpr = partshape[1] // blck_size * type_size
|
|
||||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
|
||||||
offset_current_col = current_col // blck_size * type_size
|
|
||||||
for row in range(partshape[0]):
|
|
||||||
offset_row = row * bytes_per_row
|
|
||||||
offset = offset_row + offset_current_col
|
|
||||||
fout.seek(tensor_data_offset + offset)
|
|
||||||
fout.write(data[row * bpr:row * bpr + bpr])
|
|
||||||
|
|
||||||
# advance file position to next tensor
|
|
||||||
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
|
|
||||||
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
|
|
||||||
parser.add_argument('fout_path', help='your new ggjt file name')
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
assert args.fin_path
|
|
||||||
assert args.fout_path
|
|
||||||
assert args.fin_path != args.fout_path
|
|
||||||
|
|
||||||
with open(args.fin_path, "rb") as fin:
|
|
||||||
hparams = read_hparams(fin)
|
|
||||||
tokens = read_tokens(fin, hparams)
|
|
||||||
|
|
||||||
if hparams['magic'] == 0x67676a74: # ggjt
|
|
||||||
print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if hparams['magic'] != 0x67676d66: # ggmf
|
|
||||||
print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
hparams['magic'] = 0x67676a74 # ggjt
|
|
||||||
|
|
||||||
# count number of multipart files by convention
|
|
||||||
n_parts = 1
|
|
||||||
while True:
|
|
||||||
if os.path.exists(f"{args.fin_path}.{n_parts}"):
|
|
||||||
n_parts += 1
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
# we output a single file for ggml
|
|
||||||
with open(args.fout_path, "wb") as fout:
|
|
||||||
write_hparams(fout, hparams)
|
|
||||||
write_tokens(fout, tokens)
|
|
||||||
offset_of_tensors = fout.tell()
|
|
||||||
# the tensors we load could be split across multiple files
|
|
||||||
for part_id in range(n_parts):
|
|
||||||
fout.seek(offset_of_tensors)
|
|
||||||
print(f"Processing part {part_id+1} of {n_parts}\n")
|
|
||||||
fin_path = args.fin_path
|
|
||||||
if part_id > 0:
|
|
||||||
fin_path += f".{part_id}"
|
|
||||||
with open(fin_path, "rb") as fin:
|
|
||||||
read_tokens(fin, read_hparams(fin))
|
|
||||||
copy_tensors(fin, fout, part_id, n_parts)
|
|
||||||
|
|
||||||
print(f"Done. Output file: {args.fout_path}\n")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -4,4 +4,4 @@ User: Hello, Bob.
|
||||||
Bob: Hello. How may I help you today?
|
Bob: Hello. How may I help you today?
|
||||||
User: Please tell me the largest city in Europe.
|
User: Please tell me the largest city in Europe.
|
||||||
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
||||||
User:
|
User:
|
|
@ -15,4 +15,4 @@ Answer: The calculate tool says it is 9.3333333333
|
||||||
Question: What is capital of france?
|
Question: What is capital of france?
|
||||||
Thought: Do I need to use an action? No, I know the answer
|
Thought: Do I need to use an action? No, I know the answer
|
||||||
Answer: Paris is the capital of France
|
Answer: Paris is the capital of France
|
||||||
Question:
|
Question:
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
numpy==1.24
|
||||||
|
sentencepiece==0.1.98
|
Loading…
Add table
Add a link
Reference in a new issue