Merge remote-tracking branch 'upstream/master' into fix-warnings

This commit is contained in:
Cebtenzzre 2023-08-30 13:40:58 -04:00
commit ecf9af80ac
39 changed files with 1917 additions and 1120 deletions

View file

@ -7,15 +7,12 @@ arg1="$1"
# Shift the arguments to remove the first one # Shift the arguments to remove the first one
shift shift
# Join the remaining arguments into a single string
arg2="$@"
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert.py "$arg2" python3 ./convert.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$arg2" ./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./main "$arg2" ./main "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -27,7 +24,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
fi fi
done done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./server "$arg2" ./server "$@"
else else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "

View file

@ -41,6 +41,12 @@ jobs:
run: | run: |
CC=gcc-8 make CC=gcc-8 make
- name: Test
id: make_test
run: |
CC=gcc-8 make tests
make test
ubuntu-latest-cmake: ubuntu-latest-cmake:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -157,6 +163,12 @@ jobs:
run: | run: |
make make
- name: Test
id: make_test
run: |
make tests
make test
macOS-latest-cmake: macOS-latest-cmake:
runs-on: macos-latest runs-on: macos-latest

43
.github/workflows/gguf-publish.yml vendored Normal file
View file

@ -0,0 +1,43 @@
# This workflow will upload a Python Package using Twine when a GGUF release is created
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
# See `gguf-py/README.md` for how to make a release.
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
name: Upload Python Package
on:
workflow_dispatch:
push:
# Pattern matched against refs/tags
tags:
- 'gguf-v*' # Push events to every version tag
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.9.x'
- name: Install dependencies
run: |
cd gguf-py
python -m pip install poetry
poetry install
- name: Build package
run: poetry build
- name: Publish package
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}

4
.gitignore vendored
View file

@ -5,6 +5,7 @@
*.bin *.bin
*.exe *.exe
*.dll *.dll
*.log
.DS_Store .DS_Store
.build/ .build/
.cache/ .cache/
@ -41,6 +42,9 @@ models-mnt
/gguf-llama-simple /gguf-llama-simple
/libllama.so /libllama.so
/llama-bench /llama-bench
/baby-llama
/beam-search
/save-load-state
build-info.h build-info.h
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

View file

@ -301,7 +301,7 @@ if (LLAMA_METAL)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
add_compile_definitions(GGML_USE_METAL) add_compile_definitions(GGML_USE_METAL)
add_compile_definitions(GGML_METAL_NDEBUG) #add_compile_definitions(GGML_METAL_NDEBUG)
# get full path to the file # get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
@ -402,6 +402,7 @@ if (LLAMA_ALL_WARNINGS)
-Wstrict-prototypes -Wstrict-prototypes
-Wpointer-arith -Wpointer-arith
-Wmissing-prototypes -Wmissing-prototypes
-Werror=implicit-int
-Wno-unused-function -Wno-unused-function
) )
set(cxx_flags set(cxx_flags

View file

@ -1,11 +1,28 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
test:
@echo "Running tests..."
@for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
continue; \
else \
./$$test_target; \
fi; \
done
@echo "All tests have been run."
all: $(BUILD_TARGETS) $(TEST_TARGETS)
ifndef UNAME_S ifndef UNAME_S
UNAME_S := $(shell uname -s) UNAME_S := $(shell uname -s)
endif endif
@ -64,7 +81,7 @@ endif
# warnings # warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Wno-unused-function -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
ifeq '' '$(findstring clang++,$(CXX))' ifeq '' '$(findstring clang++,$(CXX))'
@ -310,7 +327,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
endif # LLAMA_HIPBLAS endif # LLAMA_HIPBLAS
ifdef LLAMA_METAL ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o OBJS += ggml-metal.o
@ -331,6 +348,11 @@ k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS endif # LLAMA_NO_K_QUANTS
ifdef LLAMA_DISABLE_LOGS
CFLAGS += -DLOG_DISABLE_LOGS
CXXFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS
# #
# Print build information # Print build information
# #
@ -361,7 +383,7 @@ OBJS += ggml-alloc.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common.o: common/common.cpp common/common.h common.o: common/common.cpp common/common.h build-info.h common/log.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
console.o: common/console.cpp common/console.h console.o: common/console.cpp common/console.h
@ -374,7 +396,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS) rm -vf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h $(BUILD_TARGETS) $(TEST_TARGETS)
# #
# Examples # Examples
@ -414,18 +436,33 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS) gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o common.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif
ifdef LLAMA_METAL
metal: examples/metal/metal.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif
build-info.h: $(wildcard .git/index) scripts/build-info.sh build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp @sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \
@ -448,34 +485,37 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS) tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@

View file

@ -107,12 +107,13 @@ as the main playground for developing new features for the [ggml](https://github
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
**UI:** **UI:**
@ -728,8 +729,6 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
- [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML) - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
- [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML) - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
- [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML) - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
- Specify `-eps 1e-5` for best generation quality
- Specify `-gqa 8` for 70B models to work
### Verifying the model files ### Verifying the model files

View file

@ -480,6 +480,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
} else if (arg == "-h" || arg == "--help") { } else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, default_params); gpt_print_usage(argc, argv, default_params);
#ifndef LOG_DISABLE_LOGS
log_print_usage();
#endif // LOG_DISABLE_LOGS
exit(0); exit(0);
} else if (arg == "--random-prompt") { } else if (arg == "--random-prompt") {
params.random_prompt = true; params.random_prompt = true;
@ -519,6 +522,25 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
std::istreambuf_iterator<char>(), std::istreambuf_iterator<char>(),
std::back_inserter(params.grammar) std::back_inserter(params.grammar)
); );
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
} else if ( log_param_single_parse( argv[i] ) ) {
// Do nothing, log_param_single_parse automatically does it's thing
// and returns if a match was found and parsed.
} else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
// We have a matching known parameter requiring an argument,
// now we need to check if there is anything after this argv
// and flag invalid_param or parse it.
if (++i >= argc) {
invalid_param = true;
break;
}
if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) {
invalid_param = true;
break;
}
// End of Parse args for logging parameters
#endif // LOG_DISABLE_LOGS
} else { } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params); gpt_print_usage(argc, argv, default_params);

View file

@ -4,6 +4,9 @@
#include "llama.h" #include "llama.h"
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include <random> #include <random>

643
common/log.h Normal file
View file

@ -0,0 +1,643 @@
#pragma once
#include <chrono>
#include <cstring>
#include <sstream>
#include <iostream>
#include <thread>
#include <vector>
#include <algorithm>
#include <cinttypes>
// --------------------------------
//
// Basic usage:
//
// --------
//
// The LOG() and LOG_TEE() macros are ready to go by default
// they do not require any initialization.
//
// LOGLN() and LOG_TEELN() are variants which automatically
// include \n character at the end of the log string.
//
// LOG() behaves exactly like printf, by default writing to a logfile.
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
//
// Default logfile is named
// "llama.<threadID>.log"
// Default LOG_TEE() secondary output target is
// stderr
//
// Logs can be dynamically disabled or enabled using functions:
// log_disable()
// and
// log_enable()
//
// A log target can be changed with:
// log_set_target( string )
// creating and opening, or re-opening a file by string filename
// or
// log_set_target( FILE* )
// allowing to point at stderr, stdout, or any valid FILE* file handler.
//
// --------
//
// End of Basic usage.
//
// --------------------------------
// Specifies a log target.
// default uses log_handler() with "llama.log" log file
// this can be changed, by defining LOG_TARGET
// like so:
//
// #define LOG_TARGET (a valid FILE*)
// #include "log.h"
//
// or it can be simply redirected to stdout or stderr
// like so:
//
// #define LOG_TARGET stderr
// #include "log.h"
//
// The log target can also be redirected to a diffrent function
// like so:
//
// #define LOG_TARGET log_handler_diffrent()
// #include "log.h"
//
// FILE* log_handler_diffrent()
// {
// return stderr;
// }
//
// or:
//
// #define LOG_TARGET log_handler_another_one("somelog.log")
// #include "log.h"
//
// FILE* log_handler_another_one(char*filename)
// {
// static FILE* logfile = nullptr;
// (...)
// if( !logfile )
// {
// fopen(...)
// }
// (...)
// return logfile
// }
//
#ifndef LOG_TARGET
#define LOG_TARGET log_handler()
#endif
#ifndef LOG_TEE_TARGET
#define LOG_TEE_TARGET stderr
#endif
// Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid()
{
static std::string pid;
if (pid.empty())
{
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log.
std::stringstream ss;
ss << std::this_thread::get_id();
pid = ss.str();
}
return pid;
}
// Utility function for generating log file names with unique id based on thread id.
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
// where the number is a runtime id of the current thread.
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
// INTERNAL, DO NOT USE
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
{
std::stringstream buf;
buf << log_file_basename;
buf << ".";
buf << log_get_pid();
buf << ".";
buf << log_file_extension;
return buf.str();
}
#ifndef LOG_DEFAULT_FILE_NAME
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
#endif
// Utility for turning #define values into string literals
// so we can have a define for stderr and
// we can print "stderr" instead of literal stderr, etc.
#define LOG_STRINGIZE1(s) #s
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
// Allows disabling timestamps.
// in order to disable, define LOG_NO_TIMESTAMPS
// like so:
//
// #define LOG_NO_TIMESTAMPS
// #include "log.h"
//
#ifndef LOG_NO_TIMESTAMPS
#ifndef _WIN32
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#else
#define LOG_TIMESTAMP_FMT "%s"
#define LOG_TIMESTAMP_VAL ,""
#endif
#ifdef LOG_TEE_TIMESTAMPS
#ifndef _WIN32
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#else
#define LOG_TEE_TIMESTAMP_FMT "%s"
#define LOG_TEE_TIMESTAMP_VAL ,""
#endif
// Allows disabling file/line/function prefix
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
// like so:
//
// #define LOG_NO_FILE_LINE_FUNCTION
// #include "log.h"
//
#ifndef LOG_NO_FILE_LINE_FUNCTION
#ifndef _WIN32
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#endif
#else
#define LOG_FLF_FMT "%s"
#define LOG_FLF_VAL ,""
#endif
#ifdef LOG_TEE_FILE_LINE_FUNCTION
#ifndef _WIN32
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#endif
#else
#define LOG_TEE_FLF_FMT "%s"
#define LOG_TEE_FLF_VAL ,""
#endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
//
#ifndef _WIN32
#define LOG_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TARGET); \
} \
}
#else
#define LOG_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
}
#endif
// INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD
//
#ifndef _WIN32
#define LOG_TEE_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
}
#else
#define LOG_TEE_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
}
#endif
// The '\0' as a last argument, is a trick to bypass the silly
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
// so we can have a single macro which can be called just like printf.
// Main LOG macro.
// behaves like printf, and supports arguments the exact same way.
//
#ifndef _WIN32
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
#endif
// Main TEE macro.
// does the same as LOG
// and
// simultaneously writes stderr.
//
// Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET
//
#ifndef _WIN32
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
#ifndef _WIN32
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
#endif
// INTERNAL, DO NOT USE
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
{
static bool _initialized{false};
static bool _disabled{(filename.empty() && target == nullptr)};
static std::string log_current_filename{filename};
static FILE *log_current_target{target};
static FILE *logfile = nullptr;
if (change)
{
if (disable == LogTriStateTrue)
{
// Disable primary target
_disabled = true;
}
// If previously disabled, only enable, and keep previous target
else if (disable == LogTriStateFalse)
{
_disabled = false;
}
// Otherwise, process the arguments
else if (log_current_filename != filename || log_current_target != target)
{
_initialized = false;
}
}
if (_initialized)
{
if (_disabled)
{
// Log is disabled
return nullptr;
}
// with fallback in case something went wrong
return logfile ? logfile : stderr;
}
// do the (re)initialization
if (target != nullptr)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
log_current_filename = LOG_DEFAULT_FILE_NAME;
log_current_target = target;
logfile = target;
}
else
{
if (log_current_filename != filename)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
}
logfile = fopen(filename.c_str(), "w");
}
if (!logfile)
{
// Verify whether the file was opened, otherwise fallback to stderr
logfile = stderr;
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
fflush(stderr);
// At this point we let the init flag be to true below, and let the target fallback to stderr
// otherwise we would repeatedly fopen() which was already unsuccessful
}
_initialized = true;
return logfile ? logfile : stderr;
}
// INTERNAL, DO NOT USE
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
{
return log_handler1_impl(change, disable, filename, target);
}
// Disables logs entirely at runtime.
// Makes LOG() and LOG_TEE() produce no output,
// untill enabled back.
#define log_disable() log_disable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_disable_impl()
{
return log_handler1_impl(true, LogTriStateTrue);
}
// Enables logs at runtime.
#define log_enable() log_enable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_enable_impl()
{
return log_handler1_impl(true, LogTriStateFalse);
}
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
#define log_set_target(target) log_set_target_impl(target)
// INTERNAL, DO NOT USE
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
// INTERNAL, DO NOT USE
inline FILE *log_handler() { return log_handler1_impl(); }
inline void log_test()
{
log_disable();
LOG("01 Hello World to nobody, because logs are disabled!\n")
log_enable();
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
log_set_target(stderr);
LOG("04 Hello World to stderr!\n")
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("06 Hello World to default log file!\n")
log_set_target(stdout);
LOG("07 Hello World to stdout!\n")
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("08 Hello World to default log file again!\n")
log_disable();
LOG("09 Hello World _1_ into the void!\n")
log_enable();
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
log_disable();
log_set_target("llama.anotherlog.log");
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
log_enable();
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
log_set_target("llama.yetanotherlog.log");
LOG("13 Hello World this time in yet new file?\n")
log_set_target(log_filename_generator("llama_autonamed", "log"));
LOG("14 Hello World in log with generated filename!\n")
#ifdef _WIN32
LOG_TEE("15 Hello msvc TEE without arguments\n")
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
LOG_TEELN("17 Hello msvc TEELN without arguments\n")
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
LOG("19 Hello msvc LOG without arguments\n")
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
LOGLN("21 Hello msvc LOGLN without arguments\n")
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
#endif
}
inline bool log_param_single_parse(const std::string & param)
{
if ( param == "--log-test")
{
log_test();
return true;
}
if ( param == "--log-disable")
{
log_disable();
return true;
}
if ( param == "--log-enable")
{
log_enable();
return true;
}
return false;
}
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
{
if ( param == "--log-file")
{
if (!check_but_dont_parse)
{
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
}
return true;
}
return false;
}
inline void log_print_usage()
{
fprintf(stdout, "log options:\n");
/* format
fprintf(stdout, " -h, --help show this help message and exit\n");*/
/* spacing
fprintf(stdout, "__-param----------------Description\n");*/
fprintf(stdout, " --log-test Run simple logging test\n");
fprintf(stdout, " --log-disable Disable trace logs\n");
fprintf(stdout, " --log-enable Enable trace logs\n");
fprintf(stdout, " --log-file Specify a log filename (without extension)\n");
fprintf(stdout, " Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
}
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
// INTERNAL, DO NOT USE
inline void log_dump_cmdline_impl(int argc, char **argv)
{
std::stringstream buf;
for (int i = 0; i < argc; ++i)
{
if (std::string(argv[i]).find(' ') != std::string::npos)
{
buf << " \"" << argv[i] <<"\"";
}
else
{
buf << " " << argv[i];
}
}
LOGLN("Cmd:%s", buf.str().c_str())
}
#define log_tostr(var) log_var_to_string_impl(var).c_str()
inline std::string log_var_to_string_impl(bool var)
{
return var ? "true" : "false";
}
inline std::string log_var_to_string_impl(std::string var)
{
return var;
}
inline std::string log_var_to_string_impl(const std::vector<int> & var)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (auto e : var)
{
if (first)
{
first = false;
}
else
{
buf << ", ";
}
buf << std::to_string(e);
}
buf << " ]";
return buf.str();
}
#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \
[&tokens, &ctx]() \
{ \
std::stringstream buf; \
buf << "[ "; \
\
bool first = true; \
for (const auto &token : tokens) \
{ \
if (!first) \
buf << ", "; \
else \
first = false; \
\
auto detokenized = llama_token_to_piece(ctx, token); \
\
detokenized.erase( \
std::remove_if( \
detokenized.begin(), \
detokenized.end(), \
[](const unsigned char c) { return !std::isprint(c); }), \
detokenized.end()); \
\
buf \
<< "'" << detokenized << "'" \
<< ":" << std::to_string(token); \
} \
buf << " ]"; \
\
return buf.str(); \
}() \
.c_str()
#ifdef LOG_DISABLE_LOGS
#undef LOG
#define LOG(...) // dummy stub
#undef LOGLN
#define LOGLN(...) // dummy stub
#undef LOG_TEE
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
#undef LOG_TEELN
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
#undef LOG_DISABLE
#define LOG_DISABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_SET_TARGET
#define LOG_SET_TARGET(...) // dummy stub
#undef LOG_DUMP_CMDLINE
#define LOG_DUMP_CMDLINE(...) // dummy stub
#endif // LOG_DISABLE_LOGS

View file

@ -8,6 +8,7 @@ import struct
import json import json
import numpy as np import numpy as np
import torch import torch
import argparse
from typing import Any, List from typing import Any, List
from pathlib import Path from pathlib import Path
@ -32,11 +33,10 @@ def bytes_to_unicode():
bs.append(b) bs.append(b)
cs.append(2**8+n) cs.append(2**8+n)
n += 1 n += 1
cs = [chr(n) for n in cs] return dict(zip(bs, (chr(n) for n in cs)))
return dict(zip(bs, cs))
def count_model_parts(dir_model: str) -> int: def count_model_parts(dir_model: Path) -> int:
num_parts = 0 num_parts = 0
for filename in os.listdir(dir_model): for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"): if filename.startswith("pytorch_model-"):
@ -47,17 +47,22 @@ def count_model_parts(dir_model: str) -> int:
return num_parts return num_parts
if len(sys.argv) < 3: def parse_args() -> argparse.Namespace:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n") parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
print(" ftype == 0 -> float32") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
print(" ftype == 1 -> float16") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1) sys.exit(1)
# output in the same directory as the model
dir_model = sys.argv[1]
last_dir = os.path.basename(os.path.normpath(dir_model))
# possible tensor data types # possible tensor data types
# ftype == 0 -> float32 # ftype == 0 -> float32
# ftype == 1 -> float16 # ftype == 1 -> float16
@ -65,25 +70,21 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
# map from ftype to string # map from ftype to string
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
ftype = 1 if args.outfile is not None:
if len(sys.argv) > 2: fname_out = args.outfile
ftype = int(sys.argv[2]) else:
if ftype < 0 or ftype > 1: # output in the same directory as the model by default
print("Invalid ftype: " + str(ftype)) fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
sys.exit(1) print("gguf: loading model "+dir_model.name)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" with open(dir_model / "config.json", "r", encoding="utf-8") as f:
print("gguf: loading model "+last_dir)
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f) hparams = json.load(f)
if hparams["architectures"][0] != "RWForCausalLM": if hparams["architectures"][0] != "RWForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0]) print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit() sys.exit(1)
# get number of model parts # get number of model parts
num_parts = count_model_parts(dir_model) num_parts = count_model_parts(dir_model)
@ -113,23 +114,20 @@ gguf_writer.add_file_type(ftype)
print("gguf: get tokenizer metadata") print("gguf: get tokenizer metadata")
tokens: List[str] = [] tokens: List[bytearray] = []
scores: List[float] = [] scores: List[float] = []
toktypes: List[int] = [] toktypes: List[int] = []
merges: List[str] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
if Path(dir_model + "/tokenizer.json").is_file():
# gpt2 tokenizer # gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2") gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer merges") with open(tokenizer_json_file, "r", encoding="utf-8") as f:
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
tokenizer_json = json.load(f) tokenizer_json = json.load(f)
merges = tokenizer_json["model"]["merges"]
gguf_writer.add_token_merges(merges)
print("gguf: get gpt2 tokenizer vocab") print("gguf: get gpt2 tokenizer vocab")
@ -166,24 +164,8 @@ if Path(dir_model + "/tokenizer.json").is_file():
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
print("gguf: get special token ids") special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
# Look for special tokens in config.json special_vocab.add_to_gguf(gguf_writer)
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
# TENSORS # TENSORS
@ -199,15 +181,17 @@ head_dim = hparams["hidden_size"] // n_head
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
if num_parts == 0: if num_parts == 0:
part_names = ("pytorch_model.bin",) part_names = iter(("pytorch_model.bin",))
else: else:
part_names = ( part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
) )
for part_name in part_names: for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'") print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys(): for name in model_part.keys():
data = model_part[name] data = model_part[name]
@ -238,11 +222,8 @@ for part_name in part_names:
data = data.squeeze().numpy() data = data.squeeze().numpy()
# map tensor names # map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map: new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
name = tensor_map[name[:-7]] + ".weight" if new_name is None:
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print("Can not map tensor '" + name + "'") print("Can not map tensor '" + name + "'")
sys.exit() sys.exit()
@ -261,19 +242,20 @@ for part_name in part_names:
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(name, data) gguf_writer.add_tensor(new_name, data)
print("gguf: write header") print("gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
print("gguf: write metadata") print("gguf: write metadata")
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors") print("gguf: write tensors")
gguf_writer.write_tensors_to_file() gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()
print("gguf: model successfully exported to '" + fname_out + "'") print(f"gguf: model successfully exported to '{fname_out}'")
print("") print("")

View file

@ -8,6 +8,7 @@ import struct
import json import json
import numpy as np import numpy as np
import torch import torch
import argparse
from typing import Any, List from typing import Any, List
from pathlib import Path from pathlib import Path
@ -34,11 +35,10 @@ def bytes_to_unicode():
bs.append(b) bs.append(b)
cs.append(2**8+n) cs.append(2**8+n)
n += 1 n += 1
cs = [chr(n) for n in cs] return dict(zip(bs, (chr(n) for n in cs)))
return dict(zip(bs, cs))
def count_model_parts(dir_model: str) -> int: def count_model_parts(dir_model: Path) -> int:
num_parts = 0 num_parts = 0
for filename in os.listdir(dir_model): for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"): if filename.startswith("pytorch_model-"):
@ -49,17 +49,22 @@ def count_model_parts(dir_model: str) -> int:
return num_parts return num_parts
if len(sys.argv) < 3: def parse_args() -> argparse.Namespace:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n") parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
print(" ftype == 0 -> float32") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
print(" ftype == 1 -> float16") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1) sys.exit(1)
# output in the same directory as the model
dir_model = sys.argv[1]
last_dir = os.path.basename(os.path.normpath(dir_model))
# possible tensor data types # possible tensor data types
# ftype == 0 -> float32 # ftype == 0 -> float32
# ftype == 1 -> float16 # ftype == 1 -> float16
@ -67,19 +72,15 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
# map from ftype to string # map from ftype to string
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
ftype = 1 if args.outfile is not None:
if len(sys.argv) > 2: fname_out = args.outfile
ftype = int(sys.argv[2]) else:
if ftype < 0 or ftype > 1: # output in the same directory as the model by default
print("Invalid ftype: " + str(ftype)) fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
sys.exit(1) print("gguf: loading model "+dir_model.name)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" with open(dir_model / "config.json", "r", encoding="utf-8") as f:
print("gguf: loading model "+last_dir)
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f) hparams = json.load(f)
if hparams["architectures"][0] != "GPTNeoXForCausalLM": if hparams["architectures"][0] != "GPTNeoXForCausalLM":
@ -97,7 +98,7 @@ print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
gguf_writer.add_name(last_dir) gguf_writer.add_name(dir_model.name)
gguf_writer.add_context_length(hparams["max_position_embeddings"]) gguf_writer.add_context_length(hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(hparams["hidden_size"]) gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count) gguf_writer.add_block_count(block_count)
@ -111,21 +112,18 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
print("gguf: get tokenizer metadata") print("gguf: get tokenizer metadata")
tokens: List[str] = [] tokens: List[bytearray] = []
merges: List[str] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
if Path(dir_model + "/tokenizer.json").is_file():
# gpt2 tokenizer # gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2") gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer merges") with open(tokenizer_json_file, "r", encoding="utf-8") as f:
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
tokenizer_json = json.load(f) tokenizer_json = json.load(f)
merges = tokenizer_json["model"]["merges"]
gguf_writer.add_token_merges(merges)
print("gguf: get gpt2 tokenizer vocab") print("gguf: get gpt2 tokenizer vocab")
@ -158,39 +156,8 @@ if Path(dir_model + "/tokenizer.json").is_file():
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file(): special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
print("gguf: get special token ids") special_vocab.add_to_gguf(gguf_writer)
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_config = json.load(f)
# find special token ids
if "bos_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["bos_token"]:
gguf_writer.add_bos_token_id(key["id"])
if "eos_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["eos_token"]:
gguf_writer.add_eos_token_id(key["id"])
if "unk_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["unk_token"]:
gguf_writer.add_unk_token_id(key["id"])
if "sep_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["sep_token"]:
gguf_writer.add_sep_token_id(key["id"])
if "pad_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["pad_token"]:
gguf_writer.add_pad_token_id(key["id"])
# TENSORS # TENSORS
@ -200,13 +167,15 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
if num_parts == 0: if num_parts == 0:
part_names = ("pytorch_model.bin",) part_names = iter(("pytorch_model.bin",))
else: else:
part_names = ( part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
) )
for part_name in part_names: for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'") print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
@ -226,11 +195,8 @@ for part_name in part_names:
data = data.squeeze().numpy() data = data.squeeze().numpy()
# map tensor names # map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map: new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
name = tensor_map[name[:-7]] + ".weight" if new_name is None:
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print("Can not map tensor '" + name + "'") print("Can not map tensor '" + name + "'")
sys.exit() sys.exit()
@ -249,19 +215,20 @@ for part_name in part_names:
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(name, data) gguf_writer.add_tensor(new_name, data)
print("gguf: write header") print("gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
print("gguf: write metadata") print("gguf: write metadata")
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors") print("gguf: write tensors")
gguf_writer.write_tensors_to_file() gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()
print("gguf: model successfully exported to '" + fname_out + "'") print(f"gguf: model successfully exported to '{fname_out}'")
print("") print("")

View file

@ -10,8 +10,9 @@ import struct
import json import json
import numpy as np import numpy as np
import torch import torch
import argparse
from typing import Any, List from typing import Any, List, TypeAlias
from pathlib import Path from pathlib import Path
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
@ -20,7 +21,7 @@ from sentencepiece import SentencePieceProcessor
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
def count_model_parts(dir_model: str) -> int: def count_model_parts(dir_model: Path) -> int:
num_parts = 0 num_parts = 0
for filename in os.listdir(dir_model): for filename in os.listdir(dir_model):
if filename.startswith("consolidated."): if filename.startswith("consolidated."):
@ -31,19 +32,22 @@ def count_model_parts(dir_model: str) -> int:
return num_parts return num_parts
if len(sys.argv) < 3: def parse_args() -> argparse.Namespace:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n") parser = argparse.ArgumentParser(description="Convert a PyTorch 7B LLaMA model to a GGML compatible file")
print(" ftype == 0 -> float32") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
print(" ftype == 1 -> float16") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1) sys.exit(1)
# output in the same directory as the model
dir_model = sys.argv[1]
last_dir = os.path.basename(os.path.normpath(dir_model))
# possible tensor data types # possible tensor data types
# ftype == 0 -> float32 # ftype == 0 -> float32
# ftype == 1 -> float16 # ftype == 1 -> float16
@ -51,19 +55,15 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
# map from ftype to string # map from ftype to string
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
ftype = 1 if args.outfile is not None:
if len(sys.argv) > 2: fname_out = args.outfile
ftype = int(sys.argv[2]) else:
if ftype < 0 or ftype > 1: # output in the same directory as the model by default
print("Invalid ftype: " + str(ftype)) fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
sys.exit(1) print("gguf: loading model "+dir_model.name)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" with open(dir_model / "config.json", "r", encoding="utf-8") as f:
print("gguf: loading model "+last_dir)
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f) hparams = json.load(f)
if hparams["architectures"][0] != "LlamaForCausalLM": if hparams["architectures"][0] != "LlamaForCausalLM":
@ -107,7 +107,7 @@ else:
sys.exit() sys.exit()
gguf_writer.add_name(last_dir) gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth") gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length) gguf_writer.add_context_length(ctx_length)
@ -133,11 +133,15 @@ tokens: List[bytes] = []
scores: List[float] = [] scores: List[float] = []
toktypes: List[int] = [] toktypes: List[int] = []
if Path(dir_model + "/tokenizer.model").is_file(): tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece # vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab and scores") print("gguf: get sentencepiece tokenizer vocab and scores")
tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model") tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()): for i in range(tokenizer.vocab_size()):
text: bytes text: bytes
@ -164,8 +168,9 @@ if Path(dir_model + "/tokenizer.model").is_file():
scores.append(score) scores.append(score)
toktypes.append(toktype) toktypes.append(toktype)
if Path(dir_model + "/added_tokens.json").is_file(): added_tokens_file = dir_model / 'added_tokens.json'
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f: if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f) addtokens_json = json.load(f)
print("gguf: get added tokens") print("gguf: get added tokens")
@ -180,62 +185,8 @@ if Path(dir_model + "/tokenizer.model").is_file():
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
print("gguf: get special token ids") special_vocab.add_to_gguf(gguf_writer)
if Path(dir_model + "/tokenizer.json").is_file():
# Look for special tokens in tokenizer.json if it exists
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
tokenizer = json.load(f)
if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_config = json.load(f)
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["bos_token"]["content"]:
gguf_writer.add_bos_token_id(key["id"])
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["eos_token"]["content"]:
gguf_writer.add_eos_token_id(key["id"])
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["unk_token"]["content"]:
gguf_writer.add_unk_token_id(key["id"])
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["sep_token"]["content"]:
gguf_writer.add_sep_token_id(key["id"])
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["pad_token"]["content"]:
gguf_writer.add_pad_token_id(key["id"])
else:
# If no tokenizer.json: Look for special tokens in config.json
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
# TENSORS # TENSORS
@ -247,6 +198,8 @@ print("gguf: get tensor metadata")
part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts)) part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
for part_name in part_names: for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'") print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
@ -266,11 +219,8 @@ for part_name in part_names:
data = data.squeeze().numpy() data = data.squeeze().numpy()
# map tensor names # map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map: new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
name = tensor_map[name[:-7]] + ".weight" if new_name is None:
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print("Can not map tensor '" + name + "'") print("Can not map tensor '" + name + "'")
sys.exit() sys.exit()
@ -289,20 +239,20 @@ for part_name in part_names:
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(name, data) gguf_writer.add_tensor(new_name, data)
print("gguf: write header") print("gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
print("gguf: write metadata") print("gguf: write metadata")
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors") print("gguf: write tensors")
gguf_writer.write_tensors_to_file() gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("gguf: model successfully exported to '" + fname_out + "'")
print("") print("")

View file

@ -75,7 +75,7 @@ class Tensor:
self.dims = () self.dims = ()
self.dtype = None self.dtype = None
self.start_offset = 0 self.start_offset = 0
self.len_bytes = 0 self.len_bytes = np.int64(0)
def load(self, data, offset): def load(self, data, offset):
orig_offset = offset orig_offset = offset
@ -134,13 +134,14 @@ class GGMLV3Model:
return offset return offset
class GGMLToGGUF: class GGMLToGGUF:
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None): def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
hp = ggml_model.hyperparameters hp = ggml_model.hyperparameters
self.model = ggml_model self.model = ggml_model
self.data = data self.data = data
self.cfg = cfg self.cfg = cfg
self.params_override = params_override self.params_override = params_override
self.vocab_override = vocab_override self.vocab_override = vocab_override
self.special_vocab = special_vocab
if params_override is not None: if params_override is not None:
n_kv_head = params_override.n_head_kv n_kv_head = params_override.n_head_kv
else: else:
@ -162,6 +163,8 @@ class GGMLToGGUF:
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False) gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
self.add_params(gguf_writer) self.add_params(gguf_writer)
self.add_vocab(gguf_writer) self.add_vocab(gguf_writer)
if self.special_vocab is not None:
self.special_vocab.add_to_gguf(gguf_writer)
self.add_tensors(gguf_writer) self.add_tensors(gguf_writer)
print(" gguf: write header") print(" gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
@ -259,20 +262,13 @@ class GGMLToGGUF:
gguf_writer.add_eos_token_id(2) gguf_writer.add_eos_token_id(2)
def add_tensors(self, gguf_writer): def add_tensors(self, gguf_writer):
nm = self.name_map tensor_map = self.name_map
data = self.data data = self.data
print(f'* Adding {len(self.model.tensors)} tensor(s)') print(f'* Adding {len(self.model.tensors)} tensor(s)')
for tensor in self.model.tensors: for tensor in self.model.tensors:
name = str(tensor.name, 'UTF-8') name = str(tensor.name, 'UTF-8')
if name.endswith('.weight'): mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
name = name[:-7]
suffix = '.weight'
elif name.endswith('.bias'):
name = name[:-5]
suffix = '.bias'
mapped_name = nm.get(name)
assert mapped_name is not None, f'Bad name {name}' assert mapped_name is not None, f'Bad name {name}'
mapped_name += suffix
tempdims = list(tensor.dims[:]) tempdims = list(tensor.dims[:])
if len(tempdims) > 1: if len(tempdims) > 1:
temp = tempdims[1] temp = tempdims[1]
@ -302,8 +298,10 @@ def handle_metadata(cfg, hp):
else: else:
raise ValueError('Unable to load metadata') raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype) vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab) convert.check_vocab_size(params, vocab)
return (params, vocab) return (params, vocab, svocab)
def handle_args(): def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF') parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
@ -330,14 +328,16 @@ def main():
print(f'* GGML model hyperparameters: {model.hyperparameters}') print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None vocab_override = None
params_override = None params_override = None
special_vocab = None
if cfg.model_metadata_dir is not None: if cfg.model_metadata_dir is not None:
(params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters) (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
print(f'* Overriding params: {params_override}') print(f'* Overriding params: {params_override}')
print(f'* Overriding vocab: {vocab_override}') print(f'* Overriding vocab: {vocab_override}')
print(f'* Special vocab: {special_vocab}')
else: else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override) converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
converter.save() converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}') print(f'* Successful completion. Output saved to: {cfg.output}')

View file

@ -8,8 +8,9 @@ import struct
import json import json
import numpy as np import numpy as np
import torch import torch
import argparse
from typing import Any, List, Optional from typing import Any, List, Optional, TypeAlias
from pathlib import Path from pathlib import Path
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
@ -43,40 +44,38 @@ def count_model_parts(dir_model: str) -> int:
return num_parts return num_parts
if len(sys.argv) < 3: def parse_args() -> argparse.Namespace:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n") parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
print(" ftype == 0 -> float32") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
print(" ftype == 1 -> float16") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1) sys.exit(1)
# output in the same directory as the model
dir_model = sys.argv[1]
last_dir = os.path.basename(os.path.normpath(dir_model))
# possible tensor data types # possible tensor data types
# ftype == 0 -> float32 # ftype == 0 -> float32
# ftype == 1 -> float16 # ftype == 1 -> float16
# map from ftype to string # map from ftype to string
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
ftype = 1 if args.outfile is not None:
if len(sys.argv) > 2: fname_out = args.outfile
ftype = int(sys.argv[2]) else:
if ftype < 0 or ftype > 1: # output in the same directory as the model by default
print("Invalid ftype: " + str(ftype)) fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
sys.exit(1) print("gguf: loading model "+dir_model.name)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" with open(dir_model / "config.json", "r", encoding="utf-8") as f:
print("gguf: loading model "+last_dir)
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f) hparams = json.load(f)
if hparams["architectures"][0] != "LlamaForCausalLM": if hparams["architectures"][0] != "LlamaForCausalLM":
@ -115,7 +114,7 @@ else:
sys.exit() sys.exit()
gguf_writer.add_name(last_dir) gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth") gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length) gguf_writer.add_context_length(ctx_length)
@ -141,11 +140,15 @@ tokens: List[bytes] = []
scores: List[float] = [] scores: List[float] = []
toktypes: List[int] = [] toktypes: List[int] = []
if Path(dir_model + "/tokenizer.model").is_file(): tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece # vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab, scores and token types") print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model") tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()): for i in range(tokenizer.vocab_size()):
text: bytes text: bytes
@ -172,8 +175,9 @@ if Path(dir_model + "/tokenizer.model").is_file():
scores.append(score) scores.append(score)
toktypes.append(toktype) toktypes.append(toktype)
if Path(dir_model + "/added_tokens.json").is_file(): added_tokens_file = dir_model / 'added_tokens.json'
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f: if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f) addtokens_json = json.load(f)
print("gguf: get added tokens") print("gguf: get added tokens")
@ -189,62 +193,8 @@ if Path(dir_model + "/tokenizer.model").is_file():
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
print("gguf: get special token ids") special_vocab.add_to_gguf(gguf_writer)
if Path(dir_model + "/tokenizer.json").is_file():
# Look for special tokens in tokenizer.json if it exists
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
tokenizer = json.load(f)
if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_config = json.load(f)
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["bos_token"]["content"]:
gguf_writer.add_bos_token_id(key["id"])
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["eos_token"]["content"]:
gguf_writer.add_eos_token_id(key["id"])
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["unk_token"]["content"]:
gguf_writer.add_unk_token_id(key["id"])
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["sep_token"]["content"]:
gguf_writer.add_sep_token_id(key["id"])
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["pad_token"]["content"]:
gguf_writer.add_pad_token_id(key["id"])
else:
# If no tokenizer.json: Look for special tokens in config.json
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
# TENSORS # TENSORS
@ -254,13 +204,15 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
if num_parts == 0: if num_parts == 0:
part_names = ("pytorch_model.bin",) part_names = iter(("pytorch_model.bin",))
else: else:
part_names = ( part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
) )
for part_name in part_names: for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'") print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
@ -286,11 +238,8 @@ for part_name in part_names:
data = reverse_hf_permute(data, head_count, head_count_kv) data = reverse_hf_permute(data, head_count, head_count_kv)
# map tensor names # map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map: new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
name = tensor_map[name[:-7]] + ".weight" if new_name is None:
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print("Can not map tensor '" + name + "'") print("Can not map tensor '" + name + "'")
sys.exit() sys.exit()
@ -309,20 +258,20 @@ for part_name in part_names:
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(name, data) gguf_writer.add_tensor(new_name, data)
print("gguf: write header") print("gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
print("gguf: write metadata") print("gguf: write metadata")
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors") print("gguf: write tensors")
gguf_writer.write_tensors_to_file() gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("gguf: model successfully exported to '" + fname_out + "'")
print("") print("")

View file

@ -4,7 +4,7 @@ import os
import re import re
import struct import struct
import sys import sys
from typing import Any, Dict, Sequence, TextIO from typing import Any, Dict, Sequence, BinaryIO
import numpy as np import numpy as np
import torch import torch
@ -46,7 +46,7 @@ def translate_tensor_name(t: str) -> str:
sys.exit(1) sys.exit(1)
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None: def write_file_header(fout: BinaryIO, params: Dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", params["r"])) fout.write(struct.pack("i", params["r"]))
@ -60,7 +60,7 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
def write_tensor_header( def write_tensor_header(
self, name: str, shape: Sequence[int], data_type: np.dtype self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
) -> None: ) -> None:
sname = name.encode("utf-8") sname = name.encode("utf-8")
fout.write( fout.write(

View file

@ -25,7 +25,7 @@ import numpy as np
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, TypeVar, Union) from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, Type, TypeVar, Union)
from sentencepiece import SentencePieceProcessor # type: ignore from sentencepiece import SentencePieceProcessor # type: ignore
if TYPE_CHECKING: if TYPE_CHECKING:
@ -299,8 +299,10 @@ class Params:
params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
elif orig_config_path.exists(): elif orig_config_path.exists():
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
else: elif model_plus.format != 'none':
params = Params.guessed(model_plus.model) params = Params.guessed(model_plus.model)
else:
raise ValueError('Cannot guess params when model format is none')
params.path_model = model_plus.paths[0].parent params.path_model = model_plus.paths[0].parent
@ -353,7 +355,7 @@ class BpeVocab:
yield from self.added_tokens() yield from self.added_tokens()
def __repr__(self) -> str: def __repr__(self) -> str:
return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab: class SentencePieceVocab:
@ -416,7 +418,6 @@ class SentencePieceVocab:
Vocab = Union[BpeVocab, SentencePieceVocab] Vocab = Union[BpeVocab, SentencePieceVocab]
# #
# data loading # data loading
# TODO: reuse (probably move to gguf.py?) # TODO: reuse (probably move to gguf.py?)
@ -439,14 +440,14 @@ class Tensor(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ... def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ...
@abstractmethod @abstractmethod
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ... def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor': ...
@abstractmethod @abstractmethod
def part(self, n_part: int) -> 'UnquantizedTensor': ... def part(self, n_part: int) -> 'UnquantizedTensor': ...
@abstractmethod @abstractmethod
def to_ggml(self) -> 'GGMLCompatibleTensor': ... def to_ggml(self) -> 'GGMLCompatibleTensor': ...
def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray: def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}" assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
fp32_arr = bf16_arr.astype(np.uint32) << 16 fp32_arr = bf16_arr.astype(np.uint32) << 16
return fp32_arr.view(np.float32) return fp32_arr.view(np.float32)
@ -467,9 +468,9 @@ class UnquantizedTensor(Tensor):
def to_ggml(self) -> 'UnquantizedTensor': def to_ggml(self) -> 'UnquantizedTensor':
return self return self
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3 r = self.ndarray.shape[0] // 3
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head)) return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
def part(self, n_part: int) -> 'UnquantizedTensor': def part(self, n_part: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3 r = self.ndarray.shape[0] // 3
@ -531,7 +532,7 @@ LazyModel = Dict[str, LazyTensor]
class ModelPlus: class ModelPlus:
model: LazyModel model: LazyModel
paths: List[Path] # Where this was read from. paths: List[Path] # Where this was read from.
format: Literal['ggml', 'torch', 'safetensors'] format: Literal['ggml', 'torch', 'safetensors', 'none']
vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab. vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab.
@ -597,12 +598,12 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
return lazy_tensor.load().permute(n_head, n_head_kv) return lazy_tensor.load().permute(n_head, n_head_kv)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor: def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
def load() -> Tensor: def load() -> Tensor:
return lazy_tensor.load().permute_part(n_part, n_head) return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
s = lazy_tensor.shape.copy() s = lazy_tensor.shape.copy()
s[0] = s[0] // 3 s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
def load() -> Tensor: def load() -> Tensor:
@ -657,7 +658,7 @@ class LazyUnpickler(pickle.Unpickler):
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
return LazyStorage(load=load, kind=pid[1], description=description) return LazyStorage(load=load, kind=pid[1], description=description)
# @staticmethod @staticmethod
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
# pyright: ignore[reportSelfClsParameterName] # pyright: ignore[reportSelfClsParameterName]
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
@ -669,13 +670,15 @@ class LazyUnpickler(pickle.Unpickler):
description = f'pickled storage_offset={storage_offset} in {storage.description}' description = f'pickled storage_offset={storage_offset} in {storage.description}'
return LazyTensor(load, list(size), storage.kind.data_type, description) return LazyTensor(load, list(size), storage.kind.data_type, description)
# @staticmethod @staticmethod
def rebuild_from_type_v2(func, new_type, args, state): def rebuild_from_type_v2(func, new_type, args, state):
return func(*args) return func(*args)
CLASSES: Dict[Any, Any] = { CLASSES: Dict[Tuple[str, str], Any] = {
('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2, # getattr used here as a workaround for mypy not being smart enough to detrmine
('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2, # the staticmethods have a __func__ attribute.
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
('torch', 'HalfStorage'): LazyStorageKind(DT_F16), ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
('torch', 'FloatStorage'): LazyStorageKind(DT_F32), ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
@ -751,7 +754,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
In = TypeVar('In') In = TypeVar('In')
Out = TypeVar('Out') Out = TypeVar('Out')
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, factory: Callable = ThreadPoolExecutor) -> Iterable[Out]: def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, use_processpool_executor: bool = False) -> Iterable[Out]:
'''Parallel map, but with backpressure. If the caller doesn't call `next` '''Parallel map, but with backpressure. If the caller doesn't call `next`
fast enough, this will stop calling `func` at some point rather than fast enough, this will stop calling `func` at some point rather than
letting results pile up in memory. Specifically, there is a max of one letting results pile up in memory. Specifically, there is a max of one
@ -760,7 +763,12 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
yield from map(func, iterable) yield from map(func, iterable)
# Not reached. # Not reached.
iterable = iter(iterable) iterable = iter(iterable)
with factory(max_workers = max_workers) as executor: executor_class: Union[Type[ThreadPoolExecutor], Type[ProcessPoolExecutor]]
if use_processpool_executor:
executor_class = ProcessPoolExecutor
else:
executor_class = ThreadPoolExecutor
with executor_class(max_workers = max_workers) as executor:
futures: List[concurrent.futures.Future[Out]] = [] futures: List[concurrent.futures.Future[Out]] = []
done = False done = False
for _ in range(concurrency): for _ in range(concurrency):
@ -803,9 +811,11 @@ class OutputFile:
def add_meta_arch(self, params: Params) -> None: def add_meta_arch(self, params: Params) -> None:
name = "LLaMA" name = "LLaMA"
# TODO: better logic to determine model name
if (params.n_ctx == 4096): if (params.n_ctx == 4096):
name = "LLaMA v2" name = "LLaMA v2"
if params.path_model: elif params.path_model:
name = str(params.path_model.parent).split('/')[-1] name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name (name) self.gguf.add_name (name)
@ -831,18 +841,25 @@ class OutputFile:
tokens = [] tokens = []
scores = [] scores = []
toktypes = [] toktypes = []
# NOTE: `all_tokens` returns the the base vocabulary and added tokens # NOTE: `all_tokens` returns the base vocabulary and added tokens
# TODO: add special tokens?
for text, score, toktype in vocab.all_tokens(): for text, score, toktype in vocab.all_tokens():
tokens.append(text) tokens.append(text)
scores.append(score) scores.append(score)
toktypes.append(toktype) toktypes.append(toktype)
if isinstance(vocab, SentencePieceVocab):
self.gguf.add_tokenizer_model("llama") self.gguf.add_tokenizer_model("llama")
elif isinstance(vocab, BpeVocab):
self.gguf.add_tokenizer_model("gpt2")
else:
raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
self.gguf.add_token_list(tokens) self.gguf.add_token_list(tokens)
self.gguf.add_token_scores(scores) self.gguf.add_token_scores(scores)
self.gguf.add_token_types(toktypes) self.gguf.add_token_types(toktypes)
def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
svocab.add_to_gguf(self.gguf)
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
n_elements = int(np.prod(tensor.shape)) n_elements = int(np.prod(tensor.shape))
raw_dtype = getattr(tensor.data_type, 'ggml_type', None) raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
@ -861,7 +878,7 @@ class OutputFile:
self.gguf.close() self.gguf.close()
@staticmethod @staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None: def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
check_vocab_size(params, vocab) check_vocab_size(params, vocab)
of = OutputFile(fname_out) of = OutputFile(fname_out)
@ -869,6 +886,8 @@ class OutputFile:
# meta data # meta data
of.add_meta_arch(params) of.add_meta_arch(params)
of.add_meta_vocab(vocab) of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)
of.write_meta() of.write_meta()
of.close() of.close()
@ -887,7 +906,7 @@ class OutputFile:
return dt.quantize(arr) return dt.quantize(arr)
@staticmethod @staticmethod
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
check_vocab_size(params, vocab) check_vocab_size(params, vocab)
of = OutputFile(fname_out) of = OutputFile(fname_out)
@ -895,6 +914,7 @@ class OutputFile:
# meta data # meta data
of.add_meta_arch(params) of.add_meta_arch(params)
of.add_meta_vocab(vocab) of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)
# tensor info # tensor info
for name, lazy_tensor in model.items(): for name, lazy_tensor in model.items():
@ -906,7 +926,7 @@ class OutputFile:
# tensor data # tensor data
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
if ftype == GGMLFileType.MostlyQ8_0: if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, factory = ProcessPoolExecutor) ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
else: else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
@ -939,7 +959,8 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
for (name, tensor) in model.items()} for (name, tensor) in model.items()}
def convert_model_names(model: LazyModel, params: Params) -> LazyModel: def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
tmap = gguf.get_tensor_name_map(ARCH, params.n_layer) tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip: Set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
tmp = model tmp = model
@ -955,26 +976,20 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
else: else:
break break
out: LazyModel = {} out: LazyModel = {}
for name, lazy_tensor in model.items(): for name, lazy_tensor in model.items():
name_new = name tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
if name_new is None:
if name in tmap:
name_new = tmap[name]
elif name.endswith(".weight") and name[:-7] in tmap:
name_new = tmap[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tmap:
name_new = tmap[name[:-5]] + ".bias"
else:
raise Exception(f"Unexpected tensor name: {name}") raise Exception(f"Unexpected tensor name: {name}")
if gguf.should_skip_tensor_TMP(ARCH, params.n_layer, name_new): if tensor_type in should_skip:
print(f"skipping tensor {name_new}") print(f"skipping tensor {name_new}")
continue continue
else:
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
out[name_new] = lazy_tensor out[name_new] = lazy_tensor
@ -1116,8 +1131,16 @@ def main(args_in: Optional[List[str]] = None) -> None:
if args.dump_single: if args.dump_single:
model_plus = lazy_load_file(args.model) model_plus = lazy_load_file(args.model)
do_dump_model(model_plus) do_dump_model(model_plus)
return
if not args.vocab_only:
model_plus = load_some_model(args.model) model_plus = load_some_model(args.model)
else:
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
if args.dump:
do_dump_model(model_plus)
return
params = Params.load(model_plus) params = Params.load(model_plus)
if params.n_ctx == -1: if params.n_ctx == -1:
@ -1139,14 +1162,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
vocab: Vocab vocab: Vocab
if args.vocab_only: if args.vocab_only:
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
assert args.outfile, "need --outfile if using --vocab-only" assert args.outfile, "need --outfile if using --vocab-only"
# FIXME: Try to respect vocab_dir somehow?
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
outfile = args.outfile outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab) OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")
else:
if args.dump:
do_dump_model(model_plus)
return return
if model_plus.vocab is not None and args.vocab_dir is None: if model_plus.vocab is not None and args.vocab_dir is None:
@ -1154,6 +1176,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
else: else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir, args.vocabtype) vocab = load_vocab(vocab_dir, args.vocabtype)
# FIXME: Try to respect vocab_dir somehow?
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
model = model_plus.model model = model_plus.model
model = convert_model_names(model, params) model = convert_model_names(model, params)
@ -1164,7 +1188,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
params.ftype = ftype params.ftype = ftype
print(f"Writing {outfile}, format {ftype}") print(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, concurrency = args.concurrency) OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")

View file

@ -25,7 +25,7 @@ else()
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(embd-input) add_subdirectory(embd-input)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(beam_search) add_subdirectory(beam-search)
if (LLAMA_METAL) if (LLAMA_METAL)
add_subdirectory(metal) add_subdirectory(metal)
endif() endif()

View file

@ -1,5 +1,5 @@
set(TARGET beam_search) set(TARGET beam-search)
add_executable(${TARGET} beam_search.cpp) add_executable(${TARGET} beam-search.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -11,6 +11,6 @@ cd ..
# #
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt # "--keep 48" is based on the contents of prompts/chat-with-bob.txt
# #
./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \ ./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \ --repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt -r "User:" -f prompts/chat-with-bob.txt

View file

@ -8,7 +8,7 @@ function! Llm()
let buffer_content = join(getline(1, '$'), "\n") let buffer_content = join(getline(1, '$'), "\n")
" Create the JSON payload " Create the JSON payload
let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false} let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
let json_payload.prompt = buffer_content let json_payload.prompt = buffer_content
" Define the curl command " Define the curl command
@ -25,3 +25,4 @@ function! Llm()
endfunction endfunction
command! Llm call Llm() command! Llm call Llm()
noremap <F2> :Llm<CR>

View file

@ -4,6 +4,7 @@
#endif #endif
#include "common.h" #include "common.h"
#include "console.h" #include "console.h"
#include "llama.h" #include "llama.h"
#include "build-info.h" #include "build-info.h"
@ -112,6 +113,15 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc,argv);
#endif // LOG_DISABLE_LOGS
// TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
// save choice to use color for later // save choice to use color for later
// (note for later: this is a slightly awkward choice) // (note for later: this is a slightly awkward choice)
console::init(params.simple_io, params.use_color); console::init(params.simple_io, params.use_color);
@ -134,34 +144,35 @@ int main(int argc, char ** argv) {
} }
if (params.rope_freq_base != 10000.0) { if (params.rope_freq_base != 10000.0) {
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base); LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
} }
if (params.rope_freq_scale != 1.0) { if (params.rope_freq_scale != 1.0) {
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
} }
if (params.n_ctx > 2048) { if (params.n_ctx > 2048) {
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048 // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx); LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) { } else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8; params.n_ctx = 8;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);
} }
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
LOG("%s: llama backend init\n", __func__);
llama_backend_init(params.numa); llama_backend_init(params.numa);
llama_model * model; llama_model * model;
@ -171,6 +182,7 @@ int main(int argc, char ** argv) {
g_ctx = &ctx; g_ctx = &ctx;
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (params.cfg_scale > 1.f) { if (params.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params); struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
@ -178,14 +190,14 @@ int main(int argc, char ** argv) {
} }
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); LOG_TEE("%s: error: unable to load model\n", __func__);
return 1; return 1;
} }
// print system information // print system information
{ {
fprintf(stderr, "\n"); LOG_TEE("\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", LOG_TEE("system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
} }
@ -193,7 +205,7 @@ int main(int argc, char ** argv) {
// uncomment the "used_mem" line in llama.cpp to see the results // uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) { if (params.mem_test) {
{ {
fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx); LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx)); const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads); llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
@ -219,7 +231,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> session_tokens; std::vector<llama_token> session_tokens;
if (!path_session.empty()) { if (!path_session.empty()) {
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
// fopen to check for existing session // fopen to check for existing session
FILE * fp = std::fopen(path_session.c_str(), "rb"); FILE * fp = std::fopen(path_session.c_str(), "rb");
@ -229,33 +241,38 @@ int main(int argc, char ** argv) {
session_tokens.resize(params.n_ctx); session_tokens.resize(params.n_ctx);
size_t n_token_count_out = 0; size_t n_token_count_out = 0;
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1; return 1;
} }
session_tokens.resize(n_token_count_out); session_tokens.resize(n_token_count_out);
llama_set_rng_seed(ctx, params.seed); llama_set_rng_seed(ctx, params.seed);
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else { } else {
fprintf(stderr, "%s: session file does not exist, will create\n", __func__); LOG_TEE("%s: session file does not exist, will create\n", __func__);
} }
} }
// Add BOS if SPM tokenizer
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
LOG("add_bos: %d\n", add_bos);
// tokenize the prompt
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos); embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
} else { } else {
LOG("use session tokens\n");
embd_inp = session_tokens; embd_inp = session_tokens;
} }
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(ctx)); embd_inp.push_back(llama_token_bos(ctx));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
} }
// Tokenize negative prompt // Tokenize negative prompt
@ -263,23 +280,31 @@ int main(int argc, char ** argv) {
int guidance_offset = 0; int guidance_offset = 0;
int original_prompt_len = 0; int original_prompt_len = 0;
if (ctx_guidance) { if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
original_prompt_len = original_inp.size(); original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len; guidance_offset = (int)guidance_inp.size() - original_prompt_len;
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
LOG("guidance_offset: %s", log_tostr(guidance_offset));
} }
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
if ((int) embd_inp.size() > n_ctx - 4) { if ((int) embd_inp.size() > n_ctx - 4) {
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1; return 1;
} }
// debug message about similarity of saved session, if applicable // debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0; size_t n_matching_session_tokens = 0;
if (session_tokens.size()) { if (session_tokens.size() > 0) {
for (llama_token id : session_tokens) { for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break; break;
@ -287,22 +312,27 @@ int main(int argc, char ** argv) {
n_matching_session_tokens++; n_matching_session_tokens++;
} }
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
fprintf(stderr, "%s: using full prompt from session file\n", __func__); LOG_TEE("%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) { } else if (n_matching_session_tokens >= embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) { } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size()); __func__, n_matching_session_tokens, embd_inp.size());
} else { } else {
fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n", LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size()); __func__, n_matching_session_tokens, embd_inp.size());
} }
} }
LOGLN(
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
// if we will use the cache for the full prompt without reaching the end of the cache, force // if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token token to recalculate the cached logits // reevaluation of the last token token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
session_tokens.size() > embd_inp.size()) { LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
session_tokens.resize(embd_inp.size() - 1); session_tokens.resize(embd_inp.size() - 1);
} }
@ -315,6 +345,9 @@ int main(int argc, char ** argv) {
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos); const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
// in instruct mode, we inject a prefix and a suffix to each input by the user // in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) { if (params.instruct) {
params.interactive_first = true; params.interactive_first = true;
@ -327,30 +360,30 @@ int main(int argc, char ** argv) {
} }
if (params.verbose_prompt) { if (params.verbose_prompt) {
fprintf(stderr, "\n"); LOG_TEE("\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
} }
if (ctx_guidance) { if (ctx_guidance) {
fprintf(stderr, "\n"); LOG_TEE("\n");
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) { for (int i = 0; i < (int) guidance_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
} }
} }
if (params.n_keep > 0) { if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) { for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
} }
fprintf(stderr, "'\n"); LOG_TEE("'\n");
} }
fprintf(stderr, "\n"); LOG_TEE("\n");
} }
if (params.interactive) { if (params.interactive) {
@ -367,30 +400,30 @@ int main(int argc, char ** argv) {
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif #endif
fprintf(stderr, "%s: interactive mode on.\n", __func__); LOG_TEE("%s: interactive mode on.\n", __func__);
if (params.antiprompt.size()) { if (params.antiprompt.size()) {
for (auto antiprompt : params.antiprompt) { for (const auto & antiprompt : params.antiprompt) {
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
} }
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
fprintf(stderr, "Input prefix with BOS\n"); LOG_TEE("Input prefix with BOS\n");
} }
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty()) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
} }
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty()) {
fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str()); LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
} }
} }
fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n"); LOG_TEE("\n\n");
grammar_parser::parse_state parsed_grammar; grammar_parser::parse_state parsed_grammar;
llama_grammar * grammar = NULL; llama_grammar * grammar = NULL;
@ -400,14 +433,14 @@ int main(int argc, char ** argv) {
if (parsed_grammar.rules.empty()) { if (parsed_grammar.rules.empty()) {
return 1; return 1;
} }
fprintf(stderr, "%s: grammar:\n", __func__); LOG_TEE("%s: grammar:\n", __func__);
grammar_parser::print_grammar(stderr, parsed_grammar); grammar_parser::print_grammar(stderr, parsed_grammar);
fprintf(stderr, "\n"); LOG_TEE("\n");
{ {
auto it = params.logit_bias.find(llama_token_eos(ctx)); auto it = params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) { if (it != params.logit_bias.end() && it->second == -INFINITY) {
fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
} }
} }
@ -430,11 +463,11 @@ int main(int argc, char ** argv) {
" - To return control without starting a new line, end your input with '/'.\n" " - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n"; " - If you want to submit another line, end your input with '\\'.\n";
} }
fprintf(stderr, "== Running in interactive mode. ==\n" LOG_TEE("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
" - Press Ctrl+C to interject at any time.\n" LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
#endif #endif
"%s\n", control_message); LOG_TEE( "%s\n", control_message);
is_interacting = params.interactive_first; is_interacting = params.interactive_first;
} }
@ -459,8 +492,9 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd; std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance; std::vector<llama_token> embd_guidance;
// do one empty run to warm up the model
{ {
LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(ctx), }; const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(ctx); llama_reset_timings(ctx);
@ -471,15 +505,17 @@ int main(int argc, char ** argv) {
if (embd.size() > 0) { if (embd.size() > 0) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value. // --prompt or --file which uses the same value.
auto max_embd_size = n_ctx - 4; int max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary. // Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int) embd.size() > max_embd_size) { if ((int) embd.size() > max_embd_size) {
auto skipped_tokens = embd.size() - max_embd_size; const int skipped_tokens = (int) embd.size() - max_embd_size;
embd.resize(max_embd_size);
console::set_display(console::error); console::set_display(console::error);
printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset); console::set_display(console::reset);
fflush(stdout); fflush(stdout);
embd.resize(max_embd_size);
} }
// infinite text generation via context swapping // infinite text generation via context swapping
@ -488,28 +524,26 @@ int main(int argc, char ** argv) {
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) { if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
if (params.n_predict == -2) { if (params.n_predict == -2) {
fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__); LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break; break;
} }
const int n_left = n_past - params.n_keep; const int n_left = n_past - params.n_keep;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
// always keep the first token - BOS // always keep the first token - BOS
n_past = std::max(1, params.n_keep); n_past = std::max(1, params.n_keep);
n_past_guidance = std::max(1, params.n_keep + guidance_offset); n_past_guidance = std::max(1, params.n_keep + guidance_offset);
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
// insert n_left/2 tokens at the start of embd from last_n_tokens // insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
// stop saving session if we run out of context LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
path_session.clear();
//printf("\n---\n"); LOG("clear session path\n");
//printf("resetting: '"); path_session.clear();
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_piece(ctx, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
} }
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
@ -557,11 +591,8 @@ int main(int argc, char ** argv) {
input_buf = embd_guidance.data(); input_buf = embd_guidance.data();
input_size = embd_guidance.size(); input_size = embd_guidance.size();
//fprintf(stderr, "\n---------------------\n");
//for (int i = 0; i < (int) embd_guidance.size(); i++) { LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
//fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
//}
//fprintf(stderr, "\n---------------------\n");
} else { } else {
input_buf = embd.data(); input_buf = embd.data();
input_size = embd.size(); input_size = embd.size();
@ -570,7 +601,7 @@ int main(int argc, char ** argv) {
for (int i = 0; i < input_size; i += params.n_batch) { for (int i = 0; i < input_size; i += params.n_batch) {
int n_eval = std::min(input_size - i, params.n_batch); int n_eval = std::min(input_size - i, params.n_batch);
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) { if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
return 1; return 1;
} }
@ -583,11 +614,17 @@ int main(int argc, char ** argv) {
if (n_eval > params.n_batch) { if (n_eval > params.n_batch) {
n_eval = params.n_batch; n_eval = params.n_batch;
} }
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
return 1; return 1;
} }
n_past += n_eval; n_past += n_eval;
LOG("n_past = %d\n", n_past);
} }
if (embd.size() > 0 && !path_session.empty()) { if (embd.size() > 0 && !path_session.empty()) {
@ -600,7 +637,6 @@ int main(int argc, char ** argv) {
embd_guidance.clear(); embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token
const float temp = params.temp; const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const float top_p = params.top_p; const float top_p = params.top_p;
@ -619,6 +655,8 @@ int main(int argc, char ** argv) {
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false; need_to_save_session = false;
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
LOG("saved session to %s\n", path_session.c_str());
} }
llama_token id = 0; llama_token id = 0;
@ -638,55 +676,68 @@ int main(int argc, char ** argv) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) { if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale); llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
} }
// Apply penalties // Apply penalties
float nl_logit = logits[llama_token_nl(ctx)]; float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p, llama_sample_repetition_penalty(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty); last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence); last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) { if (!penalize_nl) {
for (size_t idx = 0; idx < candidates_p.size; idx++) { for (size_t idx = 0; idx < cur_p.size; idx++) {
if (candidates_p.data[idx].id == llama_token_nl(ctx)) { if (cur_p.data[idx].id == llama_token_nl(ctx)) {
candidates_p.data[idx].logit = nl_logit; cur_p.data[idx].logit = nl_logit;
break; break;
} }
} }
} }
if (grammar != NULL) { if (grammar != NULL) {
llama_sample_grammar(ctx, &candidates_p, grammar); llama_sample_grammar(ctx, &cur_p, grammar);
} }
if (temp <= 0) { if (temp <= 0) {
// Greedy sampling // Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p); id = llama_sample_token_greedy(ctx, &cur_p);
} else { } else {
if (mirostat == 1) { if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau; static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100; const int mirostat_m = 100;
llama_sample_temperature(ctx, &candidates_p, temp); llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) { } else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau; static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, temp); llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else { } else {
// Temperature sampling // Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1); llama_sample_top_k (ctx, &cur_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1); llama_sample_typical (ctx, &cur_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1); llama_sample_top_p (ctx, &cur_p, top_p, 1);
llama_sample_temperature(ctx, &candidates_p, temp); llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token(ctx, &candidates_p);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
id = llama_sample_token(ctx, &cur_p);
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
} }
} }
// printf("`%d`", candidates_p.size); // printf("`%d`", candidates_p.size);
@ -697,9 +748,10 @@ int main(int argc, char ** argv) {
last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id); last_n_tokens.push_back(id);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_n_tokens));
} }
// add it to the context
embd.push_back(id); embd.push_back(id);
// echo this to console // echo this to console
@ -707,8 +759,11 @@ int main(int argc, char ** argv) {
// decrement remaining sampling budget // decrement remaining sampling budget
--n_remain; --n_remain;
LOG("n_remain: %d\n", n_remain);
} else { } else {
// some user input remains from prompt or interaction, forward it to processing // some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) { while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]); embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.erase(last_n_tokens.begin());
@ -742,7 +797,6 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs; // if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) { if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt // check for reverse prompt
if (params.antiprompt.size()) { if (params.antiprompt.size()) {
std::string last_output; std::string last_output;
@ -760,7 +814,7 @@ int main(int argc, char ** argv) {
? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding) ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
: 0; : 0;
if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) { if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
if (params.interactive) { if (params.interactive) {
is_interacting = true; is_interacting = true;
console::set_display(console::user_input); console::set_display(console::user_input);
@ -770,10 +824,16 @@ int main(int argc, char ** argv) {
break; break;
} }
} }
if (is_antiprompt) {
LOG("found antiprompt: %s\n", last_output.c_str());
}
} }
// deal with end of text token in interactive mode // deal with end of text token in interactive mode
if (last_n_tokens.back() == llama_token_eos(ctx)) { if (last_n_tokens.back() == llama_token_eos(ctx)) {
LOG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
if (params.antiprompt.size() != 0) { if (params.antiprompt.size() != 0) {
// tokenize and inject first reverse prompt // tokenize and inject first reverse prompt
@ -792,16 +852,20 @@ int main(int argc, char ** argv) {
} }
if (n_past > 0 && is_interacting) { if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
if (params.instruct) { if (params.instruct) {
printf("\n> "); printf("\n> ");
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(ctx)); embd_inp.push_back(llama_token_bos(ctx));
} }
std::string buffer; std::string buffer;
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty()) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix; buffer += params.input_prefix;
printf("%s", buffer.c_str()); printf("%s", buffer.c_str());
} }
@ -821,23 +885,30 @@ int main(int argc, char ** argv) {
if (buffer.length() > 1) { if (buffer.length() > 1) {
// append input suffix if any // append input suffix if any
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty()) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix; buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str()); printf("%s", params.input_suffix.c_str());
} }
LOG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size(); const size_t original_size = embd_inp.size();
// instruct mode: insert instruction prefix // instruct mode: insert instruction prefix
if (params.instruct && !is_antiprompt) { if (params.instruct && !is_antiprompt) {
LOG("inserting instruction prefix\n");
n_consumed = embd_inp.size(); n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
} }
auto line_inp = ::llama_tokenize(ctx, buffer, false); const auto line_inp = ::llama_tokenize(ctx, buffer, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
// instruct mode: insert response suffix // instruct mode: insert response suffix
if (params.instruct) { if (params.instruct) {
LOG("inserting instruction suffix\n");
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
} }
@ -848,6 +919,9 @@ int main(int argc, char ** argv) {
} }
n_remain -= line_inp.size(); n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain);
} else {
LOG("empty line, passing control back\n");
} }
input_echo = false; // do not echo this again input_echo = false; // do not echo this again
@ -871,7 +945,7 @@ int main(int argc, char ** argv) {
// end of text token // end of text token
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) { if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
fprintf(stderr, " [end of text]\n"); LOG_TEE(" [end of text]\n");
break; break;
} }
@ -884,7 +958,7 @@ int main(int argc, char ** argv) {
} }
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
} }
@ -900,5 +974,9 @@ int main(int argc, char ** argv) {
} }
llama_backend_free(); llama_backend_free();
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n")
#endif // LOG_DISABLE_LOGS
return 0; return 0;
} }

View file

@ -142,6 +142,14 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (int(tokens.size()) < 2*params.n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
params.n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
std::vector<float> logit_history; std::vector<float> logit_history;
std::vector<float> prob_history; std::vector<float> prob_history;
@ -274,6 +282,13 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (int(tokens.size()) < 2*params.n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
params.n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
std::vector<float> logit_history; std::vector<float> logit_history;
logit_history.resize(tokens.size()); logit_history.resize(tokens.size());

View file

@ -321,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
//////////// compute graph allocator //////////// compute graph allocator
static bool ggml_is_view(struct ggml_tensor * t) { static bool ggml_is_view(struct ggml_tensor * t) {
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || return t->view_src != NULL;
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
} }
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@ -340,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
return true; return true;
} }
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
switch (t->op) {
case GGML_OP_PERMUTE:
case GGML_OP_RESHAPE:
case GGML_OP_TRANSPOSE:
case GGML_OP_VIEW:
return t->src[0];
case GGML_OP_CPY:
return t->src[1];
default:
return NULL;
}
}
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
struct ggml_tensor * parent = t;
do {
parent = get_view_parent(parent);
} while (ggml_is_view(parent));
return parent;
}
static bool ggml_op_can_inplace(enum ggml_op op) { static bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) { switch (op) {
case GGML_OP_SCALE: case GGML_OP_SCALE:
@ -369,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_ADD1: case GGML_OP_ADD1:
case GGML_OP_ACC:
case GGML_OP_SUB: case GGML_OP_SUB:
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_DIV: case GGML_OP_DIV:
@ -379,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
case GGML_OP_UNARY: case GGML_OP_UNARY:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_SET:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_CONT: case GGML_OP_CONT:
return true; return true;
@ -393,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
struct hash_node * ht = alloc->hash_table; struct hash_node * ht = alloc->hash_table;
if (node->data == NULL) { if (node->data == NULL) {
if (ggml_is_view(node)) { if (ggml_is_view(node)) {
size_t offset; assert(node->view_src->data != NULL);
switch(node->op) { node->data = (char *)node->view_src->data + node->view_offs;
case GGML_OP_VIEW:
memcpy(&offset, node->op_params, sizeof(size_t));
node->data = (char *) node->src[0]->data + offset;
break;
case GGML_OP_PERMUTE:
case GGML_OP_RESHAPE:
case GGML_OP_TRANSPOSE:
node->data = node->src[0]->data;
break;
case GGML_OP_CPY:
node->data = node->src[1]->data;
break;
default:
GGML_ASSERT(!"unknown view op");
break;
}
} else { } else {
// see if we can reuse a parent's buffer (inplace) // see if we can reuse a parent's buffer (inplace)
if (ggml_op_can_inplace(node->op)) { if (ggml_op_can_inplace(node->op)) {
@ -430,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
struct hash_node * p_hn = hash_get(ht, parent); struct hash_node * p_hn = hash_get(ht, parent);
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
if (ggml_is_view(parent)) { if (ggml_is_view(parent)) {
struct ggml_tensor * view_src = get_view_source(parent); struct ggml_tensor * view_src = parent->view_src;
struct hash_node * view_src_hn = hash_get(ht, view_src); struct hash_node * view_src_hn = hash_get(ht, view_src);
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@ -472,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
struct ggml_tensor * node = gf->nodes[i]; struct ggml_tensor * node = gf->nodes[i];
if (ggml_is_view(node)) { if (ggml_is_view(node)) {
struct ggml_tensor * view_src = get_view_source(node); struct ggml_tensor * view_src = node->view_src;
hash_get(ht, view_src)->n_views += 1; hash_get(ht, view_src)->n_views += 1;
} }
@ -557,7 +516,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
if (p_hn->n_children == 0 && p_hn->n_views == 0) { if (p_hn->n_children == 0 && p_hn->n_views == 0) {
if (ggml_is_view(parent)) { if (ggml_is_view(parent)) {
struct ggml_tensor * view_src = get_view_source(parent); struct ggml_tensor * view_src = parent->view_src;
struct hash_node * view_src_hn = hash_get(ht, view_src); struct hash_node * view_src_hn = hash_get(ht, view_src);
view_src_hn->n_views -= 1; view_src_hn->n_views -= 1;
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);

View file

@ -11,6 +11,7 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
// TODO: temporary - reuse llama.cpp logging
#ifdef GGML_METAL_NDEBUG #ifdef GGML_METAL_NDEBUG
#define metal_printf(...) #define metal_printf(...)
#else #else
@ -113,7 +114,7 @@ static NSString * const msl_library_source = @"see metal.metal";
@end @end
struct ggml_metal_context * ggml_metal_init(int n_cb) { struct ggml_metal_context * ggml_metal_init(int n_cb) {
fprintf(stderr, "%s: allocating\n", __func__); metal_printf("%s: allocating\n", __func__);
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
@ -132,7 +133,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
if (error) { if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
} }
} }
@ -146,11 +147,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]); metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
if (error) { if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
} }
@ -162,7 +163,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
#endif #endif
if (error) { if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
} }
} }
@ -174,11 +175,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#define GGML_METAL_ADD_KERNEL(name) \ #define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
(int) ctx->pipeline_##name.threadExecutionWidth); \ (int) ctx->pipeline_##name.threadExecutionWidth); \
if (error) { \ if (error) { \
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \ return NULL; \
} }
@ -230,19 +231,19 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#undef GGML_METAL_ADD_KERNEL #undef GGML_METAL_ADD_KERNEL
} }
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
if (ctx->device.maxTransferRate != 0) { if (ctx->device.maxTransferRate != 0) {
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
} else { } else {
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__); metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
} }
return ctx; return ctx;
} }
void ggml_metal_free(struct ggml_metal_context * ctx) { void ggml_metal_free(struct ggml_metal_context * ctx) {
fprintf(stderr, "%s: deallocating\n", __func__); metal_printf("%s: deallocating\n", __func__);
#define GGML_METAL_DEL_KERNEL(name) \ #define GGML_METAL_DEL_KERNEL(name) \
[ctx->function_##name release]; \ [ctx->function_##name release]; \
[ctx->pipeline_##name release]; [ctx->pipeline_##name release];
@ -311,7 +312,7 @@ void * ggml_metal_host_malloc(size_t n) {
void * data = NULL; void * data = NULL;
const int result = posix_memalign((void **) &data, getpagesize(), n); const int result = posix_memalign((void **) &data, getpagesize(), n);
if (result != 0) { if (result != 0) {
fprintf(stderr, "%s: error: posix_memalign failed\n", __func__); metal_printf("%s: error: posix_memalign failed\n", __func__);
return NULL; return NULL;
} }
@ -339,7 +340,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
// Metal buffer based on the host memory pointer // Metal buffer based on the host memory pointer
// //
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
const int64_t tsize = ggml_nbytes(t); const int64_t tsize = ggml_nbytes(t);
@ -350,13 +351,13 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs; *offs = (size_t) ioffs;
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
return ctx->buffers[i].metal; return ctx->buffers[i].metal;
} }
} }
fprintf(stderr, "%s: error: buffer is nil\n", __func__); metal_printf("%s: error: buffer is nil\n", __func__);
return nil; return nil;
} }
@ -368,7 +369,7 @@ bool ggml_metal_add_buffer(
size_t size, size_t size,
size_t max_size) { size_t max_size) {
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
fprintf(stderr, "%s: too many buffers\n", __func__); metal_printf("%s: too many buffers\n", __func__);
return false; return false;
} }
@ -378,7 +379,7 @@ bool ggml_metal_add_buffer(
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
return false; return false;
} }
} }
@ -399,11 +400,11 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
return false; return false;
} }
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
++ctx->n_buffers; ++ctx->n_buffers;
} else { } else {
@ -423,27 +424,27 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
return false; return false;
} }
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
if (i + size_step < size) { if (i + size_step < size) {
fprintf(stderr, "\n"); metal_printf("\n");
} }
++ctx->n_buffers; ++ctx->n_buffers;
} }
} }
fprintf(stderr, ", (%8.2f / %8.2f)", metal_printf(", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n"); metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
} else { } else {
fprintf(stderr, "\n"); metal_printf("\n");
} }
} }
@ -453,8 +454,6 @@ bool ggml_metal_add_buffer(
void ggml_metal_set_tensor( void ggml_metal_set_tensor(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_tensor * t) { struct ggml_tensor * t) {
metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
size_t offs; size_t offs;
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs); id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
@ -464,8 +463,6 @@ void ggml_metal_set_tensor(
void ggml_metal_get_tensor( void ggml_metal_get_tensor(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_tensor * t) { struct ggml_tensor * t) {
metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
size_t offs; size_t offs;
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs); id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
@ -560,15 +557,13 @@ void ggml_metal_graph_find_concurrency(
} }
if (ctx->concur_list_len > GGML_MAX_CONCUR) { if (ctx->concur_list_len > GGML_MAX_CONCUR) {
fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__); metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
} }
} }
void ggml_metal_graph_compute( void ggml_metal_graph_compute(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) { struct ggml_cgraph * gf) {
metal_printf("%s: evaluating graph\n", __func__);
@autoreleasepool { @autoreleasepool {
// if there is ctx->concur_list, dispatch concurrently // if there is ctx->concur_list, dispatch concurrently
@ -616,7 +611,7 @@ void ggml_metal_graph_compute(
continue; continue;
} }
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src0 = gf->nodes[i]->src[0];
struct ggml_tensor * src1 = gf->nodes[i]->src[1]; struct ggml_tensor * src1 = gf->nodes[i]->src[1];
@ -764,7 +759,7 @@ void ggml_metal_graph_compute(
} break; } break;
default: default:
{ {
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} break; } break;
@ -923,7 +918,7 @@ void ggml_metal_graph_compute(
} break; } break;
default: default:
{ {
fprintf(stderr, "Asserting on type %d\n",(int)src0t); metal_printf("Asserting on type %d\n",(int)src0t);
GGML_ASSERT(false && "not implemented"); GGML_ASSERT(false && "not implemented");
} }
}; };
@ -1161,7 +1156,7 @@ void ggml_metal_graph_compute(
} break; } break;
default: default:
{ {
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }
@ -1186,7 +1181,7 @@ void ggml_metal_graph_compute(
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
if (status != MTLCommandBufferStatusCompleted) { if (status != MTLCommandBufferStatusCompleted) {
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }

199
ggml.c
View file

@ -4104,16 +4104,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
} }
size_t ggml_nbytes(const struct ggml_tensor * tensor) { size_t ggml_nbytes(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
// this should handle cases where the tensor is not contiguous in memory nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
// probaby just: }
// return nbytes;
// return tensor->ne[3]*tensor->nb[3]
//
// is enough, but just in case, adding the second part
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
} }
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
@ -4567,20 +4562,33 @@ static struct ggml_tensor * ggml_new_tensor_impl(
enum ggml_type type, enum ggml_type type,
int n_dims, int n_dims,
const int64_t * ne, const int64_t * ne,
void * data) { struct ggml_tensor * view_src,
size_t view_offs) {
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
size_t data_size = 0; // find the base tensor and absolute offset
if (view_src != NULL && view_src->view_src != NULL) {
view_offs += view_src->view_offs;
view_src = view_src->view_src;
}
if (data == NULL && !ctx->no_alloc) { size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
for (int i = 1; i < n_dims; i++) { for (int i = 1; i < n_dims; i++) {
data_size *= ne[i]; data_size *= ne[i];
} }
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
void * data = view_src != NULL ? view_src->data : NULL;
if (data != NULL) {
data = (char *) data + view_offs;
} }
if (ctx->scratch.data != NULL && data == NULL) { size_t obj_alloc_size = 0;
if (view_src == NULL && ctx->no_alloc == false) {
if (ctx->scratch.data != NULL) {
// allocate tensor data in the scratch buffer // allocate tensor data in the scratch buffer
if (ctx->scratch.offs + data_size > ctx->scratch.size) { if (ctx->scratch.offs + data_size > ctx->scratch.size) {
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
@ -4592,11 +4600,13 @@ static struct ggml_tensor * ggml_new_tensor_impl(
data = (char * const) ctx->scratch.data + ctx->scratch.offs; data = (char * const) ctx->scratch.data + ctx->scratch.offs;
ctx->scratch.offs += data_size; ctx->scratch.offs += data_size;
} else {
data_size = 0; // allocate tensor data in the context's memory pool
obj_alloc_size = data_size;
}
} }
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size); struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
@ -4616,7 +4626,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.perf_runs =*/ 0, /*.perf_runs =*/ 0,
/*.perf_cycles =*/ 0, /*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0, /*.perf_time_us =*/ 0,
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.view_src =*/ view_src,
/*.view_offs =*/ view_offs,
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
/*.name =*/ { 0 }, /*.name =*/ { 0 },
/*.extra =*/ NULL, /*.extra =*/ NULL,
/*.padding =*/ { 0 }, /*.padding =*/ { 0 },
@ -4640,28 +4652,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
return result; return result;
} }
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
assert(params_size <= GGML_MAX_OP_PARAMS);
memcpy(tensor->op_params, params, params_size);
}
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
return ((const int32_t *)(tensor->op_params))[i];
}
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
((int32_t *)(tensor->op_params))[i] = value;
}
struct ggml_tensor * ggml_new_tensor( struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int n_dims, int n_dims,
const int64_t * ne) { const int64_t * ne) {
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL); return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
} }
struct ggml_tensor * ggml_new_tensor_1d( struct ggml_tensor * ggml_new_tensor_1d(
@ -4726,7 +4722,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
} }
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL); return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
}
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
assert(params_size <= GGML_MAX_OP_PARAMS);
memcpy(tensor->op_params, params, params_size);
}
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
return ((const int32_t *)(tensor->op_params))[i];
}
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
((int32_t *)(tensor->op_params))[i] = value;
} }
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
@ -5012,14 +5024,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
struct ggml_tensor * ggml_view_tensor( struct ggml_tensor * ggml_view_tensor(
struct ggml_context * ctx, struct ggml_context * ctx,
const struct ggml_tensor * src) { struct ggml_tensor * src) {
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
ggml_format_name(result, "%s (view)", src->name); ggml_format_name(result, "%s (view)", src->name);
result->nb[0] = src->nb[0]; for (int i = 0; i < GGML_MAX_DIMS; i++) {
result->nb[1] = src->nb[1]; result->nb[i] = src->nb[i];
result->nb[2] = src->nb[2]; }
result->nb[3] = src->nb[3];
return result; return result;
} }
@ -6201,7 +6212,7 @@ struct ggml_tensor * ggml_reshape(
//GGML_ASSERT(false); //GGML_ASSERT(false);
} }
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
ggml_format_name(result, "%s (reshaped)", a->name); ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
@ -6225,7 +6236,7 @@ struct ggml_tensor * ggml_reshape_1d(
} }
const int64_t ne[1] = { ne0 }; const int64_t ne[1] = { ne0 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
ggml_format_name(result, "%s (reshaped)", a->name); ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
@ -6250,7 +6261,7 @@ struct ggml_tensor * ggml_reshape_2d(
} }
const int64_t ne[2] = { ne0, ne1 }; const int64_t ne[2] = { ne0, ne1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
ggml_format_name(result, "%s (reshaped)", a->name); ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
@ -6276,7 +6287,7 @@ struct ggml_tensor * ggml_reshape_3d(
} }
const int64_t ne[3] = { ne0, ne1, ne2 }; const int64_t ne[3] = { ne0, ne1, ne2 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
ggml_format_name(result, "%s (reshaped)", a->name); ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
@ -6286,7 +6297,6 @@ struct ggml_tensor * ggml_reshape_3d(
return result; return result;
} }
struct ggml_tensor * ggml_reshape_4d( struct ggml_tensor * ggml_reshape_4d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -6304,7 +6314,7 @@ struct ggml_tensor * ggml_reshape_4d(
} }
const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
ggml_format_name(result, "%s (reshaped)", a->name); ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
@ -6314,34 +6324,12 @@ struct ggml_tensor * ggml_reshape_4d(
return result; return result;
} }
// ggml_view_1d static struct ggml_tensor * ggml_view_impl(
static struct ggml_tensor * ggml_view_tensor_offset(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_dims, int n_dims,
const int64_t * ne, const int64_t * ne,
size_t offset) { size_t offset) {
// don't calculate an offset from an unallocated tensor
void * data = NULL;
if (a->data != NULL) {
data = (char *) a->data + offset;
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
return result;
}
struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
size_t offset) {
bool is_node = false; bool is_node = false;
@ -6349,7 +6337,10 @@ struct ggml_tensor * ggml_view_1d(
is_node = true; is_node = true;
} }
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
result->op = GGML_OP_VIEW; result->op = GGML_OP_VIEW;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6358,6 +6349,19 @@ struct ggml_tensor * ggml_view_1d(
return result; return result;
} }
// ggml_view_1d
struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
size_t offset) {
struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
return result;
}
// ggml_view_2d // ggml_view_2d
struct ggml_tensor * ggml_view_2d( struct ggml_tensor * ggml_view_2d(
@ -6368,24 +6372,14 @@ struct ggml_tensor * ggml_view_2d(
size_t nb1, size_t nb1,
size_t offset) { size_t offset) {
bool is_node = false; const int64_t ne[2] = { ne0, ne1 };
if (a->grad) { struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
is_node = true;
}
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
result->nb[1] = nb1; result->nb[1] = nb1;
result->nb[2] = result->nb[1]*ne1; result->nb[2] = result->nb[1]*ne1;
result->nb[3] = result->nb[2]; result->nb[3] = result->nb[2];
result->op = GGML_OP_VIEW;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result; return result;
} }
@ -6401,24 +6395,14 @@ struct ggml_tensor * ggml_view_3d(
size_t nb2, size_t nb2,
size_t offset) { size_t offset) {
bool is_node = false; const int64_t ne[3] = { ne0, ne1, ne2 };
if (a->grad) { struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
is_node = true;
}
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
result->nb[1] = nb1; result->nb[1] = nb1;
result->nb[2] = nb2; result->nb[2] = nb2;
result->nb[3] = result->nb[2]*ne2; result->nb[3] = result->nb[2]*ne2;
result->op = GGML_OP_VIEW;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result; return result;
} }
@ -6436,24 +6420,14 @@ struct ggml_tensor * ggml_view_4d(
size_t nb3, size_t nb3,
size_t offset) { size_t offset) {
bool is_node = false; const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
if (a->grad) { struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
is_node = true;
}
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
result->nb[1] = nb1; result->nb[1] = nb1;
result->nb[2] = nb2; result->nb[2] = nb2;
result->nb[3] = nb3; result->nb[3] = nb3;
result->op = GGML_OP_VIEW;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result; return result;
} }
@ -6640,7 +6614,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { n_past, inplace ? 1 : 0 }; int32_t params[] = { n_past };
ggml_set_op_params(result, params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_DIAG_MASK_INF; result->op = GGML_OP_DIAG_MASK_INF;
@ -6657,7 +6631,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
return ggml_diag_mask_inf_impl(ctx, a, n_past, false); return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
} }
struct ggml_tensor * ggml_diag_mask_inf_inplace( struct ggml_tensor * ggml_diag_mask_inf_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -6680,7 +6653,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { n_past, inplace ? 1 : 0 }; int32_t params[] = { n_past };
ggml_set_op_params(result, params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_DIAG_MASK_ZERO; result->op = GGML_OP_DIAG_MASK_ZERO;
@ -11936,7 +11909,7 @@ static void ggml_compute_forward_diag_mask_f32(
const int nth = params->nth; const int nth = params->nth;
const int n_past = ((int32_t *) dst->op_params)[0]; const int n_past = ((int32_t *) dst->op_params)[0];
const bool inplace = (bool)((int32_t *) dst->op_params)[1]; const bool inplace = src0->data == dst->data;
GGML_ASSERT(n_past >= 0); GGML_ASSERT(n_past >= 0);

5
ggml.h
View file

@ -479,6 +479,9 @@ extern "C" {
int64_t perf_cycles; int64_t perf_cycles;
int64_t perf_time_us; int64_t perf_time_us;
struct ggml_tensor * view_src;
size_t view_offs;
void * data; void * data;
char name[GGML_MAX_NAME]; char name[GGML_MAX_NAME];
@ -661,7 +664,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);

View file

@ -27,8 +27,25 @@ In this case, upgrade Pip to the latest:
pip install --upgrade pip pip install --upgrade pip
``` ```
## Publishing ## Automatic publishing with CI
To publish the package, you need to have `twine` and `build` installed:
There's a GitHub workflow to make a release automatically upon creation of tags in a specified format.
1. Bump the version in `pyproject.toml`.
2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number.
```sh
git tag -a gguf-v1.0.0 -m "Version 1.0 release"
```
3. Push the tags.
```sh
git push origin --tags
```
## Manual publishing
If you want to publish the package manually for any reason, you need to have `twine` and `build` installed:
```sh ```sh
pip install build twine pip install build twine
@ -36,7 +53,7 @@ pip install build twine
Then, folow these steps to release a new version: Then, folow these steps to release a new version:
1. Update the version in `pyproject.toml`. 1. Bump the version in `pyproject.toml`.
2. Build the package: 2. Build the package:
```sh ```sh

View file

@ -4,9 +4,13 @@ import sys
import struct import struct
import tempfile import tempfile
import numpy as np import numpy as np
import json
import os
from pathlib import Path
from enum import IntEnum, auto from enum import IntEnum, auto
from typing import Any, IO, List, Optional from io import BufferedWriter
from typing import Any, BinaryIO, Callable, IO, Dict, List, Optional, Sequence, Tuple, Union
# #
# constants # constants
@ -71,35 +75,35 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
class MODEL_ARCH(IntEnum): class MODEL_ARCH(IntEnum):
LLAMA = auto() LLAMA : int = auto()
FALCON = auto() FALCON : int = auto()
GPT2 = auto() GPT2 : int = auto()
GPTJ = auto() GPTJ : int = auto()
GPTNEOX = auto() GPTNEOX: int = auto()
MPT = auto() MPT : int = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
TOKEN_EMBD = auto() TOKEN_EMBD : int = auto()
POS_EMBD = auto() POS_EMBD : int = auto()
OUTPUT = auto() OUTPUT : int = auto()
OUTPUT_NORM = auto() OUTPUT_NORM : int = auto()
ROPE_FREQS = auto() ROPE_FREQS : int = auto()
ATTN_Q = auto() ATTN_Q : int = auto()
ATTN_K = auto() ATTN_K : int = auto()
ATTN_V = auto() ATTN_V : int = auto()
ATTN_QKV = auto() ATTN_QKV : int = auto()
ATTN_OUT = auto() ATTN_OUT : int = auto()
ATTN_NORM = auto() ATTN_NORM : int = auto()
ATTN_NORM_2 = auto() ATTN_NORM_2 : int = auto()
ATTN_ROT_EMBD = auto() ATTN_ROT_EMBD: int = auto()
FFN_GATE = auto() FFN_GATE : int = auto()
FFN_DOWN = auto() FFN_DOWN : int = auto()
FFN_UP = auto() FFN_UP : int = auto()
FFN_NORM = auto() FFN_NORM : int = auto()
MODEL_ARCH_NAMES = { MODEL_ARCH_NAMES: Dict[MODEL_ARCH, str] = {
MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.GPT2: "gpt2", MODEL_ARCH.GPT2: "gpt2",
@ -108,7 +112,7 @@ MODEL_ARCH_NAMES = {
MODEL_ARCH.MPT: "mpt", MODEL_ARCH.MPT: "mpt",
} }
MODEL_TENSOR_NAMES = { MODEL_TENSOR_NAMES: Dict[MODEL_ARCH, Dict[MODEL_TENSOR, str]] = {
MODEL_ARCH.LLAMA: { MODEL_ARCH.LLAMA: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT_NORM: "output_norm",
@ -154,7 +158,7 @@ MODEL_TENSOR_NAMES = {
} }
# tensors that will not be serialized # tensors that will not be serialized
MODEL_TENSOR_SKIP = { MODEL_TENSOR_SKIP: Dict[MODEL_ARCH, List[MODEL_TENSOR]] = {
MODEL_ARCH.LLAMA: [ MODEL_ARCH.LLAMA: [
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
@ -162,167 +166,198 @@ MODEL_TENSOR_SKIP = {
} }
# TODO: the following helper functions should be removed class TensorNameMap:
# instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR) mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = {
# however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
# REMOVE
def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
for skip in MODEL_TENSOR_SKIP.get(arch, []):
for i in range(n_blocks):
if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
return True
return False
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
tensor_map = {}
# Token embeddings # Token embeddings
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None) MODEL_TENSOR.TOKEN_EMBD: (
"gpt_neox.embed_in", # gptneox
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox "transformer.wte", # gpt2 mpt
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt "transformer.word_embeddings", # falcon
tensor_map["transformer.word_embeddings"] = mapped_to # falcon "model.embed_tokens", # llama-hf
tensor_map["model.embed_tokens"] = mapped_to # llama-hf "tok_embeddings", # llama-pth
tensor_map["tok_embeddings"] = mapped_to # llama-pth ),
# Position embeddings # Position embeddings
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None) MODEL_TENSOR.POS_EMBD: (
"transformer.wpe", # gpt2
tensor_map["transformer.wpe"] = mapped_to # gpt2 ),
# Output # Output
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None) MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox
tensor_map["embed_out"] = mapped_to # gptneox "lm_head", # gpt2 mpt falcon llama-hf
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf "output", # llama-pth
tensor_map["output"] = mapped_to # llama-pth ),
# Output norm # Output norm
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None) MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox "transformer.ln_f", # gpt2 falcon
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon "model.norm", # llama-hf
tensor_map["transformer.norm_f"] = mapped_to # mpt "norm", # llama-pth
tensor_map["model.norm"] = mapped_to # llama-hf ),
tensor_map["norm"] = mapped_to # llama-pth
# Rope frequencies # Rope frequencies
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None) MODEL_TENSOR.ROPE_FREQS: (
"rope.freqs", # llama-pth
),
}
tensor_map["rope.freqs"] = mapped_to # llama-pth block_mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = {
# Attention and feed-forward blocks
for i in range(0, n_blocks):
# Attention norm # Attention norm
# TODO: is there are simpler way to write these 2 lines in Python? MODEL_TENSOR.ATTN_NORM: (
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None) "gpt_neox.layers.{bid}.input_layernorm", # gptneox
mapped_to = mapped_to.format(bid=i) if mapped_to else None "transformer.h.{bid}.ln_1", # gpt2
"transformer.blocks.{bid}.norm_1", # mpt
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox "transformer.h.{bid}.input_layernorm", # falcon7b
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2 "transformer.h.{bid}.ln_mlp", # falcon40b
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt "model.layers.{bid}.input_layernorm", # llama-hf
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b "layers.{bid}.attention_norm", # llama-pth
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b ),
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
# Attention norm 2 # Attention norm 2
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None) MODEL_TENSOR.ATTN_NORM_2: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "transformer.h.{bid}.ln_attn", # falcon40b
),
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
# Attention query-key-value # Attention query-key-value
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None) MODEL_TENSOR.ATTN_QKV: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
"transformer.h.{bid}.attn.c_attn", # gpt2
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox "transformer.blocks.{bid}.attn.Wqkv", # mpt
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2 "transformer.h.{bid}.self_attention.query_key_value", # falcon
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt ),
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
# Attention query # Attention query
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None) MODEL_TENSOR.ATTN_Q: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "model.layers.{bid}.self_attn.q_proj", # llama-hf
"layers.{bid}.attention.wq", # llama-pth
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf ),
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
# Attention key # Attention key
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None) MODEL_TENSOR.ATTN_K: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "model.layers.{bid}.self_attn.k_proj", # llama-hf
"layers.{bid}.attention.wk", # llama-pth
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf ),
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
# Attention value # Attention value
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None) MODEL_TENSOR.ATTN_V: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "model.layers.{bid}.self_attn.v_proj", # llama-hf
"layers.{bid}.attention.wv", # llama-pth
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf ),
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
# Attention output # Attention output
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None) MODEL_TENSOR.ATTN_OUT: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "gpt_neox.layers.{bid}.attention.dense", # gptneox
"transformer.h.{bid}.attn.c_proj", # gpt2
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox "transformer.blocks.{bid}.attn.out_proj", # mpt
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2 "transformer.h.{bid}.self_attention.dense", # falcon
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt "model.layers.{bid}.self_attn.o_proj", # llama-hf
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon "layers.{bid}.attention.wo", # llama-pth
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf ),
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
# Rotary embeddings # Rotary embeddings
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None) MODEL_TENSOR.ATTN_ROT_EMBD: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf ),
tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth
# Feed-forward norm # Feed-forward norm
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None) MODEL_TENSOR.FFN_NORM: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
"transformer.h.{bid}.ln_2", # gpt2
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox "transformer.blocks.{bid}.norm_2", # mpt
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2 "model.layers.{bid}.post_attention_layernorm", # llama-hf
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt "layers.{bid}.ffn_norm", # llama-pth
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf ),
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
# Feed-forward up # Feed-forward up
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None) MODEL_TENSOR.FFN_UP: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
"transformer.h.{bid}.mlp.c_fc", # gpt2
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox "transformer.blocks.{bid}.ffn.up_proj", # mpt
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2 "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt "model.layers.{bid}.mlp.up_proj", # llama-hf
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon "layers.{bid}.feed_forward.w3", # llama-pth
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf ),
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
# Feed-forward gate # Feed-forward gate
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None) MODEL_TENSOR.FFN_GATE: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "model.layers.{bid}.mlp.gate_proj", # llama-hf
"layers.{bid}.feed_forward.w1", # llama-pth
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf ),
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
# Feed-forward down # Feed-forward down
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None) MODEL_TENSOR.FFN_DOWN: (
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
"transformer.h.{bid}.mlp.c_proj", # gpt2
"transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"model.layers.{bid}.mlp.down_proj", # llama-hf
"layers.{bid}.feed_forward.w2", # llama-pth
),
}
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox mapping: Dict[str, Tuple[MODEL_TENSOR, str]]
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
return tensor_map tensor_names: Dict[MODEL_TENSOR, str]
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
mapping = self.mapping = {}
tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
for tensor, keys in self.mappings_cfg.items():
tensor_name = tensor_names.get(tensor)
if tensor_name is None:
continue
for key in keys:
mapping[key] = (tensor, tensor_name)
for bid in range(n_blocks):
for tensor, keys in self.block_mappings_cfg.items():
tensor_name = tensor_names.get(tensor)
if tensor_name is None:
continue
tensor_name = tensor_name.format(bid = bid)
for key in keys:
key = key.format(bid = bid)
mapping[key] = (tensor, tensor_name)
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[Tuple[MODEL_TENSOR, str]]:
result = self.mapping.get(key)
if result is not None:
return result
for suffix in try_suffixes:
if key.endswith(suffix):
result = self.mapping.get(key[:-len(suffix)])
if result is not None:
return (result[0], result[1] + suffix)
return None
def get_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[str]:
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
if result is None:
return None
return result[1]
def get_type(self, key: str, try_suffixes: Sequence[str]) -> Optional[MODEL_TENSOR]:
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
if result is None:
return None
return result[0]
def __getitem__(self, key: str) -> str:
try:
return self.mapping[key][1]
except KeyError:
raise KeyError(key)
def __contains__(self, key: str) -> bool:
return key in self.mapping
def __repr__(self) -> str:
return repr(self.mapping)
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
return TensorNameMap(arch, n_blocks)
class TokenType(IntEnum): class TokenType(IntEnum):
NORMAL = 1 NORMAL = 1
@ -388,15 +423,21 @@ class GGUFValueType(IntEnum):
class GGUFWriter: class GGUFWriter:
def __init__(self, path: str, arch: str, use_temp_file = True): fout: BufferedWriter
arch: str
offset_tensor = 0
data_alignment = GGUF_DEFAULT_ALIGNMENT
kv_data = b""
kv_data_count = 0
ti_data = b""
ti_data_count = 0
use_temp_file: bool
temp_file: Optional[tempfile.SpooledTemporaryFile[bytes]] = None
tensors: List[Tuple[np.ndarray[Any, Any], int]]
def __init__(self, path: Union[os.PathLike[str], str], arch: str, use_temp_file = True):
self.fout = open(path, "wb") self.fout = open(path, "wb")
self.arch = arch self.arch = arch
self.offset_tensor = 0
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
self.kv_data = b""
self.kv_data_count = 0
self.ti_data = b""
self.ti_data_count = 0
self.add_architecture() self.add_architecture()
self.use_temp_file = use_temp_file self.use_temp_file = use_temp_file
self.tensors = [] self.tensors = []
@ -470,14 +511,27 @@ class GGUFWriter:
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.STRING) self.add_val(val, GGUFValueType.STRING)
def add_array(self, key: str, val: list): def add_array(self, key: str, val: Sequence[Any]):
if not isinstance(val, list): if not isinstance(val, Sequence):
raise ValueError("Value must be a list for array type") raise ValueError("Value must be a sequence for array type")
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.ARRAY) self.add_val(val, GGUFValueType.ARRAY)
def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True): _simple_value_packing = {
GGUFValueType.UINT8: "<B",
GGUFValueType.INT8: "<b",
GGUFValueType.UINT16: "<H",
GGUFValueType.INT16: "<h",
GGUFValueType.UINT32: "<I",
GGUFValueType.INT32: "<i",
GGUFValueType.FLOAT32: "<f",
GGUFValueType.UINT64: "<Q",
GGUFValueType.INT64: "<q",
GGUFValueType.FLOAT64: "<d",
GGUFValueType.BOOL: "?" ,
}
def add_val(self, val: Any, vtype: Optional[GGUFValueType] = None, add_vtype: bool = True):
if vtype is None: if vtype is None:
vtype = GGUFValueType.get_type(val) vtype = GGUFValueType.get_type(val)
@ -485,47 +539,29 @@ class GGUFWriter:
self.kv_data += struct.pack("<I", vtype) self.kv_data += struct.pack("<I", vtype)
self.kv_data_count += 1 self.kv_data_count += 1
if vtype == GGUFValueType.UINT8: pack_fmt = self._simple_value_packing.get(vtype)
self.kv_data += struct.pack("<B", val) if pack_fmt is not None:
elif vtype == GGUFValueType.INT8: self.kv_data += struct.pack(pack_fmt, val)
self.kv_data += struct.pack("<b", val)
elif vtype == GGUFValueType.UINT16:
self.kv_data += struct.pack("<H", val)
elif vtype == GGUFValueType.INT16:
self.kv_data += struct.pack("<h", val)
elif vtype == GGUFValueType.UINT32:
self.kv_data += struct.pack("<I", val)
elif vtype == GGUFValueType.INT32:
self.kv_data += struct.pack("<i", val)
elif vtype == GGUFValueType.FLOAT32:
self.kv_data += struct.pack("<f", val)
elif vtype == GGUFValueType.UINT64:
self.kv_data += struct.pack("<Q", val)
elif vtype == GGUFValueType.INT64:
self.kv_data += struct.pack("<q", val)
elif vtype == GGUFValueType.FLOAT64:
self.kv_data += struct.pack("<d", val)
elif vtype == GGUFValueType.BOOL:
self.kv_data += struct.pack("?", val)
elif vtype == GGUFValueType.STRING: elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf8") if isinstance(val, str) else val encoded_val = val.encode("utf8") if isinstance(val, str) else val
self.kv_data += struct.pack("<Q", len(encoded_val)) self.kv_data += struct.pack("<Q", len(encoded_val))
self.kv_data += encoded_val self.kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY: elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
ltype = set([GGUFValueType.get_type(item) for item in val]) ltype = GGUFValueType.get_type(val[0])
assert len(ltype) == 1, "All items in a GGUF array should be of the same type" if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
self.kv_data += struct.pack("<I", list(ltype)[0]) raise ValueError("All items in a GGUF array should be of the same type")
self.kv_data += struct.pack("<I", ltype)
self.kv_data += struct.pack("<Q", len(val)) self.kv_data += struct.pack("<Q", len(val))
for item in val: for item in val:
self.add_val(item, add_vtype=False) self.add_val(item, add_vtype=False)
else: else:
raise ValueError("Invalid GGUF metadata value type") raise ValueError("Invalid GGUF metadata value type or value")
@staticmethod @staticmethod
def ggml_pad(x: int, n: int) -> int: def ggml_pad(x: int, n: int) -> int:
return ((x + n - 1) // n) * n return ((x + n - 1) // n) * n
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None): def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: Union[np.dtype[np.float16], np.dtype[np.float32]], tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now" assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
encoded_name = name.encode("utf8") encoded_name = name.encode("utf8")
@ -544,16 +580,18 @@ class GGUFWriter:
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1 self.ti_data_count += 1
def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None): def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Optional[Sequence[int]] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
if self.use_temp_file and not hasattr(self, "temp_file"): if self.use_temp_file and self.temp_file is None:
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024) fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
self.temp_file.seek(0) fp.seek(0)
self.temp_file = fp
self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype) shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
if not self.use_temp_file: if self.temp_file is None:
self.tensors.append((tensor, pad)) self.tensors.append((tensor, pad))
return return
@ -562,25 +600,22 @@ class GGUFWriter:
if pad != 0: if pad != 0:
self.temp_file.write(bytes([0] * pad)) self.temp_file.write(bytes([0] * pad))
def write_tensor_data(self, tensor: np.ndarray): def write_padding(self, fp: BinaryIO, n: int, align: Optional[int] = None):
pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell() pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
if pad != 0: if pad != 0:
self.fout.write(bytes([0] * pad)) fp.write(bytes([0] * pad))
def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
self.write_padding(self.fout, self.fout.tell())
tensor.tofile(self.fout) tensor.tofile(self.fout)
self.write_padding(self.fout, tensor.nbytes)
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
if pad != 0:
self.fout.write(bytes([0] * pad))
def write_tensors_to_file(self): def write_tensors_to_file(self):
self.write_ti_data_to_file() self.write_ti_data_to_file()
pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell() self.write_padding(self.fout, self.fout.tell())
if pad != 0:
self.fout.write(bytes([0] * pad))
if not self.use_temp_file: if self.temp_file is None:
for (currtensor, currpad) in self.tensors: for (currtensor, currpad) in self.tensors:
currtensor.tofile(self.fout) currtensor.tofile(self.fout)
if currpad != 0: if currpad != 0:
@ -654,10 +689,6 @@ class GGUFWriter:
self.add_bool( self.add_bool(
KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
def add_tensor_data_layout(self, layout: str):
self.add_string(
KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
def add_head_count(self, count: int): def add_head_count(self, count: int):
self.add_uint32( self.add_uint32(
KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count) KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
@ -695,16 +726,16 @@ class GGUFWriter:
def add_tokenizer_model(self, model: str): def add_tokenizer_model(self, model: str):
self.add_string(KEY_TOKENIZER_MODEL, model) self.add_string(KEY_TOKENIZER_MODEL, model)
def add_token_list(self, tokens: List): def add_token_list(self, tokens: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]):
self.add_array(KEY_TOKENIZER_LIST, tokens) self.add_array(KEY_TOKENIZER_LIST, tokens)
def add_token_merges(self, merges: List): def add_token_merges(self, merges: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]):
self.add_array(KEY_TOKENIZER_MERGES, merges) self.add_array(KEY_TOKENIZER_MERGES, merges)
def add_token_types(self, types: List[int]): def add_token_types(self, types: Union[Sequence[TokenType], Sequence[int]]):
self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types) self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
def add_token_scores(self, scores: List[float]): def add_token_scores(self, scores: Sequence[float]):
self.add_array(KEY_TOKENIZER_SCORES, scores) self.add_array(KEY_TOKENIZER_SCORES, scores)
def add_bos_token_id(self, id: int): def add_bos_token_id(self, id: int):
@ -723,6 +754,84 @@ class GGUFWriter:
self.add_uint32(KEY_TOKENIZER_PAD_ID, id) self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
class SpecialVocab:
load_merges: bool = False
merges: List[str] = []
special_token_types: Tuple[str, ...] = tuple(('bos', 'eos', 'unk', 'sep', 'pad'))
special_token_ids: Dict[str, int] = {}
def __init__(self, path: Path, load_merges: bool = False, special_token_types: Optional[Tuple[str, ...]] = None):
self.special_token_ids = {}
self.load_merges = load_merges
if special_token_types is not None:
self.special_token_types = special_token_types
self.load(path)
def load(self, path: Path):
if not self.try_load_from_tokenizer_json(path):
self.try_load_from_config_json(path)
def try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer_file = path / 'tokenizer.json'
if not tokenizer_file.is_file():
return False
with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
tokenizer = json.load(f)
if self.load_merges:
merges = tokenizer.get('model', {}).get('merges')
if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str):
self.merges = merges
tokenizer_config_file = path / 'tokenizer_config.json'
added_tokens = tokenizer.get('added_tokens')
if added_tokens is None or not tokenizer_config_file.is_file():
return True
with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
tokenizer_config = json.load(f)
for typ in self.special_token_types:
entry = tokenizer_config.get(f'{typ}_token')
if isinstance(entry, str):
tc_content = entry
elif isinstance(entry, dict):
entry_content = entry.get('content')
if not isinstance(entry_content, str):
continue
tc_content = entry_content
else:
continue
for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
if isinstance(maybe_token_id, int):
self.special_token_ids[typ] = maybe_token_id
break
return True
def try_load_from_config_json(self, path: Path) -> bool:
config_file = path / 'config.json'
if not config_file.is_file():
return False
with open(config_file, 'r', encoding = 'utf-8') as f:
config = json.load(f)
for typ in self.special_token_types:
maybe_token_id = config.get(f'{typ}_token_id')
if isinstance(maybe_token_id, int):
self.special_token_ids[typ] = maybe_token_id
return True
def add_to_gguf(self, gw: GGUFWriter):
if len(self.merges) > 0:
print(f'gguf: Adding {len(self.merges)} merge(s).')
gw.add_token_merges(self.merges)
for typ, tokid in self.special_token_ids.items():
handler: Optional[Callable[[int], None]] = getattr(gw, f'add_{typ}_token_id', None)
if handler is None:
print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
continue
print(f'gguf: Setting special token type {typ} to {tokid}')
handler(tokid)
def __repr__(self):
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
# Example usage: # Example usage:
if __name__ == "__main__": if __name__ == "__main__":
# Example usage with a file # Example usage with a file

0
gguf-py/gguf/py.typed Normal file
View file

View file

@ -5,6 +5,7 @@ description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [
{include = "gguf"}, {include = "gguf"},
{include = "gguf/py.typed"},
] ]
readme = "README.md" readme = "README.md"
homepage = "https://ggml.ai" homepage = "https://ggml.ai"

View file

@ -3211,7 +3211,7 @@ private:
struct llm_bigram_bpe { struct llm_bigram_bpe {
struct comparator { struct comparator {
bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) { bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
} }
}; };
@ -3359,23 +3359,22 @@ private:
} }
// probably not 100% correct // probably not 100% correct
// TODO: this is quite slow - how to make it more efficient? static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
std::vector<std::string> words; std::vector<std::string> words;
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
const std::regex re(pattern); const std::regex re(pattern);
std::smatch m;
while (std::regex_search(text, m, re)) { auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
for (auto x : m) { auto words_end = std::sregex_iterator();
words.push_back(x); auto n_words = std::distance(words_begin, words_end);
words.reserve(n_words);
for (auto it = words_begin; it != words_end; ++it) {
words.push_back(it->str());
} }
text = m.suffix();
}
return words; return words;
} }
const llama_vocab & vocab; const llama_vocab & vocab;

View file

@ -521,7 +521,7 @@ extern "C" {
// If this is not called, or NULL is supplied, everything is output on stderr. // If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx); LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -20,6 +20,7 @@ fi
model="$1" model="$1"
out="../tmp/results-${model}" out="../tmp/results-${model}"
set -o pipefail
set -e set -e
mkdir -p ${out} mkdir -p ${out}

View file

@ -20,6 +20,7 @@ fi
model="$1" model="$1"
out="../tmp/results-${model}" out="../tmp/results-${model}"
set -o pipefail
set -e set -e
mkdir -p ${out} mkdir -p ${out}

View file

@ -17,6 +17,7 @@ if [ ! -z "$3" ]; then
args="$3" args="$3"
fi fi
set -o pipefail
set -e set -e
model="$1" model="$1"

View file

@ -37,3 +37,8 @@ llama_build_and_test_executable(test-grammar-parser.cpp)
llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-llama-grammar.cpp)
llama_build_and_test_executable(test-grad0.cpp) # SLOW llama_build_and_test_executable(test-grad0.cpp) # SLOW
# llama_build_and_test_executable(test-opt.cpp) # SLOW # llama_build_and_test_executable(test-opt.cpp) # SLOW
# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
add_executable(${TEST_TARGET} test-c.c)
target_link_libraries(${TEST_TARGET} PRIVATE llama)

3
tests/test-c.c Normal file
View file

@ -0,0 +1,3 @@
#include "llama.h"
int main(void) {}