big cleanup

This commit is contained in:
Wenjing Yu 2024-07-26 17:47:11 -07:00
parent 3ba087913f
commit a53d266f7b
238 changed files with 5 additions and 60172 deletions

View file

@ -27,6 +27,3 @@ indent_size = 2
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab
[examples/cvector-generator/*.txt]
trim_trailing_whitespace = unset
insert_final_newline = unset

272
Makefile
View file

@ -1,47 +1,15 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
libllava.a \
llama-cli \
llama-convert-llama2c-to-ggml \
llama-embedding \
llama-eval-callback \
llama-export-lora \
llama-gbnf-validator \
llama-gguf \
llama-gguf-hash \
llama-gguf-split \
llama-gritlm \
llama-imatrix \
llama-infill \
llama-llava-cli \
llama-lookahead \
llama-lookup \
llama-lookup-create \
llama-lookup-merge \
llama-lookup-stats \
llama-parallel \
llama-passkey \
llama-perplexity \
llama-q8dot \
llama-quantize \
llama-quantize-stats \
llama-retrieval \
llama-save-load-state \
llama-server \
llama-simple \
llama-speculative \
llama-tokenize \
llama-vdot \
llama-cvector-generator
llama-vdot
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
simple save-load-state server gguf gguf-split eval-callback libllava.a llava-cli \
retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
LEGACY_TARGETS_CLEAN = main vdot q8dot
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
LEGACY_TARGETS_BUILD = main
# Deprecation aliases
ifdef LLAMA_CUBLAS
@ -1135,213 +1103,12 @@ llama-cli: examples/main/main.cpp \
@echo '==== Run ./llama-cli -h for help. ===='
@echo
llama-infill: examples/infill/infill.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-simple: examples/simple/simple.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-tokenize: examples/tokenize/tokenize.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-quantize: examples/quantize/quantize.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-perplexity: examples/perplexity/perplexity.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-imatrix: examples/imatrix/imatrix.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-embedding: examples/embedding/embedding.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gritlm: examples/gritlm/gritlm.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-save-load-state: examples/save-load-state/save-load-state.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gguf: examples/gguf/gguf.cpp \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
examples/gguf-hash/deps/sha1/sha1.o: \
examples/gguf-hash/deps/sha1/sha1.c
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
examples/gguf-hash/deps/xxhash/xxhash.o: \
examples/gguf-hash/deps/xxhash/xxhash.c
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
examples/gguf-hash/deps/sha256/sha256.o: \
examples/gguf-hash/deps/sha256/sha256.c
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-eval-callback: examples/eval-callback/eval-callback.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
$(OBJ_GGML) $(OBJ_LLAMA)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-export-lora: examples/export-lora/export-lora.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-retrieval: examples/retrieval/retrieval.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-speculative: examples/speculative/speculative.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-parallel: examples/parallel/parallel.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookahead: examples/lookahead/lookahead.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup: examples/lookup/lookup.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup-create: examples/lookup/lookup-create.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup-merge: examples/lookup/lookup-merge.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup-stats: examples/lookup/lookup-stats.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-passkey: examples/passkey/passkey.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
ifdef GGML_RPC
rpc-server: examples/rpc/rpc-server.cpp \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif # GGML_RPC
llama-server: \
examples/server/server.cpp \
examples/server/utils.hpp \
examples/server/httplib.h \
examples/server/colorthemes.css.hpp \
examples/server/style.css.hpp \
examples/server/theme-beeninorder.css.hpp \
examples/server/theme-ketivah.css.hpp \
examples/server/theme-mangotango.css.hpp \
examples/server/theme-playground.css.hpp \
examples/server/theme-polarnight.css.hpp \
examples/server/theme-snowstorm.css.hpp \
examples/server/index.html.hpp \
examples/server/index-new.html.hpp \
examples/server/index.js.hpp \
examples/server/completion.js.hpp \
examples/server/system-prompts.js.hpp \
examples/server/prompt-formats.js.hpp \
examples/server/json-schema-to-grammar.mjs.hpp \
common/json.hpp \
common/stb_image.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
examples/server/%.hpp: examples/server/public/% Makefile
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
echo "unsigned char $${NAME}[] = {" && \
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
echo "};" && \
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
) > $@
libllava.a: examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
common/stb_image.h \
common/base64.hpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llama-llava-cli: examples/llava/llava-cli.cpp \
examples/llava/clip.h \
examples/llava/clip.cpp \
examples/llava/llava.h \
examples/llava/llava.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh "$(CC)" > $@.tmp
@if ! cmp -s $@.tmp $@; then \
@ -1371,7 +1138,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
#
# Mark legacy binary targets as .PHONY so that they are always checked.
.PHONY: main quantize perplexity embedding server
.PHONY: main
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
# Eventually we will want to remove these target from building all the time.
@ -1380,37 +1147,6 @@ main: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
server: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
quantize: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard quantize))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
@echo " Remove the 'quantize' binary to remove this warning."
@echo "#########"
endif
perplexity: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard perplexity))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
@echo " Remove the 'perplexity' binary to remove this warning."
@echo "#########"
endif
embedding: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard embedding))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
@echo " Remove the 'embedding' binary to remove this warning."
@echo "#########"
endif

View file

@ -12,40 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(cvector-generator)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(export-lora)
add_subdirectory(gbnf-validator)
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
add_subdirectory(gguf)
add_subdirectory(gritlm)
add_subdirectory(imatrix)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(main)
add_subdirectory(parallel)
add_subdirectory(passkey)
add_subdirectory(perplexity)
add_subdirectory(quantize-stats)
add_subdirectory(quantize)
add_subdirectory(retrieval)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
if (GGML_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(tokenize)
endif()

View file

@ -1,5 +0,0 @@
set(TARGET llama-convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,28 +0,0 @@
## Convert llama2.c model to ggml
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
`$ make -j`
After successful compilation, following usage options are available:
```
usage: ./llama-convert-llama2c-to-ggml [options]
options:
-h, --help show this help message and exit
--copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
```
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
Now you can use the model with a command like:
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View file

@ -1,936 +0,0 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "log.h"
#include <unordered_map>
#include <vector>
#include <cassert>
#include <climits>
#include <cstring>
#include <cstdarg>
#include <ctime>
#include <random>
#include <stdexcept>
#include <sstream>
#include <algorithm>
#include <string>
// GGUF keys & tensor names.
#define KV_GENERAL_ARCHITECTURE "general.architecture"
#define KV_GENERAL_NAME "general.name"
#define KV_TOKENIZER_MODEL "tokenizer.ggml.model"
#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens"
#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type"
#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores"
#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id"
#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id"
#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id"
#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id"
#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id"
#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json"
#define KV_CONTEXT_LENGTH "llama.context_length"
#define KV_EMBEDDING_LENGTH "llama.embedding_length"
#define KV_BLOCK_COUNT "llama.block_count"
#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length"
#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count"
#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv"
#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon"
#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count"
#define TN_TOKEN_EMBD "token_embd.weight"
#define TN_OUTPUT_NORM "output_norm.weight"
#define TN_OUTPUT "output.weight"
#define TN_ATTN_NORM "blk.%d.attn_norm.weight"
#define TN_ATTN_Q "blk.%d.attn_q.weight"
#define TN_ATTN_K "blk.%d.attn_k.weight"
#define TN_ATTN_V "blk.%d.attn_v.weight"
#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
#define TN_FFN_NORM "blk.%d.ffn_norm.weight"
#define TN_FFN_GATE "blk.%d.ffn_gate.weight"
#define TN_FFN_DOWN "blk.%d.ffn_down.weight"
#define TN_FFN_UP "blk.%d.ffn_up.weight"
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_VERSION_GGJT_V3 3
#define TOKENIZER_NAME "llama"
#define UNKNOWN_TOKEN_ID 0
#define BOS_TOKEN_ID 1
#define EOS_TOKEN_ID 2
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
typedef struct {
int dim; // transformer dimension
int hidden_dim; // for ffn layers
int n_layers; // number of layers
int n_heads; // number of query heads
int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
int vocab_size; // vocabulary size, usually 256 (byte-level)
int seq_len; // max sequence length
} Config;
struct TransformerWeights {
// token embedding table
std::vector<float> token_embedding_table; // (vocab_size, dim)
// weights for rmsnorms
std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
std::vector<float> rms_ffn_weight; // (layer, dim)
// weights for matmuls
std::vector<float> wq; // (layer, dim, dim)
std::vector<float> wk; // (layer, dim, dim)
std::vector<float> wv; // (layer, dim, dim)
std::vector<float> wo; // (layer, dim, dim)
// weights for ffn
std::vector<float> w1; // (layer, hidden_dim, dim)
std::vector<float> w2; // (layer, dim, hidden_dim)
std::vector<float> w3; // (layer, hidden_dim, dim)
// final rmsnorm
std::vector<float> rms_final_weight; // (dim,)
// freq_cis for RoPE relatively positional embeddings
// std::vector<float> freq_cis_real; // (seq_len, dim/2)
// std::vector<float> freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer
std::vector<float> wcls;
};
static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
try {
w->token_embedding_table.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wo.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight.resize(p->dim);
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = {};
} else {
w->wcls.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
}
catch (std::length_error &) {
die("Invalid configuration. Failed to allocate memory for weights");
}
}
static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
// Skip freq_cis_real & freq_cis_imag
int head_size = p->dim / p->n_heads;
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
// Check we didn't forget to read anything
auto curr = ftell(f);
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
return 1;
}
return 0;
}
static void print_sample_weights(TransformerWeights *w){
LOG("----- Quick print of first of the weight vales of all the variables\n");
LOG("%f\n", w->token_embedding_table[0]);
LOG("%f\n", w->rms_att_weight[0]);
LOG("%f\n", w->rms_ffn_weight[0]);
LOG("%f\n", w->wq[0]);
LOG("%f\n", w->wk[0]);
LOG("%f\n", w->wv[0]);
LOG("%f\n", w->wo[0]);
LOG("%f\n", w->w1[0]);
LOG("%f\n", w->w2[0]);
LOG("%f\n", w->w3[0]);
LOG("%f\n", w->rms_att_weight[0]);
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_data {
token text;
float score;
ttype type;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
};
struct my_llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096;
uint32_t n_ff = 11008;
uint32_t n_mult = 4;
uint32_t n_head = 32;
uint32_t n_head_kv = 32;
uint32_t n_layer = 32;
uint32_t n_rot = 64;
bool operator!=(const my_llama_hparams& other) const {
return memcmp(this, &other, sizeof(my_llama_hparams));
}
};
struct my_llama_layer {
// normalization
struct ggml_tensor * attention_norm;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
// normalization
struct ggml_tensor * ffn_norm;
// ff
struct ggml_tensor * w1;
struct ggml_tensor * w2;
struct ggml_tensor * w3;
};
struct my_llama_model {
struct ggml_context * ctx = NULL;
std::string name;
my_llama_hparams hparams;
struct ggml_tensor * tok_embeddings;
struct ggml_tensor * norm;
struct ggml_tensor * output;
std::vector<my_llama_layer> layers;
uint32_t train_its = 0;
uint32_t train_samples = 0;
uint32_t train_tokens = 0;
};
struct train_params {
const char * fn_vocab_model;
const char * fn_llama2c_model;
const char * fn_llama2c_output_model;
const char * fn_train_data;
const char * fn_checkpoint_in;
const char * fn_checkpoint_out;
const char * fn_model_out;
uint32_t seed;
int n_ctx;
int n_embd;
int n_mult;
int n_head;
int n_layer;
int n_rotmax;
int n_threads;
int n_batch;
int n_examples;
int n_predict;
int print_info_interval;
int print_details_interval;
bool samples_start_after_nl;
bool use_adam;
bool use_flash;
bool use_scratch;
// only adam
int warmup;
int cos_decay_steps;
float cos_decay_restart;
float cos_decay_alpha;
int lbfgs_n_iter;
int adam_n_iter;
float adam_alpha;
float adam_decay;
int mem_model_gb;
int mem_compute_gb;
int mem_compute0_gb;
int mem_compute1_gb;
};
static void print_params(struct my_llama_hparams * params) {
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
LOG("%s: n_head: %u\n", __func__, params->n_head);
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
}
static void print_tensor_info(const struct ggml_context * ctx) {
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
LOG("%s: Allocating ", __func__);
int64_t total = 1;
int i = 0;
for (; i < ggml_n_dims(t); ++i) {
if (i > 0) LOG("x ");
LOG("[%" PRId64 "] ", t->ne[i]);
total *= t->ne[i];
}
if (i > 1) LOG("= [%" PRId64 "] ", total);
LOG("float space for %s\n", ggml_get_name(t));
}
}
static void init_model(struct my_llama_model * model) {
const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd;
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_vocab = hparams.n_vocab;
const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
const uint32_t n_ff = hparams.n_ff;
struct ggml_context * ctx = model->ctx;
model->train_its = 0;
model->train_samples = 0;
model->train_tokens = 0;
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
ggml_set_name(model->norm, "norm.weight");
ggml_set_name(model->output, "output.weight");
model->layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model->layers[i];
std::string layers_i = "layers." + std::to_string(i);
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
}
print_tensor_info(ctx);
}
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}
static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}
static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
LOG(" %f", p);
}
LOG("\n");
}
static void print_matrix(struct ggml_tensor * probs) {
assert(ggml_is_matrix(probs));
for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
LOG(" %.2f", p);
}
LOG("\n");
}
}
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
size = 0;
} else {
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
GGML_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
GGML_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t size) {
if (size == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) {
die_fmt("fread failed: %s", strerror(errno));
}
if (ret != 1) {
die("unexpectedly reached end of file");
}
}
std::uint32_t read_u32() {
std::uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::float_t read_f32() {
std::float_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::string read_string(std::uint32_t len) {
std::vector<char> chars(len);
read_raw(chars.data(), len);
return std::string(chars.data(), len);
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
};
static bool is_ggml_file(const char * filename) {
llama_file file(filename, "rb");
if (file.size < 4) {
return false;
}
std::string magic = file.read_string(4);
return magic == GGUF_MAGIC;
}
static std::string llama_escape_whitespaces(const std::string & text) {
std::ostringstream out;
for (char c : text) {
if (c == ' ') out << "\xe2\x96\x81";
else out << c;
}
return out.str();
}
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
if (is_ggml_file(filename)) {
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};
struct gguf_context * ctx = gguf_init_from_file(filename, params);
GGML_ASSERT(ctx != NULL);
const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
GGML_ASSERT(model_idx >= 0);
std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
GGML_ASSERT(token_idx >= 0);
const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
GGML_ASSERT(score_idx >= 0);
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
GGML_ASSERT(toktype_idx >= 0);
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
}
vocab->id_to_token.resize(n_vocab);
for (uint32_t i = 0; i < n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
vocab->token_to_id[word] = i;
auto & token_data = vocab->id_to_token[i];
token_data.text = std::move(word);
token_data.score = scores[i];
token_data.type = (llama_token_type) toktypes[i];
}
ggml_free(ctx_data);
gguf_free(ctx);
} else {
// assume llama2.c vocabulary
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
}
const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
for (llama_vocab::id id=0; id<n_vocab; ++id) {
float_t score = file.read_f32();
uint32_t len = file.read_u32();
std::string text = file.read_string(len);
unsigned char byte_val;
llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
if (id == UNKNOWN_TOKEN_ID) {
text = "<unk>";
type = LLAMA_TOKEN_TYPE_UNKNOWN;
} else if (id == BOS_TOKEN_ID) {
text = "<s>";
type = LLAMA_TOKEN_TYPE_CONTROL;
} else if (id == EOS_TOKEN_ID) {
text = "</s>";
type = LLAMA_TOKEN_TYPE_CONTROL;
} else if (text.empty()) {
type = LLAMA_TOKEN_TYPE_CONTROL;
} else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
// Text of byte tokens is already in the expected format.
type = LLAMA_TOKEN_TYPE_BYTE;
} else {
type = LLAMA_TOKEN_TYPE_NORMAL;
}
text = llama_escape_whitespaces(text);
vocab->id_to_token[id].text = text;
vocab->id_to_token[id].score = score;
vocab->id_to_token[id].type = type;
vocab->token_to_id.emplace(text, id);
}
}
}
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int size = 1;
for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
size *= gg_weights->ne[dim];
}
for (int ct = 0; ct < size; ++ct) {
int64_t i0 = 0; int64_t i1 = 0;
int64_t i2 = 0; int64_t i3 = 0;
ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
}
}
static void save_as_llama_model(
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
) {
// convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
//print_row(model->norm, 0);
// for rms-att-weight
int row_length = model->hparams.n_embd;
int n_ff = model->hparams.n_ff;
const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
auto & layer = model->layers[i];
// 1d
convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
// from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/n_multiqueries]);
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/n_multiqueries]);
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
}
struct gguf_context * ctx = gguf_init_empty();
std::vector<const char*> tokens;
std::vector<float> scores;
std::vector<llama_token_type> token_types;
for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
tokens.push_back(token_data.text.c_str());
scores.push_back(token_data.score);
token_types.push_back(token_data.type);
}
gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
// special tokens
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
// write tensors
ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
gguf_add_tensor(ctx, model->tok_embeddings);
ggml_set_name(model->norm, TN_OUTPUT_NORM);
gguf_add_tensor(ctx, model->norm);
ggml_set_name(model->output, TN_OUTPUT);
gguf_add_tensor(ctx, model->output);
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
auto & layer = model->layers[i];
ggml_format_name(layer.wq, TN_ATTN_Q, i);
gguf_add_tensor(ctx, layer.wq);
ggml_format_name(layer.wk, TN_ATTN_K, i);
gguf_add_tensor(ctx, layer.wk);
ggml_format_name(layer.wv, TN_ATTN_V, i);
gguf_add_tensor(ctx, layer.wv);
ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
gguf_add_tensor(ctx, layer.wo);
ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
gguf_add_tensor(ctx, layer.attention_norm);
ggml_format_name(layer.w1, TN_FFN_GATE, i);
gguf_add_tensor(ctx, layer.w1);
ggml_format_name(layer.w2, TN_FFN_DOWN, i);
gguf_add_tensor(ctx, layer.w2);
ggml_format_name(layer.w3, TN_FFN_UP, i);
gguf_add_tensor(ctx, layer.w3);
ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
gguf_add_tensor(ctx, layer.ffn_norm);
}
gguf_write_to_file(ctx, filename, false);
gguf_free(ctx);
}
static struct train_params get_default_train_params() {
struct train_params params;
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin";
params.fn_checkpoint_out = "checkpoint.bin";
params.fn_model_out = "ggml-checkpoint-f32.bin";
params.seed = -1;
params.n_ctx = 128;
params.n_embd = 256;
params.n_mult = 256;
params.n_head = 8;
params.n_layer = 16;
params.n_rotmax = 64;
params.n_threads = 6;
params.n_batch = 8;
params.n_examples = 8;
params.n_predict = 1024;
params.print_info_interval = 1;
params.print_details_interval = 2;
params.samples_start_after_nl = false;
params.use_adam = true;
params.use_flash = false;
params.use_scratch = true;
// only adam
params.warmup = 100;
params.cos_decay_steps = 1000;
params.cos_decay_restart = 1.1f;
params.cos_decay_alpha = 0.0f;
params.lbfgs_n_iter = 16;
params.adam_n_iter = 16;
params.adam_alpha = 1e-3f;
params.adam_decay = 1e-3f;
params.mem_model_gb = 2;
params.mem_compute_gb = 24;
params.mem_compute0_gb = 8;
params.mem_compute1_gb = 2;
return params;
}
static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
fprintf(stderr, "\n");
}
static bool params_parse(int argc, char ** argv, struct train_params * params) {
bool invalid_param = false;
bool reqd_param_found = false;
std::string arg;
struct train_params default_params = get_default_train_params();
const std::string arg_prefix = "--";
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "--copy-vocab-from-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_vocab_model = argv[i];
} else if (arg == "--llama2c-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
reqd_param_found = true;
params->fn_llama2c_model = argv[i];
} else if (arg == "--llama2c-output-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_llama2c_output_model = argv[i];
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, &default_params);
exit(0);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv, &default_params);
exit(1);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv, &default_params);
exit(1);
}
if (!reqd_param_found){
fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
print_usage(argc, argv, &default_params);
exit(1);
}
return true;
}
static std::string basename(const std::string &path) {
size_t pos = path.find_last_of("/\\");
if (pos == std::string::npos) {
return path;
}
return path.substr(pos + 1);
}
int main(int argc, char ** argv) {
struct train_params params = get_default_train_params();
if (!params_parse(argc, argv, &params)) {
return 1;
}
log_set_target(stdout);
Config config;
TransformerWeights weights = {};
{
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
FILE * file = fopen(params.fn_llama2c_model, "rb");
if (!file) {
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
return 1;
}
// read in the config header
if (fread(&config, sizeof(Config), 1, file) != 1) {
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
return 1;
}
auto shared_weights = config.vocab_size > 0;
config.vocab_size = abs(config.vocab_size);
// read in the Transformer weights
alloc_weights(&weights, &config, shared_weights);
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
return 1;
}
fclose(file);
}
struct llama_vocab vocab;
load_vocab(params.fn_vocab_model, &config, &vocab);
struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim;
model.hparams.n_mult = 32;//params.n_mult;
model.hparams.n_head = config.n_heads; //params.n_head;
model.hparams.n_head_kv = config.n_kv_heads;
model.hparams.n_layer = config.n_layers; //params.n_layer;
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
print_params(&model.hparams);
struct ggml_init_params lcparams;
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
lcparams.mem_buffer = NULL;
lcparams.no_alloc = false;
model.ctx = ggml_init(lcparams);
init_model(&model);
model.name = basename(params.fn_llama2c_model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx);
return 0;
}

View file

@ -1,5 +0,0 @@
set(TARGET llama-cvector-generator)
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,45 +0,0 @@
# cvector-generator
This example demonstrates how to generate a control vector using gguf models.
Related PRs:
- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
## Examples
```sh
# CPU only
./cvector-generator -m ./llama-3.Q4_K_M.gguf
# With GPU
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
# With advanced options
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
# Using mean value instead of PCA
./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
# To see help message
./cvector-generator -h
# Then, have a look at "cvector" section
```
## Tips and tricks
If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
```
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
<|im_start|>system\nYou are in a very good mood today<|im_end|>
```
Example to use output file with `llama-cli`:
(Tips: The control vector works better when apply to layers higher than 10)
```sh
./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
```

View file

@ -1,582 +0,0 @@
That game
I can see
Hmm, this
I can relate to
Who is
I understand the
Ugh,
What the hell was
Hey, did anyone
Although
Thank you for choosing
What are you
Oh w
How dare you open
It was my pleasure
I'm hon
I appreciate that you
Are you k
Whoever left this
It's always
Ew,
Hey, I l
Hello? Is someone
I understand that
That poem
Aww, poor
Hey, it
Alright, who
I didn't
Well, life
The document
Oh no, this
I'm concerned
Hello, this is
This art
Hmm, this drink
Hi there!
It seems
Is
Good
I can't
Ex
Who are
I can see that
Wow,
Today is a
Hey friend
Sometimes friends
Oh, this old
The weather outside
This place is sur
I appreciate your input
Thank you for the
Look at
I'm disappoint
To my
How dare you
That's an
This piece of art
Eww
This park is
This is incredible
Oh no, someone
Exc
Well, it'
I warned
Hey, I understand
Hey, I saw
How dare you go
What the he
Hey
It's
Hello? Hello?
It
Oh no!
This is the perfect
Good morning,
Oh no, there
It's so
Yeah
Uh,
Hello everyone
Who turned off
The weather
Who'
Hey, this
Wait,
Eww, gross
Excuse
It seems like you
Thank you so
What happened?
Oh my g
I am deeply sad
I war
Okay, let'
Hey, that
That was a beautiful
Oh no! That
What happened
Hey there
The artist'
What?!
Hey, it'
I am disappoint
It seems like
Oh no! The
This park is a
If you
Yes! I did
It sounds
What
Who is it
Hmm, that
That's strange
Yeah, that was
That's interesting
This park
What the hell
Who is that
I feel like my
Oh well
What the hell is
Hello? Hello
To my dearest
Bless you!\"
Thank you for
Oh, looks like
Can you please
This place is
Eww, what
Bless you
Is everything
Hey, I just
Whoever left these
Well, that'
I feel
Hey, do you
It's sad
Oh no, it
Hey, that'
Oh my god,
Thank you,
Hello little one,
I apolog
Hey team, I
How dare you read
Who is this and
Whoever left
Hi there! W
A
If you have
I was
U
Bless
Well, this
Oh, I'
It's a
Eww,
Is everything okay?
Oh, I
Hello, can you
Al
That was a great
What are
I understand that not
Oh no, not
Who is it?\"
Hey, can we
Whoever is taking
I would love to
Hey, I noticed
Hey, could
I understand that there
Hello?
D
Oh man, I
Thank you so much
Oh no, my
Dear [Name
Uh
I remember
Hey, who
Well, it
Are you
I understand that it
Hey, is
I would
Who is this
Excuse me
Alright
I am thrilled
Sometimes friends have
Who the
It's interesting
I would love
E
Hello? Is anyone
Well, this is
This place
Well,
I warned you
Hey, watch where
Oh my
That'
Sometimes friends have different
I understand that everyone
What?
What do these notes
I can relate
I'm not
I understand
To my dear
Guys
Well
Hey, I appreciate
Wow, what
Dear
That melody
Who the hell
Today is
Hello little
Wow, look
That's great
Love is never wrong
I'm having
Whoa, did
Ugh
Can you please provide
I miss you,
I feel uncom
I know
Ugh, this
Hey, watch
Oh great, a
I didn
Okay
That game of char
Oh
I appreciate
Who's there
I am so
Oh great, someone
Hey, could you
I remember wondering
Wait, what?
What do
Hello? Can
Hey there,
That game of
This is incred
Oh my gosh
Oh great, f
I appreciate your
It sounds like
What the heck
Okay, I understand
Ew
I understand that this
Uh, hi
Hi everyone!
What the hell?
Thank you for your
Oh no, the
Wow, I
Who turned
Dear [
Whoever
This is a
Whoa, he
What in the world
Although the physical
Hello, who is
That's amaz
Hey, I know
Okay, that
Hi everyone
Hey, is everything
I understand your fr
Oh no, poor
Oh, look
Good morning
Ew, gross
Oh no, did
Look at the family
Hey team
Yes!
Hey, can I
Okay, that'
It's great
Love is
Hey, what
Good morning, world
Who is it?
That poem really reson
I
That's
I understand the task
Gu
Hello? Who'
This postcard is
Whoa,
Oh, that
I understand that I
Whoever is
Hello? Who is
I'm really
Wow, this
Can
This artwork really
This is a shame
I miss you too
Who are you?
Today is a difficult
Hey, just
Are you okay
I am
Hi,
Wow, that
Hey there! Can
Okay, stay
Oh great, just
Yeah,
Hello? Can you
Oh, looks
Thank you for sharing
I'm glad
Hey, is that
Hmm
It was my
It sounds like you
Wow, your
I was promised certain
That was such a
Thank
Excuse you
That was
Hey team,
I feel un
It was
What'
Hey friend, I
How
Saying goodbye
That
It's heart
How dare
Oh,
Hello, may
What's this
Thank you for recogn
Aww, that
Oh, I remember
Hmm, that'
I miss
I know this
Wait
Is everything okay
Who is that person
Wow, you
Oh great
I'm sad
Wow, the
I am very disappoint
Who turned off the
I understand that things
I'm very
Hi
That's very
Okay, I
Oh no,
Wow, there
What's wrong
I apologize for
Hey, I
Can I help you
Oh, I didn
Alright,
Oh wow,
Oh my goodness
I know this event
What in the
Saying
Yeah, that
Guys, I
Hey, this v
This post
Are
Hey, can
Hello? Is
I can only imagine
Oh, that sounds
Hey, is anyone
I am disappointed
Hello,
Hey everyone, I
That was such
It's okay
The artist
Whoa
I understand that mistakes
Can I help
Who
Hi everyone! I
Hey, can you
Wow, how
Today
Oh no, I
Oh well, I
Well, that
This is the
Yes! I finally
Hey there little
Hello everyone!
Love is never
Look at the
This postcard
Oh great,
Can I
Hmm, this is
I understand your
Oh, look at
B
I'm so
Whoa, this
W
Oh, this
Sometimes
This piece of
What the
That was a
Hey, do
Oh no
Whoa, what
I feel like I
The documentary
Hello
Hello little one
I understand that my
Eww, that
Wow, an
Yes! Finally,
Although the physical location
Whoever is watching
That movie
I remember wondering about
Hey there, little
Who's
Hello, who
Hello everyone! Thank
Hello, can
That's too
Hey, just wanted
Hey there, I
Saying good
Hey there!
Who is there?
Oh my good
I am very
Oh no, what
Wow, thank
I was promised
Hi, is
Hey, I'
Guys, the
Oh no, that
Who is there
Hello, this
That movie really touched
If you have something
The documentary was
I'm starting
Are you kidd
That movie really
Hey everyone,
Thank you for considering
I didn'
Yes! I
Can you
Oh my god
Hey, whoever
That melody really
Thank you, little
Hello, may I
Look
Wow, we
It looks
What do these
Oh wow
I apologize
What are you all
It's such
It's clear
Hey, I was
Hey friend,
I can only
The weather outside is
Eww, this
I miss you
Wow
Aww,
Hi, is there
This artwork
Okay,
Oh well,
This
I'
Say
Hey there little gu
Hmm,
Whoa, who
I am thr
Oh man
Okay, stay calm
I'm happy
Oh, this cur
Oh man,
I'm sorry
Hello? Who
What?! That
This piece
Hey everyone
That's so
Are you okay?
What happened? Where
Hi there
The
Who the hell entered
I can
Guys,
What's
What in
It's important
I'm
I'm coming
It'
Yes! Finally
Wait, what
Wow, reading
I'm surprised
Hey, did
Hey,
Okay, let
I understand that you
Who the hell threw
Eww, who
Thank you for thinking
Who is this?\"
I am deeply
Thank you for including
Oh no, an
It looks like you
Aww
I'm confused
Wow, it
That poem really
Yes
Hey there, is
Hey, what'
Thank you for remember
To
This is
Thank you for making
I can'
That mel
Wow, they
I feel like
Although the
Who are you
Love
If
What the hell are
I am so sad
Oh, I found
Thank you
It looks like
Well, life is
I appreciate that
The artist's
Whoa, that
It's never

View file

@ -1,503 +0,0 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include "pca.hpp"
#include "mean.hpp"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include <cstdio>
#include <string>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <climits>
//////////////////////////////////////////////////
// utils
template <class Iter>
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
std::string ret;
for (; begin != end; ++begin) {
ret += llama_token_to_piece(ctx, *begin);
}
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
printf("\nexample usage:\n");
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
printf("\n");
}
//////////////////////////////////////////////////
// cb_eval is reused for each pair of positive - negative prompt
struct callback_data {
ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
int n_layers = 0;
int n_tokens = 0;
bool is_eval_pos = true;
// each element of the vector correspond to one layer
std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
// save a tensor into either v_pos or v_neg (decided by is_eval_pos)
void save_tensor_for_layer(struct ggml_tensor * t) {
GGML_ASSERT(t->type == GGML_TYPE_F32);
if (ctx_ggml == nullptr) {
// alloc a new ctx_ggml if needed
struct ggml_init_params params_ggml = {
/*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_ggml = ggml_init(params_ggml);
}
// copy tensor data
auto n_bytes = ggml_nbytes(t);
struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
ggml_set_name(t_layer, ggml_get_name(t));
//print_debug_tensor(t_layer);
if (is_eval_pos) {
v_pos.push_back(t_layer);
} else {
v_neg.push_back(t_layer);
}
}
// calculate diff (v_pos - v_neg) and place the result back to v_pos
// all zero rows in the diff tensor will also be removed
// NOTE: final layer is ignored. we only have (n_layers - 1) to process
std::vector<struct ggml_tensor *> calc_diff() {
for (float il = 0; il < v_pos.size(); il++) {
float * a = (float *) v_pos[il]->data;
float * b = (float *) v_neg[il]->data;
size_t n_elem = ggml_nelements(v_pos[il]);
for (size_t j = 0; j < n_elem; j++) {
a[j] -= b[j];
}
//print_debug_tensor(v_pos[i]);
auto diff_filtered = filter_nonzero_rows(v_pos[il]);
v_diff_filtered.push_back(diff_filtered);
}
return v_diff_filtered; // for convinient, we return the result std::vector
}
// delete zero rows from a given 2D tensor
struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
//printf("filter_nonzero_rows\n");
auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
// check if given row containing all zero elements
int n_cols = t->ne[0]; // hint: should be equal to n_embd
for (int col = 0; col < n_cols; ++col) {
if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
return false;
}
}
return true;
};
std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
for (int i_row = 0; i_row < a->ne[1]; i_row++) {
if (!is_row_all_zeros(a, i_row, 1e-6)) {
rows_to_copy.push_back(i_row);
}
}
// get "n_nonzero_rows" for the output "diff_filtered"
int n_nonzero_rows = rows_to_copy.size();
//printf("n_nonzero_rows: %d\n", n_nonzero_rows);
int n_embd = a->ne[0];
GGML_ASSERT(n_nonzero_rows > 0);
// diff_filtered: [n_embd, n_nonzero_rows]
struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
// copy non-zero rows
for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
int src_row = rows_to_copy[dest_row];
for (int i = 0; i < n_embd; i++) {
float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
}
}
//print_debug_tensor(diff_filtered);
return diff_filtered;
}
// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
void reset() {
for (auto ptr : v_pos) free(ptr->data);
for (auto ptr : v_neg) free(ptr->data);
for (auto ptr : v_diff_filtered) free(ptr->data);
v_pos.clear();
v_neg.clear();
v_diff_filtered.clear();
if (ctx_ggml) {
ggml_free(ctx_ggml);
}
ctx_ggml = nullptr;
}
};
/**
* process_ctx is used to store the ggml context for pre-post processing the diff vectors
* in short, input => v_diff and output => v_final
*/
struct train_context {
ggml_context * ctx_ggml;
int n_embd;
int n_layers;
/* pair of prompts to be used for generating final vector */
std::vector<std::string> positive_entries;
std::vector<std::string> negative_entries;
// each element of the vector correspond to one layer
// NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
// NOTE (2): v_diff is transposed from v_diff_tmp
std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
// to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
// v_diff_tmp will get converted unto v_diff later on
std::vector<std::vector<uint8_t>> v_diff_tmp;
train_context(int n_embd_, int n_layers_) {
n_embd = n_embd_;
n_layers = n_layers_;
struct ggml_init_params params_ggml = {
/*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_ggml = ggml_init(params_ggml);
for (int il = 0; il < n_layers - 1; il++) {
std::vector<uint8_t> empty;
v_diff_tmp.push_back(empty);
auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
v_final.push_back(t);
}
}
// add new rows into existing tensor in v_diff_tmp
void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
for (int il = 0; il < n_layers - 1; il++) {
auto t = diff_filtered[il];
auto & diff_tmp = v_diff_tmp[il];
size_t curr_size = diff_tmp.size();
diff_tmp.resize(curr_size + ggml_nbytes(t));
memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
}
}
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
void build_v_diff(bool transpose) {
printf("build_v_diff\n");
for (int il = 0; il < n_layers - 1; il++) {
auto & diff_tmp = v_diff_tmp[il];
int n_elem = diff_tmp.size() / sizeof(float);
GGML_ASSERT(n_elem % n_embd == 0);
int n_rows = n_elem / n_embd;
struct ggml_tensor * diff = transpose
? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
: ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
if (transpose) {
// copy data & transpose
float * arr = (float *) diff_tmp.data();
for (int ir = 0; ir < n_rows; ++ir) {
for (int ic = 0; ic < n_embd; ++ic) {
float f = arr[ir*n_embd + ic];
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
}
}
} else {
// only copy
memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
}
v_diff.push_back(diff);
print_debug_tensor(diff);
// free memory of diff_tmp
diff_tmp.resize(0);
}
}
~train_context() {
for (auto ptr : v_final) free(ptr->data);
for (auto ptr : v_diff) free(ptr->data);
// no need to free v_diff_tmp, since we didn't use malloc
ggml_free(ctx_ggml);
}
};
struct tokenized_prompt {
std::vector<llama_token> tokens_pos;
std::vector<llama_token> tokens_neg;
size_t max_seq_len;
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
padding_seq(ctx, tokens_pos, max_seq_len);
padding_seq(ctx, tokens_neg, max_seq_len);
}
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
// TODO: customize padding token
std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
llama_token pad_tok = pad_tokens.back();
while (tokens.size() < len) {
tokens.push_back(pad_tok);
}
}
};
//////////////////////////////////////////////////
template <typename T>
static std::string to_string(const T & val) {
std::stringstream ss;
ss << val;
return ss.str();
}
static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
std::vector<std::string> output;
std::ifstream file(path);
if (!file.is_open()) {
fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
exit(1);
}
std::string line;
while (std::getline(file, line)) {
bool is_skip = skip_empty_lines && line.empty();
if (!is_skip) {
string_process_escapes(line);
output.push_back(line);
}
}
file.close();
return output;
}
//////////////////////////////////////////////////
static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (callback_data *) user_data;
static const char * l_out_name = "l_out";
const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
if (ask) {
return is_l_out;
}
if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
return true;
}
// save the tensor to current context
cb_data->save_tensor_for_layer(t);
return true;
}
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
llama_kv_cache_clear(ctx);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
return true;
}
static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
struct gguf_context * ctx = gguf_init_empty();
const std::string arch = "controlvector";
gguf_set_val_str(ctx, "general.architecture", arch.c_str());
gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
for (size_t i = 0; i < v_ctrl.size(); ++i) {
gguf_add_tensor(ctx, v_ctrl[i]);
print_debug_tensor(v_ctrl[i]);
printf("Added tensor: %s\n", v_ctrl[i]->name);
}
printf("%s: writing file...\n", __func__);
gguf_write_to_file(ctx, fname.c_str(), false);
printf("%s: wrote file '%s'\n", __func__, fname.c_str());
gguf_free(ctx);
}
/**
* Load prompt files and completion file.
* Then format each pair of prompt + completion to make an entry.
*/
static int prepare_entries(gpt_params & params, train_context & ctx_train) {
// load prompts
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
if (positive_prompts.size() != negative_prompts.size()) {
fprintf(stderr, "number of positive and negative prompts must be equal\n");
return 1;
}
if (positive_prompts.empty()) {
fprintf(stderr, "must provide at least one prompt pair\n");
return 1;
}
ctx_train.positive_entries = positive_prompts;
ctx_train.negative_entries = negative_prompts;
return 0;
}
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
if (params.n_pca_iterations % params.n_pca_batch != 0) {
fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
return 1;
}
callback_data cb_data;
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = cb_eval;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
print_build_info();
llama_backend_init();
llama_numa_init(params.numa);
// load the model to get hparams
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
// int n_ctx = llama_n_ctx(ctx);
int n_layers = llama_n_layer(model);
int n_embd = llama_n_embd(model);
// get model hint param (a.k.a model arch name)
char model_hint[128];
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
// init train_context
train_context ctx_train(n_embd, n_layers);
// load and prepare entries for training
prepare_entries(params, ctx_train);
// we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
std::vector<tokenized_prompt> tokenized_prompts;
size_t n_total_tokens = 0;
for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
n_total_tokens += 2 * t.max_seq_len;
tokenized_prompts.push_back(std::move(t));
}
std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
bool success = false;
tokenized_prompt t = tokenized_prompts[i];
cb_data.n_layers = n_layers;
cb_data.n_tokens = t.max_seq_len;
printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
(int) i+1, (int) ctx_train.positive_entries.size(),
tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
(int) t.max_seq_len);
cb_data.is_eval_pos = true;
success = get_hidden_layers(ctx, t.tokens_pos);
if (!success) break;
cb_data.is_eval_pos = false;
success = get_hidden_layers(ctx, t.tokens_neg);
if (!success) break;
// calculate diff and remove all zero rows
auto v_diff_filtered = cb_data.calc_diff();
// save & concat the filtered v_diff to ctx_train
ctx_train.concat_diff_tmp(v_diff_filtered);
// reset for next iteration
cb_data.reset();
}
// done with the model, we can now free it to make gain some memory
printf("Done evaluate prompts, unload model...\n");
llama_free(ctx);
llama_free_model(model);
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
// prepare ctx_train for PCA
ctx_train.build_v_diff(use_pca);
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
} else {
// run mean
mean::run(ctx_train.v_diff, ctx_train.v_final);
}
// write output vectors to gguf
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
llama_backend_free();
return 0;
}

View file

@ -1,48 +0,0 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include <string>
#include <vector>
#include <math.h>
namespace mean {
static void run(
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
const std::vector<struct ggml_tensor *> & v_output) {
printf("%s: Running mean...\n", __func__);
for (size_t il = 0; il < v_input.size(); ++il) {
// prepare output vector
struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%ld", il+1);
// calculate mean vector
struct ggml_tensor * t_layer = v_input[il];
GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
for (int ic = 0; ic < t_layer->ne[0]; ic++) {
float f = 0.0;
for (int ir = 0; ir < t_layer->ne[1]; ir++) {
f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
}
f /= t_layer->ne[1];
ggml_set_f32_1d(ctrl_out, ic, f);
}
// normalize output vector
float norm = 0.0;
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
float f = ggml_get_f32_1d(ctrl_out, i);
norm += f*f;
}
norm = sqrt(norm);
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
float f = ggml_get_f32_1d(ctrl_out, i);
ggml_set_f32_1d(ctrl_out, i, f / norm);
}
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
}
}
}

View file

@ -1,4 +0,0 @@
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow

View file

@ -1,325 +0,0 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include <cstdio>
#include <ctime>
#include <string>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#define DEBUG_POS 5
static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
if (!with_data) return;
printf("%s: %s[0] = [", __func__, t->name);
for (size_t i = 0; i <= DEBUG_POS; i++) {
printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
}
printf(" ... ]\n");
}
namespace PCA {
// input params for PCA computations
struct pca_params {
int n_threads = 1;
int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
int n_iterations = 1000;
float tolerance = 1e-7;
// for debugging
int i_layer = 0;
int n_layers = 0;
};
// result from each iteration
struct pca_result {
struct ggml_tensor * calculated_square = NULL;
std::vector<struct ggml_tensor *> eigenvectors;
std::vector<float> distances;
};
struct pca_model {
ggml_backend_t backend = NULL;
ggml_backend_buffer_t buffer;
struct ggml_context * ctx; // context to compute graph on target device
struct ggml_context * ctx_host; // host context to store results
// tensors on target device
struct ggml_tensor * dev_input;
struct ggml_tensor * dev_square;
struct ggml_tensor * dev_eigenvector;
pca_model(struct ggml_tensor * t_input) {
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
// TODO: enable Metal support when support for GGML_OP_SQRT is added
// #ifdef GGML_USE_METAL
// fprintf(stderr, "%s: using Metal backend\n", __func__);
// backend = ggml_backend_metal_init();
// if (!backend) {
// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
// }
// #endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
backend = ggml_backend_cpu_init();
}
const int num_tensors = 4;
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx = ggml_init(params);
auto n_samples = t_input->ne[0];
auto n_embd = t_input->ne[1];
dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
ggml_set_name(dev_input, "dev_input");
ggml_set_name(dev_square, "dev_square");
ggml_set_name(dev_eigenvector, "dev_eigenvector");
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
// initialize eigenvector to random normalized vector
{
std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
std::uniform_real_distribution<float> distribution(0.0, 1.0);
float sum_sqr = 0.0; // for normalizing random_vec
for (size_t i = 0; i < random_vec.size(); ++i) {
float f = distribution(generator);
sum_sqr += f * f;
random_vec[i] = f;
}
// normalize it
float random_vec_norm = std::sqrt(sum_sqr);
for (size_t i = 0; i < random_vec.size(); ++i) {
random_vec[i] /= random_vec_norm;
}
ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
}
}
~pca_model() {
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
ggml_backend_free(backend);
}
};
static struct ggml_cgraph * build_graph_piter(
const struct pca_params & params,
const pca_model & model,
bool calc_square = false) {
GGML_ASSERT(params.n_batch > 0);
// TODO: buf_size must be able to scale with params.n_batch
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// create a temporally context to build the graph
struct ggml_context * ctx0 = ggml_init(params0);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// turn v_diff_original into square matrix if needed
struct ggml_tensor * tmp_square;
if (calc_square) {
tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
ggml_set_name(tmp_square, "tmp_square");
}
struct ggml_tensor * b_tensor;
struct ggml_tensor * distance;
struct ggml_tensor * old_eigen = model.dev_eigenvector;
struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
for (int i = 0; i < params.n_batch; ++i) {
// b_tensor = square * eigenvector^T
b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
ggml_set_name(b_tensor, "b_tensor");
// normalize
b_tensor = ggml_div_inplace(ctx0,
b_tensor,
ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
);
ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
// calculate distance(new eigenvector - old eigenvector)
// we don't use ggml_sub because it may not be implemented on GPU backend
struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
distance = ggml_sqrt_inplace(ctx0,
ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
ggml_format_name(distance, "distance_%d", i);
old_eigen = b_tensor;
// build operations nodes
ggml_build_forward_expand(gf, distance);
}
// delete the temporally context used to build the graph
ggml_free(ctx0);
return gf;
}
static ggml_status compute_piter(
const struct pca_params & params,
const pca_model & model,
struct ggml_cgraph * gf,
ggml_gallocr_t allocr,
struct pca_result & result) {
// allocate tensors
ggml_gallocr_alloc_graph(allocr, gf);
if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
}
// TODO: enable GPU support when support for GGML_OP_SQRT is added
//#ifdef GGML_USE_METAL
// if (ggml_backend_is_metal(model.backend)) {
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
// }
//#endif
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
if (res == GGML_STATUS_SUCCESS) {
auto extract_i = [](std::string prefix, std::string str) -> int {
int i = -1;
if (str.rfind(prefix, 0) == 0) {
sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
}
return i;
};
result.calculated_square = NULL;
result.eigenvectors.clear();
result.distances.clear();
result.eigenvectors.resize(params.n_batch);
result.distances.resize(params.n_batch);
// get output nodes
for (int i = 0; i < gf->n_nodes; ++i) {
auto node = gf->nodes[i];
int iter = -1;
// find b_tensor (without copying data from device)
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
result.eigenvectors[iter] = node;
}
// find distances, then copy data from device
if ((iter = extract_i("distance_", node->name)) > -1) {
float d;
ggml_backend_tensor_get(node, &d, 0, sizeof(float));
result.distances[iter] = d;
// std::cout << node->name << " = " << d << "\n";
}
// find tmp_square if it exists (without copying data from device)
if (std::string(node->name) == "tmp_square") {
result.calculated_square = node;
}
}
}
return res;
}
static void power_iteration(
const struct pca_params & params,
struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
struct ggml_tensor * output) {
//printf("in power iteration\n");
struct pca_model model(input);
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
struct pca_result result;
struct ggml_tensor * last_eigenvector = NULL;
int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
for (int iter = 0; iter < n_iters; ++iter) {
bool calc_square = (iter == 0); // only need to calculate square for first iteration
struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
// ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
compute_piter(params, model, gf, allocr, result);
for (size_t k = 0; k < result.distances.size(); ++k) {
last_eigenvector = result.eigenvectors[k];
if (result.distances[k] < params.tolerance) {
break; // done
}
}
if (calc_square) {
// copy and store the square matrix if needed
GGML_ASSERT(result.calculated_square != NULL);
ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
}
{
// copy last eigen vector and store as input for next iteration
GGML_ASSERT(last_eigenvector != NULL);
ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
}
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
__func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
}
// get output tensor
GGML_ASSERT(last_eigenvector);
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
//print_debug_tensor(output);
ggml_gallocr_free(allocr);
// TODO @ngxson : The output vector is randomly inverted
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
}
static void run_pca(
struct pca_params & params,
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
const std::vector<struct ggml_tensor *> & v_output) {
printf("%s: Running PCA...\n", __func__);
for (size_t il = 0; il < v_input.size(); ++il) {
// prepare output vector
struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%ld", il+1);
// run power_iteration
params.i_layer = il;
params.n_layers = v_input.size();
power_iteration(params, v_input[il], ctrl_out);
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
}
}
}

View file

@ -1,4 +0,0 @@
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!

View file

@ -23,9 +23,6 @@ Please update all scripts and workflows to use the new binary names.
| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
| eval-callback | llama-eval-callback |
| gbnf-validator | llama-gbnf-validator |
| gguf | llama-gguf |
| gguf-split | llama-gguf-split |
| gritlm | llama-gritlm |
| imatrix | llama-imatrix |
| infill | llama-infill |
| llava-cli | llama-llava-cli |
@ -35,10 +32,7 @@ Please update all scripts and workflows to use the new binary names.
| lookup-merge | llama-lookup-merge |
| lookup-stats | llama-lookup-stats |
| parallel | llama-parallel |
| passkey | llama-passkey |
| perplexity | llama-perplexity |
| q8dot | llama-q8dot |
| quantize-stats | llama-quantize-stats |
| q8dot | llama-q8dot ||
| retrieval | llama-retrieval |
| save-load-state | llama-save-load-state |
| simple | llama-simple |

View file

@ -1,5 +0,0 @@
set(TARGET llama-embedding)
add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,60 +0,0 @@
# llama.cpp/example/embedding
This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.
## Quick Start
To get started right away, run the following command, making sure to use the correct path for the model you have:
### Unix-based systems (Linux, macOS, etc.):
```bash
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
```
### Windows:
```powershell
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
```
The above command will output space-separated float values.
## extra parameters
### --embd-normalize $integer$
| $integer$ | description | formula |
|-----------|---------------------|---------|
| $-1$ | none |
| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$
| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
### --embd-output-format $'string'$
| $'string'$ | description | |
|------------|------------------------------|--|
| '' | same as before | (default)
| 'array' | single embeddings | $[[x_1,...,x_n]]$
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
| 'json' | openai style |
| 'json+' | add cosine similarity matrix |
### --embd-separator $"string"$
| $"string"$ | |
|--------------|-|
| "\n" | (default)
| "<#embSep#>" | for exemple
| "<#sep#>" | other exemple
## examples
### Unix-based systems (Linux, macOS, etc.):
```bash
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```
### Windows:
```powershell
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```

View file

@ -1,268 +0,0 @@
#include "common.h"
#include "llama.h"
#include <ctime>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
std::vector<std::string> lines;
size_t start = 0;
size_t end = s.find(separator);
while (end != std::string::npos) {
lines.push_back(s.substr(start, end - start));
start = end + separator.length();
end = s.find(separator, start);
}
lines.push_back(s.substr(start)); // Add the last part
return lines;
}
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
size_t n_tokens = tokens.size();
for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
}
}
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
// run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_decode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to decode\n", __func__);
}
for (int i = 0; i < batch.n_tokens; i++) {
if (!batch.logits[i]) {
continue;
}
// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
float * out = output + batch.seq_id[i][0] * n_embd;
llama_embd_normalize(embd, out, n_embd, embd_norm);
}
}
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
params.embedding = true;
// For non-causal models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
// split the prompt into lines
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
// max batch size
const uint64_t n_batch = params.n_batch;
GGML_ASSERT(params.n_batch >= params.n_ctx);
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true, false);
if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
return 1;
}
inputs.push_back(inp);
}
// check if the last token is SEP
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) {
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
}
}
// tokenization stats
if (params.verbose_prompt) {
for (int i = 0; i < (int) inputs.size(); i++) {
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
for (int j = 0; j < (int) inputs[i].size(); j++) {
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
}
fprintf(stderr, "\n\n");
}
}
// initialize batch
const int n_prompts = prompts.size();
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
// allocate output
const int n_embd = llama_n_embd(model);
std::vector<float> embeddings(n_prompts * n_embd, 0);
float * emb = embeddings.data();
// break into batches
int p = 0; // number of prompts processed already
int s = 0; // number of prompts in current batch
for (int k = 0; k < n_prompts; k++) {
// clamp to n_batch tokens
auto & inp = inputs[k];
const uint64_t n_toks = inp.size();
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
llama_batch_clear(batch);
p += s;
s = 0;
}
// add to batch
batch_add_seq(batch, inp, s);
s += 1;
}
// final batch
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
if (params.embd_out.empty()) {
// print the first part of the embeddings or for a single prompt, the full embedding
fprintf(stdout, "\n");
for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
} else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
}
}
fprintf(stdout, "\n");
}
// print cosine similarity matrix
if (n_prompts > 1) {
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
fprintf(stdout, "%6.6s ", prompts[i].c_str());
}
fprintf(stdout, "\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
}
fprintf(stdout, "%1.10s", prompts[i].c_str());
fprintf(stdout, "\n");
}
}
}
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
const bool notArray = params.embd_out != "array";
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
for (int j = 0;;) { // at least one iteration (one prompt)
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
fprintf(stdout, "[");
for (int i = 0;;) { // at least one iteration (n_embd > 0)
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
i++;
if (i < n_embd) fprintf(stdout, ","); else break;
}
fprintf(stdout, notArray ? "]\n }" : "]");
j++;
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
}
fprintf(stdout, notArray ? "\n ]" : "]\n");
if (params.embd_out == "json+" && n_prompts > 1) {
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
fprintf(stdout, " [");
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f", sim);
j++;
if (j < n_prompts) fprintf(stdout, ", "); else break;
}
fprintf(stdout, " ]");
i++;
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
}
fprintf(stdout, "\n ]");
}
if (notArray) fprintf(stdout, "\n}\n");
}
// clean up
llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View file

@ -1,9 +0,0 @@
set(TARGET llama-eval-callback)
add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View file

@ -1,95 +0,0 @@
# llama.cpp/examples/eval-callback
A simple example which demonstrates how to use callback during the inference.
It simply prints to the console all operations and tensor data.
Usage:
```shell
llama-eval-callback \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model phi-2-q4_0.gguf \
--prompt hello \
--seed 42 \
-ngl 33
```
Will print:
```shell
llm_load_tensors: offloaded 33/33 layers to GPU
...
llama_new_context_with_model: n_ctx = 512
...
llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
llama_new_context_with_model: graph nodes = 1225
llama_new_context_with_model: graph splits = 2
ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.0181, 0.0272, 0.0272, ...],
],
]
ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -0.6989, 1.0636, 1.0636, ...],
],
]
ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.1800, 0.2817, 0.2632, ...],
],
]
ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.1863, 0.2970, 0.2604, ...],
],
]
ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
[
[
[ -1.1238, 1.2876, -1.8086, ...],
],
]
ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
[ -0.3608, 0.5076, -1.8866, ...],
[ 1.7643, 0.0273, -2.1065, ...],
...
],
]
ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
[ -0.3608, 0.5076, -1.8866, ...],
[ 1.7643, 0.0273, -2.1065, ...],
...
],
]
```

View file

@ -1,193 +0,0 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include <cstdio>
#include <random>
#include <string>
#include <tuple>
#include <vector>
/**
* This the arbitrary data which will be passed to each callback.
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
*/
struct callback_data {
std::vector<uint8_t> data;
};
static std::string ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) {
printf(" ..., \n");
i2 = ne[2] - n;
}
printf(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) {
printf(" ..., \n");
i1 = ne[1] - n;
}
printf(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) {
printf("..., ");
i0 = ne[0] - n;
}
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
} else {
GGML_ASSERT(false);
}
printf("%12.4f", v);
sum += v;
if (i0 < ne[0] - 1) printf(", ");
}
printf("],\n");
}
printf(" ],\n");
}
printf(" ]\n");
printf(" sum = %f\n", sum);
}
}
/**
* GGML operations callback during the graph execution.
*
* @param t current tensor
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
* see ggml_backend_sched_eval_callback
* @param user_data user data to pass at each call back
* @return true to receive data or continue the graph, false otherwise
*/
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (callback_data *) user_data;
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
if (ask) {
return true; // Always retrieve data
}
char src1_str[128] = {0};
if (src1) {
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "",
ggml_ne_string(t).c_str());
// copy the data from the GPU memory if needed
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
if (!is_host) {
auto n_bytes = ggml_nbytes(t);
cb_data->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
}
if (!ggml_is_quantized(t->type)) {
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
}
return true;
}
static bool run(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
return true;
}
int main(int argc, char ** argv) {
callback_data cb_data;
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
llama_backend_init();
llama_numa_init(params.numa);
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = ggml_debug;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
bool OK = run(ctx, params);
if (!OK) {
return 1;
}
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View file

@ -1,5 +0,0 @@
set(TARGET llama-export-lora)
add_executable(${TARGET} export-lora.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,33 +0,0 @@
# export-lora
Apply LORA adapters to base model and export the resulting model.
```
usage: llama-export-lora [options]
options:
-m, --model model path from which to load base model (default '')
--lora FNAME path to LoRA adapter (can be repeated to use multiple adapters)
--lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)
-t, --threads N number of threads to use during computation (default: 4)
-o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf')
```
For example:
```bash
./bin/llama-export-lora \
-m open-llama-3b-v2-q8_0.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
```
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
```bash
./bin/llama-export-lora \
-m your_base_model.gguf \
-o your_merged_model.gguf \
--lora-scaled lora_task_A.gguf 0.5 \
--lora-scaled lora_task_B.gguf 0.5
```

View file

@ -1,420 +0,0 @@
#include "common.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include <map>
#include <vector>
#include <string>
#include <thread>
#include <fstream>
static bool g_verbose = false;
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
}
static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
}
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
file.write(&zero, 1);
}
}
static std::string ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ ctx_ggml,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
if (!ctx_gguf) {
throw std::runtime_error("failed to load input GGUF from " + fname);
}
return ctx_gguf;
}
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos);
if (new_pos == std::string::npos) {
result += s.substr(pos, s.size() - pos);
break;
}
result += s.substr(pos, new_pos - pos) + replace;
pos = new_pos;
}
s = std::move(result);
}
struct file_input {
struct ggml_context * ctx_meta = nullptr;
struct gguf_context * ctx_gguf = nullptr;
std::ifstream f_in;
std::map<std::string, ggml_tensor *> tensors;
float alpha;
float scale;
file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
if (!f_in.is_open()) {
throw std::runtime_error("failed to open input gguf from " + fname);
}
ctx_gguf = load_gguf(fname, &ctx_meta);
alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
std::string name(cur->name);
tensors[name] = cur;
if (g_verbose) {
printf("%s: %s\n", __func__, cur->name);
}
}
}
ggml_tensor * get_tensor(std::string name) {
if (tensors.find(name) == tensors.end()) {
return nullptr;
}
return tensors[name];
}
void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
if (tensors.find(name) == tensors.end()) {
throw std::runtime_error("cannot find tensor with name: " + name);
}
auto len = ggml_nbytes(tensors[name]);
if (buf.size() < len) {
buf.resize(len);
}
auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
f_in.seekg(offset);
f_in.read((char* )buf.data(), len);
}
~file_input() {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
}
};
struct lora_merge_ctx {
// input base model + adapters
file_input base_model;
std::vector<std::unique_ptr<file_input>> adapters;
// for computing merged tensor
int n_threads;
ggml_backend_t backend = nullptr;
ggml_gallocr_t allocr = nullptr;
std::vector<uint8_t> read_buf;
// output file
struct gguf_context * ctx_out;
struct ggml_context * ctx_out_ggml;
std::ofstream fout;
lora_merge_ctx(
std::string & base_fname,
std::vector<std::tuple<std::string, float>> & lora_files,
std::string & outfile,
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
throw std::runtime_error("split model is not yet supported");
}
for (auto lora_inp : lora_files) {
auto fname = std::get<0>(lora_inp);
auto scale = std::get<1>(lora_inp);
std::unique_ptr<file_input> adapter(new file_input(fname, scale));
check_metadata_lora(adapter.get());
adapters.push_back(std::move(adapter));
}
ctx_out = gguf_init_empty();
struct ggml_init_params params = {
/*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_out_ggml = ggml_init(params);
backend = ggml_backend_cpu_init();
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
}
void check_metadata_lora(file_input * adapter) {
auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
if (general_type != "adapter") {
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
}
auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
if (adapter_type != "lora") {
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
}
auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture");
if (general_arch_base != general_arch_lora) {
throw std::runtime_error("model arch and LoRA arch mismatch");
}
}
ggml_type get_out_tensor_type(struct ggml_tensor * t) {
if (t->type == GGML_TYPE_F32) {
return GGML_TYPE_F32;
} else {
return GGML_TYPE_F16;
}
}
void run_merge() {
// prepare metadata
gguf_set_kv(ctx_out, base_model.ctx_gguf);
// output is forced to f16 for now
gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
// check if all lora adapters have the same tensors
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
if (adapters.size() > 1) {
for (size_t i = 1; i < adapters.size(); ++i) {
if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
throw std::runtime_error(err_no_subset_adapter);
}
for (auto & it : adapters[i]->tensors) {
if (adapters[0]->get_tensor(it.first) == nullptr) {
throw std::runtime_error(err_no_subset_adapter);
}
}
}
}
// mapping base tensor to out tensor (same shape with base, but different type)
// if out_tensor == nullptr, we only copy it
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
for (auto & it : base_model.tensors) {
bool t_a = true;
bool t_b = true;
for (auto & adapter : adapters) {
t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
}
auto base_tensor = it.second;
if (!t_a && !t_b) {
// only copy
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
ggml_set_name(cpy_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
gguf_add_tensor(ctx_out, cpy_tensor);
} else if (t_a && t_b) {
// need merging
struct ggml_tensor * out_tensor = ggml_new_tensor(
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
ggml_set_name(out_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
gguf_add_tensor(ctx_out, out_tensor);
} else {
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
}
}
// placeholder for the meta data
{
size_t meta_size = gguf_get_meta_size(ctx_out);
zeros(fout, meta_size);
}
// process base model tensors
size_t n_merged = 0;
for (auto & it : base_to_out_tensors) {
if (it.second != nullptr) {
merge_tensor(it.first, it.second);
n_merged++;
} else {
copy_tensor(it.first);
}
}
// write output metadata
{
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.seekp(0);
fout.write((const char *)data.data(), data.size());
}
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
}
void copy_tensor(struct ggml_tensor * base) {
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
size_t len = ggml_nbytes(base);
base_model.read_tensor_data(base->name, read_buf);
fout.write((char* )read_buf.data(), len);
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
}
void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
std::string name_base(base->name);
std::string name_lora_a = name_base + ".lora_a";
std::string name_lora_b = name_base + ".lora_b";
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
// context for input tensor
std::vector<struct ggml_tensor *> inp_a(adapters.size());
std::vector<struct ggml_tensor *> inp_b(adapters.size());
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context * ctx = ggml_init(params);
// alloc tensors
struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
for (size_t i = 0; i < adapters.size(); ++i) {
auto t_a = adapters[i]->get_tensor(name_lora_a);
auto t_b = adapters[i]->get_tensor(name_lora_b);
inp_a[i] = ggml_dup_tensor(ctx, t_a);
inp_b[i] = ggml_dup_tensor(ctx, t_b);
}
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
// load base tensor to backend buffer
base_model.read_tensor_data(name_base, read_buf);
if (base->type != GGML_TYPE_F32) {
// optionally dequantize it
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
auto nels = ggml_nelements(inp_base);
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
} else {
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
}
// load lora tensors to backend buffer
for (size_t i = 0; i < adapters.size(); ++i) {
adapters[i]->read_tensor_data(name_lora_a, read_buf);
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
adapters[i]->read_tensor_data(name_lora_b, read_buf);
ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
}
// build graph
struct ggml_cgraph * gf;
{
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true,
};
struct ggml_context * ctx0 = ggml_init(params0);
gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur = inp_base;
for (size_t i = 0; i < adapters.size(); ++i) {
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
// scale
const float alpha = adapters[i]->alpha;
const float rank = (float) inp_b[i]->ne[0];
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
delta = ggml_scale(ctx0, delta, scale);
cur = ggml_add(ctx0, delta, cur);
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
}
cur = ggml_cast(ctx0, cur, out->type);
printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
}
// compute
{
ggml_gallocr_alloc_graph(allocr, gf);
ggml_backend_cpu_set_n_threads(backend, n_threads);
ggml_backend_graph_compute(backend, gf);
}
// write data to output file
{
auto result = gf->nodes[gf->n_nodes - 1];
size_t len = ggml_nbytes(result);
if (read_buf.size() < len) {
read_buf.resize(len);
}
ggml_backend_tensor_get(result, read_buf.data(), 0, len);
fout.write((char* )read_buf.data(), len);
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
}
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
}
~lora_merge_ctx() {
ggml_gallocr_free(allocr);
ggml_backend_free(backend);
gguf_free(ctx_out);
ggml_free(ctx_out_ggml);
}
};
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
printf("\nexample usage:\n");
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
printf("\nNOTE: output model is F16\n");
printf("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
g_verbose = (params.verbosity == 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
exit(EXIT_FAILURE);
}
printf("done, output file is %s\n", params.lora_outfile.c_str());
return 0;
}

View file

@ -1,5 +0,0 @@
set(TARGET llama-gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,137 +0,0 @@
#define LLAMA_API_INTERNAL
#include "grammar-parser.h"
#include "ggml.h"
#include "llama.h"
#include "unicode.h"
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <fstream>
#include <string>
#include <vector>
static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
auto decoded = decode_utf8(input_str, {});
const auto & code_points = decoded.first;
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
if (cur_stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
cur_stacks = prev_stacks;
return false;
}
++pos;
}
for (const auto & stack : cur_stacks) {
if (stack.empty()) {
return true;
}
}
error_pos = pos;
error_msg = "Unexpected end of input";
return false;
}
static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
fprintf(stdout, "Input string is invalid according to the grammar.\n");
fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
fprintf(stdout, "\n");
fprintf(stdout, "Input string:\n");
fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
if (error_pos < input_str.size()) {
fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
if (error_pos+1 < input_str.size()) {
fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
}
fprintf(stdout, "\033[0m\n");
}
}
int main(int argc, char** argv) {
if (argc != 3) {
fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
return 1;
}
const std::string grammar_filename = argv[1];
const std::string input_filename = argv[2];
// Read the GBNF grammar file
FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
if (!grammar_file) {
fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
return 1;
}
std::string grammar_str;
{
std::ifstream grammar_file(grammar_filename);
GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
std::stringstream buffer;
buffer << grammar_file.rdbuf();
grammar_str = buffer.str();
}
// Parse the GBNF grammar
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
fprintf(stdout, "%s: failed to parse grammar\n", __func__);
return 1;
}
// Ensure that there is a "root" node.
if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
// Create the LLAMA grammar
auto grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
// Read the input file
std::string input_str;
{
std::ifstream input_file(input_filename);
GGML_ASSERT(input_file.is_open() && "Failed to open input file");
std::stringstream buffer;
buffer << input_file.rdbuf();
input_str = buffer.str();
}
// Validate the input string against the grammar
size_t error_pos;
std::string error_msg;
bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
if (is_valid) {
fprintf(stdout, "Input string is valid according to the grammar.\n");
} else {
print_error_message(input_str, error_pos, error_msg);
}
// Clean up
llama_grammar_free(grammar);
return 0;
}

View file

@ -1,15 +0,0 @@
set(TARGET llama-gguf-hash)
add_executable(${TARGET} gguf-hash.cpp)
install(TARGETS ${TARGET} RUNTIME)
# clibs dependencies
include_directories(deps/)
add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
target_link_libraries(${TARGET} PRIVATE xxhash)
add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
target_link_libraries(${TARGET} PRIVATE sha1)
add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
target_link_libraries(${TARGET} PRIVATE sha256)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,206 +0,0 @@
# llama-gguf-hash
CLI to hash GGUF files to detect difference on a per model and per tensor level.
**Command line options:**
- `--help`: display help message
- `--xxh64`: use xhash 64bit hash mode (default)
- `--sha1`: use sha1
- `--uuid`: use uuid
- `--sha256`: use sha256
- `--all`: use all hash
- `--no-layer`: exclude per layer hash
- `--uuid`: generate UUIDv5 ID
- `-c`, `--check <manifest>`: verify against a manifest
## About
While most POSIX systems already have hash checking programs like sha256sum, it
is designed to check entire files. This is not ideal for our purpose if we want
to check for consistency of the tensor data even if the metadata content of the
gguf KV store has been updated.
This program is designed to hash a gguf tensor payload on a 'per tensor layer'
in addition to a 'entire tensor model' hash. The intent is that the entire
tensor layer can be checked first but if there is any detected inconsistencies,
then the per tensor hash can be used to narrow down the specific tensor layer
that has inconsistencies.
For Maintainers:
- Detection of tensor inconsistency during development and automated tests
- This is served by xxh64 which is fast
- This is also served by having per tensor layer to assist in narrowing down
the location of the faulty tensor layer
- This is also served by sha1 which is much slower but more widely supported
For Model Creators:
- Optional consistent UUID generation based on model tensor content
- This is served by UUIDv5 which is useful for databases keys
- llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
- Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
For Model Users:
- Assurance of tensor layer integrity even if metadata was updated
- This is served by sha256 which is still considered very secure as of 2024
### Design Note
- The default behavior of this program if no arguments is provided is to hash
using xxhash's xxh32 mode because it is very fast and is primarily targeted
towards maintainers who may want to use this in automated tests.
- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
would have a better affinity to calculating hash that is 64bit in size.
## Compile Example
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
make -C build clean
make -C build llama-gguf-hash VERBOSE=1
./build/bin/llama-gguf-hash test.gguf
./build/bin/llama-gguf-hash --xxh64 test.gguf
./build/bin/llama-gguf-hash --sha1 test.gguf
./build/bin/llama-gguf-hash --uuid test.gguf
./build/bin/llama-gguf-hash --sha256 test.gguf
```
## Generation and Verification Example
To generate we may use this command
```bash
./llama-gguf-hash --all test.gguf > test.gguf.manifest
```
Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
(This excludes UUID as that is an ID not a hash)
```bash
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0
sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1
sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1
xxh64 a0af5d700049693b test.gguf:tensor_2
sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3
sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3
xxh64 1257733306b7992d test.gguf:tensor_4
sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4
xxh64 d238d16ba4711e58 test.gguf:tensor_5
sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6
sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6
xxh64 c22021c29854f093 test.gguf:tensor_7
sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7
xxh64 936df61f5d64261f test.gguf:tensor_8
sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8
xxh64 93fd20c64421c081 test.gguf:tensor_9
sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9
xxh64 5a54d3aad816f302 test.gguf
sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf
```
We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
```bash
$ ./llama-gguf-hash --check test.gguf.manifest test.gguf
manifest test.gguf.manifest sha256 sha1 xxh64
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok
Verification results for test.gguf.manifest - Success
```
Or we may explicitly ask for a faster hash like:
```bash
$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
manifest test.gguf.manifest sha256 sha1 xxh64
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok
xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok
xxh64 1257733306b7992d test.gguf:tensor_4 - Ok
xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok
xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok
xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok
xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok
xxh64 5a54d3aad816f302 test.gguf - Ok
Verification results for test.gguf.manifest - Success
```
Or maybe we want to just check that all the hash is valid:
```bash
$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
manifest test.gguf.manifest sha256 sha1 xxh64
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok
sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0 - Ok
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok
sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1 - Ok
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok
xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok
sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2 - Ok
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok
sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3 - Ok
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok
xxh64 1257733306b7992d test.gguf:tensor_4 - Ok
sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4 - Ok
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok
xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok
sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5 - Ok
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok
sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6 - Ok
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok
xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok
sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7 - Ok
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok
xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok
sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8 - Ok
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok
xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok
sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9 - Ok
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok
xxh64 5a54d3aad816f302 test.gguf - Ok
sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf - Ok
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok
Verification results for test.gguf.manifest - Success
```
## Crypto/Hash Libraries Used
These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
- https://github.com/Cyan4973/xxHash
- https://github.com/clibs/sha1/
- https://github.com/jb55/sha256.c

View file

@ -1,13 +0,0 @@
{
"name": "rotate-bits",
"version": "0.1.1",
"repo": "jb55/rotate-bits.h",
"description": "rotate bits",
"keywords": ["rotl", "rotr"],
"src": ["rotate-bits.h"],
"license": "Public Domain",
"development": {
"thlorenz/tap.c": "*"
}
}

View file

@ -1,46 +0,0 @@
#ifndef __ROTATE_DEFS_H
#define __ROTATE_DEFS_H
#ifdef _MSC_VER
#include <stdlib.h>
#define ROTL32(v, n) _rotl((v), (n))
#define ROTL64(v, n) _rotl64((v), (n))
#define ROTR32(v, n) _rotr((v), (n))
#define ROTR64(v, n) _rotr64((v), (n))
#else
#include <stdint.h>
#define U8V(v) ((uint8_t)(v) & 0xFFU)
#define U16V(v) ((uint16_t)(v) & 0xFFFFU)
#define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU)
#define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU)
#define ROTL32(v, n) \
(U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n))))
// tests fail if we don't have this cast...
#define ROTL64(v, n) \
(U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n))))
#define ROTR32(v, n) ROTL32(v, 32 - (n))
#define ROTR64(v, n) ROTL64(v, 64 - (n))
#endif
#define ROTL8(v, n) \
(U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n))))
#define ROTL16(v, n) \
(U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n))))
#define ROTR8(v, n) ROTL8(v, 8 - (n))
#define ROTR16(v, n) ROTL16(v, 16 - (n))
#endif

View file

@ -1,9 +0,0 @@
{
"name": "sha1",
"version": "0.0.1",
"repo": "clibs/sha1",
"description": "sha1 hash algorithm",
"keywords": ["sha1", "hash"],
"license": "public domain",
"src": ["sha1.c", "sha1.h"]
}

View file

@ -1,295 +0,0 @@
/*
SHA-1 in C
By Steve Reid <steve@edmweb.com>
100% Public Domain
Test Vectors (from FIPS PUB 180-1)
"abc"
A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
A million repetitions of "a"
34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
*/
/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
/* #define SHA1HANDSOFF * Copies data before messing with it. */
#define SHA1HANDSOFF
#include <stdio.h>
#include <string.h>
/* for uint32_t */
#include <stdint.h>
#include "sha1.h"
#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
/* blk0() and blk() perform the initial expand. */
/* I got the idea of expanding during the round function from SSLeay */
#if BYTE_ORDER == LITTLE_ENDIAN
#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
|(rol(block->l[i],8)&0x00FF00FF))
#elif BYTE_ORDER == BIG_ENDIAN
#define blk0(i) block->l[i]
#else
#error "Endianness not defined!"
#endif
#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
^block->l[(i+2)&15]^block->l[i&15],1))
/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
/* Hash a single 512-bit block. This is the core of the algorithm. */
void SHA1Transform(
uint32_t state[5],
const unsigned char buffer[64]
)
{
uint32_t a, b, c, d, e;
typedef union
{
unsigned char c[64];
uint32_t l[16];
} CHAR64LONG16;
#ifdef SHA1HANDSOFF
CHAR64LONG16 block[1]; /* use array to appear as a pointer */
memcpy(block, buffer, 64);
#else
/* The following had better never be used because it causes the
* pointer-to-const buffer to be cast into a pointer to non-const.
* And the result is written through. I threw a "const" in, hoping
* this will cause a diagnostic.
*/
CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer;
#endif
/* Copy context->state[] to working vars */
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
/* 4 rounds of 20 operations each. Loop unrolled. */
R0(a, b, c, d, e, 0);
R0(e, a, b, c, d, 1);
R0(d, e, a, b, c, 2);
R0(c, d, e, a, b, 3);
R0(b, c, d, e, a, 4);
R0(a, b, c, d, e, 5);
R0(e, a, b, c, d, 6);
R0(d, e, a, b, c, 7);
R0(c, d, e, a, b, 8);
R0(b, c, d, e, a, 9);
R0(a, b, c, d, e, 10);
R0(e, a, b, c, d, 11);
R0(d, e, a, b, c, 12);
R0(c, d, e, a, b, 13);
R0(b, c, d, e, a, 14);
R0(a, b, c, d, e, 15);
R1(e, a, b, c, d, 16);
R1(d, e, a, b, c, 17);
R1(c, d, e, a, b, 18);
R1(b, c, d, e, a, 19);
R2(a, b, c, d, e, 20);
R2(e, a, b, c, d, 21);
R2(d, e, a, b, c, 22);
R2(c, d, e, a, b, 23);
R2(b, c, d, e, a, 24);
R2(a, b, c, d, e, 25);
R2(e, a, b, c, d, 26);
R2(d, e, a, b, c, 27);
R2(c, d, e, a, b, 28);
R2(b, c, d, e, a, 29);
R2(a, b, c, d, e, 30);
R2(e, a, b, c, d, 31);
R2(d, e, a, b, c, 32);
R2(c, d, e, a, b, 33);
R2(b, c, d, e, a, 34);
R2(a, b, c, d, e, 35);
R2(e, a, b, c, d, 36);
R2(d, e, a, b, c, 37);
R2(c, d, e, a, b, 38);
R2(b, c, d, e, a, 39);
R3(a, b, c, d, e, 40);
R3(e, a, b, c, d, 41);
R3(d, e, a, b, c, 42);
R3(c, d, e, a, b, 43);
R3(b, c, d, e, a, 44);
R3(a, b, c, d, e, 45);
R3(e, a, b, c, d, 46);
R3(d, e, a, b, c, 47);
R3(c, d, e, a, b, 48);
R3(b, c, d, e, a, 49);
R3(a, b, c, d, e, 50);
R3(e, a, b, c, d, 51);
R3(d, e, a, b, c, 52);
R3(c, d, e, a, b, 53);
R3(b, c, d, e, a, 54);
R3(a, b, c, d, e, 55);
R3(e, a, b, c, d, 56);
R3(d, e, a, b, c, 57);
R3(c, d, e, a, b, 58);
R3(b, c, d, e, a, 59);
R4(a, b, c, d, e, 60);
R4(e, a, b, c, d, 61);
R4(d, e, a, b, c, 62);
R4(c, d, e, a, b, 63);
R4(b, c, d, e, a, 64);
R4(a, b, c, d, e, 65);
R4(e, a, b, c, d, 66);
R4(d, e, a, b, c, 67);
R4(c, d, e, a, b, 68);
R4(b, c, d, e, a, 69);
R4(a, b, c, d, e, 70);
R4(e, a, b, c, d, 71);
R4(d, e, a, b, c, 72);
R4(c, d, e, a, b, 73);
R4(b, c, d, e, a, 74);
R4(a, b, c, d, e, 75);
R4(e, a, b, c, d, 76);
R4(d, e, a, b, c, 77);
R4(c, d, e, a, b, 78);
R4(b, c, d, e, a, 79);
/* Add the working vars back into context.state[] */
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
/* Wipe variables */
a = b = c = d = e = 0;
#ifdef SHA1HANDSOFF
memset(block, '\0', sizeof(block));
#endif
}
/* SHA1Init - Initialize new context */
void SHA1Init(
SHA1_CTX * context
)
{
/* SHA1 initialization constants */
context->state[0] = 0x67452301;
context->state[1] = 0xEFCDAB89;
context->state[2] = 0x98BADCFE;
context->state[3] = 0x10325476;
context->state[4] = 0xC3D2E1F0;
context->count[0] = context->count[1] = 0;
}
/* Run your data through this. */
void SHA1Update(
SHA1_CTX * context,
const unsigned char *data,
uint32_t len
)
{
uint32_t i;
uint32_t j;
j = context->count[0];
if ((context->count[0] += len << 3) < j)
context->count[1]++;
context->count[1] += (len >> 29);
j = (j >> 3) & 63;
if ((j + len) > 63)
{
memcpy(&context->buffer[j], data, (i = 64 - j));
SHA1Transform(context->state, context->buffer);
for (; i + 63 < len; i += 64)
{
SHA1Transform(context->state, &data[i]);
}
j = 0;
}
else
i = 0;
memcpy(&context->buffer[j], &data[i], len - i);
}
/* Add padding and return the message digest. */
void SHA1Final(
unsigned char digest[20],
SHA1_CTX * context
)
{
unsigned i;
unsigned char finalcount[8];
unsigned char c;
#if 0 /* untested "improvement" by DHR */
/* Convert context->count to a sequence of bytes
* in finalcount. Second element first, but
* big-endian order within element.
* But we do it all backwards.
*/
unsigned char *fcp = &finalcount[8];
for (i = 0; i < 2; i++)
{
uint32_t t = context->count[i];
int j;
for (j = 0; j < 4; t >>= 8, j++)
*--fcp = (unsigned char) t}
#else
for (i = 0; i < 8; i++)
{
finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255); /* Endian independent */
}
#endif
c = 0200;
SHA1Update(context, &c, 1);
while ((context->count[0] & 504) != 448)
{
c = 0000;
SHA1Update(context, &c, 1);
}
SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
for (i = 0; i < 20; i++)
{
digest[i] = (unsigned char)
((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
}
/* Wipe variables */
memset(context, '\0', sizeof(*context));
memset(&finalcount, '\0', sizeof(finalcount));
}
void SHA1(
char *hash_out,
const char *str,
uint32_t len)
{
SHA1_CTX ctx;
unsigned int ii;
SHA1Init(&ctx);
for (ii=0; ii<len; ii+=1)
SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
SHA1Final((unsigned char *)hash_out, &ctx);
}

View file

@ -1,52 +0,0 @@
#ifndef SHA1_H
#define SHA1_H
/*
SHA-1 in C
By Steve Reid <steve@edmweb.com>
100% Public Domain
*/
#include "stdint.h"
#if defined(__cplusplus)
extern "C" {
#endif
typedef struct
{
uint32_t state[5];
uint32_t count[2];
unsigned char buffer[64];
} SHA1_CTX;
void SHA1Transform(
uint32_t state[5],
const unsigned char buffer[64]
);
void SHA1Init(
SHA1_CTX * context
);
void SHA1Update(
SHA1_CTX * context,
const unsigned char *data,
uint32_t len
);
void SHA1Final(
unsigned char digest[20],
SHA1_CTX * context
);
void SHA1(
char *hash_out,
const char *str,
uint32_t len);
#if defined(__cplusplus)
}
#endif
#endif /* SHA1_H */

View file

@ -1,15 +0,0 @@
{
"name": "sha256",
"version": "0.0.2",
"repo": "jb55/sha256.c",
"description": "sha256 in c",
"keywords": ["sha256", "sha2"],
"src": ["sha256.c", "sha256.h"],
"dependencies": {
"jb55/rotate-bits.h": "0.1.1"
},
"development": {
"thlorenz/tap.c": "*"
}
}

View file

@ -1,221 +0,0 @@
/* Crypto/Sha256.c -- SHA-256 Hash
2010-06-11 : Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "rotate-bits/rotate-bits.h"
#include "sha256.h"
/* define it for speed optimization */
#define _SHA256_UNROLL
#define _SHA256_UNROLL2
void
sha256_init(sha256_t *p)
{
p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372;
p->state[3] = 0xa54ff53a;
p->state[4] = 0x510e527f;
p->state[5] = 0x9b05688c;
p->state[6] = 0x1f83d9ab;
p->state[7] = 0x5be0cd19;
p->count = 0;
}
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x,13) ^ ROTR32(x, 22))
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x,11) ^ ROTR32(x, 25))
#define s0(x) (ROTR32(x, 7) ^ ROTR32(x,18) ^ (x >> 3))
#define s1(x) (ROTR32(x,17) ^ ROTR32(x,19) ^ (x >> 10))
#define blk0(i) (W[i] = data[i])
#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
#define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
#define a(i) T[(0-(i))&7]
#define b(i) T[(1-(i))&7]
#define c(i) T[(2-(i))&7]
#define d(i) T[(3-(i))&7]
#define e(i) T[(4-(i))&7]
#define f(i) T[(5-(i))&7]
#define g(i) T[(6-(i))&7]
#define h(i) T[(7-(i))&7]
#ifdef _SHA256_UNROLL2
#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
d += h; h += S0(a) + Maj(a, b, c)
#define RX_8(i) \
R(a,b,c,d,e,f,g,h, i); \
R(h,a,b,c,d,e,f,g, (i+1)); \
R(g,h,a,b,c,d,e,f, (i+2)); \
R(f,g,h,a,b,c,d,e, (i+3)); \
R(e,f,g,h,a,b,c,d, (i+4)); \
R(d,e,f,g,h,a,b,c, (i+5)); \
R(c,d,e,f,g,h,a,b, (i+6)); \
R(b,c,d,e,f,g,h,a, (i+7))
#else
#define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
#ifdef _SHA256_UNROLL
#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
#endif
#endif
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static void
sha256_transform(uint32_t *state, const uint32_t *data)
{
uint32_t W[16] = {0};
unsigned j;
#ifdef _SHA256_UNROLL2
uint32_t a,b,c,d,e,f,g,h;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
#else
uint32_t T[8];
for (j = 0; j < 8; j++)
T[j] = state[j];
#endif
for (j = 0; j < 64; j += 16)
{
#if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
RX_8(0); RX_8(8);
#else
unsigned i;
for (i = 0; i < 16; i++) { R(i); }
#endif
}
#ifdef _SHA256_UNROLL2
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
#else
for (j = 0; j < 8; j++)
state[j] += T[j];
#endif
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
/* memset(T, 0, sizeof(T)); */
}
#undef S0
#undef S1
#undef s0
#undef s1
static void
sha256_write_byte_block(sha256_t *p)
{
uint32_t data32[16];
unsigned i;
for (i = 0; i < 16; i++)
data32[i] =
((uint32_t)(p->buffer[i * 4 ]) << 24) +
((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
((uint32_t)(p->buffer[i * 4 + 2]) << 8) +
((uint32_t)(p->buffer[i * 4 + 3]));
sha256_transform(p->state, data32);
}
void
sha256_hash(unsigned char *buf, const unsigned char *data, size_t size)
{
sha256_t hash;
sha256_init(&hash);
sha256_update(&hash, data, size);
sha256_final(&hash, buf);
}
void
sha256_update(sha256_t *p, const unsigned char *data, size_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
sha256_write_byte_block(p);
}
}
}
void
sha256_final(sha256_t *p, unsigned char *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
sha256_write_byte_block(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (unsigned char)(lenInBits >> 56);
lenInBits <<= 8;
}
sha256_write_byte_block(p);
for (i = 0; i < 8; i++)
{
*digest++ = (unsigned char)(p->state[i] >> 24);
*digest++ = (unsigned char)(p->state[i] >> 16);
*digest++ = (unsigned char)(p->state[i] >> 8);
*digest++ = (unsigned char)(p->state[i]);
}
sha256_init(p);
}

View file

@ -1,24 +0,0 @@
/* Sha256.h -- SHA-256 Hash
2010-06-11 : Igor Pavlov : Public domain */
#ifndef __CRYPTO_SHA256_H
#define __CRYPTO_SHA256_H
#include <stdlib.h>
#include <stdint.h>
#define SHA256_DIGEST_SIZE 32
typedef struct sha256_t
{
uint32_t state[8];
uint64_t count;
unsigned char buffer[64];
} sha256_t;
void sha256_init(sha256_t *p);
void sha256_update(sha256_t *p, const unsigned char *data, size_t size);
void sha256_final(sha256_t *p, unsigned char *digest);
void sha256_hash(unsigned char *buf, const unsigned char *data, size_t size);
#endif

View file

@ -1,12 +0,0 @@
{
"name": "xxhash",
"version": "0.8.2",
"repo": "Cyan4973/xxhash",
"description": "Extremely fast non-cryptographic hash algorithm",
"keywords": ["xxhash", "hashing"],
"license": "BSD-2-Clause",
"src": [
"xxhash.c",
"xxhash.h"
]
}

View file

@ -1,42 +0,0 @@
/*
* xxHash - Extremely Fast Hash algorithm
* Copyright (C) 2012-2023 Yann Collet
*
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* You can contact the author at:
* - xxHash homepage: https://www.xxhash.com
* - xxHash source repository: https://github.com/Cyan4973/xxHash
*/
/*
* xxhash.c instantiates functions defined in xxhash.h
*/
#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
#define XXH_IMPLEMENTATION /* access definitions */
#include "xxhash.h"

File diff suppressed because it is too large Load diff

View file

@ -1,693 +0,0 @@
#include "ggml.h"
#include <cstdlib> /* abort() */
#include <cstddef>
#include <cstdio>
#include <string>
#include <stdexcept>
#include <algorithm>
#include <cstring>
#include <sstream>
#include <fstream>
#ifdef __cplusplus
extern "C" {
#endif
#include "xxhash/xxhash.h"
#include "sha1/sha1.h"
#include "sha256/sha256.h"
#ifdef __cplusplus
}
#endif
// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
#define HASH_TYPE_SHA256_STR "sha256"
#define HASH_TYPE_SHA1_STR "sha1"
#define HASH_TYPE_XXH64_STR "xxh64"
#define HASH_TYPE_UUID_STR "uuid"
typedef enum {
HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated
HASH_EXIT_FAILURE = 1, // Generic Failure
HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation
HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest
HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it
HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format
} hash_exit_code_t;
typedef enum {
HASH_MANIFEST_NOT_FOUND,
HASH_MANIFEST_MISMATCH,
HASH_MANIFEST_OK,
} hash_manifest_result_t;
struct hash_params {
std::string input;
bool xxh64 = false;
bool sha1 = false;
bool sha256 = false;
bool uuid = false;
bool no_layer = false;
bool manifest_is_usable = false;
std::string manifest_file;
};
struct manifest_check_params {
bool xxh64 = false;
bool sha1 = false;
bool sha256 = false;
bool uuid = false;
};
static char const * hash_manifest_result_to_str(hash_manifest_result_t value) {
switch (value) {
case HASH_MANIFEST_NOT_FOUND: return "Not Found";
case HASH_MANIFEST_MISMATCH: return "Mismatch";
case HASH_MANIFEST_OK: return "Ok";
}
return "?";
}
static char const * hash_exit_code_to_str(hash_exit_code_t value) {
switch (value) {
case HASH_EXIT_SUCCESS: return "Success";
case HASH_EXIT_FAILURE: return "Failure";
case HASH_EXIT_MISMATCH: return "Mismatch";
case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry";
case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash";
case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error";
}
return "?";
}
static void hash_print_usage(const char * executable) {
const hash_params default_params;
printf("\n");
printf("usage: %s [options] GGUF_IN\n", executable);
printf("\n");
printf("Hash a GGUF file");
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --xxh64 use xxh64 hash\n");
printf(" --sha1 use sha1 hash\n");
printf(" --sha256 use sha256 hash\n");
printf(" --all use all hash\n");
printf(" --no-layer exclude per layer hash\n");
printf(" --uuid generate UUIDv5 ID\n");
printf(" -c, --check <manifest> verify against a manifest\n");
printf("\n");
}
static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
std::string arg;
bool invalid_param = false;
const std::string arg_prefix = "--";
int arg_idx = 1;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
arg = argv[arg_idx];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
bool arg_found = false;
if (arg == "-h" || arg == "--help") {
hash_print_usage(argv[0]);
exit(0);
}
if (arg == "--xxh64") {
arg_found = true;
params.xxh64 = true;
}
if (arg == "--sha1") {
arg_found = true;
params.sha1 = true;
}
if (arg == "--uuid") {
arg_found = true;
params.uuid = true;
}
if (arg == "--sha256") {
arg_found = true;
params.sha256 = true;
}
if (arg == "--all") {
arg_found = true;
params.sha256 = true;
params.sha1 = true;
params.xxh64 = true;
}
if (arg == "--no-layer") {
arg_found = true;
params.no_layer = true;
}
if (arg == "-c" || arg == "--check") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
params.manifest_file = argv[arg_idx];
}
if (!arg_found) {
throw std::invalid_argument("error: unknown argument: " + arg);
}
}
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument:" + arg);
}
if (argc - arg_idx < 1) {
throw std::invalid_argument("error: bad arguments");
}
params.input = argv[arg_idx++];
}
static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
bool result = true;
try {
hash_params_parse_ex(argc, argv, params);
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
hash_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
return result;
}
static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) {
if (manifest_file.empty()) {
return false;
}
std::ifstream file(manifest_file);
if (!file.is_open()) {
return false;
}
std::string manifest_entry_line;
while (getline(file, manifest_entry_line)) {
// hash_type_str hash_str tensor_name
// e.g. 'xxh64 f66e9cd66a4396a0 test.gguf:tensor_0'
std::istringstream line_stream(manifest_entry_line);
std::string file_hash_type;
if (line_stream >> file_hash_type) {
if (file_hash_type == HASH_TYPE_SHA256_STR) {
manifest_check.sha256 = true;
} else if (file_hash_type == HASH_TYPE_SHA1_STR) {
manifest_check.sha1 = true;
} else if (file_hash_type == HASH_TYPE_XXH64_STR) {
manifest_check.xxh64 = true;
} else if (file_hash_type == HASH_TYPE_UUID_STR) {
manifest_check.uuid = true;
}
}
}
return true;
}
static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) {
if (manifest_file.empty()) {
return HASH_MANIFEST_NOT_FOUND;
}
std::ifstream file(manifest_file);
if (!file.is_open()) {
return HASH_MANIFEST_NOT_FOUND;
}
std::string manifest_entry_line;
while (getline(file, manifest_entry_line)) {
std::istringstream line_stream(manifest_entry_line);
std::string file_hash_type;
std::string file_hash;
std::string file_tensor_name;
if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) {
// Line parsed. Check hash validity
if (file_hash_type != hash_type_str) {
continue;
}
if (file_tensor_name != tensor_name) {
continue;
}
return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH;
}
}
return HASH_MANIFEST_NOT_FOUND;
}
static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
// Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
// Assumes that digest was processed correctly with the expected namespace
for (int i = 0; i < 16; i++) {
uuid[i] = sha1_digest[i];
}
// Set bits corresponding to UUID ver 5
uuid[ 6] &= ~(0xF << 4);
uuid[ 6] |= (5 << 4);
// Set bits corresponding to UUID variant 0b10XX
uuid[ 8] &= ~(0xc << 4);
uuid[ 8] |= (0x8 << 4);
}
static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
const std::string & fname = hash_params.input;
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};
// xxh64 init
XXH64_state_t* xxh64_model_hash_state = NULL;
if (hash_params.xxh64) {
xxh64_model_hash_state = XXH64_createState();
if (xxh64_model_hash_state==NULL) {
abort();
}
XXH64_hash_t const seed = 0;
if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
abort();
}
}
// sha1 init
SHA1_CTX sha1_model_hash_ctx;
if (hash_params.sha1) {
SHA1Init(&sha1_model_hash_ctx);
}
// sha256 init
sha256_t sha256_model_hash_ctx;
if (hash_params.sha256) {
sha256_init(&sha256_model_hash_ctx);
}
// sha1 for uuid init
SHA1_CTX sha1_for_uuid_ctx;
if (hash_params.uuid) {
unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
SHA1Init(&sha1_for_uuid_ctx);
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
}
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
const int n_tensors = gguf_get_n_tensors(ctx);
bool tensor_layer_in_manifest = false;
bool model_in_manifest = false;
bool tensor_layer_has_mismatch = false;
bool model_has_mismatch = false;
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
auto n_bytes = ggml_nbytes(cur);
auto *raw_data = cur->data;
const std::string tensor_layer_name = fname + ":" + name;
if (hash_params.xxh64) {
if (!hash_params.no_layer) {
// Per Layer Hash
XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
}
if (hash_params.sha1) {
if (!hash_params.no_layer) {
// Per Layer Hash
char result[21]; // sha1 outputs 20 bytes
SHA1( result, (const char *)raw_data, n_bytes);
char hex_result[41] = {0};
for (int offset = 0; offset < 20; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
}
if (hash_params.sha256) {
if (!hash_params.no_layer) {
// Per Layer Hash
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes);
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
}
if (hash_params.uuid) {
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes);
}
}
if (hash_params.xxh64) {
XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str());
}
}
if (hash_params.sha1) {
unsigned char result[21];
SHA1Final(result, &sha1_model_hash_ctx);
char hex_result[41];
for (int offset = 0; offset < 20; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str());
}
}
if (hash_params.sha256) {
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
sha256_final( &sha256_model_hash_ctx, result);
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str());
}
}
if (hash_params.uuid) {
unsigned char result[21];
SHA1Final(result, &sha1_for_uuid_ctx);
unsigned char uuid[16];
generate_uuidv5(result, uuid);
char string_buffer[37] = {0};
snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid[0], uuid[1], uuid[2], uuid[3],
uuid[4], uuid[5], uuid[6], uuid[7],
uuid[8], uuid[9], uuid[10], uuid[11],
uuid[12], uuid[13], uuid[14], uuid[15]);
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str());
}
}
ggml_free(ctx_data);
gguf_free(ctx);
if (hash_params.manifest_is_usable) {
// In hash verification mode
if (!model_in_manifest) {
// model missing in manifest?
// Check tensor layer...
if (!tensor_layer_in_manifest) {
// Still missing? Maybe we are reading the wrong manifest.
return HASH_EXIT_MANIFEST_MISSING_ENTRY;
}
if (tensor_layer_has_mismatch) {
// Per tensor check found error
return HASH_EXIT_FAILURE;
}
// All per tensor layer checks passed? Sounds good enough.
return HASH_EXIT_SUCCESS;
}
// Overall model check passed, but let's check per layer just in case
// If missing, we don't care too much as the overall model checked
if (tensor_layer_in_manifest && tensor_layer_has_mismatch) {
return HASH_EXIT_FAILURE;
}
if (model_has_mismatch) {
// model has failed hash somewhere in the model
return HASH_EXIT_FAILURE;
}
// All checks appears to be fine
return HASH_EXIT_SUCCESS;
}
// In hash generation mode
return HASH_EXIT_SUCCESS;
}
int main(int argc, const char ** argv) {
hash_params params;
manifest_check_params manifest_check;
hash_params_parse(argc, argv, params);
if (!params.manifest_file.empty()) {
if (!manifest_type(params.manifest_file, manifest_check)) {
printf("ERROR cannot open manifest %s", params.manifest_file.c_str());
return HASH_EXIT_MANIFEST_FILE_ERROR;
}
if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
}
printf("manifest %s", params.manifest_file.c_str());
if (manifest_check.sha256) {
printf(" sha256");
}
if (manifest_check.sha1) {
printf(" sha1");
}
if (manifest_check.xxh64) {
printf(" xxh64");
}
if (manifest_check.uuid) {
printf(" uuid");
}
printf("\n");
// Autoselect the highest security hash if manifest is provided but
// the user has not specifically defined the hash they care about
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
// User has not selected a specific value, pick most secure hash
if (manifest_check.sha256) {
params.sha256 = true;
} else if (manifest_check.sha1) {
params.sha1 = true;
} else if (manifest_check.xxh64) {
params.xxh64 = true;
} else if (manifest_check.uuid) {
params.uuid = true;
}
}
params.manifest_is_usable = true;
}
// By default if no swich argument provided, assume xxh64
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
params.xxh64 = true;
}
hash_exit_code_t exit_code = gguf_hash(params);
if (params.manifest_is_usable) {
printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code));
}
return exit_code;
}

View file

@ -1,5 +0,0 @@
set(TARGET llama-gguf-split)
add_executable(${TARGET} gguf-split.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,10 +0,0 @@
## GGUF split Example
CLI to split / merge GGUF files.
**Command line options:**
- `--split`: split GGUF to multiple GGUF, default operation.
- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
- `--split-max-tensors`: maximum tensors in each split: default(128)
- `--merge`: merge multiple GGUF to a single GGUF.

View file

@ -1,564 +0,0 @@
#include "llama.h"
#include "common.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <string>
#include <vector>
#include <stdio.h>
#include <string.h>
#include <climits>
#include <stdexcept>
#if defined(_WIN32)
#include <windows.h>
#ifndef PATH_MAX
#define PATH_MAX MAX_PATH
#endif
#include <io.h>
#endif
enum split_operation : uint8_t {
SPLIT_OP_SPLIT,
SPLIT_OP_MERGE,
};
struct split_params {
split_operation operation = SPLIT_OP_SPLIT;
size_t n_bytes_split = 0;
int n_split_tensors = 128;
std::string input;
std::string output;
bool no_tensor_first_split = false;
bool dry_run = false;
};
static void split_print_usage(const char * executable) {
const split_params default_params;
printf("\n");
printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
printf("\n");
printf("Apply a GGUF operation on IN to OUT.");
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --version show version and build info\n");
printf(" --split split GGUF to multiple GGUF (enabled by default)\n");
printf(" --merge merge multiple GGUF to a single GGUF\n");
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
printf(" --split-max-size N(M|G) max size per split\n");
printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
printf("\n");
}
// return convert string, for example "128M" or "4G" to number of bytes
static size_t split_str_to_n_bytes(std::string str) {
size_t n_bytes = 0;
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1000 * 1000; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
if (n <= 0) {
throw std::invalid_argument("error: size must be a positive value");
}
return n_bytes;
}
static void split_params_parse_ex(int argc, const char ** argv, split_params & params) {
std::string arg;
const std::string arg_prefix = "--";
bool invalid_param = false;
int arg_idx = 1;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
arg = argv[arg_idx];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
bool arg_found = false;
bool is_op_set = false;
bool is_mode_set = false;
if (arg == "-h" || arg == "--help") {
split_print_usage(argv[0]);
exit(0);
}
if (arg == "--version") {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
exit(0);
}
if (arg == "--dry-run") {
arg_found = true;
params.dry_run = true;
}
if (arg == "--no-tensor-first-split") {
arg_found = true;
params.no_tensor_first_split = true;
}
if (is_op_set) {
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
}
if (arg == "--merge") {
arg_found = true;
is_op_set = true;
params.operation = SPLIT_OP_MERGE;
}
if (arg == "--split") {
arg_found = true;
is_op_set = true;
params.operation = SPLIT_OP_SPLIT;
}
if (is_mode_set) {
throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
}
if (arg == "--split-max-tensors") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
is_mode_set = true;
params.n_split_tensors = atoi(argv[arg_idx]);
}
if (arg == "--split-max-size") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
is_mode_set = true;
params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
}
if (!arg_found) {
throw std::invalid_argument("error: unknown argument: " + arg);
}
}
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
}
if (argc - arg_idx < 2) {
throw std::invalid_argument("error: bad arguments");
}
params.input = argv[arg_idx++];
params.output = argv[arg_idx++];
}
static bool split_params_parse(int argc, const char ** argv, split_params & params) {
bool result = true;
try {
split_params_parse_ex(argc, argv, params);
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
split_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
return result;
}
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
file.write(&zero, 1);
}
}
struct split_strategy {
const split_params params;
std::ifstream & f_input;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_meta = NULL;
const int n_tensors;
// one ctx_out per one output file
std::vector<struct gguf_context *> ctx_outs;
// temporary buffer for reading in tensor data
std::vector<uint8_t> read_buf;
split_strategy(const split_params & params,
std::ifstream & f_input,
struct gguf_context * ctx_gguf,
struct ggml_context * ctx_meta) :
params(params),
f_input(f_input),
ctx_gguf(ctx_gguf),
ctx_meta(ctx_meta),
n_tensors(gguf_get_n_tensors(ctx_gguf)) {
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
int i_split = -1;
struct gguf_context * ctx_out = NULL;
auto new_ctx_out = [&](bool allow_no_tensors) {
i_split++;
if (ctx_out != NULL) {
if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
exit(EXIT_FAILURE);
}
ctx_outs.push_back(ctx_out);
}
ctx_out = gguf_init_empty();
// Save all metadata in first split only
if (i_split == 0) {
gguf_set_kv(ctx_out, ctx_gguf);
}
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
};
// initialize ctx_out for the first split
new_ctx_out(false);
// skip first split if no_tensor_first_split is set
if (params.no_tensor_first_split) {
new_ctx_out(true);
}
// process tensors one by one
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
for (int i = 0; i < n_tensors; ++i) {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
// calculate the "imaginary" size = the current size + next tensor size
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
size_t next_tensors_size = curr_tensors_size + n_bytes;
if (should_split(i, next_tensors_size)) {
new_ctx_out(false);
curr_tensors_size = n_bytes;
} else {
curr_tensors_size = next_tensors_size;
}
gguf_add_tensor(ctx_out, t);
}
// push the last ctx_out
ctx_outs.push_back(ctx_out);
// set the correct n_split for all ctx_out
for (auto & ctx : ctx_outs) {
gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size());
}
}
~split_strategy() {
for (auto & ctx_out : ctx_outs) {
gguf_free(ctx_out);
}
}
bool should_split(int i_tensor, size_t next_size) {
if (params.n_bytes_split > 0) {
// split by max size per file
return next_size > params.n_bytes_split;
} else {
// split by number of tensors per file
return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
}
}
void print_info() {
printf("n_split: %ld\n", ctx_outs.size());
int i_split = 0;
for (auto & ctx_out : ctx_outs) {
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
size_t total_size = gguf_get_meta_size(ctx_out);
for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
total_size += ggml_nbytes(t);
}
total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++;
}
}
void write() {
int i_split = 0;
int n_split = ctx_outs.size();
for (auto & ctx_out : ctx_outs) {
// construct file path
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
// open the output file
printf("Writing file %s ... ", split_path);
fflush(stdout);
std::ofstream fout = std::ofstream(split_path, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
// write metadata
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *)data.data(), data.size());
// write tensors
for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
// read tensor meta and prepare buffer
const char * t_name = gguf_get_tensor_name(ctx_out, i);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
auto n_bytes = ggml_nbytes(t);
read_buf.resize(n_bytes);
// calculate offset
auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
// copy tensor from input to output file
copy_file_to_file(f_input, fout, offset, n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
}
printf("done\n");
// close the file
fout.close();
i_split++;
}
}
void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
// TODO: detect OS and use copy_file_range() here for better performance
if (read_buf.size() < len) {
read_buf.resize(len);
}
f_in.seekg(in_offset);
f_in.read((char *)read_buf.data(), len);
f_out.write((const char *)read_buf.data(), len);
}
};
static void gguf_split(const split_params & split_params) {
struct ggml_context * ctx_meta = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
if (!f_input.is_open()) {
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
exit(EXIT_FAILURE);
}
auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
exit(EXIT_FAILURE);
}
// prepare the strategy
split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
int n_split = strategy.ctx_outs.size();
strategy.print_info();
if (!split_params.dry_run) {
// write all output splits
strategy.write();
}
// done, clean up
gguf_free(ctx_gguf);
f_input.close();
fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
__func__, n_split, strategy.n_tensors);
}
static void gguf_merge(const split_params & split_params) {
fprintf(stderr, "%s: %s -> %s\n",
__func__, split_params.input.c_str(),
split_params.output.c_str());
int n_split = 1;
int total_tensors = 0;
auto * ctx_out = gguf_init_empty();
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
std::vector<uint8_t> read_data;
std::vector<ggml_context *> ctx_metas;
std::vector<gguf_context *> ctx_ggufs;
char split_path[PATH_MAX] = {0};
strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
char split_prefix[PATH_MAX] = {0};
// First pass to find KV and tensors metadata
for (int i_split = 0; i_split < n_split; i_split++) {
struct ggml_context * ctx_meta = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};
if (i_split > 0) {
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
}
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
auto * ctx_gguf = gguf_init_from_file(split_path, params);
if (!ctx_gguf) {
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
exit(EXIT_FAILURE);
}
ctx_ggufs.push_back(ctx_gguf);
ctx_metas.push_back(ctx_meta);
if (i_split == 0) {
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split < 0) {
fprintf(stderr,
"\n%s: input file does not contain %s metadata\n",
__func__,
LLM_KV_SPLIT_COUNT);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
if (n_split < 1) {
fprintf(stderr,
"\n%s: input file does not contain a valid split count %d\n",
__func__,
n_split);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
// Verify the file naming and extract split_prefix
if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
fprintf(stderr, "\n%s: unexpected input file name: %s"
" i_split=%d"
" n_split=%d\n", __func__,
split_path, i_split, n_split);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
// Do not trigger merge if we try to merge again the output
gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
// Set metadata from the first split
gguf_set_kv(ctx_out, ctx_gguf);
}
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
gguf_add_tensor(ctx_out, t);
}
total_tensors += n_tensors;
fprintf(stderr, "\033[3Ddone\n");
}
// placeholder for the meta data
{
auto meta_size = gguf_get_meta_size(ctx_out);
::zeros(fout, meta_size);
}
// Write tensors data
for (int i_split = 0; i_split < n_split; i_split++) {
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
std::ifstream f_input(split_path, std::ios::binary);
if (!f_input.is_open()) {
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path);
for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
gguf_free(ctx_ggufs[i]);
ggml_free(ctx_metas[i]);
}
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
auto * ctx_gguf = ctx_ggufs[i_split];
auto * ctx_meta = ctx_metas[i_split];
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
auto n_bytes = ggml_nbytes(t);
if (read_data.size() < n_bytes) {
read_data.resize(n_bytes);
}
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
f_input.seekg(offset);
f_input.read((char *)read_data.data(), n_bytes);
// write tensor data + padding
fout.write((const char *)read_data.data(), n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
}
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
f_input.close();
fprintf(stderr, "\033[3Ddone\n");
}
{
// go back to beginning of file and write the updated metadata
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *)data.data(), data.size());
fout.close();
gguf_free(ctx_out);
}
fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
__func__, split_params.output.c_str(), n_split, total_tensors);
}
int main(int argc, const char ** argv) {
split_params params;
split_params_parse(argc, argv, params);
switch (params.operation) {
case SPLIT_OP_SPLIT: gguf_split(params);
break;
case SPLIT_OP_MERGE: gguf_merge(params);
break;
default: split_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
return 0;
}

View file

@ -1,89 +0,0 @@
#!/bin/bash
set -eu
if [ $# -lt 1 ]
then
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
echo "example: $0 ../../build/bin ../../tmp"
exit 1
fi
if [ $# -gt 1 ]
then
TMP_DIR=$2
else
TMP_DIR=/tmp
fi
set -x
SPLIT=$1/llama-gguf-split
MAIN=$1/llama-cli
WORK_PATH=$TMP_DIR/gguf-split
ROOT_DIR=$(realpath $(dirname $0)/../../)
mkdir -p "$WORK_PATH"
# Clean up in case of previously failed test
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
# 1. Get a model
(
cd $WORK_PATH
"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
)
echo PASS
# 2. Split with max tensors strategy
$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
echo PASS
echo
# 2b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
echo PASS
echo
# 3. Merge
$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
echo PASS
echo
# 3b. Test the merged model is loading properly
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
echo PASS
echo
# 4. Split with no tensors in the first split
$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
echo PASS
echo
# 4b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
echo PASS
echo
# 5. Merge
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
#echo PASS
#echo
# 5b. Test the merged model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#echo PASS
#echo
# 6. Split with size strategy
$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
echo PASS
echo
# 6b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
echo PASS
echo
# Clean up
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf

View file

@ -1,5 +0,0 @@
set(TARGET llama-gguf)
add_executable(${TARGET} gguf.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,261 +0,0 @@
#include "ggml.h"
#include <cstdio>
#include <cinttypes>
#include <string>
#include <sstream>
#include <fstream>
#include <vector>
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
template <typename T>
static std::string to_string(const T & val) {
std::stringstream ss;
ss << val;
return ss.str();
}
static bool gguf_ex_write(const std::string & fname) {
struct gguf_context * ctx = gguf_init_empty();
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull);
gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll);
gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789);
gguf_set_val_bool(ctx, "some.parameter.bool", true);
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
struct ggml_init_params params = {
/*.mem_size =*/ 128ull*1024ull*1024ull,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx_data = ggml_init(params);
const int n_tensors = 10;
// tensor infos
for (int i = 0; i < n_tensors; ++i) {
const std::string name = "tensor_" + to_string(i);
int64_t ne[GGML_MAX_DIMS] = { 1 };
int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
for (int j = 0; j < n_dims; ++j) {
ne[j] = rand() % 10 + 1;
}
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
ggml_set_name(cur, name.c_str());
{
float * data = (float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
data[j] = 100 + i;
}
}
gguf_add_tensor(ctx, cur);
}
gguf_write_to_file(ctx, fname.c_str(), false);
printf("%s: wrote file '%s;\n", __func__, fname.c_str());
ggml_free(ctx_data);
gguf_free(ctx);
return true;
}
// just read tensor info
static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ NULL,
};
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
if (!ctx) {
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
return false;
}
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv
{
const int n_kv = gguf_get_n_kv(ctx);
printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i);
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
// find kv string
{
const char * findkey = "some.parameter.string";
const int keyidx = gguf_find_key(ctx, findkey);
if (keyidx == -1) {
printf("%s: find key: %s not found.\n", __func__, findkey);
} else {
const char * key_value = gguf_get_val_str(ctx, keyidx);
printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
}
}
// tensor info
{
const int n_tensors = gguf_get_n_tensors(ctx);
printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
gguf_free(ctx);
return true;
}
// read and create ggml_context containing the tensors and their data
static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv
{
const int n_kv = gguf_get_n_kv(ctx);
printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i);
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
// tensor info
{
const int n_tensors = gguf_get_n_tensors(ctx);
printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
// data
{
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
printf("%s: reading tensor %d data\n", __func__, i);
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
// print first 10 elements
const float * data = (const float *) cur->data;
printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]);
}
printf("\n\n");
// check data
if (check_data) {
const float * data = (const float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) {
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
gguf_free(ctx);
return false;
}
}
}
}
}
printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
ggml_free(ctx_data);
gguf_free(ctx);
return true;
}
int main(int argc, char ** argv) {
if (argc < 3) {
printf("usage: %s data.gguf r|w [n]\n", argv[0]);
printf("r: read data.gguf file\n");
printf("w: write data.gguf file\n");
printf("n: no check of tensor data\n");
return -1;
}
bool check_data = true;
if (argc == 4) {
check_data = false;
}
const std::string fname(argv[1]);
const std::string mode (argv[2]);
GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
if (mode == "w") {
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
} else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
}
return 0;
}

View file

@ -1,5 +0,0 @@
set(TARGET llama-gritlm)
add_executable(${TARGET} gritlm.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,62 +0,0 @@
## Generative Representational Instruction Tuning (GRIT) Example
[gritlm] a model which can generate embeddings as well as "normal" text
generation depending on the instructions in the prompt.
* Paper: https://arxiv.org/pdf/2402.09906.pdf
### Retrieval-Augmented Generation (RAG) use case
One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
that we take documents that we want to use as context, to ground the large
language model (LLM), and we create token embeddings for them. We then store
these token embeddings in a vector database.
When we perform a query, prompt the LLM, we will first create token embeddings
for the query and then search the vector database to retrieve the most
similar vectors, and return those documents so they can be passed to the LLM as
context. Then the query and the context will be passed to the LLM which will
have to _again_ create token embeddings for the query. But because gritlm is used
the first query can be cached and the second query tokenization generation does
not have to be performed at all.
### Running the example
Download a Grit model:
```console
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
```
Run the example using the downloaded model:
```console
$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
Oh, brave adventurer, who dared to climb
The lofty peak of Mt. Fuji in the night,
When shadows lurk and ghosts do roam,
And darkness reigns, a fearsome sight.
Thou didst set out, with heart aglow,
To conquer this mountain, so high,
And reach the summit, where the stars do glow,
And the moon shines bright, up in the sky.
Through the mist and fog, thou didst press on,
With steadfast courage, and a steadfast will,
Through the darkness, thou didst not be gone,
But didst climb on, with a steadfast skill.
At last, thou didst reach the summit's crest,
And gazed upon the world below,
And saw the beauty of the night's best,
And felt the peace, that only nature knows.
Oh, brave adventurer, who dared to climb
The lofty peak of Mt. Fuji in the night,
Thou art a hero, in the eyes of all,
For thou didst conquer this mountain, so bright.
```
[gritlm]: https://github.com/ContextualAI/gritlm

View file

@ -1,219 +0,0 @@
#include "common.h"
#include "llama.h"
#include <string>
#include <vector>
// #define GRIT_DEBUG
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
std::vector<std::vector<float>> result;
const llama_model * mdl = llama_get_model(ctx);
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
for (uint64_t i = 0; i < sentences.size(); i++) {
llama_batch_clear(batch);
const std::string input_string = instruction + sentences[i];
std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
const int32_t n_toks = inputs.size();
// GritLM seems to have EOS = ""
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
// inputs.push_back(llama_token_eos(mdl));
// we want to ignore instruction tokens for mean pooling
const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
#ifdef GRIT_DEBUG
// debug tokens - should be matching as referenced in the GritLM sample
std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
});
std::printf("\n");
#endif
// add input to batch (this increments n_tokens)
for (int32_t j = 0; j < n_toks; j++) {
llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
}
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false);
// run model
llama_decode(ctx, batch);
// get embedding dimensions
uint64_t n_embd = llama_n_embd(mdl);
// allocate embedding output
std::vector<float> emb_unorm(n_embd, 0.0f);
// sum up all token embeddings
for (int32_t k = n_inst; k < n_toks; k++) {
float * emb = llama_get_embeddings_ith(ctx, k);
for (uint64_t j = 0; j < n_embd; j++) {
emb_unorm[j] += emb[j];
}
}
// divide by number of tokens (mean pooling)
{
const uint64_t n_sent = n_toks - n_inst;
for (uint64_t j = 0; j < n_embd; j++) {
emb_unorm[j] /= n_sent;
}
}
std::vector<float> emb_norm(emb_unorm.size());
llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
result.push_back(emb_norm);
#ifdef GRIT_DEBUG
// print out emb_norm
std::printf("embedding %ld: ", i);
for (uint64_t j = 0; j < n_embd; j++) {
std::printf("%.5f ", emb_norm[j]);
}
std::printf("\n\n");
#endif
}
llama_batch_free(batch);
return result;
}
static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
std::string result;
const llama_model * mdl = llama_get_model(ctx);
llama_token eos_token = llama_token_eos(mdl);
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true);
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
int32_t i_current_token = 0;
while (true) {
llama_batch_clear(bat);
auto n_inputs = (int32_t)inputs.size();
for (int32_t i = 0; i < n_inputs; i++) {
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
}
inputs.clear();
llama_decode(ctx, bat);
auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
auto n_candidates = (int32_t)candidates.size();
for (int32_t token = 0; token < n_candidates; token++) {
candidates[token] = llama_token_data{ token, logits[token], 0.0f };
}
auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
if (token == eos_token) {
break;
}
std::string piece = llama_token_to_piece(ctx, token);
if (stream) {
std::printf("%s", piece.c_str());
std::fflush(stdout);
}
inputs.push_back(token);
result += piece;
}
if (stream) {
std::printf("\n");
}
llama_batch_free(bat);
return result;
}
static std::string gritlm_instruction(const std::string & instruction) {
return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
}
int main(int argc, char * argv[]) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_context_params cparams = llama_context_params_from_gpt_params(params);
llama_backend_init();
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
// create generation context
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
// ### Embedding/Representation ###
// samples taken from: https://github.com/ContextualAI/gritlm#basic
{
const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
const std::vector<std::string> queries = {
"Bitcoin: A Peer-to-Peer Electronic Cash System",
"Generative Representational Instruction Tuning",
};
const std::vector<std::string> documents = {
"A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
"All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
};
// No need to add instruction for retrieval documents
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
const int n_embd = llama_n_embd(mdl);
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
}
// ### Generation ###
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
{
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
std::string response = generate(ctx, prompt, true);
}
llama_free(ctx);
llama_free_model(mdl);
llama_backend_free();
return 0;
}

View file

@ -1,5 +0,0 @@
set(TARGET llama-imatrix)
add_executable(${TARGET} imatrix.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,35 +0,0 @@
# llama.cpp/examples/imatrix
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
## Usage
```
./llama-imatrix \
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
```
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
The parameters in square brackets are optional and have the following meaning:
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
For faster computation, make sure to use GPU offloading via the `-ngl` argument
## Example
```bash
GGML_CUDA=1 make -j
# generate importance matrix (imatrix.dat)
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
# use the imatrix to perform a Q4_K_M quantization
./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
```

View file

@ -1,649 +0,0 @@
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <sstream>
#include <thread>
#include <mutex>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <algorithm>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG_TEE("\n");
}
struct Stats {
std::vector<float> values;
std::vector<int> counts;
int ncall = 0;
};
class IMatrixCollector {
public:
IMatrixCollector() = default;
void set_params(gpt_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * file_name);
private:
std::unordered_map<std::string, Stats> m_stats;
gpt_params m_params;
std::mutex m_mutex;
int m_last_call = 0;
std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
};
// remove any prefix and suffixes from the name
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
static std::string filter_tensor_name(const char * name) {
std::string wname;
const char * p = strchr(name, '#');
if (p != NULL) {
p = p + 1;
const char * q = strchr(p, '#');
if (q != NULL) {
wname = std::string(p, q - p);
} else {
wname = p;
}
} else {
wname = name;
}
return wname;
}
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
GGML_UNUSED(user_data);
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
std::string wname = filter_tensor_name(src0->name);
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
if (ask) {
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
if (t->op != GGML_OP_MUL_MAT) return false;
// why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
return true;
}
std::lock_guard<std::mutex> lock(m_mutex);
// copy the data from the GPU memory if needed
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
if (!is_host) {
m_src1_data.resize(ggml_nelements(src1));
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
}
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) {
// ids -> [n_experts_used, n_tokens]
// src1 -> [cols, n_expert_used, n_tokens]
const ggml_tensor * ids = t->src[2];
const int n_as = src0->ne[2];
const int n_ids = ids->ne[0];
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
// take into account that ids is not contiguous!
GGML_ASSERT(ids->ne[1] == src1->ne[2]);
m_ids.resize(ggml_nbytes(ids));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
auto & e = m_stats[wname];
++e.ncall;
if (e.values.empty()) {
e.values.resize(src1->ne[0]*n_as, 0);
e.counts.resize(src1->ne[0]*n_as, 0);
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ASSERT(false);
}
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
}
// loop over all possible experts, regardless if they are used or not in the batch
for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0];
for (int idx = 0; idx < n_ids; ++idx) {
for (int row = 0; row < (int)src1->ne[2]; ++row) {
const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
if (excur != ex) continue;
const int64_t i11 = idx % src1->ne[1];
const int64_t i12 = row;
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1);
}
}
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
save_imatrix(m_last_call);
}
}
}
} else {
auto & e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
}
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
if (!std::isfinite(e.values[j])) {
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
exit(1);
}
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
save_imatrix(m_last_call);
}
}
}
return true;
}
void IMatrixCollector::save_imatrix(int ncall) const {
auto fname = m_params.out_file;
if (fname.empty()) {
fname = "imatrix.dat";
}
if (ncall > 0) {
fname += ".at_";
fname += std::to_string(ncall);
}
// avoid writing imatrix entries that do not have full data
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
int n_entries = 0;
std::vector<std::string> to_store;
bool is_first = true; // for printing
for (const auto & kv : m_stats) {
const int n_all = kv.second.counts.size();
if (n_all == 0) {
continue;
}
int n_zeros = 0;
for (const int c : kv.second.counts) {
if (c == 0) {
n_zeros++;
}
}
if (n_zeros != 0 && is_first) {
fprintf(stderr, "\n");
is_first = false;
}
if (n_zeros == n_all) {
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
continue;
}
if (n_zeros > 0) {
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
continue;
}
n_entries++;
to_store.push_back(kv.first);
}
if (to_store.size() < m_stats.size()) {
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
}
std::ofstream out(fname, std::ios::binary);
out.write((const char *) &n_entries, sizeof(n_entries));
for (const auto & name : to_store) {
const auto & stat = m_stats.at(name);
int len = name.size();
out.write((const char *) &len, sizeof(len));
out.write(name.c_str(), len);
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
int nval = stat.values.size();
out.write((const char *) &nval, sizeof(nval));
if (nval > 0) {
std::vector<float> tmp(nval);
for (int i = 0; i < nval; i++) {
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
}
out.write((const char*)tmp.data(), nval*sizeof(float));
}
}
// Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the input filename at the end of the file to later on specify it in quantize
{
int len = m_params.prompt_file.size();
out.write((const char *) &len, sizeof(len));
out.write(m_params.prompt_file.c_str(), len);
}
if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
}
bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(fname, std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__, fname);
return false;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, fname);
return false;
}
for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len));
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto & e = m_stats[std::move(name)];
int ncall;
in.read((char*)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i);
m_stats = {};
return false;
}
if (e.values.empty()) {
e.values.resize(nval, 0);
e.counts.resize(nval, 0);
}
std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
m_stats = {};
return false;
}
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
for (int i = 0; i < nval; i++) {
e.values[i] += tmp[i];
e.counts[i] += ncall;
}
e.ncall += ncall;
}
return true;
}
static IMatrixCollector g_collector;
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
return g_collector.collect_imatrix(t, ask, user_data);
}
struct results_log_softmax {
double log_softmax;
float logit;
float prob;
};
static std::vector<float> softmax(const std::vector<float> & logits) {
std::vector<float> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) {
max_logit = std::max(max_logit, v);
}
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
const float logit = logits[i] - max_logit;
const float exp_logit = expf(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
for (size_t i = 0; i < probs.size(); i++) {
probs[i] /= sum_exp;
}
return probs;
}
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) {
max_logit = std::max(max_logit, logits[i]);
}
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
}
static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
double local_nll = 0;
double local_nll2 = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
nll += local_nll; nll2 += local_nll2;
break;
}
lock.unlock();
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
const double v = -results.log_softmax;
local_nll += v;
local_nll2 += v*v;
logit_history[i] = results.logit;
prob_history[i] = results.prob;
}
};
for (auto & w : workers) {
w = std::thread(compute);
}
compute();
for (auto & w : workers) {
w.join();
}
}
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (params.i_chunk > 0) {
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false;
}
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
}
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return false;
}
std::vector<float> logit_history;
std::vector<float> prob_history;
if (params.compute_ppl) {
logit_history.resize(tokens.size());
prob_history.resize(tokens.size());
}
const int n_chunk_max = tokens.size() / n_ctx;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_batch = params.n_batch;
int count = 0;
double nll = 0.0;
double nll2 = 0.0;
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
if (params.compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
// TODO: use batch.logits to save computations instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
if (params.compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
if (params.compute_ppl) {
const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
logits.clear();
}
}
printf("\n");
if (params.compute_ppl) {
nll2 /= count;
nll /= count;
const double ppl = exp(nll);
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
}
}
return true;
}
int main(int argc, char ** argv) {
gpt_params params;
params.n_ctx = 512;
params.logits_all = true;
params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
params.n_batch = std::min(params.n_batch, params.n_ctx);
g_collector.set_params(params);
for (const auto & in_file : params.in_files) {
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
if (!g_collector.load_imatrix(in_file.c_str())) {
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
return 1;
}
}
if (params.in_files.size() > 1) {
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix();
}
llama_backend_init();
llama_numa_init(params.numa);
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = ik_collect_imatrix;
params.cb_eval_user_data = NULL;
params.warmup = false;
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
if (!compute_imatrix(ctx, params)) {
return 1;
}
g_collector.save_imatrix();
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View file

@ -1,5 +0,0 @@
set(TARGET llama-infill)
add_executable(${TARGET} infill.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,47 +0,0 @@
# llama.cpp/example/infill
This example shows how to use the infill mode with Code Llama models supporting infill mode.
Currently the 7B and 13B models support infill mode.
Infill supports most of the options available in the main example.
For further information have a look at the main README.md in llama.cpp/example/main/README.md
## Common Options
In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
## Input Prompts
The `infill` program provides several ways to interact with the LLaMA models using input prompts:
- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
## Interaction
The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
### Interaction Options
- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
### Example
Download a model that supports infill, for example CodeLlama:
```console
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
```
```bash
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
```

View file

@ -1,652 +0,0 @@
#include "common.h"
#include "console.h"
#include "llama.h"
#include "grammar-parser.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static llama_context ** g_ctx;
static llama_model ** g_model;
static gpt_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens
) {
if (params.logdir.empty()) {
return;
}
const std::string timestamp = string_get_sortable_timestamp();
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
return;
}
const std::string logfile_path = params.logdir + timestamp + ".yml";
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
fprintf(logfile, "binary: infill\n");
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n");
fprintf(logfile, "######################\n");
fprintf(logfile, "# Generation Results #\n");
fprintf(logfile, "######################\n");
fprintf(logfile, "\n");
yaml_dump_string_multiline(logfile, "output", output.c_str());
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile);
}
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void sigint_handler(int signo) {
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting = true;
} else {
console::cleanup();
printf("\n");
llama_print_timings(*g_ctx);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
_exit(130);
}
}
}
#endif
int main(int argc, char ** argv) {
gpt_params params;
llama_sampling_params & sparams = params.sparams;
g_params = &params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("infill", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
if (params.logits_all) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
printf("************\n\n");
return 0;
}
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
LOG("%s: llama backend init\n", __func__);
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
g_model = &model;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
}
// print system information
{
LOG_TEE("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
}
const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end;
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
GGML_ASSERT(llama_token_prefix(model) >= 0);
GGML_ASSERT(llama_token_suffix(model) >= 0);
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
}
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
const llama_token middle_token = llama_token_middle(model);
if (middle_token >= 0) {
embd_inp.push_back(middle_token);
}
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
params.n_keep = (int)embd_inp.size();
}
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
params.interactive = true;
}
if (params.verbose_prompt) {
LOG_TEE("\n");
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
LOG_TEE("'\n");
}
LOG_TEE("\n");
}
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
LOG_TEE("%s: interactive mode on.\n", __func__);
if (params.input_prefix_bos) {
LOG_TEE("Input prefix with BOS\n");
}
if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
}
if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
}
}
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
LOG_TEE("\n##### Infill mode #####\n\n");
if (params.infill) {
printf("\n************\n");
printf("no need to specify '--infill', always running infill\n");
printf("************\n\n");
}
if (params.interactive) {
const char *control_message;
if (params.multiline_input) {
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n";
} else {
control_message = " - Press Return to return control to LLaMA.\n"
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
LOG_TEE("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
#endif
LOG_TEE( "%s\n", control_message);
is_interacting = params.interactive_first;
}
bool input_echo = true;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
std::ostringstream output_ss; g_output_ss = &output_ss;
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
std::vector<llama_token> embd;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
while (n_remain != 0 || params.interactive) {
// predict
if (!embd.empty()) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int) embd.size() > max_embd_size) {
const int skipped_tokens = (int) embd.size() - max_embd_size;
embd.resize(max_embd_size);
console::set_display(console::error);
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
fflush(stdout);
}
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
n_past -= n_discard;
LOG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
}
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
LOG("n_past = %d\n", n_past);
}
}
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
llama_sampling_accept(ctx_sampling, ctx, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
embd.push_back(id);
// echo this to console
input_echo = true;
// decrement remaining sampling budget
--n_remain;
LOG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
}
}
}
// display text
if (input_echo) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
if (embd.size() > 1) {
input_tokens.push_back(id);
} else {
output_tokens.push_back(id);
output_ss << token_str;
}
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
}
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
// print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
fflush(stdout);
printf("\n");
console::set_display(console::user_input);
std::string buffer;
std::string line;
bool another_line=true;
// set a new prefix via stdin
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// check if we got an empty line, if so we use the old input
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_prefix = buffer;
}
buffer.clear();
// set a new suffix via stdin
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// check if we got an empty line
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_suffix = buffer;
}
buffer.clear();
// done taking input, reset color
console::set_display(console::reset);
if (params.escape) {
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
string_process_escapes(params.input_prefix);
string_process_escapes(params.input_suffix);
}
// tokenize new prefix and suffix
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
}
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
if (middle_token >= 0) {
embd_inp.push_back(middle_token);
}
embd.clear();
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
// LOG_TEE("took new input\n");
is_interacting = false;
}
// deal with end of generation tokens in interactive mode
else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
LOG("found EOS token\n");
if (params.interactive) {
is_interacting = true;
printf("\n");
console::set_display(console::user_input);
fflush(stdout);
}
}
if (n_past > 0 && is_interacting && !params.interactive) {
LOG("waiting for user input\n");
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
if (!params.input_prefix.empty()) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix;
printf("%s", buffer.c_str());
}
std::string line;
bool another_line = true;
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// done taking input, reset color
console::set_display(console::reset);
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty()) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str());
}
LOG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
output_ss << llama_token_to_piece(ctx, token);
}
n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain);
} else {
LOG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
}
if (n_past > 0) {
if (is_interacting) {
llama_sampling_reset(ctx_sampling);
}
is_interacting = false;
}
}
// end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
break;
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
n_remain = params.n_predict;
is_interacting = true;
}
}
if (!params.interactive && n_remain <= 0) {
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
fflush(stdout);
}
llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
llama_free(ctx);
llama_free_model(model);
llama_sampling_free(ctx_sampling);
llama_backend_free();
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
return 0;
}

View file

@ -1,21 +0,0 @@
# llama.cpp/example/jeopardy
This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
Step 1: Open jeopardy.sh and modify the following:
```
MODEL=(path to your model)
MODEL_NAME=(name of your model)
prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
opts=(add -instruct here if needed for your model, or anything else you want to test out)
```
Step 2: Run `jeopardy.sh` from the llama.cpp folder
Step 3: Repeat steps 1 and 2 until you have all the results you need.
Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.

View file

@ -1,58 +0,0 @@
#!/usr/bin/env python3
import matplotlib.pyplot as plt
import os
import csv
labels = []
numbers = []
numEntries = 1
rows = []
def bar_chart(numbers, labels, pos):
plt.bar(pos, numbers, color='blue')
plt.xticks(ticks=pos, labels=labels)
plt.title("Jeopardy Results by Model")
plt.xlabel("Model")
plt.ylabel("Questions Correct")
plt.show()
def calculatecorrect():
directory = os.fsencode("./examples/jeopardy/results/")
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
for row in csv_reader:
global rows
rows.append(row)
for listing in os.listdir(directory):
filename = os.fsdecode(listing)
if filename.endswith(".txt"):
file = open("./examples/jeopardy/results/" + filename, "rt")
global labels
global numEntries
global numbers
labels.append(filename[:-4])
numEntries += 1
i = 1
totalcorrect = 0
for line in file.readlines():
if line.strip() != "------":
print(line)
else:
print("Correct answer: " + rows[i][2] + "\n")
i += 1
print("Did the AI get the question right? (y/n)")
if input() == "y":
totalcorrect += 1
numbers.append(totalcorrect)
if __name__ == '__main__':
calculatecorrect()
pos = list(range(numEntries))
labels.append("Human")
numbers.append(48.11)
bar_chart(numbers, labels, pos)
print(labels)
print(numbers)

View file

@ -1,30 +0,0 @@
#!/bin/bash
set -e
MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
MODEL_NAME=Vicuna
# exec options
prefix="Human: " # Ex. Vicuna uses "Human: "
opts="--temp 0 -n 80" # additional flags
nl='
'
introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
# file options
question_file=./examples/jeopardy/questions.txt
touch ./examples/jeopardy/results/$MODEL_NAME.txt
output_file=./examples/jeopardy/results/$MODEL_NAME.txt
counter=1
echo 'Running'
while IFS= read -r question
do
exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
echo $counter
echo "Current Question: $question"
eval "$exe_cmd"
echo -e "\n------" >> $output_file
counter=$((counter+1))
done < "$question_file"

View file

@ -1,103 +0,0 @@
Index,Original Category,Original Correct Question,Model Prompt
1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
3,Writers Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
10,Movies of the 80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
28,Geographic Names the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipea can of Campbell's tomato soup & 2 cans of milk?"
38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territorys largest ethnic group?"
39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
44,Art & Science,What is Halleys Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
51,World War II,What is Schindlers list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
70,USA,What is Jack Daniels?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
79,Childrens Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
83,Childrens Lit,What is Charlottes Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
92,Names The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
100,Childrens Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
,,,
TOTALS,,,
Can't render this file because it has a wrong number of fields in line 34.

View file

@ -1,100 +0,0 @@
Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
For a special 1970s cookbook, who provided one simple recipea can of Campbell's tomato soup & 2 cans of milk?
Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territorys largest ethnic group?
In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
In 2010 who introduced the 4-point shot, 35 feet from the basket?
Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
Like Sir Thomas More, 3 16th century English queens are buried at what British location?
In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
What was first sold in 1908, at a price equivalent to about $27,000 today?
The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
Until a 1903 secession, what country's contiguous territory spanned 2 continents?
Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?

View file

@ -1,5 +0,0 @@
set(TARGET llama-bench)
add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,281 +0,0 @@
# llama.cpp/examples/llama-bench
Performance testing tool for llama.cpp.
## Table of contents
1. [Syntax](#syntax)
2. [Examples](#examples)
1. [Text generation with different models](#text-generation-with-different-models)
2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes)
3. [Different numbers of threads](#different-numbers-of-threads)
4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu)
3. [Output formats](#output-formats)
1. [Markdown](#markdown)
2. [CSV](#csv)
3. [JSON](#json)
4. [SQL](#sql)
## Syntax
```
usage: ./llama-bench [options]
options:
-h, --help
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: 512,128)
-b, --batch-size <n> (default: 2048)
-ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 16)
-ngl, --n-gpu-layers <n> (default: 99)
-sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
--numa <distribute|isolate|numactl> (default: disabled)
-embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md)
-v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
```
llama-bench can perform three types of tests:
- Prompt processing (pp): processing a prompt in batches (`-p`)
- Text generation (tg): generating a sequence of tokens (`-n`)
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
For a description of the other options, see the [main example](../main/README.md).
Note:
- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.
## Examples
### Text generation with different models
```sh
$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 |
### Prompt processing with different batch sizes
```sh
$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
```
| model | size | params | backend | ngl | n_batch | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 |
### Different numbers of threads
```sh
$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
```
| model | size | params | backend | threads | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 ||
### Different numbers of layers offloaded to the GPU
```sh
$ ./llama-bench -ngl 10,20,30,31,32,33,34,35
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
## Output formats
By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
### Markdown
```sh
$ ./llama-bench -o md
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 |
### CSV
```sh
$ ./llama-bench -o csv
```
```csv
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
```
### JSON
```sh
$ ./llama-bench -o json
```
```json
[
{
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"metal": false,
"gpu_blas": true,
"blas": true,
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
"model_filename": "models/7B/ggml-model-q4_0.gguf",
"model_type": "llama 7B mostly Q4_0",
"model_size": 3825065984,
"model_n_params": 6738415616,
"n_batch": 512,
"n_threads": 16,
"f16_kv": true,
"n_gpu_layers": 99,
"main_gpu": 0,
"mul_mat_q": true,
"tensor_split": "0.00",
"n_prompt": 512,
"n_gen": 0,
"test_time": "2023-09-23T12:09:57Z",
"avg_ns": 212365953,
"stddev_ns": 985423,
"avg_ts": 2410.974041,
"stddev_ts": 11.163766,
"samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
"samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
},
{
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"metal": false,
"gpu_blas": true,
"blas": true,
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
"model_filename": "models/7B/ggml-model-q4_0.gguf",
"model_type": "llama 7B mostly Q4_0",
"model_size": 3825065984,
"model_n_params": 6738415616,
"n_batch": 512,
"n_threads": 16,
"f16_kv": true,
"n_gpu_layers": 99,
"main_gpu": 0,
"mul_mat_q": true,
"tensor_split": "0.00",
"n_prompt": 0,
"n_gen": 128,
"test_time": "2023-09-23T12:09:59Z",
"avg_ns": 977425219,
"stddev_ns": 9268593,
"avg_ts": 130.965708,
"stddev_ts": 1.238924,
"samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
"samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
}
]
```
### SQL
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
```sh
$ ./llama-bench -o sql
```
```sql
CREATE TABLE IF NOT EXISTS test (
build_commit TEXT,
build_number INTEGER,
cuda INTEGER,
metal INTEGER,
gpu_blas INTEGER,
blas INTEGER,
cpu_info TEXT,
gpu_info TEXT,
model_filename TEXT,
model_type TEXT,
model_size INTEGER,
model_n_params INTEGER,
n_batch INTEGER,
n_threads INTEGER,
f16_kv INTEGER,
n_gpu_layers INTEGER,
main_gpu INTEGER,
mul_mat_q INTEGER,
tensor_split TEXT,
n_prompt INTEGER,
n_gen INTEGER,
test_time TEXT,
avg_ns INTEGER,
stddev_ns INTEGER,
avg_ts REAL,
stddev_ts REAL
);
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
```

File diff suppressed because it is too large Load diff

View file

@ -1,33 +0,0 @@
# Gradle files
.gradle/
build/
# Local configuration file (sdk path, etc)
local.properties
# Log/OS Files
*.log
# Android Studio generated files and folders
captures/
.externalNativeBuild/
.cxx/
*.apk
output.json
# IntelliJ
*.iml
.idea/
misc.xml
deploymentTargetDropDown.xml
render.experimental.xml
# Keystore files
*.jks
*.keystore
# Google Services (e.g. APIs or Firebase)
google-services.json
# Android Profiling
*.hprof

View file

@ -1 +0,0 @@
/build

View file

@ -1,65 +0,0 @@
plugins {
id("com.android.application")
id("org.jetbrains.kotlin.android")
}
android {
namespace = "com.example.llama"
compileSdk = 34
defaultConfig {
applicationId = "com.example.llama"
minSdk = 33
targetSdk = 34
versionCode = 1
versionName = "1.0"
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
vectorDrawables {
useSupportLibrary = true
}
}
buildTypes {
release {
isMinifyEnabled = false
proguardFiles(
getDefaultProguardFile("proguard-android-optimize.txt"),
"proguard-rules.pro"
)
}
}
compileOptions {
sourceCompatibility = JavaVersion.VERSION_1_8
targetCompatibility = JavaVersion.VERSION_1_8
}
kotlinOptions {
jvmTarget = "1.8"
}
buildFeatures {
compose = true
}
composeOptions {
kotlinCompilerExtensionVersion = "1.5.1"
}
}
dependencies {
implementation("androidx.core:core-ktx:1.12.0")
implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
implementation("androidx.activity:activity-compose:1.8.2")
implementation(platform("androidx.compose:compose-bom:2023.08.00"))
implementation("androidx.compose.ui:ui")
implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3")
implementation(project(":llama"))
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
androidTestImplementation("androidx.compose.ui:ui-test-junit4")
debugImplementation("androidx.compose.ui:ui-tooling")
debugImplementation("androidx.compose.ui:ui-test-manifest")
}

View file

@ -1,21 +0,0 @@
# Add project specific ProGuard rules here.
# You can control the set of applied configuration files using the
# proguardFiles setting in build.gradle.
#
# For more details, see
# http://developer.android.com/guide/developing/tools/proguard.html
# If your project uses WebView with JS, uncomment the following
# and specify the fully qualified class name to the JavaScript interface
# class:
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
# public *;
#}
# Uncomment this to preserve the line number information for
# debugging stack traces.
#-keepattributes SourceFile,LineNumberTable
# If you keep the line number information, uncomment this to
# hide the original source file name.
#-renamesourcefileattribute SourceFile

View file

@ -1,30 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:tools="http://schemas.android.com/tools">
<uses-permission android:name="android.permission.INTERNET" />
<application
android:allowBackup="true"
android:dataExtractionRules="@xml/data_extraction_rules"
android:fullBackupContent="@xml/backup_rules"
android:icon="@mipmap/ic_launcher"
android:label="@string/app_name"
android:roundIcon="@mipmap/ic_launcher_round"
android:supportsRtl="true"
android:theme="@style/Theme.LlamaAndroid"
>
<activity
android:name=".MainActivity"
android:exported="true"
android:theme="@style/Theme.LlamaAndroid">
<intent-filter>
<action android:name="android.intent.action.MAIN" />
<category android:name="android.intent.category.LAUNCHER" />
</intent-filter>
</activity>
</application>
</manifest>

View file

@ -1,119 +0,0 @@
package com.example.llama
import android.app.DownloadManager
import android.net.Uri
import android.util.Log
import androidx.compose.material3.Button
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableDoubleStateOf
import androidx.compose.runtime.mutableStateOf
import androidx.compose.runtime.remember
import androidx.compose.runtime.rememberCoroutineScope
import androidx.compose.runtime.setValue
import androidx.core.database.getLongOrNull
import androidx.core.net.toUri
import kotlinx.coroutines.delay
import kotlinx.coroutines.launch
import java.io.File
data class Downloadable(val name: String, val source: Uri, val destination: File) {
companion object {
@JvmStatic
private val tag: String? = this::class.qualifiedName
sealed interface State
data object Ready: State
data class Downloading(val id: Long): State
data class Downloaded(val downloadable: Downloadable): State
data class Error(val message: String): State
@JvmStatic
@Composable
fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
var status: State by remember {
mutableStateOf(
if (item.destination.exists()) Downloaded(item)
else Ready
)
}
var progress by remember { mutableDoubleStateOf(0.0) }
val coroutineScope = rememberCoroutineScope()
suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
while (true) {
val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
if (cursor == null) {
Log.e(tag, "dm.query() returned null")
return Error("dm.query() returned null")
}
if (!cursor.moveToFirst() || cursor.count < 1) {
cursor.close()
Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
return Ready
}
val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
val sofar = cursor.getLongOrNull(pix) ?: 0
val total = cursor.getLongOrNull(tix) ?: 1
cursor.close()
if (sofar == total) {
return Downloaded(item)
}
progress = (sofar * 1.0) / total
delay(1000L)
}
}
fun onClick() {
when (val s = status) {
is Downloaded -> {
viewModel.load(item.destination.path)
}
is Downloading -> {
coroutineScope.launch {
status = waitForDownload(s, item)
}
}
else -> {
item.destination.delete()
val request = DownloadManager.Request(item.source).apply {
setTitle("Downloading model")
setDescription("Downloading model: ${item.name}")
setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
setDestinationUri(item.destination.toUri())
}
viewModel.log("Saving ${item.name} to ${item.destination.path}")
Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
val id = dm.enqueue(request)
status = Downloading(id)
onClick()
}
}
}
Button(onClick = { onClick() }, enabled = status !is Downloading) {
when (status) {
is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
is Downloaded -> Text("Load ${item.name}")
is Ready -> Text("Download ${item.name}")
is Error -> Text("Download ${item.name}")
}
}
}
}
}

View file

@ -1,154 +0,0 @@
package com.example.llama
import android.app.ActivityManager
import android.app.DownloadManager
import android.content.ClipData
import android.content.ClipboardManager
import android.net.Uri
import android.os.Bundle
import android.os.StrictMode
import android.os.StrictMode.VmPolicy
import android.text.format.Formatter
import androidx.activity.ComponentActivity
import androidx.activity.compose.setContent
import androidx.activity.viewModels
import androidx.compose.foundation.layout.Box
import androidx.compose.foundation.layout.Column
import androidx.compose.foundation.layout.Row
import androidx.compose.foundation.layout.fillMaxSize
import androidx.compose.foundation.layout.padding
import androidx.compose.foundation.lazy.LazyColumn
import androidx.compose.foundation.lazy.items
import androidx.compose.foundation.lazy.rememberLazyListState
import androidx.compose.material3.Button
import androidx.compose.material3.LocalContentColor
import androidx.compose.material3.MaterialTheme
import androidx.compose.material3.OutlinedTextField
import androidx.compose.material3.Surface
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
import androidx.compose.ui.Modifier
import androidx.compose.ui.unit.dp
import androidx.core.content.getSystemService
import com.example.llama.ui.theme.LlamaAndroidTheme
import java.io.File
class MainActivity(
activityManager: ActivityManager? = null,
downloadManager: DownloadManager? = null,
clipboardManager: ClipboardManager? = null,
): ComponentActivity() {
private val tag: String? = this::class.simpleName
private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
private val viewModel: MainViewModel by viewModels()
// Get a MemoryInfo object for the device's current memory status.
private fun availableMemory(): ActivityManager.MemoryInfo {
return ActivityManager.MemoryInfo().also { memoryInfo ->
activityManager.getMemoryInfo(memoryInfo)
}
}
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
StrictMode.setVmPolicy(
VmPolicy.Builder(StrictMode.getVmPolicy())
.detectLeakedClosableObjects()
.build()
)
val free = Formatter.formatFileSize(this, availableMemory().availMem)
val total = Formatter.formatFileSize(this, availableMemory().totalMem)
viewModel.log("Current memory: $free / $total")
viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
val extFilesDir = getExternalFilesDir(null)
val models = listOf(
Downloadable(
"Phi-2 7B (Q4_0, 1.6 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
File(extFilesDir, "phi-2-q4_0.gguf"),
),
Downloadable(
"TinyLlama 1.1B (f16, 2.2 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
File(extFilesDir, "tinyllama-1.1-f16.gguf"),
),
Downloadable(
"Phi 2 DPO (Q3_K_M, 1.48 GiB)",
Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
),
)
setContent {
LlamaAndroidTheme {
// A surface container using the 'background' color from the theme
Surface(
modifier = Modifier.fillMaxSize(),
color = MaterialTheme.colorScheme.background
) {
MainCompose(
viewModel,
clipboardManager,
downloadManager,
models,
)
}
}
}
}
}
@Composable
fun MainCompose(
viewModel: MainViewModel,
clipboard: ClipboardManager,
dm: DownloadManager,
models: List<Downloadable>
) {
Column {
val scrollState = rememberLazyListState()
Box(modifier = Modifier.weight(1f)) {
LazyColumn(state = scrollState) {
items(viewModel.messages) {
Text(
it,
style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
modifier = Modifier.padding(16.dp)
)
}
}
}
OutlinedTextField(
value = viewModel.message,
onValueChange = { viewModel.updateMessage(it) },
label = { Text("Message") },
)
Row {
Button({ viewModel.send() }) { Text("Send") }
Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
Button({ viewModel.clear() }) { Text("Clear") }
Button({
viewModel.messages.joinToString("\n").let {
clipboard.setPrimaryClip(ClipData.newPlainText("", it))
}
}) { Text("Copy") }
}
Column {
for (model in models) {
Downloadable.Button(viewModel, dm, model)
}
}
}
}

View file

@ -1,105 +0,0 @@
package com.example.llama
import android.llama.cpp.LLamaAndroid
import android.util.Log
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
import androidx.compose.runtime.setValue
import androidx.lifecycle.ViewModel
import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.launch
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
companion object {
@JvmStatic
private val NanosPerSecond = 1_000_000_000.0
}
private val tag: String? = this::class.simpleName
var messages by mutableStateOf(listOf("Initializing..."))
private set
var message by mutableStateOf("")
private set
override fun onCleared() {
super.onCleared()
viewModelScope.launch {
try {
llamaAndroid.unload()
} catch (exc: IllegalStateException) {
messages += exc.message!!
}
}
}
fun send() {
val text = message
message = ""
// Add to messages console.
messages += text
messages += ""
viewModelScope.launch {
llamaAndroid.send(text)
.catch {
Log.e(tag, "send() failed", it)
messages += it.message!!
}
.collect { messages = messages.dropLast(1) + (messages.last() + it) }
}
}
fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
viewModelScope.launch {
try {
val start = System.nanoTime()
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
val end = System.nanoTime()
messages += warmupResult
val warmup = (end - start).toDouble() / NanosPerSecond
messages += "Warm up time: $warmup seconds, please wait..."
if (warmup > 5.0) {
messages += "Warm up took too long, aborting benchmark"
return@launch
}
messages += llamaAndroid.bench(512, 128, 1, 3)
} catch (exc: IllegalStateException) {
Log.e(tag, "bench() failed", exc)
messages += exc.message!!
}
}
}
fun load(pathToModel: String) {
viewModelScope.launch {
try {
llamaAndroid.load(pathToModel)
messages += "Loaded $pathToModel"
} catch (exc: IllegalStateException) {
Log.e(tag, "load() failed", exc)
messages += exc.message!!
}
}
}
fun updateMessage(newMessage: String) {
message = newMessage
}
fun clear() {
messages = listOf()
}
fun log(message: String) {
messages += message
}
}

View file

@ -1,11 +0,0 @@
package com.example.llama.ui.theme
import androidx.compose.ui.graphics.Color
val Purple80 = Color(0xFFD0BCFF)
val PurpleGrey80 = Color(0xFFCCC2DC)
val Pink80 = Color(0xFFEFB8C8)
val Purple40 = Color(0xFF6650a4)
val PurpleGrey40 = Color(0xFF625b71)
val Pink40 = Color(0xFF7D5260)

View file

@ -1,70 +0,0 @@
package com.example.llama.ui.theme
import android.app.Activity
import android.os.Build
import androidx.compose.foundation.isSystemInDarkTheme
import androidx.compose.material3.MaterialTheme
import androidx.compose.material3.darkColorScheme
import androidx.compose.material3.dynamicDarkColorScheme
import androidx.compose.material3.dynamicLightColorScheme
import androidx.compose.material3.lightColorScheme
import androidx.compose.runtime.Composable
import androidx.compose.runtime.SideEffect
import androidx.compose.ui.graphics.toArgb
import androidx.compose.ui.platform.LocalContext
import androidx.compose.ui.platform.LocalView
import androidx.core.view.WindowCompat
private val DarkColorScheme = darkColorScheme(
primary = Purple80,
secondary = PurpleGrey80,
tertiary = Pink80
)
private val LightColorScheme = lightColorScheme(
primary = Purple40,
secondary = PurpleGrey40,
tertiary = Pink40
/* Other default colors to override
background = Color(0xFFFFFBFE),
surface = Color(0xFFFFFBFE),
onPrimary = Color.White,
onSecondary = Color.White,
onTertiary = Color.White,
onBackground = Color(0xFF1C1B1F),
onSurface = Color(0xFF1C1B1F),
*/
)
@Composable
fun LlamaAndroidTheme(
darkTheme: Boolean = isSystemInDarkTheme(),
// Dynamic color is available on Android 12+
dynamicColor: Boolean = true,
content: @Composable () -> Unit
) {
val colorScheme = when {
dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
val context = LocalContext.current
if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
}
darkTheme -> DarkColorScheme
else -> LightColorScheme
}
val view = LocalView.current
if (!view.isInEditMode) {
SideEffect {
val window = (view.context as Activity).window
window.statusBarColor = colorScheme.primary.toArgb()
WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
}
}
MaterialTheme(
colorScheme = colorScheme,
typography = Typography,
content = content
)
}

View file

@ -1,34 +0,0 @@
package com.example.llama.ui.theme
import androidx.compose.material3.Typography
import androidx.compose.ui.text.TextStyle
import androidx.compose.ui.text.font.FontFamily
import androidx.compose.ui.text.font.FontWeight
import androidx.compose.ui.unit.sp
// Set of Material typography styles to start with
val Typography = Typography(
bodyLarge = TextStyle(
fontFamily = FontFamily.Default,
fontWeight = FontWeight.Normal,
fontSize = 16.sp,
lineHeight = 24.sp,
letterSpacing = 0.5.sp
)
/* Other default text styles to override
titleLarge = TextStyle(
fontFamily = FontFamily.Default,
fontWeight = FontWeight.Normal,
fontSize = 22.sp,
lineHeight = 28.sp,
letterSpacing = 0.sp
),
labelSmall = TextStyle(
fontFamily = FontFamily.Default,
fontWeight = FontWeight.Medium,
fontSize = 11.sp,
lineHeight = 16.sp,
letterSpacing = 0.5.sp
)
*/
)

View file

@ -1,170 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<vector xmlns:android="http://schemas.android.com/apk/res/android"
android:width="108dp"
android:height="108dp"
android:viewportWidth="108"
android:viewportHeight="108">
<path
android:fillColor="#3DDC84"
android:pathData="M0,0h108v108h-108z" />
<path
android:fillColor="#00000000"
android:pathData="M9,0L9,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,0L19,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M29,0L29,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M39,0L39,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M49,0L49,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M59,0L59,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M69,0L69,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M79,0L79,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M89,0L89,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M99,0L99,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,9L108,9"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,19L108,19"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,29L108,29"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,39L108,39"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,49L108,49"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,59L108,59"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,69L108,69"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,79L108,79"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,89L108,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,99L108,99"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,29L89,29"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,39L89,39"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,49L89,49"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,59L89,59"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,69L89,69"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,79L89,79"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M29,19L29,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M39,19L39,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M49,19L49,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M59,19L59,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M69,19L69,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M79,19L79,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
</vector>

View file

@ -1,30 +0,0 @@
<vector xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:aapt="http://schemas.android.com/aapt"
android:width="108dp"
android:height="108dp"
android:viewportWidth="108"
android:viewportHeight="108">
<path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
<aapt:attr name="android:fillColor">
<gradient
android:endX="85.84757"
android:endY="92.4963"
android:startX="42.9492"
android:startY="49.59793"
android:type="linear">
<item
android:color="#44000000"
android:offset="0.0" />
<item
android:color="#00000000"
android:offset="1.0" />
</gradient>
</aapt:attr>
</path>
<path
android:fillColor="#FFFFFF"
android:fillType="nonZero"
android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
android:strokeWidth="1"
android:strokeColor="#00000000" />
</vector>

View file

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
<background android:drawable="@drawable/ic_launcher_background" />
<foreground android:drawable="@drawable/ic_launcher_foreground" />
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
</adaptive-icon>

View file

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
<background android:drawable="@drawable/ic_launcher_background" />
<foreground android:drawable="@drawable/ic_launcher_foreground" />
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
</adaptive-icon>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 982 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.6 KiB

View file

@ -1,10 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<color name="purple_200">#FFBB86FC</color>
<color name="purple_500">#FF6200EE</color>
<color name="purple_700">#FF3700B3</color>
<color name="teal_200">#FF03DAC5</color>
<color name="teal_700">#FF018786</color>
<color name="black">#FF000000</color>
<color name="white">#FFFFFFFF</color>
</resources>

View file

@ -1,3 +0,0 @@
<resources>
<string name="app_name">LlamaAndroid</string>
</resources>

View file

@ -1,5 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
</resources>

View file

@ -1,13 +0,0 @@
<?xml version="1.0" encoding="utf-8"?><!--
Sample backup rules file; uncomment and customize as necessary.
See https://developer.android.com/guide/topics/data/autobackup
for details.
Note: This file is ignored for devices older that API 31
See https://developer.android.com/about/versions/12/backup-restore
-->
<full-backup-content>
<!--
<include domain="sharedpref" path="."/>
<exclude domain="sharedpref" path="device.xml"/>
-->
</full-backup-content>

View file

@ -1,19 +0,0 @@
<?xml version="1.0" encoding="utf-8"?><!--
Sample data extraction rules file; uncomment and customize as necessary.
See https://developer.android.com/about/versions/12/backup-restore#xml-changes
for details.
-->
<data-extraction-rules>
<cloud-backup>
<!-- TODO: Use <include> and <exclude> to control what is backed up.
<include .../>
<exclude .../>
-->
</cloud-backup>
<!--
<device-transfer>
<include .../>
<exclude .../>
</device-transfer>
-->
</data-extraction-rules>

View file

@ -1,6 +0,0 @@
// Top-level build file where you can add configuration options common to all sub-projects/modules.
plugins {
id("com.android.application") version "8.2.0" apply false
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
id("com.android.library") version "8.2.0" apply false
}

View file

@ -1,23 +0,0 @@
# Project-wide Gradle settings.
# IDE (e.g. Android Studio) users:
# Gradle settings configured through the IDE *will override*
# any settings specified in this file.
# For more details on how to configure your build environment visit
# http://www.gradle.org/docs/current/userguide/build_environment.html
# Specifies the JVM arguments used for the daemon process.
# The setting is particularly useful for tweaking memory settings.
org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
# When configured, Gradle will run in incubating parallel mode.
# This option should only be used with decoupled projects. More details, visit
# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
# org.gradle.parallel=true
# AndroidX package structure to make it clearer which packages are bundled with the
# Android operating system, and which are packaged with your app's APK
# https://developer.android.com/topic/libraries/support-library/androidx-rn
android.useAndroidX=true
# Kotlin code style for this project: "official" or "obsolete":
kotlin.code.style=official
# Enables namespacing of each library's R class so that its R class includes only the
# resources declared in the library itself and none from the library's dependencies,
# thereby reducing the size of the R class for that library
android.nonTransitiveRClass=true

View file

@ -1,6 +0,0 @@
#Thu Dec 21 14:31:09 AEDT 2023
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

View file

@ -1,185 +0,0 @@
#!/usr/bin/env sh
#
# Copyright 2015 the original author or authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
##
## Gradle start up script for UN*X
##
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
warn () {
echo "$*"
}
die () {
echo
echo "$*"
echo
exit 1
}
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin or MSYS, switch paths to Windows format before running java
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=`expr $i + 1`
done
case $i in
0) set -- ;;
1) set -- "$args0" ;;
2) set -- "$args0" "$args1" ;;
3) set -- "$args0" "$args1" "$args2" ;;
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=`save "$@"`
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
exec "$JAVACMD" "$@"

View file

@ -1 +0,0 @@
/build

Some files were not shown because too many files have changed in this diff Show more