diff --git a/.editorconfig b/.editorconfig index f88f8da67..1a8840f9b 100644 --- a/.editorconfig +++ b/.editorconfig @@ -27,6 +27,3 @@ indent_size = 2 [examples/llama.swiftui/llama.swiftui.xcodeproj/*] indent_style = tab -[examples/cvector-generator/*.txt] -trim_trailing_whitespace = unset -insert_final_newline = unset diff --git a/Makefile b/Makefile index 4c321edd5..bd0ee5589 100644 --- a/Makefile +++ b/Makefile @@ -1,47 +1,15 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ - libllava.a \ llama-cli \ - llama-convert-llama2c-to-ggml \ - llama-embedding \ - llama-eval-callback \ - llama-export-lora \ - llama-gbnf-validator \ - llama-gguf \ - llama-gguf-hash \ - llama-gguf-split \ - llama-gritlm \ - llama-imatrix \ - llama-infill \ - llama-llava-cli \ - llama-lookahead \ - llama-lookup \ - llama-lookup-create \ - llama-lookup-merge \ - llama-lookup-stats \ - llama-parallel \ - llama-passkey \ - llama-perplexity \ llama-q8dot \ - llama-quantize \ - llama-quantize-stats \ - llama-retrieval \ - llama-save-load-state \ - llama-server \ - llama-simple \ - llama-speculative \ - llama-tokenize \ - llama-vdot \ - llama-cvector-generator + llama-vdot # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned -LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ - simple save-load-state server gguf gguf-split eval-callback libllava.a llava-cli \ - retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm +LEGACY_TARGETS_CLEAN = main vdot q8dot # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. -LEGACY_TARGETS_BUILD = main quantize perplexity embedding server +LEGACY_TARGETS_BUILD = main # Deprecation aliases ifdef LLAMA_CUBLAS @@ -1135,213 +1103,12 @@ llama-cli: examples/main/main.cpp \ @echo '==== Run ./llama-cli -h for help. ====' @echo -llama-infill: examples/infill/infill.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-simple: examples/simple/simple.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-tokenize: examples/tokenize/tokenize.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-quantize: examples/quantize/quantize.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-perplexity: examples/perplexity/perplexity.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-imatrix: examples/imatrix/imatrix.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-embedding: examples/embedding/embedding.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gritlm: examples/gritlm/gritlm.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-save-load-state: examples/save-load-state/save-load-state.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gguf: examples/gguf/gguf.cpp \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -examples/gguf-hash/deps/sha1/sha1.o: \ - examples/gguf-hash/deps/sha1/sha1.c - $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@ - -examples/gguf-hash/deps/xxhash/xxhash.o: \ - examples/gguf-hash/deps/xxhash/xxhash.c - $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@ - -examples/gguf-hash/deps/sha256/sha256.o: \ - examples/gguf-hash/deps/sha256/sha256.c - $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@ - -llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gguf-split: examples/gguf-split/gguf-split.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-eval-callback: examples/eval-callback/eval-callback.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ - $(OBJ_GGML) $(OBJ_LLAMA) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-export-lora: examples/export-lora/export-lora.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-retrieval: examples/retrieval/retrieval.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-speculative: examples/speculative/speculative.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-parallel: examples/parallel/parallel.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookahead: examples/lookahead/lookahead.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup: examples/lookup/lookup.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup-create: examples/lookup/lookup-create.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup-merge: examples/lookup/lookup-merge.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup-stats: examples/lookup/lookup-stats.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-passkey: examples/passkey/passkey.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - ifdef GGML_RPC rpc-server: examples/rpc/rpc-server.cpp \ $(OBJ_GGML) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) endif # GGML_RPC -llama-server: \ - examples/server/server.cpp \ - examples/server/utils.hpp \ - examples/server/httplib.h \ - examples/server/colorthemes.css.hpp \ - examples/server/style.css.hpp \ - examples/server/theme-beeninorder.css.hpp \ - examples/server/theme-ketivah.css.hpp \ - examples/server/theme-mangotango.css.hpp \ - examples/server/theme-playground.css.hpp \ - examples/server/theme-polarnight.css.hpp \ - examples/server/theme-snowstorm.css.hpp \ - examples/server/index.html.hpp \ - examples/server/index-new.html.hpp \ - examples/server/index.js.hpp \ - examples/server/completion.js.hpp \ - examples/server/system-prompts.js.hpp \ - examples/server/prompt-formats.js.hpp \ - examples/server/json-schema-to-grammar.mjs.hpp \ - common/json.hpp \ - common/stb_image.h \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) - -# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`: -examples/server/%.hpp: examples/server/public/% Makefile - @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \ - echo "unsigned char $${NAME}[] = {" && \ - cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \ - echo "};" && \ - echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ - ) > $@ - -libllava.a: examples/llava/llava.cpp \ - examples/llava/llava.h \ - examples/llava/clip.cpp \ - examples/llava/clip.h \ - common/stb_image.h \ - common/base64.hpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual - -llama-llava-cli: examples/llava/llava-cli.cpp \ - examples/llava/clip.h \ - examples/llava/clip.cpp \ - examples/llava/llava.h \ - examples/llava/llava.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual - $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) - common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh "$(CC)" > $@.tmp @if ! cmp -s $@.tmp $@; then \ @@ -1371,7 +1138,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed. # # Mark legacy binary targets as .PHONY so that they are always checked. -.PHONY: main quantize perplexity embedding server +.PHONY: main # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. # Eventually we will want to remove these target from building all the time. @@ -1380,37 +1147,6 @@ main: examples/deprecation-warning/deprecation-warning.cpp $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." -server: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." -quantize: examples/deprecation-warning/deprecation-warning.cpp -ifneq (,$(wildcard quantize)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - @echo "#########" - @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." - @echo " Remove the 'quantize' binary to remove this warning." - @echo "#########" -endif -perplexity: examples/deprecation-warning/deprecation-warning.cpp -ifneq (,$(wildcard perplexity)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - @echo "#########" - @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." - @echo " Remove the 'perplexity' binary to remove this warning." - @echo "#########" -endif -embedding: examples/deprecation-warning/deprecation-warning.cpp -ifneq (,$(wildcard embedding)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - @echo "#########" - @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." - @echo " Remove the 'embedding' binary to remove this warning." - @echo "#########" -endif diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0a5f3647d..1e10862b2 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,40 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(cvector-generator) - add_subdirectory(convert-llama2c-to-ggml) - add_subdirectory(embedding) - add_subdirectory(eval-callback) - add_subdirectory(export-lora) - add_subdirectory(gbnf-validator) - add_subdirectory(gguf-hash) - add_subdirectory(gguf-split) - add_subdirectory(gguf) - add_subdirectory(gritlm) - add_subdirectory(imatrix) - add_subdirectory(infill) - add_subdirectory(llama-bench) - add_subdirectory(llava) - add_subdirectory(lookahead) - add_subdirectory(lookup) add_subdirectory(main) - add_subdirectory(parallel) - add_subdirectory(passkey) - add_subdirectory(perplexity) - add_subdirectory(quantize-stats) - add_subdirectory(quantize) - add_subdirectory(retrieval) if (GGML_RPC) add_subdirectory(rpc) endif() if (LLAMA_BUILD_SERVER) - add_subdirectory(server) endif() - if (GGML_SYCL) - add_subdirectory(sycl) - endif() - add_subdirectory(save-load-state) - add_subdirectory(simple) - add_subdirectory(speculative) - add_subdirectory(tokenize) endif() diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt deleted file mode 100644 index a6790e617..000000000 --- a/examples/convert-llama2c-to-ggml/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-convert-llama2c-to-ggml) -add_executable(${TARGET} convert-llama2c-to-ggml.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md deleted file mode 100644 index 5774ac83c..000000000 --- a/examples/convert-llama2c-to-ggml/README.md +++ /dev/null @@ -1,28 +0,0 @@ -## Convert llama2.c model to ggml - -This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default. - -To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository: - -`$ make -j` - -After successful compilation, following usage options are available: -``` -usage: ./llama-convert-llama2c-to-ggml [options] - -options: - -h, --help show this help message and exit - --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf') - --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model - --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin') -``` - -An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows: - -`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` - -Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K). - -Now you can use the model with a command like: - -`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp deleted file mode 100644 index 8ca9f8915..000000000 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ /dev/null @@ -1,936 +0,0 @@ -#include "ggml.h" -#include "llama.h" -#include "common.h" -#include "log.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// GGUF keys & tensor names. - -#define KV_GENERAL_ARCHITECTURE "general.architecture" -#define KV_GENERAL_NAME "general.name" - -#define KV_TOKENIZER_MODEL "tokenizer.ggml.model" -#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens" -#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type" -#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores" -#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id" -#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id" -#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id" -#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id" -#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id" -#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json" - -#define KV_CONTEXT_LENGTH "llama.context_length" -#define KV_EMBEDDING_LENGTH "llama.embedding_length" -#define KV_BLOCK_COUNT "llama.block_count" -#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length" -#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count" -#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv" -#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon" -#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count" - -#define TN_TOKEN_EMBD "token_embd.weight" -#define TN_OUTPUT_NORM "output_norm.weight" -#define TN_OUTPUT "output.weight" -#define TN_ATTN_NORM "blk.%d.attn_norm.weight" -#define TN_ATTN_Q "blk.%d.attn_q.weight" -#define TN_ATTN_K "blk.%d.attn_k.weight" -#define TN_ATTN_V "blk.%d.attn_v.weight" -#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight" -#define TN_FFN_NORM "blk.%d.ffn_norm.weight" -#define TN_FFN_GATE "blk.%d.ffn_gate.weight" -#define TN_FFN_DOWN "blk.%d.ffn_down.weight" -#define TN_FFN_UP "blk.%d.ffn_up.weight" - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' -#define LLAMA_FILE_VERSION_GGJT_V3 3 - -#define TOKENIZER_NAME "llama" -#define UNKNOWN_TOKEN_ID 0 -#define BOS_TOKEN_ID 1 -#define EOS_TOKEN_ID 2 - -//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. -typedef struct { - int dim; // transformer dimension - int hidden_dim; // for ffn layers - int n_layers; // number of layers - int n_heads; // number of query heads - int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery) - int vocab_size; // vocabulary size, usually 256 (byte-level) - int seq_len; // max sequence length -} Config; - -struct TransformerWeights { - // token embedding table - std::vector token_embedding_table; // (vocab_size, dim) - // weights for rmsnorms - std::vector rms_att_weight; // (layer, dim) rmsnorm weights - std::vector rms_ffn_weight; // (layer, dim) - // weights for matmuls - std::vector wq; // (layer, dim, dim) - std::vector wk; // (layer, dim, dim) - std::vector wv; // (layer, dim, dim) - std::vector wo; // (layer, dim, dim) - // weights for ffn - std::vector w1; // (layer, hidden_dim, dim) - std::vector w2; // (layer, dim, hidden_dim) - std::vector w3; // (layer, hidden_dim, dim) - // final rmsnorm - std::vector rms_final_weight; // (dim,) - // freq_cis for RoPE relatively positional embeddings - // std::vector freq_cis_real; // (seq_len, dim/2) - // std::vector freq_cis_imag; // (seq_len, dim/2) - // (optional) classifier weights for the logits, on the last layer - std::vector wcls; -}; - -static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) { - const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; - try { - w->token_embedding_table.resize(p->vocab_size * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); - - w->rms_att_weight.resize(p->n_layers * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); - - w->rms_ffn_weight.resize(p->n_layers * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); - - w->wq.resize(p->n_layers * p->dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); - - w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); - - w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); - - w->wo.resize(p->n_layers * p->dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); - - w->w1.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); - - w->w2.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); - - w->w3.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); - - w->rms_final_weight.resize(p->dim); - LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); - - if (shared_weights) { - w->wcls = {}; - } else { - w->wcls.resize(p->vocab_size * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); - } - } - catch (std::length_error &) { - die("Invalid configuration. Failed to allocate memory for weights"); - } -} - -static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) { - if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1; - if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1; - if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1; - if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1; - if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1; - if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1; - if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1; - if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1; - if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1; - if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1; - if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1; - - // Skip freq_cis_real & freq_cis_imag - int head_size = p->dim / p->n_heads; - fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR); - - if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1; - - // Check we didn't forget to read anything - auto curr = ftell(f); - fseek(f, 0, SEEK_END); - auto end = ftell(f); - if (curr != end) { - LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); - return 1; - } - - return 0; -} - -static void print_sample_weights(TransformerWeights *w){ - LOG("----- Quick print of first of the weight vales of all the variables\n"); - LOG("%f\n", w->token_embedding_table[0]); - LOG("%f\n", w->rms_att_weight[0]); - LOG("%f\n", w->rms_ffn_weight[0]); - - LOG("%f\n", w->wq[0]); - LOG("%f\n", w->wk[0]); - LOG("%f\n", w->wv[0]); - LOG("%f\n", w->wo[0]); - LOG("%f\n", w->w1[0]); - LOG("%f\n", w->w2[0]); - LOG("%f\n", w->w3[0]); - LOG("%f\n", w->rms_att_weight[0]); - if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); -} -//////////////////////////////////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model. - -struct llama_vocab { - using id = int32_t; - using token = std::string; - using ttype = llama_token_type; - - struct token_data { - token text; - float score; - ttype type; - }; - - std::unordered_map token_to_id; - std::vector id_to_token; -}; - -struct my_llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_ff = 11008; - uint32_t n_mult = 4; - uint32_t n_head = 32; - uint32_t n_head_kv = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; - - bool operator!=(const my_llama_hparams& other) const { - return memcmp(this, &other, sizeof(my_llama_hparams)); - } -}; - -struct my_llama_layer { - // normalization - struct ggml_tensor * attention_norm; - - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - - // normalization - struct ggml_tensor * ffn_norm; - - // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; -}; - -struct my_llama_model { - struct ggml_context * ctx = NULL; - - std::string name; - - my_llama_hparams hparams; - - struct ggml_tensor * tok_embeddings; - - struct ggml_tensor * norm; - struct ggml_tensor * output; - - std::vector layers; - - uint32_t train_its = 0; - uint32_t train_samples = 0; - uint32_t train_tokens = 0; -}; - -struct train_params { - const char * fn_vocab_model; - const char * fn_llama2c_model; - const char * fn_llama2c_output_model; - const char * fn_train_data; - const char * fn_checkpoint_in; - const char * fn_checkpoint_out; - const char * fn_model_out; - - uint32_t seed; - - int n_ctx; - int n_embd; - int n_mult; - int n_head; - int n_layer; - int n_rotmax; - - int n_threads; - int n_batch; - int n_examples; - int n_predict; - - int print_info_interval; - int print_details_interval; - - bool samples_start_after_nl; - bool use_adam; - bool use_flash; - bool use_scratch; - - // only adam - int warmup; - int cos_decay_steps; - float cos_decay_restart; - float cos_decay_alpha; - - int lbfgs_n_iter; - int adam_n_iter; - float adam_alpha; - float adam_decay; - - int mem_model_gb; - int mem_compute_gb; - int mem_compute0_gb; - int mem_compute1_gb; -}; - -static void print_params(struct my_llama_hparams * params) { - LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); - LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); - LOG("%s: n_embd: %u\n", __func__, params->n_embd); - LOG("%s: n_mult: %u\n", __func__, params->n_mult); - LOG("%s: n_head: %u\n", __func__, params->n_head); - LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); - LOG("%s: n_ff: %u\n", __func__, params->n_ff); - LOG("%s: n_layer: %u\n", __func__, params->n_layer); - LOG("%s: n_rot: %u\n", __func__, params->n_rot); -} - -static void print_tensor_info(const struct ggml_context * ctx) { - for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - LOG("%s: Allocating ", __func__); - int64_t total = 1; - int i = 0; - for (; i < ggml_n_dims(t); ++i) { - if (i > 0) LOG("x "); - LOG("[%" PRId64 "] ", t->ne[i]); - total *= t->ne[i]; - } - if (i > 1) LOG("= [%" PRId64 "] ", total); - LOG("float space for %s\n", ggml_get_name(t)); - } -} - -static void init_model(struct my_llama_model * model) { - const auto & hparams = model->hparams; - - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_vocab = hparams.n_vocab; - - const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv; - - const uint32_t n_ff = hparams.n_ff; - struct ggml_context * ctx = model->ctx; - - model->train_its = 0; - model->train_samples = 0; - model->train_tokens = 0; - - model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - - ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); - ggml_set_name(model->norm, "norm.weight"); - ggml_set_name(model->output, "output.weight"); - - model->layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - std::string layers_i = "layers." + std::to_string(i); - - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries); - layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries); - layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); - layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - - ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str()); - - ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str()); - ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str()); - ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str()); - ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str()); - - ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str()); - - ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str()); - ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str()); - ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str()); - } - - print_tensor_info(ctx); -} - -static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - return *ptr; -} - -static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - return *ptr; -} - -static void print_row(struct ggml_tensor * probs, int i) { - for (int k = 0; k < probs->ne[0]; ++k) { - float p = get_f32_2d(probs, k, i); - LOG(" %f", p); - } - LOG("\n"); -} - -static void print_matrix(struct ggml_tensor * probs) { - assert(ggml_is_matrix(probs)); - for (int i = 0; i < probs->ne[1]; ++i) { - for (int k = 0; k < probs->ne[0]; ++k) { - float p = get_f32_2d(probs, k, i); - LOG(" %.2f", p); - } - LOG("\n"); - } -} - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; - } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - die_fmt("fread failed: %s", strerror(errno)); - } - if (ret != 1) { - die("unexpectedly reached end of file"); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - std::float_t read_f32() { - std::float_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -static bool is_ggml_file(const char * filename) { - llama_file file(filename, "rb"); - if (file.size < 4) { - return false; - } - std::string magic = file.read_string(4); - return magic == GGUF_MAGIC; -} - -static std::string llama_escape_whitespaces(const std::string & text) { - std::ostringstream out; - for (char c : text) { - if (c == ' ') out << "\xe2\x96\x81"; - else out << c; - } - return out.str(); -} - -static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { - if (is_ggml_file(filename)) { - LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); - struct ggml_context * ctx_data = NULL; - - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, - }; - - struct gguf_context * ctx = gguf_init_from_file(filename, params); - GGML_ASSERT(ctx != NULL); - - const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL); - GGML_ASSERT(model_idx >= 0); - std::string tokenizer_name = gguf_get_val_str(ctx, model_idx); - GGML_ASSERT(tokenizer_name == TOKENIZER_NAME); - - const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST); - GGML_ASSERT(token_idx >= 0); - - const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES); - GGML_ASSERT(score_idx >= 0); - const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - - const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE); - GGML_ASSERT(toktype_idx >= 0); - const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); - - const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); - if (n_vocab != static_cast(config->vocab_size)) { - die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size); - } - - vocab->id_to_token.resize(n_vocab); - - for (uint32_t i = 0; i < n_vocab; i++) { - std::string word = gguf_get_arr_str(ctx, token_idx, i); - - vocab->token_to_id[word] = i; - - auto & token_data = vocab->id_to_token[i]; - token_data.text = std::move(word); - token_data.score = scores[i]; - token_data.type = (llama_token_type) toktypes[i]; - } - ggml_free(ctx_data); - gguf_free(ctx); - } else { - // assume llama2.c vocabulary - LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); - llama_file file(filename, "rb"); - if (!file.fp) { - die_fmt("%s: %s", strerror(errno), filename); - } - const int n_vocab = config->vocab_size; - /* uint32_t max_token_length = */ file.read_u32(); // unused - vocab->id_to_token.resize(n_vocab); - for (llama_vocab::id id=0; id", &byte_val) == 1) { - // Text of byte tokens is already in the expected format. - type = LLAMA_TOKEN_TYPE_BYTE; - } else { - type = LLAMA_TOKEN_TYPE_NORMAL; - } - text = llama_escape_whitespaces(text); - - vocab->id_to_token[id].text = text; - vocab->id_to_token[id].score = score; - vocab->id_to_token[id].type = type; - vocab->token_to_id.emplace(text, id); - } - } -} - -static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { - int size = 1; - for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) { - size *= gg_weights->ne[dim]; - } - for (int ct = 0; ct < size; ++ct) { - int64_t i0 = 0; int64_t i1 = 0; - int64_t i2 = 0; int64_t i3 = 0; - ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3); - ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]); - } -} - -static void save_as_llama_model( - struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename -) { - // convert AK weights into GG weights one by one. - // w->token_embedding_table -> model->tok_embeddings - // float* -> struct ggml_tensor - convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data()); - convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data()); - - convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data()); - //print_row(model->norm, 0); - - // for rms-att-weight - int row_length = model->hparams.n_embd; - int n_ff = model->hparams.n_ff; - - const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv; - - for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ - auto & layer = model->layers[i]; - // 1d - convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); - convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); - - // from 3d matrix layer x dim x dim to 2d matrix dim x dim - convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]); - convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]); - // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries - convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/n_multiqueries]); - convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/n_multiqueries]); - - convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]); - convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]); - convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]); - } - - struct gguf_context * ctx = gguf_init_empty(); - - std::vector tokens; - std::vector scores; - std::vector token_types; - for (const llama_vocab::token_data & token_data : vocab->id_to_token) { - tokens.push_back(token_data.text.c_str()); - scores.push_back(token_data.score); - token_types.push_back(token_data.type); - } - gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size()); - gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size()); - gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size()); - - gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME); - - gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama"); - gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama"); - - // special tokens - gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); - gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); - gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); - gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); - gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); - - gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); - gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); - gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff); - gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); - gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); - gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv); - gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer); - gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot); - gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f); - - // write tensors - ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD); - gguf_add_tensor(ctx, model->tok_embeddings); - - ggml_set_name(model->norm, TN_OUTPUT_NORM); - gguf_add_tensor(ctx, model->norm); - - ggml_set_name(model->output, TN_OUTPUT); - gguf_add_tensor(ctx, model->output); - - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - ggml_format_name(layer.wq, TN_ATTN_Q, i); - gguf_add_tensor(ctx, layer.wq); - - ggml_format_name(layer.wk, TN_ATTN_K, i); - gguf_add_tensor(ctx, layer.wk); - - ggml_format_name(layer.wv, TN_ATTN_V, i); - gguf_add_tensor(ctx, layer.wv); - - ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i); - gguf_add_tensor(ctx, layer.wo); - - ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i); - gguf_add_tensor(ctx, layer.attention_norm); - - ggml_format_name(layer.w1, TN_FFN_GATE, i); - gguf_add_tensor(ctx, layer.w1); - - ggml_format_name(layer.w2, TN_FFN_DOWN, i); - gguf_add_tensor(ctx, layer.w2); - - ggml_format_name(layer.w3, TN_FFN_UP, i); - gguf_add_tensor(ctx, layer.w3); - - ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i); - gguf_add_tensor(ctx, layer.ffn_norm); - } - - gguf_write_to_file(ctx, filename, false); - gguf_free(ctx); -} - -static struct train_params get_default_train_params() { - struct train_params params; - params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; - params.fn_llama2c_output_model = "ak_llama_model.bin"; - params.fn_train_data = "shakespeare.txt"; - params.fn_checkpoint_in = "checkpoint.bin"; - params.fn_checkpoint_out = "checkpoint.bin"; - params.fn_model_out = "ggml-checkpoint-f32.bin"; - - params.seed = -1; - - params.n_ctx = 128; - params.n_embd = 256; - params.n_mult = 256; - params.n_head = 8; - params.n_layer = 16; - params.n_rotmax = 64; - - params.n_threads = 6; - params.n_batch = 8; - params.n_examples = 8; - params.n_predict = 1024; - - params.print_info_interval = 1; - params.print_details_interval = 2; - - params.samples_start_after_nl = false; - params.use_adam = true; - params.use_flash = false; - params.use_scratch = true; - - // only adam - params.warmup = 100; - params.cos_decay_steps = 1000; - params.cos_decay_restart = 1.1f; - params.cos_decay_alpha = 0.0f; - - params.lbfgs_n_iter = 16; - params.adam_n_iter = 16; - params.adam_alpha = 1e-3f; - params.adam_decay = 1e-3f; - - params.mem_model_gb = 2; - params.mem_compute_gb = 24; - params.mem_compute0_gb = 8; - params.mem_compute1_gb = 2; - - return params; -} - -static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model); - fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n"); - fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model); - fprintf(stderr, "\n"); -} - -static bool params_parse(int argc, char ** argv, struct train_params * params) { - bool invalid_param = false; - bool reqd_param_found = false; - std::string arg; - struct train_params default_params = get_default_train_params(); - const std::string arg_prefix = "--"; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (arg == "--copy-vocab-from-model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_vocab_model = argv[i]; - } else if (arg == "--llama2c-model") { - if (++i >= argc) { - invalid_param = true; - break; - } - reqd_param_found = true; - params->fn_llama2c_model = argv[i]; - } else if (arg == "--llama2c-output-model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_llama2c_output_model = argv[i]; - } else if (arg == "-h" || arg == "--help") { - print_usage(argc, argv, &default_params); - exit(0); - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - print_usage(argc, argv, &default_params); - exit(1); - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - print_usage(argc, argv, &default_params); - exit(1); - } - if (!reqd_param_found){ - fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n"); - print_usage(argc, argv, &default_params); - exit(1); - } - - return true; -} - -static std::string basename(const std::string &path) { - size_t pos = path.find_last_of("/\\"); - if (pos == std::string::npos) { - return path; - } - return path.substr(pos + 1); -} - -int main(int argc, char ** argv) { - struct train_params params = get_default_train_params(); - if (!params_parse(argc, argv, ¶ms)) { - return 1; - } - log_set_target(stdout); - Config config; - TransformerWeights weights = {}; - { - LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); - FILE * file = fopen(params.fn_llama2c_model, "rb"); - if (!file) { - LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); - return 1; - } - // read in the config header - if (fread(&config, sizeof(Config), 1, file) != 1) { - LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); - return 1; - } - auto shared_weights = config.vocab_size > 0; - config.vocab_size = abs(config.vocab_size); - - // read in the Transformer weights - alloc_weights(&weights, &config, shared_weights); - if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { - LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); - return 1; - } - fclose(file); - } - - struct llama_vocab vocab; - load_vocab(params.fn_vocab_model, &config, &vocab); - - struct my_llama_model model; - model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); - model.hparams.n_ctx = params.n_ctx; - model.hparams.n_embd = config.dim; //params.n_embd; - model.hparams.n_ff = config.hidden_dim; - model.hparams.n_mult = 32;//params.n_mult; - model.hparams.n_head = config.n_heads; //params.n_head; - model.hparams.n_head_kv = config.n_kv_heads; - model.hparams.n_layer = config.n_layers; //params.n_layer; - model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); - - print_params(&model.hparams); - - struct ggml_init_params lcparams; - lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); - lcparams.mem_buffer = NULL; - lcparams.no_alloc = false; - - model.ctx = ggml_init(lcparams); - - init_model(&model); - model.name = basename(params.fn_llama2c_model); - save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); - - LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); - - ggml_free(model.ctx); - return 0; -} diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt deleted file mode 100644 index 0a559d60c..000000000 --- a/examples/cvector-generator/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-cvector-generator) -add_executable(${TARGET} cvector-generator.cpp pca.hpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md deleted file mode 100644 index be4dd5250..000000000 --- a/examples/cvector-generator/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# cvector-generator - -This example demonstrates how to generate a control vector using gguf models. - -Related PRs: -- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970) -- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880) -- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514) - -## Examples - -```sh -# CPU only -./cvector-generator -m ./llama-3.Q4_K_M.gguf - -# With GPU -./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 - -# With advanced options -./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100 - -# Using mean value instead of PCA -./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean - -# To see help message -./cvector-generator -h -# Then, have a look at "cvector" section -``` - -## Tips and tricks - -If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example: - -``` -<|im_start|>system\nAct like a person who is extremely happy.<|im_end|> -<|im_start|>system\nYou are in a very good mood today<|im_end|> -``` - -Example to use output file with `llama-cli`: - -(Tips: The control vector works better when apply to layers higher than 10) - -```sh -./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31 -``` diff --git a/examples/cvector-generator/completions.txt b/examples/cvector-generator/completions.txt deleted file mode 100644 index abc45ffd8..000000000 --- a/examples/cvector-generator/completions.txt +++ /dev/null @@ -1,582 +0,0 @@ - -That game -I can see -Hmm, this -I can relate to -Who is -I understand the -Ugh, -What the hell was -Hey, did anyone -Although -Thank you for choosing -What are you -Oh w -How dare you open -It was my pleasure -I'm hon -I appreciate that you -Are you k -Whoever left this -It's always -Ew, -Hey, I l -Hello? Is someone -I understand that -That poem -Aww, poor -Hey, it -Alright, who -I didn't -Well, life -The document -Oh no, this -I'm concerned -Hello, this is -This art -Hmm, this drink -Hi there! -It seems -Is -Good -I can't -Ex -Who are -I can see that -Wow, -Today is a -Hey friend -Sometimes friends -Oh, this old -The weather outside -This place is sur -I appreciate your input -Thank you for the -Look at -I'm disappoint -To my -How dare you -That's an -This piece of art -Eww -This park is -This is incredible -Oh no, someone -Exc -Well, it' -I warned -Hey, I understand -Hey, I saw -How dare you go -What the he -Hey -It's -Hello? Hello? -It -Oh no! -This is the perfect -Good morning, -Oh no, there -It's so -Yeah -Uh, -Hello everyone -Who turned off -The weather -Who' -Hey, this -Wait, -Eww, gross -Excuse -It seems like you -Thank you so -What happened? -Oh my g -I am deeply sad -I war -Okay, let' -Hey, that -That was a beautiful -Oh no! That -What happened -Hey there -The artist' -What?! -Hey, it' -I am disappoint -It seems like -Oh no! The -This park is a -If you -Yes! I did -It sounds -What -Who is it -Hmm, that -That's strange -Yeah, that was -That's interesting -This park -What the hell -Who is that -I feel like my -Oh well -What the hell is -Hello? Hello -To my dearest -Bless you!\" -Thank you for -Oh, looks like -Can you please -This place is -Eww, what -Bless you -Is everything -Hey, I just -Whoever left these -Well, that' -I feel -Hey, do you -It's sad -Oh no, it -Hey, that' -Oh my god, -Thank you, -Hello little one, -I apolog -Hey team, I -How dare you read -Who is this and -Whoever left -Hi there! W -A -If you have -I was -U -Bless -Well, this -Oh, I' -It's a -Eww, -Is everything okay? -Oh, I -Hello, can you -Al -That was a great -What are -I understand that not -Oh no, not -Who is it?\" -Hey, can we -Whoever is taking -I would love to -Hey, I noticed -Hey, could -I understand that there -Hello? -D -Oh man, I -Thank you so much -Oh no, my -Dear [Name -Uh -I remember -Hey, who -Well, it -Are you -I understand that it -Hey, is -I would -Who is this -Excuse me -Alright -I am thrilled -Sometimes friends have -Who the -It's interesting -I would love -E -Hello? Is anyone -Well, this is -This place -Well, -I warned you -Hey, watch where -Oh my -That' -Sometimes friends have different -I understand that everyone -What? -What do these notes -I can relate -I'm not -I understand -To my dear -Guys -Well -Hey, I appreciate -Wow, what -Dear -That melody -Who the hell -Today is -Hello little -Wow, look -That's great -Love is never wrong -I'm having -Whoa, did -Ugh -Can you please provide -I miss you, -I feel uncom -I know -Ugh, this -Hey, watch -Oh great, a -I didn -Okay -That game of char -Oh -I appreciate -Who's there -I am so -Oh great, someone -Hey, could you -I remember wondering -Wait, what? -What do -Hello? Can -Hey there, -That game of -This is incred -Oh my gosh -Oh great, f -I appreciate your -It sounds like -What the heck -Okay, I understand -Ew -I understand that this -Uh, hi -Hi everyone! -What the hell? -Thank you for your -Oh no, the -Wow, I -Who turned -Dear [ -Whoever -This is a -Whoa, he -What in the world -Although the physical -Hello, who is -That's amaz -Hey, I know -Okay, that -Hi everyone -Hey, is everything -I understand your fr -Oh no, poor -Oh, look -Good morning -Ew, gross -Oh no, did -Look at the family -Hey team -Yes! -Hey, can I -Okay, that' -It's great -Love is -Hey, what -Good morning, world -Who is it? -That poem really reson -I -That's -I understand the task -Gu -Hello? Who' -This postcard is -Whoa, -Oh, that -I understand that I -Whoever is -Hello? Who is -I'm really -Wow, this -Can -This artwork really -This is a shame -I miss you too -Who are you? -Today is a difficult -Hey, just -Are you okay -I am -Hi, -Wow, that -Hey there! Can -Okay, stay -Oh great, just -Yeah, -Hello? Can you -Oh, looks -Thank you for sharing -I'm glad -Hey, is that -Hmm -It was my -It sounds like you -Wow, your -I was promised certain -That was such a -Thank -Excuse you -That was -Hey team, -I feel un -It was -What' -Hey friend, I -How -Saying goodbye -That -It's heart -How dare -Oh, -Hello, may -What's this -Thank you for recogn -Aww, that -Oh, I remember -Hmm, that' -I miss -I know this -Wait -Is everything okay -Who is that person -Wow, you -Oh great -I'm sad -Wow, the -I am very disappoint -Who turned off the -I understand that things -I'm very -Hi -That's very -Okay, I -Oh no, -Wow, there -What's wrong -I apologize for -Hey, I -Can I help you -Oh, I didn -Alright, -Oh wow, -Oh my goodness -I know this event -What in the -Saying -Yeah, that -Guys, I -Hey, this v -This post -Are -Hey, can -Hello? Is -I can only imagine -Oh, that sounds -Hey, is anyone -I am disappointed -Hello, -Hey everyone, I -That was such -It's okay -The artist -Whoa -I understand that mistakes -Can I help -Who -Hi everyone! I -Hey, can you -Wow, how -Today -Oh no, I -Oh well, I -Well, that -This is the -Yes! I finally -Hey there little -Hello everyone! -Love is never -Look at the -This postcard -Oh great, -Can I -Hmm, this is -I understand your -Oh, look at -B -I'm so -Whoa, this -W -Oh, this -Sometimes -This piece of -What the -That was a -Hey, do -Oh no -Whoa, what -I feel like I -The documentary -Hello -Hello little one -I understand that my -Eww, that -Wow, an -Yes! Finally, -Although the physical location -Whoever is watching -That movie -I remember wondering about -Hey there, little -Who's -Hello, who -Hello everyone! Thank -Hello, can -That's too -Hey, just wanted -Hey there, I -Saying good -Hey there! -Who is there? -Oh my good -I am very -Oh no, what -Wow, thank -I was promised -Hi, is -Hey, I' -Guys, the -Oh no, that -Who is there -Hello, this -That movie really touched -If you have something -The documentary was -I'm starting -Are you kidd -That movie really -Hey everyone, -Thank you for considering -I didn' -Yes! I -Can you -Oh my god -Hey, whoever -That melody really -Thank you, little -Hello, may I -Look -Wow, we -It looks -What do these -Oh wow -I apologize -What are you all -It's such -It's clear -Hey, I was -Hey friend, -I can only -The weather outside is -Eww, this -I miss you -Wow -Aww, -Hi, is there -This artwork -Okay, -Oh well, -This -I' -Say -Hey there little gu -Hmm, -Whoa, who -I am thr -Oh man -Okay, stay calm -I'm happy -Oh, this cur -Oh man, -I'm sorry -Hello? Who -What?! That -This piece -Hey everyone -That's so -Are you okay? -What happened? Where -Hi there -The -Who the hell entered -I can -Guys, -What's -What in -It's important -I'm -I'm coming -It' -Yes! Finally -Wait, what -Wow, reading -I'm surprised -Hey, did -Hey, -Okay, let -I understand that you -Who the hell threw -Eww, who -Thank you for thinking -Who is this?\" -I am deeply -Thank you for including -Oh no, an -It looks like you -Aww -I'm confused -Wow, it -That poem really -Yes -Hey there, is -Hey, what' -Thank you for remember -To -This is -Thank you for making -I can' -That mel -Wow, they -I feel like -Although the -Who are you -Love -If -What the hell are -I am so sad -Oh, I found -Thank you -It looks like -Well, life is -I appreciate that -The artist's -Whoa, that -It's never \ No newline at end of file diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp deleted file mode 100644 index d4e126ac2..000000000 --- a/examples/cvector-generator/cvector-generator.cpp +++ /dev/null @@ -1,503 +0,0 @@ -#include "common.h" -#include "llama.h" -#include "ggml.h" -#include "pca.hpp" -#include "mean.hpp" - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - - -////////////////////////////////////////////////// -// utils - -template -static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { - std::string ret; - for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); - } - - return ret; -} - -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - - printf("\nexample usage:\n"); - printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); - printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); - printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]); - printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]); - printf("\n"); -} - -////////////////////////////////////////////////// - - -// cb_eval is reused for each pair of positive - negative prompt -struct callback_data { - ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered - - int n_layers = 0; - int n_tokens = 0; - bool is_eval_pos = true; - - // each element of the vector correspond to one layer - std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] - std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] - std::vector v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer - - // save a tensor into either v_pos or v_neg (decided by is_eval_pos) - void save_tensor_for_layer(struct ggml_tensor * t) { - GGML_ASSERT(t->type == GGML_TYPE_F32); - - if (ctx_ggml == nullptr) { - // alloc a new ctx_ggml if needed - struct ggml_init_params params_ggml = { - /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ctx_ggml = ggml_init(params_ggml); - } - - // copy tensor data - auto n_bytes = ggml_nbytes(t); - struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]); - t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow - ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes); - ggml_set_name(t_layer, ggml_get_name(t)); - //print_debug_tensor(t_layer); - - if (is_eval_pos) { - v_pos.push_back(t_layer); - } else { - v_neg.push_back(t_layer); - } - } - - // calculate diff (v_pos - v_neg) and place the result back to v_pos - // all zero rows in the diff tensor will also be removed - // NOTE: final layer is ignored. we only have (n_layers - 1) to process - std::vector calc_diff() { - for (float il = 0; il < v_pos.size(); il++) { - float * a = (float *) v_pos[il]->data; - float * b = (float *) v_neg[il]->data; - size_t n_elem = ggml_nelements(v_pos[il]); - for (size_t j = 0; j < n_elem; j++) { - a[j] -= b[j]; - } - //print_debug_tensor(v_pos[i]); - auto diff_filtered = filter_nonzero_rows(v_pos[il]); - v_diff_filtered.push_back(diff_filtered); - } - return v_diff_filtered; // for convinient, we return the result std::vector - } - - // delete zero rows from a given 2D tensor - struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) { - //printf("filter_nonzero_rows\n"); - auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool { - // check if given row containing all zero elements - int n_cols = t->ne[0]; // hint: should be equal to n_embd - for (int col = 0; col < n_cols; ++col) { - if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) { - return false; - } - } - return true; - }; - std::vector rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered) - for (int i_row = 0; i_row < a->ne[1]; i_row++) { - if (!is_row_all_zeros(a, i_row, 1e-6)) { - rows_to_copy.push_back(i_row); - } - } - - // get "n_nonzero_rows" for the output "diff_filtered" - int n_nonzero_rows = rows_to_copy.size(); - //printf("n_nonzero_rows: %d\n", n_nonzero_rows); - int n_embd = a->ne[0]; - GGML_ASSERT(n_nonzero_rows > 0); - - // diff_filtered: [n_embd, n_nonzero_rows] - struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( - ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows); - ggml_format_name(diff_filtered, "diff_filtered_%s", a->name); - diff_filtered->data = malloc(ggml_nbytes(diff_filtered)); - - // copy non-zero rows - for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) { - int src_row = rows_to_copy[dest_row]; - for (int i = 0; i < n_embd; i++) { - float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0); - ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem); - } - } - - //print_debug_tensor(diff_filtered); - - return diff_filtered; - } - - // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors - void reset() { - for (auto ptr : v_pos) free(ptr->data); - for (auto ptr : v_neg) free(ptr->data); - for (auto ptr : v_diff_filtered) free(ptr->data); - v_pos.clear(); - v_neg.clear(); - v_diff_filtered.clear(); - if (ctx_ggml) { - ggml_free(ctx_ggml); - } - ctx_ggml = nullptr; - } -}; - -/** - * process_ctx is used to store the ggml context for pre-post processing the diff vectors - * in short, input => v_diff and output => v_final - */ -struct train_context { - ggml_context * ctx_ggml; - int n_embd; - int n_layers; - - /* pair of prompts to be used for generating final vector */ - std::vector positive_entries; - std::vector negative_entries; - - // each element of the vector correspond to one layer - // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here - // NOTE (2): v_diff is transposed from v_diff_tmp - std::vector v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) - std::vector v_final; // vector of vectors of size [n_embd] to be written to file - - // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor - // v_diff_tmp will get converted unto v_diff later on - std::vector> v_diff_tmp; - - train_context(int n_embd_, int n_layers_) { - n_embd = n_embd_; - n_layers = n_layers_; - struct ggml_init_params params_ggml = { - /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ctx_ggml = ggml_init(params_ggml); - for (int il = 0; il < n_layers - 1; il++) { - std::vector empty; - v_diff_tmp.push_back(empty); - auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd); - t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible - v_final.push_back(t); - } - } - - // add new rows into existing tensor in v_diff_tmp - void concat_diff_tmp(const std::vector & diff_filtered) { - GGML_ASSERT((int) diff_filtered.size() == n_layers - 1); - for (int il = 0; il < n_layers - 1; il++) { - auto t = diff_filtered[il]; - auto & diff_tmp = v_diff_tmp[il]; - size_t curr_size = diff_tmp.size(); - diff_tmp.resize(curr_size + ggml_nbytes(t)); - memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t)); - } - } - - // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) - // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method - void build_v_diff(bool transpose) { - printf("build_v_diff\n"); - for (int il = 0; il < n_layers - 1; il++) { - auto & diff_tmp = v_diff_tmp[il]; - int n_elem = diff_tmp.size() / sizeof(float); - GGML_ASSERT(n_elem % n_embd == 0); - int n_rows = n_elem / n_embd; - struct ggml_tensor * diff = transpose - ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd) - : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows); - ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); - diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible - if (transpose) { - // copy data & transpose - float * arr = (float *) diff_tmp.data(); - for (int ir = 0; ir < n_rows; ++ir) { - for (int ic = 0; ic < n_embd; ++ic) { - float f = arr[ir*n_embd + ic]; - ggml_set_f32_nd(diff, ir, ic, 0, 0, f); - } - } - } else { - // only copy - memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff)); - } - v_diff.push_back(diff); - print_debug_tensor(diff); - // free memory of diff_tmp - diff_tmp.resize(0); - } - } - - ~train_context() { - for (auto ptr : v_final) free(ptr->data); - for (auto ptr : v_diff) free(ptr->data); - // no need to free v_diff_tmp, since we didn't use malloc - ggml_free(ctx_ggml); - } -}; - -struct tokenized_prompt { - std::vector tokens_pos; - std::vector tokens_neg; - size_t max_seq_len; - - tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true); - tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true); - max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); - padding_seq(ctx, tokens_pos, max_seq_len); - padding_seq(ctx, tokens_neg, max_seq_len); - } - - void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { - // TODO: customize padding token - std::vector pad_tokens = ::llama_tokenize(ctx, " ", false); - llama_token pad_tok = pad_tokens.back(); - while (tokens.size() < len) { - tokens.push_back(pad_tok); - } - } -}; - -////////////////////////////////////////////////// - -template -static std::string to_string(const T & val) { - std::stringstream ss; - ss << val; - return ss.str(); -} - -static std::vector ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) { - std::vector output; - std::ifstream file(path); - if (!file.is_open()) { - fprintf(stderr, "error: unable to open file: %s\n", path.c_str()); - exit(1); - } - std::string line; - while (std::getline(file, line)) { - bool is_skip = skip_empty_lines && line.empty(); - if (!is_skip) { - string_process_escapes(line); - output.push_back(line); - } - } - file.close(); - return output; -} - -////////////////////////////////////////////////// - -static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { - auto * cb_data = (callback_data *) user_data; - static const char * l_out_name = "l_out"; - const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; - - if (ask) { - return is_l_out; - } - - if (!is_l_out || t->ne[1] != cb_data->n_tokens) { - return true; - } - - // save the tensor to current context - cb_data->save_tensor_for_layer(t); - return true; -} - -static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { - llama_kv_cache_clear(ctx); - if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - return true; -} - -static void export_gguf(const std::vector & v_ctrl, const std::string fname, const std::string model_hint) { - struct gguf_context * ctx = gguf_init_empty(); - - const std::string arch = "controlvector"; - gguf_set_val_str(ctx, "general.architecture", arch.c_str()); - gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); - gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size()); - - for (size_t i = 0; i < v_ctrl.size(); ++i) { - gguf_add_tensor(ctx, v_ctrl[i]); - print_debug_tensor(v_ctrl[i]); - printf("Added tensor: %s\n", v_ctrl[i]->name); - } - - printf("%s: writing file...\n", __func__); - gguf_write_to_file(ctx, fname.c_str(), false); - printf("%s: wrote file '%s'\n", __func__, fname.c_str()); - gguf_free(ctx); -} - -/** - * Load prompt files and completion file. - * Then format each pair of prompt + completion to make an entry. - */ -static int prepare_entries(gpt_params & params, train_context & ctx_train) { - // load prompts - std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); - std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); - if (positive_prompts.size() != negative_prompts.size()) { - fprintf(stderr, "number of positive and negative prompts must be equal\n"); - return 1; - } - if (positive_prompts.empty()) { - fprintf(stderr, "must provide at least one prompt pair\n"); - return 1; - } - ctx_train.positive_entries = positive_prompts; - ctx_train.negative_entries = negative_prompts; - return 0; -} - -int main(int argc, char ** argv) { - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); - return 1; - } - - if (params.n_pca_iterations % params.n_pca_batch != 0) { - fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n"); - return 1; - } - - - callback_data cb_data; - - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation - params.cb_eval = cb_eval; - params.cb_eval_user_data = &cb_data; - params.warmup = false; - - print_build_info(); - llama_backend_init(); - llama_numa_init(params.numa); - - // load the model to get hparams - llama_model * model; - llama_context * ctx; - std::tie(model, ctx) = llama_init_from_gpt_params(params); - - // int n_ctx = llama_n_ctx(ctx); - int n_layers = llama_n_layer(model); - int n_embd = llama_n_embd(model); - // get model hint param (a.k.a model arch name) - char model_hint[128]; - llama_model_meta_val_str(model, "general.architecture", model_hint, 128); - - // init train_context - train_context ctx_train(n_embd, n_layers); - - // load and prepare entries for training - prepare_entries(params, ctx_train); - - // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped - std::vector tokenized_prompts; - size_t n_total_tokens = 0; - for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { - tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]); - n_total_tokens += 2 * t.max_seq_len; - tokenized_prompts.push_back(std::move(t)); - } - - std::cout << "n_total_tokens: " << n_total_tokens << std::endl; - - for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { - bool success = false; - tokenized_prompt t = tokenized_prompts[i]; - cb_data.n_layers = n_layers; - cb_data.n_tokens = t.max_seq_len; - - printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n", - (int) i+1, (int) ctx_train.positive_entries.size(), - tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), - tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), - (int) t.max_seq_len); - - cb_data.is_eval_pos = true; - success = get_hidden_layers(ctx, t.tokens_pos); - if (!success) break; - - cb_data.is_eval_pos = false; - success = get_hidden_layers(ctx, t.tokens_neg); - if (!success) break; - - // calculate diff and remove all zero rows - auto v_diff_filtered = cb_data.calc_diff(); - - // save & concat the filtered v_diff to ctx_train - ctx_train.concat_diff_tmp(v_diff_filtered); - - // reset for next iteration - cb_data.reset(); - } - - // done with the model, we can now free it to make gain some memory - printf("Done evaluate prompts, unload model...\n"); - llama_free(ctx); - llama_free_model(model); - - bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; - - // prepare ctx_train for PCA - ctx_train.build_v_diff(use_pca); - - if (use_pca) { - // run PCA - PCA::pca_params pca_params; - pca_params.n_threads = params.n_threads; - pca_params.n_batch = params.n_pca_batch; - pca_params.n_iterations = params.n_pca_iterations; - PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); - } else { - // run mean - mean::run(ctx_train.v_diff, ctx_train.v_final); - } - - // write output vectors to gguf - export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint); - - llama_backend_free(); - - return 0; -} diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp deleted file mode 100644 index 16be5ce3e..000000000 --- a/examples/cvector-generator/mean.hpp +++ /dev/null @@ -1,48 +0,0 @@ -#include "common.h" -#include "llama.h" -#include "ggml.h" - -#include -#include -#include - -namespace mean { - -static void run( - const std::vector & v_input, // shape of v_input[0]: [n_embd, n_samples] - const std::vector & v_output) { - printf("%s: Running mean...\n", __func__); - for (size_t il = 0; il < v_input.size(); ++il) { - // prepare output vector - struct ggml_tensor * ctrl_out = v_output[il]; - ggml_format_name(ctrl_out, "direction.%ld", il+1); - - // calculate mean vector - struct ggml_tensor * t_layer = v_input[il]; - GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd - for (int ic = 0; ic < t_layer->ne[0]; ic++) { - float f = 0.0; - for (int ir = 0; ir < t_layer->ne[1]; ir++) { - f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0); - } - f /= t_layer->ne[1]; - ggml_set_f32_1d(ctrl_out, ic, f); - } - - // normalize output vector - float norm = 0.0; - for (int i = 0; i < ggml_nelements(ctrl_out); i++) { - float f = ggml_get_f32_1d(ctrl_out, i); - norm += f*f; - } - norm = sqrt(norm); - for (int i = 0; i < ggml_nelements(ctrl_out); i++) { - float f = ggml_get_f32_1d(ctrl_out, i); - ggml_set_f32_1d(ctrl_out, i, f / norm); - } - - printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size()); - } -} - -} diff --git a/examples/cvector-generator/negative.txt b/examples/cvector-generator/negative.txt deleted file mode 100644 index 45b9384b3..000000000 --- a/examples/cvector-generator/negative.txt +++ /dev/null @@ -1,4 +0,0 @@ -<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest -<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow -<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me -<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow \ No newline at end of file diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp deleted file mode 100644 index 6ec3141af..000000000 --- a/examples/cvector-generator/pca.hpp +++ /dev/null @@ -1,325 +0,0 @@ -#include "common.h" -#include "llama.h" -#include "ggml.h" - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_POS 5 - -static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) { - printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]); - if (!with_data) return; - printf("%s: %s[0] = [", __func__, t->name); - for (size_t i = 0; i <= DEBUG_POS; i++) { - printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0)); - } - printf(" ... ]\n"); -} - -namespace PCA { - -// input params for PCA computations -struct pca_params { - int n_threads = 1; - int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used - int n_iterations = 1000; - float tolerance = 1e-7; - - // for debugging - int i_layer = 0; - int n_layers = 0; -}; - -// result from each iteration -struct pca_result { - struct ggml_tensor * calculated_square = NULL; - std::vector eigenvectors; - std::vector distances; -}; - -struct pca_model { - ggml_backend_t backend = NULL; - ggml_backend_buffer_t buffer; - struct ggml_context * ctx; // context to compute graph on target device - struct ggml_context * ctx_host; // host context to store results - - // tensors on target device - struct ggml_tensor * dev_input; - struct ggml_tensor * dev_square; - struct ggml_tensor * dev_eigenvector; - - pca_model(struct ggml_tensor * t_input) { -#ifdef GGML_USE_CUDA - fprintf(stderr, "%s: using CUDA backend\n", __func__); - backend = ggml_backend_cuda_init(0); // init device 0 - if (!backend) { - fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); - } -#endif - -// TODO: enable Metal support when support for GGML_OP_SQRT is added -// #ifdef GGML_USE_METAL -// fprintf(stderr, "%s: using Metal backend\n", __func__); -// backend = ggml_backend_metal_init(); -// if (!backend) { -// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); -// } -// #endif - - // if there aren't GPU Backends fallback to CPU backend - if (!backend) { - backend = ggml_backend_cpu_init(); - } - - const int num_tensors = 4; - struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ctx = ggml_init(params); - - auto n_samples = t_input->ne[0]; - auto n_embd = t_input->ne[1]; - - dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd); - dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - ggml_set_name(dev_input, "dev_input"); - ggml_set_name(dev_square, "dev_square"); - ggml_set_name(dev_eigenvector, "dev_eigenvector"); - buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input)); - - // initialize eigenvector to random normalized vector - { - std::vector random_vec(ggml_nelements(dev_eigenvector), 0.0); - std::default_random_engine generator(static_cast(std::time(0))); - std::uniform_real_distribution distribution(0.0, 1.0); - float sum_sqr = 0.0; // for normalizing random_vec - for (size_t i = 0; i < random_vec.size(); ++i) { - float f = distribution(generator); - sum_sqr += f * f; - random_vec[i] = f; - } - // normalize it - float random_vec_norm = std::sqrt(sum_sqr); - for (size_t i = 0; i < random_vec.size(); ++i) { - random_vec[i] /= random_vec_norm; - } - ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector)); - } - } - - ~pca_model() { - ggml_free(ctx); - ggml_backend_buffer_free(buffer); - ggml_backend_free(backend); - } -}; - -static struct ggml_cgraph * build_graph_piter( - const struct pca_params & params, - const pca_model & model, - bool calc_square = false) { - GGML_ASSERT(params.n_batch > 0); - // TODO: buf_size must be able to scale with params.n_batch - static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); - static std::vector buf(buf_size); - - struct ggml_init_params params0 = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - // create a temporally context to build the graph - struct ggml_context * ctx0 = ggml_init(params0); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - // turn v_diff_original into square matrix if needed - struct ggml_tensor * tmp_square; - if (calc_square) { - tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input); - ggml_set_name(tmp_square, "tmp_square"); - } - - struct ggml_tensor * b_tensor; - struct ggml_tensor * distance; - struct ggml_tensor * old_eigen = model.dev_eigenvector; - struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square; - - for (int i = 0; i < params.n_batch; ++i) { - // b_tensor = square * eigenvector^T - b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen); - ggml_set_name(b_tensor, "b_tensor"); - - // normalize - b_tensor = ggml_div_inplace(ctx0, - b_tensor, - ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor))) - ); - ggml_format_name(b_tensor, "b_tensor_norm_%d", i); - - // calculate distance(new eigenvector - old eigenvector) - // we don't use ggml_sub because it may not be implemented on GPU backend - struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1)); - distance = ggml_sqrt_inplace(ctx0, - ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old))); - ggml_format_name(distance, "distance_%d", i); - - old_eigen = b_tensor; - - // build operations nodes - ggml_build_forward_expand(gf, distance); - } - - // delete the temporally context used to build the graph - ggml_free(ctx0); - return gf; -} - -static ggml_status compute_piter( - const struct pca_params & params, - const pca_model & model, - struct ggml_cgraph * gf, - ggml_gallocr_t allocr, - struct pca_result & result) { - // allocate tensors - ggml_gallocr_alloc_graph(allocr, gf); - - if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, params.n_threads); - } - -// TODO: enable GPU support when support for GGML_OP_SQRT is added -//#ifdef GGML_USE_METAL -// if (ggml_backend_is_metal(model.backend)) { -// ggml_backend_metal_set_n_cb(model.backend, params.n_threads); -// } -//#endif - - ggml_status res = ggml_backend_graph_compute(model.backend, gf); - if (res == GGML_STATUS_SUCCESS) { - auto extract_i = [](std::string prefix, std::string str) -> int { - int i = -1; - if (str.rfind(prefix, 0) == 0) { - sscanf(str.c_str(), (prefix + "%d").c_str(), &i); - } - return i; - }; - result.calculated_square = NULL; - result.eigenvectors.clear(); - result.distances.clear(); - result.eigenvectors.resize(params.n_batch); - result.distances.resize(params.n_batch); - // get output nodes - for (int i = 0; i < gf->n_nodes; ++i) { - auto node = gf->nodes[i]; - int iter = -1; - // find b_tensor (without copying data from device) - if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { - result.eigenvectors[iter] = node; - } - // find distances, then copy data from device - if ((iter = extract_i("distance_", node->name)) > -1) { - float d; - ggml_backend_tensor_get(node, &d, 0, sizeof(float)); - result.distances[iter] = d; - // std::cout << node->name << " = " << d << "\n"; - } - // find tmp_square if it exists (without copying data from device) - if (std::string(node->name) == "tmp_square") { - result.calculated_square = node; - } - } - } - return res; -} - -static void power_iteration( - const struct pca_params & params, - struct ggml_tensor * input, // shape of input: [n_samples, n_embd] - struct ggml_tensor * output) { - //printf("in power iteration\n"); - struct pca_model model(input); - - ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - struct pca_result result; - struct ggml_tensor * last_eigenvector = NULL; - - int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations - for (int iter = 0; iter < n_iters; ++iter) { - bool calc_square = (iter == 0); // only need to calculate square for first iteration - struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square); - // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); - compute_piter(params, model, gf, allocr, result); - - for (size_t k = 0; k < result.distances.size(); ++k) { - last_eigenvector = result.eigenvectors[k]; - if (result.distances[k] < params.tolerance) { - break; // done - } - } - - if (calc_square) { - // copy and store the square matrix if needed - GGML_ASSERT(result.calculated_square != NULL); - ggml_backend_tensor_copy(result.calculated_square, model.dev_square); - } - - { - // copy last eigen vector and store as input for next iteration - GGML_ASSERT(last_eigenvector != NULL); - ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector); - } - - printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n", - __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch); - } - - // get output tensor - GGML_ASSERT(last_eigenvector); - ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector)); - //print_debug_tensor(output); - ggml_gallocr_free(allocr); - - // TODO @ngxson : The output vector is randomly inverted - // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171 -} - -static void run_pca( - struct pca_params & params, - const std::vector & v_input, // shape of v_input[0]: [n_samples, n_embd] - const std::vector & v_output) { - printf("%s: Running PCA...\n", __func__); - for (size_t il = 0; il < v_input.size(); ++il) { - - // prepare output vector - struct ggml_tensor * ctrl_out = v_output[il]; - ggml_format_name(ctrl_out, "direction.%ld", il+1); - - // run power_iteration - params.i_layer = il; - params.n_layers = v_input.size(); - power_iteration(params, v_input[il], ctrl_out); - printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size()); - } -} - -} diff --git a/examples/cvector-generator/positive.txt b/examples/cvector-generator/positive.txt deleted file mode 100644 index fea736225..000000000 --- a/examples/cvector-generator/positive.txt +++ /dev/null @@ -1,4 +0,0 @@ -<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world -<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever! -<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you -<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now! \ No newline at end of file diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md index 629bdb5bc..11096f767 100644 --- a/examples/deprecation-warning/README.md +++ b/examples/deprecation-warning/README.md @@ -23,9 +23,6 @@ Please update all scripts and workflows to use the new binary names. | convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml | | eval-callback | llama-eval-callback | | gbnf-validator | llama-gbnf-validator | -| gguf | llama-gguf | -| gguf-split | llama-gguf-split | -| gritlm | llama-gritlm | | imatrix | llama-imatrix | | infill | llama-infill | | llava-cli | llama-llava-cli | @@ -35,10 +32,7 @@ Please update all scripts and workflows to use the new binary names. | lookup-merge | llama-lookup-merge | | lookup-stats | llama-lookup-stats | | parallel | llama-parallel | -| passkey | llama-passkey | -| perplexity | llama-perplexity | -| q8dot | llama-q8dot | -| quantize-stats | llama-quantize-stats | +| q8dot | llama-q8dot || | retrieval | llama-retrieval | | save-load-state | llama-save-load-state | | simple | llama-simple | diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt deleted file mode 100644 index 8256e789a..000000000 --- a/examples/embedding/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-embedding) -add_executable(${TARGET} embedding.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/embedding/README.md b/examples/embedding/README.md deleted file mode 100644 index e3705b454..000000000 --- a/examples/embedding/README.md +++ /dev/null @@ -1,60 +0,0 @@ -# llama.cpp/example/embedding - -This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp. - -## Quick Start - -To get started right away, run the following command, making sure to use the correct path for the model you have: - -### Unix-based systems (Linux, macOS, etc.): - -```bash -./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null -``` - -### Windows: - -```powershell -llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null -``` - -The above command will output space-separated float values. - -## extra parameters -### --embd-normalize $integer$ -| $integer$ | description | formula | -|-----------|---------------------|---------| -| $-1$ | none | -| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$ -| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$ -| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$ -| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$ - -### --embd-output-format $'string'$ -| $'string'$ | description | | -|------------|------------------------------|--| -| '' | same as before | (default) -| 'array' | single embeddings | $[[x_1,...,x_n]]$ -| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$ -| 'json' | openai style | -| 'json+' | add cosine similarity matrix | - -### --embd-separator $"string"$ -| $"string"$ | | -|--------------|-| -| "\n" | (default) -| "<#embSep#>" | for exemple -| "<#sep#>" | other exemple - -## examples -### Unix-based systems (Linux, macOS, etc.): - -```bash -./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null -``` - -### Windows: - -```powershell -embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null -``` diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp deleted file mode 100644 index 1466e5b2b..000000000 --- a/examples/embedding/embedding.cpp +++ /dev/null @@ -1,268 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static std::vector split_lines(const std::string & s, const std::string & separator = "\n") { - std::vector lines; - size_t start = 0; - size_t end = s.find(separator); - - while (end != std::string::npos) { - lines.push_back(s.substr(start, end - start)); - start = end + separator.length(); - end = s.find(separator, start); - } - - lines.push_back(s.substr(start)); // Add the last part - - return lines; -} - -static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { - size_t n_tokens = tokens.size(); - for (size_t i = 0; i < n_tokens; i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, true); - } -} - -static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { - // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); - - // run model - fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); - if (llama_decode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to decode\n", __func__); - } - - for (int i = 0; i < batch.n_tokens; i++) { - if (!batch.logits[i]) { - continue; - } - - // try to get sequence embeddings - supported only when pooling_type is not NONE - const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); - - float * out = output + batch.seq_id[i][0] * n_embd; - llama_embd_normalize(embd, out, n_embd, embd_norm); - } -} - -int main(int argc, char ** argv) { - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); - return 1; - } - - params.embedding = true; - // For non-causal models, batch size must be equal to ubatch size - params.n_ubatch = params.n_batch; - - print_build_info(); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - - llama_backend_init(); - llama_numa_init(params.numa); - - llama_model * model; - llama_context * ctx; - - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return 1; - } - - const int n_ctx_train = llama_n_ctx_train(model); - const int n_ctx = llama_n_ctx(ctx); - - const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); - return 1; - } - - if (n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); - } - - // split the prompt into lines - std::vector prompts = split_lines(params.prompt, params.embd_sep); - - // max batch size - const uint64_t n_batch = params.n_batch; - GGML_ASSERT(params.n_batch >= params.n_ctx); - - // tokenize the prompts and trim - std::vector> inputs; - for (const auto & prompt : prompts) { - auto inp = ::llama_tokenize(ctx, prompt, true, false); - if (inp.size() > n_batch) { - fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", - __func__, (long long int) inp.size(), (long long int) n_batch); - return 1; - } - inputs.push_back(inp); - } - - // check if the last token is SEP - // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' - for (auto & inp : inputs) { - if (inp.empty() || inp.back() != llama_token_sep(model)) { - fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__); - fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); - } - } - - // tokenization stats - if (params.verbose_prompt) { - for (int i = 0; i < (int) inputs.size(); i++) { - fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); - for (int j = 0; j < (int) inputs[i].size(); j++) { - fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); - } - fprintf(stderr, "\n\n"); - } - } - - // initialize batch - const int n_prompts = prompts.size(); - struct llama_batch batch = llama_batch_init(n_batch, 0, 1); - - // allocate output - const int n_embd = llama_n_embd(model); - std::vector embeddings(n_prompts * n_embd, 0); - float * emb = embeddings.data(); - - // break into batches - int p = 0; // number of prompts processed already - int s = 0; // number of prompts in current batch - for (int k = 0; k < n_prompts; k++) { - // clamp to n_batch tokens - auto & inp = inputs[k]; - - const uint64_t n_toks = inp.size(); - - // encode if at capacity - if (batch.n_tokens + n_toks > n_batch) { - float * out = emb + p * n_embd; - batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - llama_batch_clear(batch); - p += s; - s = 0; - } - - // add to batch - batch_add_seq(batch, inp, s); - s += 1; - } - - // final batch - float * out = emb + p * n_embd; - batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - - if (params.embd_out.empty()) { - // print the first part of the embeddings or for a single prompt, the full embedding - fprintf(stdout, "\n"); - for (int j = 0; j < n_prompts; j++) { - fprintf(stdout, "embedding %d: ", j); - for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { - if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); - } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); - } - } - fprintf(stdout, "\n"); - } - - // print cosine similarity matrix - if (n_prompts > 1) { - fprintf(stdout, "\n"); - printf("cosine similarity matrix:\n\n"); - for (int i = 0; i < n_prompts; i++) { - fprintf(stdout, "%6.6s ", prompts[i].c_str()); - } - fprintf(stdout, "\n"); - for (int i = 0; i < n_prompts; i++) { - for (int j = 0; j < n_prompts; j++) { - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f ", sim); - } - fprintf(stdout, "%1.10s", prompts[i].c_str()); - fprintf(stdout, "\n"); - } - } - } - - if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") { - const bool notArray = params.embd_out != "array"; - - fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); - for (int j = 0;;) { // at least one iteration (one prompt) - if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); - fprintf(stdout, "["); - for (int i = 0;;) { // at least one iteration (n_embd > 0) - fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); - i++; - if (i < n_embd) fprintf(stdout, ","); else break; - } - fprintf(stdout, notArray ? "]\n }" : "]"); - j++; - if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break; - } - fprintf(stdout, notArray ? "\n ]" : "]\n"); - - if (params.embd_out == "json+" && n_prompts > 1) { - fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); - for (int i = 0;;) { // at least two iteration (n_prompts > 1) - fprintf(stdout, " ["); - for (int j = 0;;) { // at least two iteration (n_prompts > 1) - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f", sim); - j++; - if (j < n_prompts) fprintf(stdout, ", "); else break; - } - fprintf(stdout, " ]"); - i++; - if (i < n_prompts) fprintf(stdout, ",\n"); else break; - } - fprintf(stdout, "\n ]"); - } - - if (notArray) fprintf(stdout, "\n}\n"); - } - - // clean up - llama_print_timings(ctx); - llama_batch_free(batch); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); - - return 0; -} diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt deleted file mode 100644 index a48753d38..000000000 --- a/examples/eval-callback/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -set(TARGET llama-eval-callback) -add_executable(${TARGET} eval-callback.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) - -set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) -set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md deleted file mode 100644 index 63a57ad6b..000000000 --- a/examples/eval-callback/README.md +++ /dev/null @@ -1,95 +0,0 @@ -# llama.cpp/examples/eval-callback - -A simple example which demonstrates how to use callback during the inference. -It simply prints to the console all operations and tensor data. - -Usage: - -```shell -llama-eval-callback \ - --hf-repo ggml-org/models \ - --hf-file phi-2/ggml-model-q4_0.gguf \ - --model phi-2-q4_0.gguf \ - --prompt hello \ - --seed 42 \ - -ngl 33 -``` - -Will print: - -```shell -llm_load_tensors: offloaded 33/33 layers to GPU -... -llama_new_context_with_model: n_ctx = 512 -... -llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB -llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB -llama_new_context_with_model: graph nodes = 1225 -llama_new_context_with_model: graph splits = 2 -ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1} - [ - [ - [ -0.0181, 0.0272, 0.0272, ...], - ], - ] -ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1} - [ - [ - [ -0.6989, 1.0636, 1.0636, ...], - ], - ] -ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1} - [ - [ - [ -0.1800, 0.2817, 0.2632, ...], - ], - ] -ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1} - [ - [ - [ -0.1863, 0.2970, 0.2604, ...], - ], - ] -ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1} - [ - [ - [ -1.1238, 1.2876, -1.8086, ...], - ], - ] -ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1} - [ - [ - [ -1.1135, 1.4604, -1.9226, ...], - ], - ] -ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1} - [ - [ - [ -1.1135, 1.4604, -1.9226, ...], - ], - ] -ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1} - [ - [ - [ -1.1135, 1.4604, -1.9226, ...], - ], - ] -ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1} - [ - [ - [ -1.1135, 1.4604, -1.9226, ...], - [ -0.3608, 0.5076, -1.8866, ...], - [ 1.7643, 0.0273, -2.1065, ...], - ... - ], - ] -ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1} - [ - [ - [ -1.1135, 1.4604, -1.9226, ...], - [ -0.3608, 0.5076, -1.8866, ...], - [ 1.7643, 0.0273, -2.1065, ...], - ... - ], - ] -``` diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp deleted file mode 100644 index c8a3016a4..000000000 --- a/examples/eval-callback/eval-callback.cpp +++ /dev/null @@ -1,193 +0,0 @@ -#include "common.h" -#include "llama.h" -#include "ggml.h" - -#include -#include -#include -#include -#include - -/** - * This the arbitrary data which will be passed to each callback. - * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. - */ -struct callback_data { - std::vector data; -}; - -static std::string ggml_ne_string(const ggml_tensor * t) { - std::string str; - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - str += std::to_string(t->ne[i]); - if (i + 1 < GGML_MAX_DIMS) { - str += ", "; - } - } - return str; -} - -static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { - GGML_ASSERT(n > 0); - float sum = 0; - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - printf(" [\n"); - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - if (i2 == n && ne[2] > 2*n) { - printf(" ..., \n"); - i2 = ne[2] - n; - } - printf(" [\n"); - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - if (i1 == n && ne[1] > 2*n) { - printf(" ..., \n"); - i1 = ne[1] - n; - } - printf(" ["); - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - if (i0 == n && ne[0] > 2*n) { - printf("..., "); - i0 = ne[0] - n; - } - size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; - float v; - if (type == GGML_TYPE_F16) { - v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); - } else if (type == GGML_TYPE_F32) { - v = *(float *) &data[i]; - } else if (type == GGML_TYPE_I32) { - v = (float) *(int32_t *) &data[i]; - } else if (type == GGML_TYPE_I16) { - v = (float) *(int16_t *) &data[i]; - } else if (type == GGML_TYPE_I8) { - v = (float) *(int8_t *) &data[i]; - } else { - GGML_ASSERT(false); - } - printf("%12.4f", v); - sum += v; - if (i0 < ne[0] - 1) printf(", "); - } - printf("],\n"); - } - printf(" ],\n"); - } - printf(" ]\n"); - printf(" sum = %f\n", sum); - } -} - -/** - * GGML operations callback during the graph execution. - * - * @param t current tensor - * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor - * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. - * see ggml_backend_sched_eval_callback - * @param user_data user data to pass at each call back - * @return true to receive data or continue the graph, false otherwise - */ -static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { - auto * cb_data = (callback_data *) user_data; - - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - - if (ask) { - return true; // Always retrieve data - } - - char src1_str[128] = {0}; - if (src1) { - snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); - } - - printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); - - - // copy the data from the GPU memory if needed - const bool is_host = ggml_backend_buffer_is_host(t->buffer); - - if (!is_host) { - auto n_bytes = ggml_nbytes(t); - cb_data->data.resize(n_bytes); - ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); - } - - if (!ggml_is_quantized(t->type)) { - uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); - } - - return true; -} - -static bool run(llama_context * ctx, const gpt_params & params) { - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - - std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); - - if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - - return true; -} - -int main(int argc, char ** argv) { - callback_data cb_data; - - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); - return 1; - } - - print_build_info(); - - std::mt19937 rng(params.seed); - - llama_backend_init(); - llama_numa_init(params.numa); - - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation - params.cb_eval = ggml_debug; - params.cb_eval_user_data = &cb_data; - params.warmup = false; - - // init - llama_model * model; - llama_context * ctx; - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); - } - - bool OK = run(ctx, params); - if (!OK) { - return 1; - } - - llama_print_timings(ctx); - - llama_free(ctx); - llama_free_model(model); - - llama_backend_free(); - - return 0; -} diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt deleted file mode 100644 index 1cef6e716..000000000 --- a/examples/export-lora/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-export-lora) -add_executable(${TARGET} export-lora.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md deleted file mode 100644 index 91c33c34a..000000000 --- a/examples/export-lora/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# export-lora - -Apply LORA adapters to base model and export the resulting model. - -``` -usage: llama-export-lora [options] - -options: - -m, --model model path from which to load base model (default '') - --lora FNAME path to LoRA adapter (can be repeated to use multiple adapters) - --lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters) - -t, --threads N number of threads to use during computation (default: 4) - -o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf') -``` - -For example: - -```bash -./bin/llama-export-lora \ - -m open-llama-3b-v2-q8_0.gguf \ - -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ - --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf -``` - -Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: - -```bash -./bin/llama-export-lora \ - -m your_base_model.gguf \ - -o your_merged_model.gguf \ - --lora-scaled lora_task_A.gguf 0.5 \ - --lora-scaled lora_task_B.gguf 0.5 -``` diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp deleted file mode 100644 index 150f7e8d5..000000000 --- a/examples/export-lora/export-lora.cpp +++ /dev/null @@ -1,420 +0,0 @@ -#include "common.h" -#include "ggml.h" -#include "ggml-alloc.h" - -#include -#include -#include -#include -#include - -static bool g_verbose = false; - -static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ - int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); -} - -static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) { - int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); -} - -static void zeros(std::ofstream & file, size_t n) { - char zero = 0; - for (size_t i = 0; i < n; ++i) { - file.write(&zero, 1); - } -} - -static std::string ggml_ne_string(const ggml_tensor * t) { - std::string str; - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - str += std::to_string(t->ne[i]); - if (i + 1 < GGML_MAX_DIMS) { - str += ", "; - } - } - return str; -} - -static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) { - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ ctx_ggml, - }; - struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params); - if (!ctx_gguf) { - throw std::runtime_error("failed to load input GGUF from " + fname); - } - return ctx_gguf; -} - -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; - } - s = std::move(result); -} - -struct file_input { - struct ggml_context * ctx_meta = nullptr; - struct gguf_context * ctx_gguf = nullptr; - std::ifstream f_in; - std::map tensors; - float alpha; - float scale; - - file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) { - if (!f_in.is_open()) { - throw std::runtime_error("failed to open input gguf from " + fname); - } - - ctx_gguf = load_gguf(fname, &ctx_meta); - alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha"); - printf("%s: loaded gguf from %s\n", __func__, fname.c_str()); - - for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) { - std::string name(cur->name); - tensors[name] = cur; - if (g_verbose) { - printf("%s: %s\n", __func__, cur->name); - } - } - } - - ggml_tensor * get_tensor(std::string name) { - if (tensors.find(name) == tensors.end()) { - return nullptr; - } - return tensors[name]; - } - - void read_tensor_data(std::string name, std::vector & buf) { - if (tensors.find(name) == tensors.end()) { - throw std::runtime_error("cannot find tensor with name: " + name); - } - auto len = ggml_nbytes(tensors[name]); - if (buf.size() < len) { - buf.resize(len); - } - auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file - auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); - f_in.seekg(offset); - f_in.read((char* )buf.data(), len); - } - - ~file_input() { - gguf_free(ctx_gguf); - ggml_free(ctx_meta); - } -}; - -struct lora_merge_ctx { - // input base model + adapters - file_input base_model; - std::vector> adapters; - - // for computing merged tensor - int n_threads; - ggml_backend_t backend = nullptr; - ggml_gallocr_t allocr = nullptr; - std::vector read_buf; - - // output file - struct gguf_context * ctx_out; - struct ggml_context * ctx_out_ggml; - std::ofstream fout; - - lora_merge_ctx( - std::string & base_fname, - std::vector> & lora_files, - std::string & outfile, - int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { - fout.exceptions(std::ofstream::failbit); // fail fast on write errors - - if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) { - throw std::runtime_error("split model is not yet supported"); - } - - for (auto lora_inp : lora_files) { - auto fname = std::get<0>(lora_inp); - auto scale = std::get<1>(lora_inp); - std::unique_ptr adapter(new file_input(fname, scale)); - check_metadata_lora(adapter.get()); - adapters.push_back(std::move(adapter)); - } - - ctx_out = gguf_init_empty(); - struct ggml_init_params params = { - /*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ctx_out_ggml = ggml_init(params); - backend = ggml_backend_cpu_init(); - allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); - } - - void check_metadata_lora(file_input * adapter) { - auto general_type = get_kv_str(adapter->ctx_gguf, "general.type"); - if (general_type != "adapter") { - throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); - } - - auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type"); - if (adapter_type != "lora") { - throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); - } - - auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture"); - auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture"); - if (general_arch_base != general_arch_lora) { - throw std::runtime_error("model arch and LoRA arch mismatch"); - } - } - - ggml_type get_out_tensor_type(struct ggml_tensor * t) { - if (t->type == GGML_TYPE_F32) { - return GGML_TYPE_F32; - } else { - return GGML_TYPE_F16; - } - } - - void run_merge() { - // prepare metadata - gguf_set_kv(ctx_out, base_model.ctx_gguf); - // output is forced to f16 for now - gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16); - - // check if all lora adapters have the same tensors - // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777 - static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once."; - if (adapters.size() > 1) { - for (size_t i = 1; i < adapters.size(); ++i) { - if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) { - throw std::runtime_error(err_no_subset_adapter); - } - for (auto & it : adapters[i]->tensors) { - if (adapters[0]->get_tensor(it.first) == nullptr) { - throw std::runtime_error(err_no_subset_adapter); - } - } - } - } - - // mapping base tensor to out tensor (same shape with base, but different type) - // if out_tensor == nullptr, we only copy it - std::vector> base_to_out_tensors; - for (auto & it : base_model.tensors) { - bool t_a = true; - bool t_b = true; - for (auto & adapter : adapters) { - t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a"); - t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b"); - } - auto base_tensor = it.second; - if (!t_a && !t_b) { - // only copy - struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); - ggml_set_name(cpy_tensor, base_tensor->name); - base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr)); - gguf_add_tensor(ctx_out, cpy_tensor); - } else if (t_a && t_b) { - // need merging - struct ggml_tensor * out_tensor = ggml_new_tensor( - ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); - ggml_set_name(out_tensor, base_tensor->name); - base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor)); - gguf_add_tensor(ctx_out, out_tensor); - } else { - throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); - } - } - - // placeholder for the meta data - { - size_t meta_size = gguf_get_meta_size(ctx_out); - zeros(fout, meta_size); - } - - // process base model tensors - size_t n_merged = 0; - for (auto & it : base_to_out_tensors) { - if (it.second != nullptr) { - merge_tensor(it.first, it.second); - n_merged++; - } else { - copy_tensor(it.first); - } - } - - // write output metadata - { - std::vector data(gguf_get_meta_size(ctx_out)); - gguf_get_meta_data(ctx_out, data.data()); - fout.seekp(0); - fout.write((const char *)data.data(), data.size()); - } - - printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); - printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size()); - } - - void copy_tensor(struct ggml_tensor * base) { - printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); - size_t len = ggml_nbytes(base); - base_model.read_tensor_data(base->name, read_buf); - fout.write((char* )read_buf.data(), len); - zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); - } - - void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) { - std::string name_base(base->name); - std::string name_lora_a = name_base + ".lora_a"; - std::string name_lora_b = name_base + ".lora_b"; - - printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); - - // context for input tensor - std::vector inp_a(adapters.size()); - std::vector inp_b(adapters.size()); - struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - struct ggml_context * ctx = ggml_init(params); - - // alloc tensors - struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne); - for (size_t i = 0; i < adapters.size(); ++i) { - auto t_a = adapters[i]->get_tensor(name_lora_a); - auto t_b = adapters[i]->get_tensor(name_lora_b); - inp_a[i] = ggml_dup_tensor(ctx, t_a); - inp_b[i] = ggml_dup_tensor(ctx, t_b); - } - ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - - // load base tensor to backend buffer - base_model.read_tensor_data(name_base, read_buf); - if (base->type != GGML_TYPE_F32) { - // optionally dequantize it - printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); - auto nels = ggml_nelements(inp_base); - ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type); - std::vector dequant_buf(nels * sizeof(float)); - qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels); - ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); - } else { - ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); - } - - // load lora tensors to backend buffer - for (size_t i = 0; i < adapters.size(); ++i) { - adapters[i]->read_tensor_data(name_lora_a, read_buf); - ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i])); - adapters[i]->read_tensor_data(name_lora_b, read_buf); - ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i])); - } - - // build graph - struct ggml_cgraph * gf; - { - static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); - static std::vector buf(buf_size); - struct ggml_init_params params0 = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, - }; - struct ggml_context * ctx0 = ggml_init(params0); - gf = ggml_new_graph(ctx0); - struct ggml_tensor * cur = inp_base; - for (size_t i = 0; i < adapters.size(); ++i) { - struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))); - struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32)); - // scale - const float alpha = adapters[i]->alpha; - const float rank = (float) inp_b[i]->ne[0]; - const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; - delta = ggml_scale(ctx0, delta, scale); - cur = ggml_add(ctx0, delta, cur); - printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); - printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); - } - cur = ggml_cast(ctx0, cur, out->type); - printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type)); - ggml_build_forward_expand(gf, cur); - ggml_free(ctx0); - } - - // compute - { - ggml_gallocr_alloc_graph(allocr, gf); - ggml_backend_cpu_set_n_threads(backend, n_threads); - ggml_backend_graph_compute(backend, gf); - } - - // write data to output file - { - auto result = gf->nodes[gf->n_nodes - 1]; - size_t len = ggml_nbytes(result); - if (read_buf.size() < len) { - read_buf.resize(len); - } - ggml_backend_tensor_get(result, read_buf.data(), 0, len); - fout.write((char* )read_buf.data(), len); - zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); - } - - ggml_free(ctx); - ggml_backend_buffer_free(buffer); - } - - ~lora_merge_ctx() { - ggml_gallocr_free(allocr); - ggml_backend_free(backend); - gguf_free(ctx_out); - ggml_free(ctx_out_ggml); - } -}; - -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - - printf("\nexample usage:\n"); - printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); - printf("\nNOTE: output model is F16\n"); - printf("\n"); -} - -int main(int argc, char ** argv) { - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); - return 1; - } - - g_verbose = (params.verbosity == 1); - try { - lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads); - ctx.run_merge(); - } catch (const std::exception & err) { - fprintf(stderr, "%s\n", err.what()); - exit(EXIT_FAILURE); - } - - printf("done, output file is %s\n", params.lora_outfile.c_str()); - - return 0; -} diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt deleted file mode 100644 index 4edd6ec73..000000000 --- a/examples/gbnf-validator/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gbnf-validator) -add_executable(${TARGET} gbnf-validator.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp deleted file mode 100644 index 48a705e15..000000000 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ /dev/null @@ -1,137 +0,0 @@ -#define LLAMA_API_INTERNAL - -#include "grammar-parser.h" -#include "ggml.h" -#include "llama.h" -#include "unicode.h" - -#include -#include -#include -#include -#include -#include - -static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { - auto decoded = decode_utf8(input_str, {}); - const auto & code_points = decoded.first; - - const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); - llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar); - - size_t pos = 0; - for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy - - llama_grammar_accept(rules, prev_stacks, *it, cur_stacks); - - if (cur_stacks.empty()) { - error_pos = pos; - error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; - cur_stacks = prev_stacks; - return false; - } - ++pos; - } - - for (const auto & stack : cur_stacks) { - if (stack.empty()) { - return true; - } - } - - error_pos = pos; - error_msg = "Unexpected end of input"; - return false; -} - -static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) { - fprintf(stdout, "Input string is invalid according to the grammar.\n"); - fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos); - fprintf(stdout, "\n"); - fprintf(stdout, "Input string:\n"); - fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str()); - if (error_pos < input_str.size()) { - fprintf(stdout, "\033[1;31m%c", input_str[error_pos]); - if (error_pos+1 < input_str.size()) { - fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str()); - } - fprintf(stdout, "\033[0m\n"); - } -} - -int main(int argc, char** argv) { - if (argc != 3) { - fprintf(stdout, "Usage: %s \n", argv[0]); - return 1; - } - - const std::string grammar_filename = argv[1]; - const std::string input_filename = argv[2]; - - // Read the GBNF grammar file - FILE* grammar_file = fopen(grammar_filename.c_str(), "r"); - if (!grammar_file) { - fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str()); - return 1; - } - - std::string grammar_str; - { - std::ifstream grammar_file(grammar_filename); - GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file"); - std::stringstream buffer; - buffer << grammar_file.rdbuf(); - grammar_str = buffer.str(); - } - - // Parse the GBNF grammar - auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); - - // will be empty (default) if there are parse errors - if (parsed_grammar.rules.empty()) { - fprintf(stdout, "%s: failed to parse grammar\n", __func__); - return 1; - } - - // Ensure that there is a "root" node. - if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) { - fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__); - return 1; - } - - std::vector grammar_rules(parsed_grammar.c_rules()); - - // Create the LLAMA grammar - auto grammar = llama_grammar_init( - grammar_rules.data(), - grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); - if (grammar == nullptr) { - throw std::runtime_error("Failed to initialize llama_grammar"); - } - // Read the input file - std::string input_str; - { - std::ifstream input_file(input_filename); - GGML_ASSERT(input_file.is_open() && "Failed to open input file"); - std::stringstream buffer; - buffer << input_file.rdbuf(); - input_str = buffer.str(); - } - - // Validate the input string against the grammar - size_t error_pos; - std::string error_msg; - bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg); - - if (is_valid) { - fprintf(stdout, "Input string is valid according to the grammar.\n"); - } else { - print_error_message(input_str, error_pos, error_msg); - } - - // Clean up - llama_grammar_free(grammar); - - return 0; -} diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt deleted file mode 100644 index 633f45535..000000000 --- a/examples/gguf-hash/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -set(TARGET llama-gguf-hash) -add_executable(${TARGET} gguf-hash.cpp) -install(TARGETS ${TARGET} RUNTIME) - -# clibs dependencies -include_directories(deps/) -add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h) -target_link_libraries(${TARGET} PRIVATE xxhash) -add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h) -target_link_libraries(${TARGET} PRIVATE sha1) -add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h) -target_link_libraries(${TARGET} PRIVATE sha256) - -target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md deleted file mode 100644 index 9871651e3..000000000 --- a/examples/gguf-hash/README.md +++ /dev/null @@ -1,206 +0,0 @@ - -# llama-gguf-hash - -CLI to hash GGUF files to detect difference on a per model and per tensor level. - -**Command line options:** - -- `--help`: display help message -- `--xxh64`: use xhash 64bit hash mode (default) -- `--sha1`: use sha1 -- `--uuid`: use uuid -- `--sha256`: use sha256 -- `--all`: use all hash -- `--no-layer`: exclude per layer hash -- `--uuid`: generate UUIDv5 ID -- `-c`, `--check `: verify against a manifest - -## About - -While most POSIX systems already have hash checking programs like sha256sum, it -is designed to check entire files. This is not ideal for our purpose if we want -to check for consistency of the tensor data even if the metadata content of the -gguf KV store has been updated. - -This program is designed to hash a gguf tensor payload on a 'per tensor layer' -in addition to a 'entire tensor model' hash. The intent is that the entire -tensor layer can be checked first but if there is any detected inconsistencies, -then the per tensor hash can be used to narrow down the specific tensor layer -that has inconsistencies. - -For Maintainers: -- Detection of tensor inconsistency during development and automated tests - - This is served by xxh64 which is fast - - This is also served by having per tensor layer to assist in narrowing down - the location of the faulty tensor layer - - This is also served by sha1 which is much slower but more widely supported - -For Model Creators: -- Optional consistent UUID generation based on model tensor content - - This is served by UUIDv5 which is useful for databases keys - - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5` - - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp` - -For Model Users: -- Assurance of tensor layer integrity even if metadata was updated - - This is served by sha256 which is still considered very secure as of 2024 - -### Design Note - -- The default behavior of this program if no arguments is provided is to hash - using xxhash's xxh32 mode because it is very fast and is primarily targeted - towards maintainers who may want to use this in automated tests. -- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively - however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus - would have a better affinity to calculating hash that is 64bit in size. - -## Compile Example - -```bash -cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON -make -C build clean -make -C build llama-gguf-hash VERBOSE=1 -./build/bin/llama-gguf-hash test.gguf -./build/bin/llama-gguf-hash --xxh64 test.gguf -./build/bin/llama-gguf-hash --sha1 test.gguf -./build/bin/llama-gguf-hash --uuid test.gguf -./build/bin/llama-gguf-hash --sha256 test.gguf -``` - -## Generation and Verification Example - -To generate we may use this command - -```bash -./llama-gguf-hash --all test.gguf > test.gguf.manifest -``` - -Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well -(This excludes UUID as that is an ID not a hash) - -```bash -xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 -sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0 -sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 -xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 -sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1 -sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 -xxh64 a0af5d700049693b test.gguf:tensor_2 -sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2 -sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 -xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 -sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3 -sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 -xxh64 1257733306b7992d test.gguf:tensor_4 -sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4 -sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 -xxh64 d238d16ba4711e58 test.gguf:tensor_5 -sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5 -sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 -xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 -sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6 -sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 -xxh64 c22021c29854f093 test.gguf:tensor_7 -sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7 -sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 -xxh64 936df61f5d64261f test.gguf:tensor_8 -sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8 -sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 -xxh64 93fd20c64421c081 test.gguf:tensor_9 -sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9 -sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 -xxh64 5a54d3aad816f302 test.gguf -sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf -sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf -``` - -We can then use the normal check command which will by default check for the highest security strength hash and verify against that: - -```bash -$ ./llama-gguf-hash --check test.gguf.manifest test.gguf -manifest test.gguf.manifest sha256 sha1 xxh64 -sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok -sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok -sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok -sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok -sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok -sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok -sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok -sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok -sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok -sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok -sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok - -Verification results for test.gguf.manifest - Success -``` - -Or we may explicitly ask for a faster hash like: - -```bash -$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf -manifest test.gguf.manifest sha256 sha1 xxh64 -xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok -xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok -xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok -xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok -xxh64 1257733306b7992d test.gguf:tensor_4 - Ok -xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok -xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok -xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok -xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok -xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok -xxh64 5a54d3aad816f302 test.gguf - Ok - -Verification results for test.gguf.manifest - Success -``` - -Or maybe we want to just check that all the hash is valid: - -```bash -$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest -manifest test.gguf.manifest sha256 sha1 xxh64 -xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok -sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0 - Ok -sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok -xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok -sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1 - Ok -sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok -xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok -sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2 - Ok -sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok -xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok -sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3 - Ok -sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok -xxh64 1257733306b7992d test.gguf:tensor_4 - Ok -sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4 - Ok -sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok -xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok -sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5 - Ok -sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok -xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok -sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6 - Ok -sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok -xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok -sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7 - Ok -sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok -xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok -sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8 - Ok -sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok -xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok -sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9 - Ok -sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok -xxh64 5a54d3aad816f302 test.gguf - Ok -sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf - Ok -sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok - -Verification results for test.gguf.manifest - Success -``` - - -## Crypto/Hash Libraries Used - -These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs) - -- https://github.com/Cyan4973/xxHash -- https://github.com/clibs/sha1/ -- https://github.com/jb55/sha256.c diff --git a/examples/gguf-hash/deps/rotate-bits/package.json b/examples/gguf-hash/deps/rotate-bits/package.json deleted file mode 100644 index 74c0bef68..000000000 --- a/examples/gguf-hash/deps/rotate-bits/package.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "name": "rotate-bits", - "version": "0.1.1", - "repo": "jb55/rotate-bits.h", - "description": "rotate bits", - "keywords": ["rotl", "rotr"], - "src": ["rotate-bits.h"], - "license": "Public Domain", - "development": { - "thlorenz/tap.c": "*" - } -} - diff --git a/examples/gguf-hash/deps/rotate-bits/rotate-bits.h b/examples/gguf-hash/deps/rotate-bits/rotate-bits.h deleted file mode 100644 index 75c4881fc..000000000 --- a/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +++ /dev/null @@ -1,46 +0,0 @@ - - -#ifndef __ROTATE_DEFS_H -#define __ROTATE_DEFS_H - -#ifdef _MSC_VER - -#include - -#define ROTL32(v, n) _rotl((v), (n)) -#define ROTL64(v, n) _rotl64((v), (n)) - -#define ROTR32(v, n) _rotr((v), (n)) -#define ROTR64(v, n) _rotr64((v), (n)) - -#else - -#include - -#define U8V(v) ((uint8_t)(v) & 0xFFU) -#define U16V(v) ((uint16_t)(v) & 0xFFFFU) -#define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU) -#define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU) - -#define ROTL32(v, n) \ - (U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n)))) - -// tests fail if we don't have this cast... -#define ROTL64(v, n) \ - (U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n)))) - -#define ROTR32(v, n) ROTL32(v, 32 - (n)) -#define ROTR64(v, n) ROTL64(v, 64 - (n)) - -#endif - -#define ROTL8(v, n) \ - (U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n)))) - -#define ROTL16(v, n) \ - (U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n)))) - -#define ROTR8(v, n) ROTL8(v, 8 - (n)) -#define ROTR16(v, n) ROTL16(v, 16 - (n)) - -#endif diff --git a/examples/gguf-hash/deps/sha1/package.json b/examples/gguf-hash/deps/sha1/package.json deleted file mode 100644 index 6a5843dd1..000000000 --- a/examples/gguf-hash/deps/sha1/package.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "sha1", - "version": "0.0.1", - "repo": "clibs/sha1", - "description": "sha1 hash algorithm", - "keywords": ["sha1", "hash"], - "license": "public domain", - "src": ["sha1.c", "sha1.h"] -} diff --git a/examples/gguf-hash/deps/sha1/sha1.c b/examples/gguf-hash/deps/sha1/sha1.c deleted file mode 100644 index 76cd6ca33..000000000 --- a/examples/gguf-hash/deps/sha1/sha1.c +++ /dev/null @@ -1,295 +0,0 @@ -/* -SHA-1 in C -By Steve Reid -100% Public Domain - -Test Vectors (from FIPS PUB 180-1) -"abc" - A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D -"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" - 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 -A million repetitions of "a" - 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F -*/ - -/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */ -/* #define SHA1HANDSOFF * Copies data before messing with it. */ - -#define SHA1HANDSOFF - -#include -#include - -/* for uint32_t */ -#include - -#include "sha1.h" - - -#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) - -/* blk0() and blk() perform the initial expand. */ -/* I got the idea of expanding during the round function from SSLeay */ -#if BYTE_ORDER == LITTLE_ENDIAN -#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ - |(rol(block->l[i],8)&0x00FF00FF)) -#elif BYTE_ORDER == BIG_ENDIAN -#define blk0(i) block->l[i] -#else -#error "Endianness not defined!" -#endif -#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \ - ^block->l[(i+2)&15]^block->l[i&15],1)) - -/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ -#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30); -#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30); -#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30); -#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30); -#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30); - - -/* Hash a single 512-bit block. This is the core of the algorithm. */ - -void SHA1Transform( - uint32_t state[5], - const unsigned char buffer[64] -) -{ - uint32_t a, b, c, d, e; - - typedef union - { - unsigned char c[64]; - uint32_t l[16]; - } CHAR64LONG16; - -#ifdef SHA1HANDSOFF - CHAR64LONG16 block[1]; /* use array to appear as a pointer */ - - memcpy(block, buffer, 64); -#else - /* The following had better never be used because it causes the - * pointer-to-const buffer to be cast into a pointer to non-const. - * And the result is written through. I threw a "const" in, hoping - * this will cause a diagnostic. - */ - CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer; -#endif - /* Copy context->state[] to working vars */ - a = state[0]; - b = state[1]; - c = state[2]; - d = state[3]; - e = state[4]; - /* 4 rounds of 20 operations each. Loop unrolled. */ - R0(a, b, c, d, e, 0); - R0(e, a, b, c, d, 1); - R0(d, e, a, b, c, 2); - R0(c, d, e, a, b, 3); - R0(b, c, d, e, a, 4); - R0(a, b, c, d, e, 5); - R0(e, a, b, c, d, 6); - R0(d, e, a, b, c, 7); - R0(c, d, e, a, b, 8); - R0(b, c, d, e, a, 9); - R0(a, b, c, d, e, 10); - R0(e, a, b, c, d, 11); - R0(d, e, a, b, c, 12); - R0(c, d, e, a, b, 13); - R0(b, c, d, e, a, 14); - R0(a, b, c, d, e, 15); - R1(e, a, b, c, d, 16); - R1(d, e, a, b, c, 17); - R1(c, d, e, a, b, 18); - R1(b, c, d, e, a, 19); - R2(a, b, c, d, e, 20); - R2(e, a, b, c, d, 21); - R2(d, e, a, b, c, 22); - R2(c, d, e, a, b, 23); - R2(b, c, d, e, a, 24); - R2(a, b, c, d, e, 25); - R2(e, a, b, c, d, 26); - R2(d, e, a, b, c, 27); - R2(c, d, e, a, b, 28); - R2(b, c, d, e, a, 29); - R2(a, b, c, d, e, 30); - R2(e, a, b, c, d, 31); - R2(d, e, a, b, c, 32); - R2(c, d, e, a, b, 33); - R2(b, c, d, e, a, 34); - R2(a, b, c, d, e, 35); - R2(e, a, b, c, d, 36); - R2(d, e, a, b, c, 37); - R2(c, d, e, a, b, 38); - R2(b, c, d, e, a, 39); - R3(a, b, c, d, e, 40); - R3(e, a, b, c, d, 41); - R3(d, e, a, b, c, 42); - R3(c, d, e, a, b, 43); - R3(b, c, d, e, a, 44); - R3(a, b, c, d, e, 45); - R3(e, a, b, c, d, 46); - R3(d, e, a, b, c, 47); - R3(c, d, e, a, b, 48); - R3(b, c, d, e, a, 49); - R3(a, b, c, d, e, 50); - R3(e, a, b, c, d, 51); - R3(d, e, a, b, c, 52); - R3(c, d, e, a, b, 53); - R3(b, c, d, e, a, 54); - R3(a, b, c, d, e, 55); - R3(e, a, b, c, d, 56); - R3(d, e, a, b, c, 57); - R3(c, d, e, a, b, 58); - R3(b, c, d, e, a, 59); - R4(a, b, c, d, e, 60); - R4(e, a, b, c, d, 61); - R4(d, e, a, b, c, 62); - R4(c, d, e, a, b, 63); - R4(b, c, d, e, a, 64); - R4(a, b, c, d, e, 65); - R4(e, a, b, c, d, 66); - R4(d, e, a, b, c, 67); - R4(c, d, e, a, b, 68); - R4(b, c, d, e, a, 69); - R4(a, b, c, d, e, 70); - R4(e, a, b, c, d, 71); - R4(d, e, a, b, c, 72); - R4(c, d, e, a, b, 73); - R4(b, c, d, e, a, 74); - R4(a, b, c, d, e, 75); - R4(e, a, b, c, d, 76); - R4(d, e, a, b, c, 77); - R4(c, d, e, a, b, 78); - R4(b, c, d, e, a, 79); - /* Add the working vars back into context.state[] */ - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; - /* Wipe variables */ - a = b = c = d = e = 0; -#ifdef SHA1HANDSOFF - memset(block, '\0', sizeof(block)); -#endif -} - - -/* SHA1Init - Initialize new context */ - -void SHA1Init( - SHA1_CTX * context -) -{ - /* SHA1 initialization constants */ - context->state[0] = 0x67452301; - context->state[1] = 0xEFCDAB89; - context->state[2] = 0x98BADCFE; - context->state[3] = 0x10325476; - context->state[4] = 0xC3D2E1F0; - context->count[0] = context->count[1] = 0; -} - - -/* Run your data through this. */ - -void SHA1Update( - SHA1_CTX * context, - const unsigned char *data, - uint32_t len -) -{ - uint32_t i; - - uint32_t j; - - j = context->count[0]; - if ((context->count[0] += len << 3) < j) - context->count[1]++; - context->count[1] += (len >> 29); - j = (j >> 3) & 63; - if ((j + len) > 63) - { - memcpy(&context->buffer[j], data, (i = 64 - j)); - SHA1Transform(context->state, context->buffer); - for (; i + 63 < len; i += 64) - { - SHA1Transform(context->state, &data[i]); - } - j = 0; - } - else - i = 0; - memcpy(&context->buffer[j], &data[i], len - i); -} - - -/* Add padding and return the message digest. */ - -void SHA1Final( - unsigned char digest[20], - SHA1_CTX * context -) -{ - unsigned i; - - unsigned char finalcount[8]; - - unsigned char c; - -#if 0 /* untested "improvement" by DHR */ - /* Convert context->count to a sequence of bytes - * in finalcount. Second element first, but - * big-endian order within element. - * But we do it all backwards. - */ - unsigned char *fcp = &finalcount[8]; - - for (i = 0; i < 2; i++) - { - uint32_t t = context->count[i]; - - int j; - - for (j = 0; j < 4; t >>= 8, j++) - *--fcp = (unsigned char) t} -#else - for (i = 0; i < 8; i++) - { - finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255); /* Endian independent */ - } -#endif - c = 0200; - SHA1Update(context, &c, 1); - while ((context->count[0] & 504) != 448) - { - c = 0000; - SHA1Update(context, &c, 1); - } - SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */ - for (i = 0; i < 20; i++) - { - digest[i] = (unsigned char) - ((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255); - } - /* Wipe variables */ - memset(context, '\0', sizeof(*context)); - memset(&finalcount, '\0', sizeof(finalcount)); -} - -void SHA1( - char *hash_out, - const char *str, - uint32_t len) -{ - SHA1_CTX ctx; - unsigned int ii; - - SHA1Init(&ctx); - for (ii=0; ii - 100% Public Domain - */ - -#include "stdint.h" - -#if defined(__cplusplus) -extern "C" { -#endif - -typedef struct -{ - uint32_t state[5]; - uint32_t count[2]; - unsigned char buffer[64]; -} SHA1_CTX; - -void SHA1Transform( - uint32_t state[5], - const unsigned char buffer[64] - ); - -void SHA1Init( - SHA1_CTX * context - ); - -void SHA1Update( - SHA1_CTX * context, - const unsigned char *data, - uint32_t len - ); - -void SHA1Final( - unsigned char digest[20], - SHA1_CTX * context - ); - -void SHA1( - char *hash_out, - const char *str, - uint32_t len); - -#if defined(__cplusplus) -} -#endif - -#endif /* SHA1_H */ diff --git a/examples/gguf-hash/deps/sha256/package.json b/examples/gguf-hash/deps/sha256/package.json deleted file mode 100644 index b92a04127..000000000 --- a/examples/gguf-hash/deps/sha256/package.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "name": "sha256", - "version": "0.0.2", - "repo": "jb55/sha256.c", - "description": "sha256 in c", - "keywords": ["sha256", "sha2"], - "src": ["sha256.c", "sha256.h"], - "dependencies": { - "jb55/rotate-bits.h": "0.1.1" - }, - "development": { - "thlorenz/tap.c": "*" - } -} - diff --git a/examples/gguf-hash/deps/sha256/sha256.c b/examples/gguf-hash/deps/sha256/sha256.c deleted file mode 100644 index a7a87aeb2..000000000 --- a/examples/gguf-hash/deps/sha256/sha256.c +++ /dev/null @@ -1,221 +0,0 @@ -/* Crypto/Sha256.c -- SHA-256 Hash -2010-06-11 : Igor Pavlov : Public domain -This code is based on public domain code from Wei Dai's Crypto++ library. */ - -#include "rotate-bits/rotate-bits.h" -#include "sha256.h" - -/* define it for speed optimization */ -#define _SHA256_UNROLL -#define _SHA256_UNROLL2 - -void -sha256_init(sha256_t *p) -{ - p->state[0] = 0x6a09e667; - p->state[1] = 0xbb67ae85; - p->state[2] = 0x3c6ef372; - p->state[3] = 0xa54ff53a; - p->state[4] = 0x510e527f; - p->state[5] = 0x9b05688c; - p->state[6] = 0x1f83d9ab; - p->state[7] = 0x5be0cd19; - p->count = 0; -} - -#define S0(x) (ROTR32(x, 2) ^ ROTR32(x,13) ^ ROTR32(x, 22)) -#define S1(x) (ROTR32(x, 6) ^ ROTR32(x,11) ^ ROTR32(x, 25)) -#define s0(x) (ROTR32(x, 7) ^ ROTR32(x,18) ^ (x >> 3)) -#define s1(x) (ROTR32(x,17) ^ ROTR32(x,19) ^ (x >> 10)) - -#define blk0(i) (W[i] = data[i]) -#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15])) - -#define Ch(x,y,z) (z^(x&(y^z))) -#define Maj(x,y,z) ((x&y)|(z&(x|y))) - -#define a(i) T[(0-(i))&7] -#define b(i) T[(1-(i))&7] -#define c(i) T[(2-(i))&7] -#define d(i) T[(3-(i))&7] -#define e(i) T[(4-(i))&7] -#define f(i) T[(5-(i))&7] -#define g(i) T[(6-(i))&7] -#define h(i) T[(7-(i))&7] - - -#ifdef _SHA256_UNROLL2 - -#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\ - d += h; h += S0(a) + Maj(a, b, c) - -#define RX_8(i) \ - R(a,b,c,d,e,f,g,h, i); \ - R(h,a,b,c,d,e,f,g, (i+1)); \ - R(g,h,a,b,c,d,e,f, (i+2)); \ - R(f,g,h,a,b,c,d,e, (i+3)); \ - R(e,f,g,h,a,b,c,d, (i+4)); \ - R(d,e,f,g,h,a,b,c, (i+5)); \ - R(c,d,e,f,g,h,a,b, (i+6)); \ - R(b,c,d,e,f,g,h,a, (i+7)) - -#else - -#define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\ - d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i)) - -#ifdef _SHA256_UNROLL - -#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7); - -#endif - -#endif - -static const uint32_t K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -static void -sha256_transform(uint32_t *state, const uint32_t *data) -{ - uint32_t W[16] = {0}; - unsigned j; - #ifdef _SHA256_UNROLL2 - uint32_t a,b,c,d,e,f,g,h; - a = state[0]; - b = state[1]; - c = state[2]; - d = state[3]; - e = state[4]; - f = state[5]; - g = state[6]; - h = state[7]; - #else - uint32_t T[8]; - for (j = 0; j < 8; j++) - T[j] = state[j]; - #endif - - for (j = 0; j < 64; j += 16) - { - #if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2) - RX_8(0); RX_8(8); - #else - unsigned i; - for (i = 0; i < 16; i++) { R(i); } - #endif - } - - #ifdef _SHA256_UNROLL2 - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; - state[5] += f; - state[6] += g; - state[7] += h; - #else - for (j = 0; j < 8; j++) - state[j] += T[j]; - #endif - - /* Wipe variables */ - /* memset(W, 0, sizeof(W)); */ - /* memset(T, 0, sizeof(T)); */ -} - -#undef S0 -#undef S1 -#undef s0 -#undef s1 - -static void -sha256_write_byte_block(sha256_t *p) -{ - uint32_t data32[16]; - unsigned i; - for (i = 0; i < 16; i++) - data32[i] = - ((uint32_t)(p->buffer[i * 4 ]) << 24) + - ((uint32_t)(p->buffer[i * 4 + 1]) << 16) + - ((uint32_t)(p->buffer[i * 4 + 2]) << 8) + - ((uint32_t)(p->buffer[i * 4 + 3])); - sha256_transform(p->state, data32); -} - - -void -sha256_hash(unsigned char *buf, const unsigned char *data, size_t size) -{ - sha256_t hash; - sha256_init(&hash); - sha256_update(&hash, data, size); - sha256_final(&hash, buf); -} - - -void -sha256_update(sha256_t *p, const unsigned char *data, size_t size) -{ - uint32_t curBufferPos = (uint32_t)p->count & 0x3F; - while (size > 0) - { - p->buffer[curBufferPos++] = *data++; - p->count++; - size--; - if (curBufferPos == 64) - { - curBufferPos = 0; - sha256_write_byte_block(p); - } - } -} - - -void -sha256_final(sha256_t *p, unsigned char *digest) -{ - uint64_t lenInBits = (p->count << 3); - uint32_t curBufferPos = (uint32_t)p->count & 0x3F; - unsigned i; - p->buffer[curBufferPos++] = 0x80; - while (curBufferPos != (64 - 8)) - { - curBufferPos &= 0x3F; - if (curBufferPos == 0) - sha256_write_byte_block(p); - p->buffer[curBufferPos++] = 0; - } - for (i = 0; i < 8; i++) - { - p->buffer[curBufferPos++] = (unsigned char)(lenInBits >> 56); - lenInBits <<= 8; - } - sha256_write_byte_block(p); - - for (i = 0; i < 8; i++) - { - *digest++ = (unsigned char)(p->state[i] >> 24); - *digest++ = (unsigned char)(p->state[i] >> 16); - *digest++ = (unsigned char)(p->state[i] >> 8); - *digest++ = (unsigned char)(p->state[i]); - } - sha256_init(p); -} diff --git a/examples/gguf-hash/deps/sha256/sha256.h b/examples/gguf-hash/deps/sha256/sha256.h deleted file mode 100644 index 21657e66b..000000000 --- a/examples/gguf-hash/deps/sha256/sha256.h +++ /dev/null @@ -1,24 +0,0 @@ -/* Sha256.h -- SHA-256 Hash -2010-06-11 : Igor Pavlov : Public domain */ - -#ifndef __CRYPTO_SHA256_H -#define __CRYPTO_SHA256_H - -#include -#include - -#define SHA256_DIGEST_SIZE 32 - -typedef struct sha256_t -{ - uint32_t state[8]; - uint64_t count; - unsigned char buffer[64]; -} sha256_t; - -void sha256_init(sha256_t *p); -void sha256_update(sha256_t *p, const unsigned char *data, size_t size); -void sha256_final(sha256_t *p, unsigned char *digest); -void sha256_hash(unsigned char *buf, const unsigned char *data, size_t size); - -#endif diff --git a/examples/gguf-hash/deps/xxhash/clib.json b/examples/gguf-hash/deps/xxhash/clib.json deleted file mode 100644 index 242343c5d..000000000 --- a/examples/gguf-hash/deps/xxhash/clib.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "name": "xxhash", - "version": "0.8.2", - "repo": "Cyan4973/xxhash", - "description": "Extremely fast non-cryptographic hash algorithm", - "keywords": ["xxhash", "hashing"], - "license": "BSD-2-Clause", - "src": [ - "xxhash.c", - "xxhash.h" - ] -} diff --git a/examples/gguf-hash/deps/xxhash/xxhash.c b/examples/gguf-hash/deps/xxhash/xxhash.c deleted file mode 100644 index e60cc37f1..000000000 --- a/examples/gguf-hash/deps/xxhash/xxhash.c +++ /dev/null @@ -1,42 +0,0 @@ -/* - * xxHash - Extremely Fast Hash algorithm - * Copyright (C) 2012-2023 Yann Collet - * - * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at: - * - xxHash homepage: https://www.xxhash.com - * - xxHash source repository: https://github.com/Cyan4973/xxHash - */ - -/* - * xxhash.c instantiates functions defined in xxhash.h - */ - -#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ -#define XXH_IMPLEMENTATION /* access definitions */ - -#include "xxhash.h" diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h deleted file mode 100644 index c0fafe20d..000000000 --- a/examples/gguf-hash/deps/xxhash/xxhash.h +++ /dev/null @@ -1,7093 +0,0 @@ -/* - * xxHash - Extremely Fast Hash algorithm - * Header File - * Copyright (C) 2012-2023 Yann Collet - * - * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at: - * - xxHash homepage: https://www.xxhash.com - * - xxHash source repository: https://github.com/Cyan4973/xxHash - */ - -/*! - * @mainpage xxHash - * - * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed - * limits. - * - * It is proposed in four flavors, in three families: - * 1. @ref XXH32_family - * - Classic 32-bit hash function. Simple, compact, and runs on almost all - * 32-bit and 64-bit systems. - * 2. @ref XXH64_family - * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most - * 64-bit systems (but _not_ 32-bit systems). - * 3. @ref XXH3_family - * - Modern 64-bit and 128-bit hash function family which features improved - * strength and performance across the board, especially on smaller data. - * It benefits greatly from SIMD and 64-bit without requiring it. - * - * Benchmarks - * --- - * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. - * The open source benchmark program is compiled with clang v10.0 using -O3 flag. - * - * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | - * | -------------------- | ------- | ----: | ---------------: | ------------------: | - * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | - * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | - * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | - * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | - * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | - * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | - * | RAM sequential read | | N/A | 28.0 GB/s | N/A | - * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | - * | City64 | | 64 | 22.0 GB/s | 76.6 | - * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | - * | City128 | | 128 | 21.7 GB/s | 57.7 | - * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | - * | XXH64() | | 64 | 19.4 GB/s | 71.0 | - * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | - * | Mum | | 64 | 18.0 GB/s | 67.0 | - * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | - * | XXH32() | | 32 | 9.7 GB/s | 71.9 | - * | City32 | | 32 | 9.1 GB/s | 66.0 | - * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | - * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | - * | SipHash* | | 64 | 3.0 GB/s | 43.2 | - * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | - * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | - * | FNV64 | | 64 | 1.2 GB/s | 62.7 | - * | Blake2* | | 256 | 1.1 GB/s | 5.1 | - * | SHA1* | | 160 | 0.8 GB/s | 5.6 | - * | MD5* | | 128 | 0.6 GB/s | 7.8 | - * @note - * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, - * even though it is mandatory on x64. - * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic - * by modern standards. - * - Small data velocity is a rough average of algorithm's efficiency for small - * data. For more accurate information, see the wiki. - * - More benchmarks and strength tests are found on the wiki: - * https://github.com/Cyan4973/xxHash/wiki - * - * Usage - * ------ - * All xxHash variants use a similar API. Changing the algorithm is a trivial - * substitution. - * - * @pre - * For functions which take an input and length parameter, the following - * requirements are assumed: - * - The range from [`input`, `input + length`) is valid, readable memory. - * - The only exception is if the `length` is `0`, `input` may be `NULL`. - * - For C++, the objects must have the *TriviallyCopyable* property, as the - * functions access bytes directly as if it was an array of `unsigned char`. - * - * @anchor single_shot_example - * **Single Shot** - * - * These functions are stateless functions which hash a contiguous block of memory, - * immediately returning the result. They are the easiest and usually the fastest - * option. - * - * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() - * - * @code{.c} - * #include - * #include "xxhash.h" - * - * // Example for a function which hashes a null terminated string with XXH32(). - * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) - * { - * // NULL pointers are only valid if the length is zero - * size_t length = (string == NULL) ? 0 : strlen(string); - * return XXH32(string, length, seed); - * } - * @endcode - * - * - * @anchor streaming_example - * **Streaming** - * - * These groups of functions allow incremental hashing of unknown size, even - * more than what would fit in a size_t. - * - * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() - * - * @code{.c} - * #include - * #include - * #include "xxhash.h" - * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). - * XXH64_hash_t hashFile(FILE* f) - * { - * // Allocate a state struct. Do not just use malloc() or new. - * XXH3_state_t* state = XXH3_createState(); - * assert(state != NULL && "Out of memory!"); - * // Reset the state to start a new hashing session. - * XXH3_64bits_reset(state); - * char buffer[4096]; - * size_t count; - * // Read the file in chunks - * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { - * // Run update() as many times as necessary to process the data - * XXH3_64bits_update(state, buffer, count); - * } - * // Retrieve the finalized hash. This will not change the state. - * XXH64_hash_t result = XXH3_64bits_digest(state); - * // Free the state. Do not use free(). - * XXH3_freeState(state); - * return result; - * } - * @endcode - * - * Streaming functions generate the xxHash value from an incremental input. - * This method is slower than single-call functions, due to state management. - * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. - * - * An XXH state must first be allocated using `XXH*_createState()`. - * - * Start a new hash by initializing the state with a seed using `XXH*_reset()`. - * - * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. - * - * The function returns an error code, with 0 meaning OK, and any other value - * meaning there is an error. - * - * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. - * This function returns the nn-bits hash as an int or long long. - * - * It's still possible to continue inserting input into the hash state after a - * digest, and generate new hash values later on by invoking `XXH*_digest()`. - * - * When done, release the state using `XXH*_freeState()`. - * - * - * @anchor canonical_representation_example - * **Canonical Representation** - * - * The default return values from XXH functions are unsigned 32, 64 and 128 bit - * integers. - * This the simplest and fastest format for further post-processing. - * - * However, this leaves open the question of what is the order on the byte level, - * since little and big endian conventions will store the same number differently. - * - * The canonical representation settles this issue by mandating big-endian - * convention, the same convention as human-readable numbers (large digits first). - * - * When writing hash values to storage, sending them over a network, or printing - * them, it's highly recommended to use the canonical representation to ensure - * portability across a wider range of systems, present and future. - * - * The following functions allow transformation of hash values to and from - * canonical format. - * - * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(), - * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(), - * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(), - * - * @code{.c} - * #include - * #include "xxhash.h" - * - * // Example for a function which prints XXH32_hash_t in human readable format - * void printXxh32(XXH32_hash_t hash) - * { - * XXH32_canonical_t cano; - * XXH32_canonicalFromHash(&cano, hash); - * size_t i; - * for(i = 0; i < sizeof(cano.digest); ++i) { - * printf("%02x", cano.digest[i]); - * } - * printf("\n"); - * } - * - * // Example for a function which converts XXH32_canonical_t to XXH32_hash_t - * XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano) - * { - * XXH32_hash_t hash = XXH32_hashFromCanonical(&cano); - * return hash; - * } - * @endcode - * - * - * @file xxhash.h - * xxHash prototypes and implementation - */ - -#if defined (__cplusplus) -extern "C" { -#endif - -/* **************************** - * INLINE mode - ******************************/ -/*! - * @defgroup public Public API - * Contains details on the public xxHash functions. - * @{ - */ -#ifdef XXH_DOXYGEN -/*! - * @brief Gives access to internal state declaration, required for static allocation. - * - * Incompatible with dynamic linking, due to risks of ABI changes. - * - * Usage: - * @code{.c} - * #define XXH_STATIC_LINKING_ONLY - * #include "xxhash.h" - * @endcode - */ -# define XXH_STATIC_LINKING_ONLY -/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ - -/*! - * @brief Gives access to internal definitions. - * - * Usage: - * @code{.c} - * #define XXH_STATIC_LINKING_ONLY - * #define XXH_IMPLEMENTATION - * #include "xxhash.h" - * @endcode - */ -# define XXH_IMPLEMENTATION -/* Do not undef XXH_IMPLEMENTATION for Doxygen */ - -/*! - * @brief Exposes the implementation and marks all functions as `inline`. - * - * Use these build macros to inline xxhash into the target unit. - * Inlining improves performance on small inputs, especially when the length is - * expressed as a compile-time constant: - * - * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html - * - * It also keeps xxHash symbols private to the unit, so they are not exported. - * - * Usage: - * @code{.c} - * #define XXH_INLINE_ALL - * #include "xxhash.h" - * @endcode - * Do not compile and link xxhash.o as a separate object, as it is not useful. - */ -# define XXH_INLINE_ALL -# undef XXH_INLINE_ALL -/*! - * @brief Exposes the implementation without marking functions as inline. - */ -# define XXH_PRIVATE_API -# undef XXH_PRIVATE_API -/*! - * @brief Emulate a namespace by transparently prefixing all symbols. - * - * If you want to include _and expose_ xxHash functions from within your own - * library, but also want to avoid symbol collisions with other libraries which - * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix - * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE - * (therefore, avoid empty or numeric values). - * - * Note that no change is required within the calling program as long as it - * includes `xxhash.h`: Regular symbol names will be automatically translated - * by this header. - */ -# define XXH_NAMESPACE /* YOUR NAME HERE */ -# undef XXH_NAMESPACE -#endif - -#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ - && !defined(XXH_INLINE_ALL_31684351384) - /* this section should be traversed only once */ -# define XXH_INLINE_ALL_31684351384 - /* give access to the advanced API, required to compile implementations */ -# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ -# define XXH_STATIC_LINKING_ONLY - /* make all functions private */ -# undef XXH_PUBLIC_API -# if defined(__GNUC__) -# define XXH_PUBLIC_API static __inline __attribute__((__unused__)) -# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define XXH_PUBLIC_API static inline -# elif defined(_MSC_VER) -# define XXH_PUBLIC_API static __inline -# else - /* note: this version may generate warnings for unused static functions */ -# define XXH_PUBLIC_API static -# endif - - /* - * This part deals with the special case where a unit wants to inline xxHash, - * but "xxhash.h" has previously been included without XXH_INLINE_ALL, - * such as part of some previously included *.h header file. - * Without further action, the new include would just be ignored, - * and functions would effectively _not_ be inlined (silent failure). - * The following macros solve this situation by prefixing all inlined names, - * avoiding naming collision with previous inclusions. - */ - /* Before that, we unconditionally #undef all symbols, - * in case they were already defined with XXH_NAMESPACE. - * They will then be redefined for XXH_INLINE_ALL - */ -# undef XXH_versionNumber - /* XXH32 */ -# undef XXH32 -# undef XXH32_createState -# undef XXH32_freeState -# undef XXH32_reset -# undef XXH32_update -# undef XXH32_digest -# undef XXH32_copyState -# undef XXH32_canonicalFromHash -# undef XXH32_hashFromCanonical - /* XXH64 */ -# undef XXH64 -# undef XXH64_createState -# undef XXH64_freeState -# undef XXH64_reset -# undef XXH64_update -# undef XXH64_digest -# undef XXH64_copyState -# undef XXH64_canonicalFromHash -# undef XXH64_hashFromCanonical - /* XXH3_64bits */ -# undef XXH3_64bits -# undef XXH3_64bits_withSecret -# undef XXH3_64bits_withSeed -# undef XXH3_64bits_withSecretandSeed -# undef XXH3_createState -# undef XXH3_freeState -# undef XXH3_copyState -# undef XXH3_64bits_reset -# undef XXH3_64bits_reset_withSeed -# undef XXH3_64bits_reset_withSecret -# undef XXH3_64bits_update -# undef XXH3_64bits_digest -# undef XXH3_generateSecret - /* XXH3_128bits */ -# undef XXH128 -# undef XXH3_128bits -# undef XXH3_128bits_withSeed -# undef XXH3_128bits_withSecret -# undef XXH3_128bits_reset -# undef XXH3_128bits_reset_withSeed -# undef XXH3_128bits_reset_withSecret -# undef XXH3_128bits_reset_withSecretandSeed -# undef XXH3_128bits_update -# undef XXH3_128bits_digest -# undef XXH128_isEqual -# undef XXH128_cmp -# undef XXH128_canonicalFromHash -# undef XXH128_hashFromCanonical - /* Finally, free the namespace itself */ -# undef XXH_NAMESPACE - - /* employ the namespace for XXH_INLINE_ALL */ -# define XXH_NAMESPACE XXH_INLINE_ - /* - * Some identifiers (enums, type names) are not symbols, - * but they must nonetheless be renamed to avoid redeclaration. - * Alternative solution: do not redeclare them. - * However, this requires some #ifdefs, and has a more dispersed impact. - * Meanwhile, renaming can be achieved in a single place. - */ -# define XXH_IPREF(Id) XXH_NAMESPACE ## Id -# define XXH_OK XXH_IPREF(XXH_OK) -# define XXH_ERROR XXH_IPREF(XXH_ERROR) -# define XXH_errorcode XXH_IPREF(XXH_errorcode) -# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) -# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) -# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) -# define XXH32_state_s XXH_IPREF(XXH32_state_s) -# define XXH32_state_t XXH_IPREF(XXH32_state_t) -# define XXH64_state_s XXH_IPREF(XXH64_state_s) -# define XXH64_state_t XXH_IPREF(XXH64_state_t) -# define XXH3_state_s XXH_IPREF(XXH3_state_s) -# define XXH3_state_t XXH_IPREF(XXH3_state_t) -# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) - /* Ensure the header is parsed again, even if it was previously included */ -# undef XXHASH_H_5627135585666179 -# undef XXHASH_H_STATIC_13879238742 -#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ - -/* **************************************************************** - * Stable API - *****************************************************************/ -#ifndef XXHASH_H_5627135585666179 -#define XXHASH_H_5627135585666179 1 - -/*! @brief Marks a global symbol. */ -#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) -# if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) -# ifdef XXH_EXPORT -# define XXH_PUBLIC_API __declspec(dllexport) -# elif XXH_IMPORT -# define XXH_PUBLIC_API __declspec(dllimport) -# endif -# else -# define XXH_PUBLIC_API /* do nothing */ -# endif -#endif - -#ifdef XXH_NAMESPACE -# define XXH_CAT(A,B) A##B -# define XXH_NAME2(A,B) XXH_CAT(A,B) -# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) -/* XXH32 */ -# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) -# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) -# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) -# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) -# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) -# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) -# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) -# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) -# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) -/* XXH64 */ -# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) -# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) -# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) -# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) -# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) -# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) -# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) -# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) -# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) -/* XXH3_64bits */ -# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) -# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) -# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) -# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) -# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) -# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) -# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) -# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) -# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) -# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) -# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) -# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) -# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) -# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) -# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) -/* XXH3_128bits */ -# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) -# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) -# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) -# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) -# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) -# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) -# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) -# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) -# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) -# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) -# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) -# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) -# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) -# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) -# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) -#endif - - -/* ************************************* -* Compiler specifics -***************************************/ - -/* specific declaration modes for Windows */ -#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) -# if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) -# ifdef XXH_EXPORT -# define XXH_PUBLIC_API __declspec(dllexport) -# elif XXH_IMPORT -# define XXH_PUBLIC_API __declspec(dllimport) -# endif -# else -# define XXH_PUBLIC_API /* do nothing */ -# endif -#endif - -#if defined (__GNUC__) -# define XXH_CONSTF __attribute__((__const__)) -# define XXH_PUREF __attribute__((__pure__)) -# define XXH_MALLOCF __attribute__((__malloc__)) -#else -# define XXH_CONSTF /* disable */ -# define XXH_PUREF -# define XXH_MALLOCF -#endif - -/* ************************************* -* Version -***************************************/ -#define XXH_VERSION_MAJOR 0 -#define XXH_VERSION_MINOR 8 -#define XXH_VERSION_RELEASE 3 -/*! @brief Version number, encoded as two digits each */ -#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) - -/*! - * @brief Obtains the xxHash version. - * - * This is mostly useful when xxHash is compiled as a shared library, - * since the returned value comes from the library, as opposed to header file. - * - * @return @ref XXH_VERSION_NUMBER of the invoked library. - */ -XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); - - -/* **************************** -* Common basic types -******************************/ -#include /* size_t */ -/*! - * @brief Exit code for the streaming API. - */ -typedef enum { - XXH_OK = 0, /*!< OK */ - XXH_ERROR /*!< Error */ -} XXH_errorcode; - - -/*-********************************************************************** -* 32-bit hash -************************************************************************/ -#if defined(XXH_DOXYGEN) /* Don't show include */ -/*! - * @brief An unsigned 32-bit integer. - * - * Not necessarily defined to `uint32_t` but functionally equivalent. - */ -typedef uint32_t XXH32_hash_t; - -#elif !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# ifdef _AIX -# include -# else -# include -# endif - typedef uint32_t XXH32_hash_t; - -#else -# include -# if UINT_MAX == 0xFFFFFFFFUL - typedef unsigned int XXH32_hash_t; -# elif ULONG_MAX == 0xFFFFFFFFUL - typedef unsigned long XXH32_hash_t; -# else -# error "unsupported platform: need a 32-bit type" -# endif -#endif - -/*! - * @} - * - * @defgroup XXH32_family XXH32 family - * @ingroup public - * Contains functions used in the classic 32-bit xxHash algorithm. - * - * @note - * XXH32 is useful for older platforms, with no or poor 64-bit performance. - * Note that the @ref XXH3_family provides competitive speed for both 32-bit - * and 64-bit systems, and offers true 64/128 bit hash results. - * - * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families - * @see @ref XXH32_impl for implementation details - * @{ - */ - -/*! - * @brief Calculates the 32-bit hash of @p input using xxHash32. - * - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * @param seed The 32-bit seed to alter the hash's output predictably. - * - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return The calculated 32-bit xxHash32 value. - * - * @see @ref single_shot_example "Single Shot Example" for an example. - */ -XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); - -#ifndef XXH_NO_STREAM -/*! - * @typedef struct XXH32_state_s XXH32_state_t - * @brief The opaque state struct for the XXH32 streaming API. - * - * @see XXH32_state_s for details. - * @see @ref streaming_example "Streaming Example" - */ -typedef struct XXH32_state_s XXH32_state_t; - -/*! - * @brief Allocates an @ref XXH32_state_t. - * - * @return An allocated pointer of @ref XXH32_state_t on success. - * @return `NULL` on failure. - * - * @note Must be freed with XXH32_freeState(). - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); -/*! - * @brief Frees an @ref XXH32_state_t. - * - * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). - * - * @return @ref XXH_OK. - * - * @note @p statePtr must be allocated with XXH32_createState(). - * - * @see @ref streaming_example "Streaming Example" - * - */ -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); -/*! - * @brief Copies one @ref XXH32_state_t to another. - * - * @param dst_state The state to copy to. - * @param src_state The state to copy from. - * @pre - * @p dst_state and @p src_state must not be `NULL` and must not overlap. - */ -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); - -/*! - * @brief Resets an @ref XXH32_state_t to begin a new hash. - * - * @param statePtr The state struct to reset. - * @param seed The 32-bit seed to alter the hash result predictably. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note This function resets and seeds a state. Call it before @ref XXH32_update(). - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); - -/*! - * @brief Consumes a block of @p input to an @ref XXH32_state_t. - * - * @param statePtr The state struct to update. - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note Call this to incrementally consume blocks of data. - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); - -/*! - * @brief Returns the calculated hash value from an @ref XXH32_state_t. - * - * @param statePtr The state struct to calculate the hash from. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return The calculated 32-bit xxHash32 value from that state. - * - * @note - * Calling XXH32_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); -#endif /* !XXH_NO_STREAM */ - -/******* Canonical representation *******/ - -/*! - * @brief Canonical (big endian) representation of @ref XXH32_hash_t. - */ -typedef struct { - unsigned char digest[4]; /*!< Hash bytes, big endian */ -} XXH32_canonical_t; - -/*! - * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. - * - * @param dst The @ref XXH32_canonical_t pointer to be stored to. - * @param hash The @ref XXH32_hash_t to be converted. - * - * @pre - * @p dst must not be `NULL`. - * - * @see @ref canonical_representation_example "Canonical Representation Example" - */ -XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); - -/*! - * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. - * - * @param src The @ref XXH32_canonical_t to convert. - * - * @pre - * @p src must not be `NULL`. - * - * @return The converted hash. - * - * @see @ref canonical_representation_example "Canonical Representation Example" - */ -XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); - - -/*! @cond Doxygen ignores this part */ -#ifdef __has_attribute -# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) -#else -# define XXH_HAS_ATTRIBUTE(x) 0 -#endif -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -/* - * C23 __STDC_VERSION__ number hasn't been specified yet. For now - * leave as `201711L` (C17 + 1). - * TODO: Update to correct value when its been specified. - */ -#define XXH_C23_VN 201711L -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -/* C-language Attributes are added in C23. */ -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) -# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) -#else -# define XXH_HAS_C_ATTRIBUTE(x) 0 -#endif -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -#if defined(__cplusplus) && defined(__has_cpp_attribute) -# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) -#else -# define XXH_HAS_CPP_ATTRIBUTE(x) 0 -#endif -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -/* - * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute - * introduced in CPP17 and C23. - * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough - * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough - */ -#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) -# define XXH_FALLTHROUGH [[fallthrough]] -#elif XXH_HAS_ATTRIBUTE(__fallthrough__) -# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) -#else -# define XXH_FALLTHROUGH /* fallthrough */ -#endif -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -/* - * Define XXH_NOESCAPE for annotated pointers in public API. - * https://clang.llvm.org/docs/AttributeReference.html#noescape - * As of writing this, only supported by clang. - */ -#if XXH_HAS_ATTRIBUTE(noescape) -# define XXH_NOESCAPE __attribute__((__noescape__)) -#else -# define XXH_NOESCAPE -#endif -/*! @endcond */ - - -/*! - * @} - * @ingroup public - * @{ - */ - -#ifndef XXH_NO_LONG_LONG -/*-********************************************************************** -* 64-bit hash -************************************************************************/ -#if defined(XXH_DOXYGEN) /* don't include */ -/*! - * @brief An unsigned 64-bit integer. - * - * Not necessarily defined to `uint64_t` but functionally equivalent. - */ -typedef uint64_t XXH64_hash_t; -#elif !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# ifdef _AIX -# include -# else -# include -# endif - typedef uint64_t XXH64_hash_t; -#else -# include -# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL - /* LP64 ABI says uint64_t is unsigned long */ - typedef unsigned long XXH64_hash_t; -# else - /* the following type must have a width of 64-bit */ - typedef unsigned long long XXH64_hash_t; -# endif -#endif - -/*! - * @} - * - * @defgroup XXH64_family XXH64 family - * @ingroup public - * @{ - * Contains functions used in the classic 64-bit xxHash algorithm. - * - * @note - * XXH3 provides competitive speed for both 32-bit and 64-bit systems, - * and offers true 64/128 bit hash results. - * It provides better speed for systems with vector processing capabilities. - */ - -/*! - * @brief Calculates the 64-bit hash of @p input using xxHash64. - * - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * @param seed The 64-bit seed to alter the hash's output predictably. - * - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return The calculated 64-bit xxHash64 value. - * - * @see @ref single_shot_example "Single Shot Example" for an example. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); - -/******* Streaming *******/ -#ifndef XXH_NO_STREAM -/*! - * @brief The opaque state struct for the XXH64 streaming API. - * - * @see XXH64_state_s for details. - * @see @ref streaming_example "Streaming Example" - */ -typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ - -/*! - * @brief Allocates an @ref XXH64_state_t. - * - * @return An allocated pointer of @ref XXH64_state_t on success. - * @return `NULL` on failure. - * - * @note Must be freed with XXH64_freeState(). - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); - -/*! - * @brief Frees an @ref XXH64_state_t. - * - * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). - * - * @return @ref XXH_OK. - * - * @note @p statePtr must be allocated with XXH64_createState(). - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); - -/*! - * @brief Copies one @ref XXH64_state_t to another. - * - * @param dst_state The state to copy to. - * @param src_state The state to copy from. - * @pre - * @p dst_state and @p src_state must not be `NULL` and must not overlap. - */ -XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); - -/*! - * @brief Resets an @ref XXH64_state_t to begin a new hash. - * - * @param statePtr The state struct to reset. - * @param seed The 64-bit seed to alter the hash result predictably. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note This function resets and seeds a state. Call it before @ref XXH64_update(). - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); - -/*! - * @brief Consumes a block of @p input to an @ref XXH64_state_t. - * - * @param statePtr The state struct to update. - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note Call this to incrementally consume blocks of data. - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); - -/*! - * @brief Returns the calculated hash value from an @ref XXH64_state_t. - * - * @param statePtr The state struct to calculate the hash from. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return The calculated 64-bit xxHash64 value from that state. - * - * @note - * Calling XXH64_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); -#endif /* !XXH_NO_STREAM */ -/******* Canonical representation *******/ - -/*! - * @brief Canonical (big endian) representation of @ref XXH64_hash_t. - */ -typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; - -/*! - * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. - * - * @param dst The @ref XXH64_canonical_t pointer to be stored to. - * @param hash The @ref XXH64_hash_t to be converted. - * - * @pre - * @p dst must not be `NULL`. - * - * @see @ref canonical_representation_example "Canonical Representation Example" - */ -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); - -/*! - * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. - * - * @param src The @ref XXH64_canonical_t to convert. - * - * @pre - * @p src must not be `NULL`. - * - * @return The converted hash. - * - * @see @ref canonical_representation_example "Canonical Representation Example" - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); - -#ifndef XXH_NO_XXH3 - -/*! - * @} - * ************************************************************************ - * @defgroup XXH3_family XXH3 family - * @ingroup public - * @{ - * - * XXH3 is a more recent hash algorithm featuring: - * - Improved speed for both small and large inputs - * - True 64-bit and 128-bit outputs - * - SIMD acceleration - * - Improved 32-bit viability - * - * Speed analysis methodology is explained here: - * - * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html - * - * Compared to XXH64, expect XXH3 to run approximately - * ~2x faster on large inputs and >3x faster on small ones, - * exact differences vary depending on platform. - * - * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, - * but does not require it. - * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 - * at competitive speeds, even without vector support. Further details are - * explained in the implementation. - * - * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD - * implementations for many common platforms: - * - AVX512 - * - AVX2 - * - SSE2 - * - ARM NEON - * - WebAssembly SIMD128 - * - POWER8 VSX - * - s390x ZVector - * This can be controlled via the @ref XXH_VECTOR macro, but it automatically - * selects the best version according to predefined macros. For the x86 family, an - * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. - * - * XXH3 implementation is portable: - * it has a generic C90 formulation that can be compiled on any platform, - * all implementations generate exactly the same hash value on all platforms. - * Starting from v0.8.0, it's also labelled "stable", meaning that - * any future version will also generate the same hash value. - * - * XXH3 offers 2 variants, _64bits and _128bits. - * - * When only 64 bits are needed, prefer invoking the _64bits variant, as it - * reduces the amount of mixing, resulting in faster speed on small inputs. - * It's also generally simpler to manipulate a scalar return type than a struct. - * - * The API supports one-shot hashing, streaming mode, and custom secrets. - */ -/*-********************************************************************** -* XXH3 64-bit variant -************************************************************************/ - -/*! - * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input. - * - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return The calculated 64-bit XXH3 hash value. - * - * @note - * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however - * it may have slightly better performance due to constant propagation of the - * defaults. - * - * @see - * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants - * @see @ref single_shot_example "Single Shot Example" for an example. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); - -/*! - * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input. - * - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * @param seed The 64-bit seed to alter the hash result predictably. - * - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return The calculated 64-bit XXH3 hash value. - * - * @note - * seed == 0 produces the same results as @ref XXH3_64bits(). - * - * This variant generates a custom secret on the fly based on default secret - * altered using the @p seed value. - * - * While this operation is decently fast, note that it's not completely free. - * - * @see @ref single_shot_example "Single Shot Example" for an example. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); - -/*! - * The bare minimum size for a custom secret. - * - * @see - * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), - * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). - */ -#define XXH3_SECRET_SIZE_MIN 136 - -/*! - * @brief Calculates 64-bit variant of XXH3 with a custom "secret". - * - * @param data The block of data to be hashed, at least @p len bytes in size. - * @param len The length of @p data, in bytes. - * @param secret The secret data. - * @param secretSize The length of @p secret, in bytes. - * - * @return The calculated 64-bit XXH3 hash value. - * - * @pre - * The memory between @p data and @p data + @p len must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p data may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * It's possible to provide any blob of bytes as a "secret" to generate the hash. - * This makes it more difficult for an external actor to prepare an intentional collision. - * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). - * However, the quality of the secret impacts the dispersion of the hash algorithm. - * Therefore, the secret _must_ look like a bunch of random bytes. - * Avoid "trivial" or structured data such as repeated sequences or a text document. - * Whenever in doubt about the "randomness" of the blob of bytes, - * consider employing @ref XXH3_generateSecret() instead (see below). - * It will generate a proper high entropy secret derived from the blob of bytes. - * Another advantage of using XXH3_generateSecret() is that - * it guarantees that all bits within the initial blob of bytes - * will impact every bit of the output. - * This is not necessarily the case when using the blob of bytes directly - * because, when hashing _small_ inputs, only a portion of the secret is employed. - * - * @see @ref single_shot_example "Single Shot Example" for an example. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); - - -/******* Streaming *******/ -#ifndef XXH_NO_STREAM -/* - * Streaming requires state maintenance. - * This operation costs memory and CPU. - * As a consequence, streaming is slower than one-shot hashing. - * For better performance, prefer one-shot functions whenever applicable. - */ - -/*! - * @brief The opaque state struct for the XXH3 streaming API. - * - * @see XXH3_state_s for details. - * @see @ref streaming_example "Streaming Example" - */ -typedef struct XXH3_state_s XXH3_state_t; -XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); - -/*! - * @brief Copies one @ref XXH3_state_t to another. - * - * @param dst_state The state to copy to. - * @param src_state The state to copy from. - * @pre - * @p dst_state and @p src_state must not be `NULL` and must not overlap. - */ -XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); - -/*! - * @brief Resets an @ref XXH3_state_t to begin a new hash. - * - * @param statePtr The state struct to reset. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note - * - This function resets `statePtr` and generate a secret with default parameters. - * - Call this function before @ref XXH3_64bits_update(). - * - Digest will be equivalent to `XXH3_64bits()`. - * - * @see @ref streaming_example "Streaming Example" - * - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); - -/*! - * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. - * - * @param statePtr The state struct to reset. - * @param seed The 64-bit seed to alter the hash result predictably. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note - * - This function resets `statePtr` and generate a secret from `seed`. - * - Call this function before @ref XXH3_64bits_update(). - * - Digest will be equivalent to `XXH3_64bits_withSeed()`. - * - * @see @ref streaming_example "Streaming Example" - * - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); - -/*! - * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. - * - * @param statePtr The state struct to reset. - * @param secret The secret data. - * @param secretSize The length of @p secret, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note - * `secret` is referenced, it _must outlive_ the hash streaming session. - * - * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, - * and the quality of produced hash values depends on secret's entropy - * (secret's content should look like a bunch of random bytes). - * When in doubt about the randomness of a candidate `secret`, - * consider employing `XXH3_generateSecret()` instead (see below). - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); - -/*! - * @brief Consumes a block of @p input to an @ref XXH3_state_t. - * - * @param statePtr The state struct to update. - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note Call this to incrementally consume blocks of data. - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); - -/*! - * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. - * - * @param statePtr The state struct to calculate the hash from. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return The calculated XXH3 64-bit hash value from that state. - * - * @note - * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); -#endif /* !XXH_NO_STREAM */ - -/* note : canonical representation of XXH3 is the same as XXH64 - * since they both produce XXH64_hash_t values */ - - -/*-********************************************************************** -* XXH3 128-bit variant -************************************************************************/ - -/*! - * @brief The return value from 128-bit hashes. - * - * Stored in little endian order, although the fields themselves are in native - * endianness. - */ -typedef struct { - XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ - XXH64_hash_t high64; /*!< `value >> 64` */ -} XXH128_hash_t; - -/*! - * @brief Calculates 128-bit unseeded variant of XXH3 of @p data. - * - * @param data The block of data to be hashed, at least @p length bytes in size. - * @param len The length of @p data, in bytes. - * - * @return The calculated 128-bit variant of XXH3 value. - * - * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead - * for shorter inputs. - * - * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however - * it may have slightly better performance due to constant propagation of the - * defaults. - * - * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants - * @see @ref single_shot_example "Single Shot Example" for an example. - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); -/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. - * - * @param data The block of data to be hashed, at least @p length bytes in size. - * @param len The length of @p data, in bytes. - * @param seed The 64-bit seed to alter the hash result predictably. - * - * @return The calculated 128-bit variant of XXH3 value. - * - * @note - * seed == 0 produces the same results as @ref XXH3_64bits(). - * - * This variant generates a custom secret on the fly based on default secret - * altered using the @p seed value. - * - * While this operation is decently fast, note that it's not completely free. - * - * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants - * @see @ref single_shot_example "Single Shot Example" for an example. - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); -/*! - * @brief Calculates 128-bit variant of XXH3 with a custom "secret". - * - * @param data The block of data to be hashed, at least @p len bytes in size. - * @param len The length of @p data, in bytes. - * @param secret The secret data. - * @param secretSize The length of @p secret, in bytes. - * - * @return The calculated 128-bit variant of XXH3 value. - * - * It's possible to provide any blob of bytes as a "secret" to generate the hash. - * This makes it more difficult for an external actor to prepare an intentional collision. - * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). - * However, the quality of the secret impacts the dispersion of the hash algorithm. - * Therefore, the secret _must_ look like a bunch of random bytes. - * Avoid "trivial" or structured data such as repeated sequences or a text document. - * Whenever in doubt about the "randomness" of the blob of bytes, - * consider employing @ref XXH3_generateSecret() instead (see below). - * It will generate a proper high entropy secret derived from the blob of bytes. - * Another advantage of using XXH3_generateSecret() is that - * it guarantees that all bits within the initial blob of bytes - * will impact every bit of the output. - * This is not necessarily the case when using the blob of bytes directly - * because, when hashing _small_ inputs, only a portion of the secret is employed. - * - * @see @ref single_shot_example "Single Shot Example" for an example. - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); - -/******* Streaming *******/ -#ifndef XXH_NO_STREAM -/* - * Streaming requires state maintenance. - * This operation costs memory and CPU. - * As a consequence, streaming is slower than one-shot hashing. - * For better performance, prefer one-shot functions whenever applicable. - * - * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). - * Use already declared XXH3_createState() and XXH3_freeState(). - * - * All reset and streaming functions have same meaning as their 64-bit counterpart. - */ - -/*! - * @brief Resets an @ref XXH3_state_t to begin a new hash. - * - * @param statePtr The state struct to reset. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note - * - This function resets `statePtr` and generate a secret with default parameters. - * - Call it before @ref XXH3_128bits_update(). - * - Digest will be equivalent to `XXH3_128bits()`. - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); - -/*! - * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. - * - * @param statePtr The state struct to reset. - * @param seed The 64-bit seed to alter the hash result predictably. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note - * - This function resets `statePtr` and generate a secret from `seed`. - * - Call it before @ref XXH3_128bits_update(). - * - Digest will be equivalent to `XXH3_128bits_withSeed()`. - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); -/*! - * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. - * - * @param statePtr The state struct to reset. - * @param secret The secret data. - * @param secretSize The length of @p secret, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * `secret` is referenced, it _must outlive_ the hash streaming session. - * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, - * and the quality of produced hash values depends on secret's entropy - * (secret's content should look like a bunch of random bytes). - * When in doubt about the randomness of a candidate `secret`, - * consider employing `XXH3_generateSecret()` instead (see below). - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); - -/*! - * @brief Consumes a block of @p input to an @ref XXH3_state_t. - * - * Call this to incrementally consume blocks of data. - * - * @param statePtr The state struct to update. - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @note - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); - -/*! - * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. - * - * @param statePtr The state struct to calculate the hash from. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return The calculated XXH3 128-bit hash value from that state. - * - * @note - * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); -#endif /* !XXH_NO_STREAM */ - -/* Following helper functions make it possible to compare XXH128_hast_t values. - * Since XXH128_hash_t is a structure, this capability is not offered by the language. - * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ - -/*! - * @brief Check equality of two XXH128_hash_t values - * - * @param h1 The 128-bit hash value. - * @param h2 Another 128-bit hash value. - * - * @return `1` if `h1` and `h2` are equal. - * @return `0` if they are not. - */ -XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); - -/*! - * @brief Compares two @ref XXH128_hash_t - * - * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. - * - * @param h128_1 Left-hand side value - * @param h128_2 Right-hand side value - * - * @return >0 if @p h128_1 > @p h128_2 - * @return =0 if @p h128_1 == @p h128_2 - * @return <0 if @p h128_1 < @p h128_2 - */ -XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); - - -/******* Canonical representation *******/ -typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; - - -/*! - * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. - * - * @param dst The @ref XXH128_canonical_t pointer to be stored to. - * @param hash The @ref XXH128_hash_t to be converted. - * - * @pre - * @p dst must not be `NULL`. - * @see @ref canonical_representation_example "Canonical Representation Example" - */ -XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); - -/*! - * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. - * - * @param src The @ref XXH128_canonical_t to convert. - * - * @pre - * @p src must not be `NULL`. - * - * @return The converted hash. - * @see @ref canonical_representation_example "Canonical Representation Example" - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); - - -#endif /* !XXH_NO_XXH3 */ -#endif /* XXH_NO_LONG_LONG */ - -/*! - * @} - */ -#endif /* XXHASH_H_5627135585666179 */ - - - -#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) -#define XXHASH_H_STATIC_13879238742 -/* **************************************************************************** - * This section contains declarations which are not guaranteed to remain stable. - * They may change in future versions, becoming incompatible with a different - * version of the library. - * These declarations should only be used with static linking. - * Never use them in association with dynamic linking! - ***************************************************************************** */ - -/* - * These definitions are only present to allow static allocation - * of XXH states, on stack or in a struct, for example. - * Never **ever** access their members directly. - */ - -/*! - * @internal - * @brief Structure for XXH32 streaming API. - * - * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, - * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is - * an opaque type. This allows fields to safely be changed. - * - * Typedef'd to @ref XXH32_state_t. - * Do not access the members of this struct directly. - * @see XXH64_state_s, XXH3_state_s - */ -struct XXH32_state_s { - XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ - XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ - XXH32_hash_t v[4]; /*!< Accumulator lanes */ - XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ - XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ - XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ -}; /* typedef'd to XXH32_state_t */ - - -#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ - -/*! - * @internal - * @brief Structure for XXH64 streaming API. - * - * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, - * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is - * an opaque type. This allows fields to safely be changed. - * - * Typedef'd to @ref XXH64_state_t. - * Do not access the members of this struct directly. - * @see XXH32_state_s, XXH3_state_s - */ -struct XXH64_state_s { - XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ - XXH64_hash_t v[4]; /*!< Accumulator lanes */ - XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ - XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ - XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ - XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ -}; /* typedef'd to XXH64_state_t */ - -#ifndef XXH_NO_XXH3 - -/* Windows SDK under 10.0.22000 is missing stdalign.h so we add a check - before allowing the windows compiler to use the C11 form. - Reference: https://github.com/Cyan4973/xxHash/issues/955 */ -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) \ - && (defined(_MSC_VER) && (_MSC_VER >= 1000) || !defined(_MSC_VER)) /* >= C11 */ -# include -# define XXH_ALIGN(n) alignas(n) -#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ -/* In C++ alignas() is a keyword */ -# define XXH_ALIGN(n) alignas(n) -#elif defined(__GNUC__) -# define XXH_ALIGN(n) __attribute__ ((aligned(n))) -#elif defined(_MSC_VER) -# define XXH_ALIGN(n) __declspec(align(n)) -#else -# define XXH_ALIGN(n) /* disabled */ -#endif - -/* Old GCC versions only accept the attribute after the type in structures. */ -#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ - && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ - && defined(__GNUC__) -# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) -#else -# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type -#endif - -/*! - * @brief The size of the internal XXH3 buffer. - * - * This is the optimal update size for incremental hashing. - * - * @see XXH3_64b_update(), XXH3_128b_update(). - */ -#define XXH3_INTERNALBUFFER_SIZE 256 - -/*! - * @internal - * @brief Default size of the secret buffer (and @ref XXH3_kSecret). - * - * This is the size used in @ref XXH3_kSecret and the seeded functions. - * - * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. - */ -#define XXH3_SECRET_DEFAULT_SIZE 192 - -/*! - * @internal - * @brief Structure for XXH3 streaming API. - * - * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, - * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. - * Otherwise it is an opaque type. - * Never use this definition in combination with dynamic library. - * This allows fields to safely be changed in the future. - * - * @note ** This structure has a strict alignment requirement of 64 bytes!! ** - * Do not allocate this with `malloc()` or `new`, - * it will not be sufficiently aligned. - * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. - * - * Typedef'd to @ref XXH3_state_t. - * Do never access the members of this struct directly. - * - * @see XXH3_INITSTATE() for stack initialization. - * @see XXH3_createState(), XXH3_freeState(). - * @see XXH32_state_s, XXH64_state_s - */ -struct XXH3_state_s { - XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); - /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ - XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); - /*!< Used to store a custom secret generated from a seed. */ - XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); - /*!< The internal buffer. @see XXH32_state_s::mem32 */ - XXH32_hash_t bufferedSize; - /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ - XXH32_hash_t useSeed; - /*!< Reserved field. Needed for padding on 64-bit. */ - size_t nbStripesSoFar; - /*!< Number or stripes processed. */ - XXH64_hash_t totalLen; - /*!< Total length hashed. 64-bit even on 32-bit targets. */ - size_t nbStripesPerBlock; - /*!< Number of stripes per block. */ - size_t secretLimit; - /*!< Size of @ref customSecret or @ref extSecret */ - XXH64_hash_t seed; - /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ - XXH64_hash_t reserved64; - /*!< Reserved field. */ - const unsigned char* extSecret; - /*!< Reference to an external secret for the _withSecret variants, NULL - * for other variants. */ - /* note: there may be some padding at the end due to alignment on 64 bytes */ -}; /* typedef'd to XXH3_state_t */ - -#undef XXH_ALIGN_MEMBER - -/*! - * @brief Initializes a stack-allocated `XXH3_state_s`. - * - * When the @ref XXH3_state_t structure is merely emplaced on stack, - * it should be initialized with XXH3_INITSTATE() or a memset() - * in case its first reset uses XXH3_NNbits_reset_withSeed(). - * This init can be omitted if the first reset uses default or _withSecret mode. - * This operation isn't necessary when the state is created with XXH3_createState(). - * Note that this doesn't prepare the state for a streaming operation, - * it's still necessary to use XXH3_NNbits_reset*() afterwards. - */ -#define XXH3_INITSTATE(XXH3_state_ptr) \ - do { \ - XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ - tmp_xxh3_state_ptr->seed = 0; \ - tmp_xxh3_state_ptr->extSecret = NULL; \ - } while(0) - - -/*! - * @brief Calculates the 128-bit hash of @p data using XXH3. - * - * @param data The block of data to be hashed, at least @p len bytes in size. - * @param len The length of @p data, in bytes. - * @param seed The 64-bit seed to alter the hash's output predictably. - * - * @pre - * The memory between @p data and @p data + @p len must be valid, - * readable, contiguous memory. However, if @p len is `0`, @p data may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return The calculated 128-bit XXH3 value. - * - * @see @ref single_shot_example "Single Shot Example" for an example. - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); - - -/* === Experimental API === */ -/* Symbols defined below must be considered tied to a specific library version. */ - -/*! - * @brief Derive a high-entropy secret from any user-defined content, named customSeed. - * - * @param secretBuffer A writable buffer for derived high-entropy secret data. - * @param secretSize Size of secretBuffer, in bytes. Must be >= XXH3_SECRET_SIZE_MIN. - * @param customSeed A user-defined content. - * @param customSeedSize Size of customSeed, in bytes. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * The generated secret can be used in combination with `*_withSecret()` functions. - * The `_withSecret()` variants are useful to provide a higher level of protection - * than 64-bit seed, as it becomes much more difficult for an external actor to - * guess how to impact the calculation logic. - * - * The function accepts as input a custom seed of any length and any content, - * and derives from it a high-entropy secret of length @p secretSize into an - * already allocated buffer @p secretBuffer. - * - * The generated secret can then be used with any `*_withSecret()` variant. - * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), - * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() - * are part of this list. They all accept a `secret` parameter - * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) - * _and_ feature very high entropy (consist of random-looking bytes). - * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can - * be employed to ensure proper quality. - * - * @p customSeed can be anything. It can have any size, even small ones, - * and its content can be anything, even "poor entropy" sources such as a bunch - * of zeroes. The resulting `secret` will nonetheless provide all required qualities. - * - * @pre - * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN - * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. - * - * Example code: - * @code{.c} - * #include - * #include - * #include - * #define XXH_STATIC_LINKING_ONLY // expose unstable API - * #include "xxhash.h" - * // Hashes argv[2] using the entropy from argv[1]. - * int main(int argc, char* argv[]) - * { - * char secret[XXH3_SECRET_SIZE_MIN]; - * if (argv != 3) { return 1; } - * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); - * XXH64_hash_t h = XXH3_64bits_withSecret( - * argv[2], strlen(argv[2]), - * secret, sizeof(secret) - * ); - * printf("%016llx\n", (unsigned long long) h); - * } - * @endcode - */ -XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); - -/*! - * @brief Generate the same secret as the _withSeed() variants. - * - * @param secretBuffer A writable buffer of @ref XXH3_SECRET_DEFAULT_SIZE bytes - * @param seed The 64-bit seed to alter the hash result predictably. - * - * The generated secret can be used in combination with - *`*_withSecret()` and `_withSecretandSeed()` variants. - * - * Example C++ `std::string` hash class: - * @code{.cpp} - * #include - * #define XXH_STATIC_LINKING_ONLY // expose unstable API - * #include "xxhash.h" - * // Slow, seeds each time - * class HashSlow { - * XXH64_hash_t seed; - * public: - * HashSlow(XXH64_hash_t s) : seed{s} {} - * size_t operator()(const std::string& x) const { - * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; - * } - * }; - * // Fast, caches the seeded secret for future uses. - * class HashFast { - * unsigned char secret[XXH3_SECRET_DEFAULT_SIZE]; - * public: - * HashFast(XXH64_hash_t s) { - * XXH3_generateSecret_fromSeed(secret, seed); - * } - * size_t operator()(const std::string& x) const { - * return size_t{ - * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) - * }; - * } - * }; - * @endcode - */ -XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); - -/*! - * @brief Maximum size of "short" key in bytes. - */ -#define XXH3_MIDSIZE_MAX 240 - -/*! - * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data. - * - * @param data The block of data to be hashed, at least @p len bytes in size. - * @param len The length of @p data, in bytes. - * @param secret The secret data. - * @param secretSize The length of @p secret, in bytes. - * @param seed The 64-bit seed to alter the hash result predictably. - * - * These variants generate hash values using either: - * - @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes) - * - @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX). - * - * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. - * `_withSeed()` has to generate the secret on the fly for "large" keys. - * It's fast, but can be perceptible for "not so large" keys (< 1 KB). - * `_withSecret()` has to generate the masks on the fly for "small" keys, - * which requires more instructions than _withSeed() variants. - * Therefore, _withSecretandSeed variant combines the best of both worlds. - * - * When @p secret has been generated by XXH3_generateSecret_fromSeed(), - * this variant produces *exactly* the same results as `_withSeed()` variant, - * hence offering only a pure speed benefit on "large" input, - * by skipping the need to regenerate the secret for every large input. - * - * Another usage scenario is to hash the secret to a 64-bit hash value, - * for example with XXH3_64bits(), which then becomes the seed, - * and then employ both the seed and the secret in _withSecretandSeed(). - * On top of speed, an added benefit is that each bit in the secret - * has a 50% chance to swap each bit in the output, via its impact to the seed. - * - * This is not guaranteed when using the secret directly in "small data" scenarios, - * because only portions of the secret are employed for small data. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t -XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, - XXH_NOESCAPE const void* secret, size_t secretSize, - XXH64_hash_t seed); - -/*! - * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. - * - * @param data The memory segment to be hashed, at least @p len bytes in size. - * @param length The length of @p data, in bytes. - * @param secret The secret used to alter hash result predictably. - * @param secretSize The length of @p secret, in bytes (must be >= XXH3_SECRET_SIZE_MIN) - * @param seed64 The 64-bit seed to alter the hash result predictably. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @see XXH3_64bits_withSecretandSeed(): contract is the same. - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t -XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, - XXH_NOESCAPE const void* secret, size_t secretSize, - XXH64_hash_t seed64); - -#ifndef XXH_NO_STREAM -/*! - * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. - * - * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). - * @param secret The secret data. - * @param secretSize The length of @p secret, in bytes. - * @param seed64 The 64-bit seed to alter the hash result predictably. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @see XXH3_64bits_withSecretandSeed(). Contract is identical. - */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, - XXH_NOESCAPE const void* secret, size_t secretSize, - XXH64_hash_t seed64); - -/*! - * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. - * - * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). - * @param secret The secret data. - * @param secretSize The length of @p secret, in bytes. - * @param seed64 The 64-bit seed to alter the hash result predictably. - * - * @return @ref XXH_OK on success. - * @return @ref XXH_ERROR on failure. - * - * @see XXH3_64bits_withSecretandSeed(). Contract is identical. - * - * Note: there was a bug in an earlier version of this function (<= v0.8.2) - * that would make it generate an incorrect hash value - * when @p seed == 0 and @p length < XXH3_MIDSIZE_MAX - * and @p secret is different from XXH3_generateSecret_fromSeed(). - * As stated in the contract, the correct hash result must be - * the same as XXH3_128bits_withSeed() when @p length <= XXH3_MIDSIZE_MAX. - * Results generated by this older version are wrong, hence not comparable. - */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, - XXH_NOESCAPE const void* secret, size_t secretSize, - XXH64_hash_t seed64); - -#endif /* !XXH_NO_STREAM */ - -#endif /* !XXH_NO_XXH3 */ -#endif /* XXH_NO_LONG_LONG */ -#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) -# define XXH_IMPLEMENTATION -#endif - -#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ - - -/* ======================================================================== */ -/* ======================================================================== */ -/* ======================================================================== */ - - -/*-********************************************************************** - * xxHash implementation - *-********************************************************************** - * xxHash's implementation used to be hosted inside xxhash.c. - * - * However, inlining requires implementation to be visible to the compiler, - * hence be included alongside the header. - * Previously, implementation was hosted inside xxhash.c, - * which was then #included when inlining was activated. - * This construction created issues with a few build and install systems, - * as it required xxhash.c to be stored in /include directory. - * - * xxHash implementation is now directly integrated within xxhash.h. - * As a consequence, xxhash.c is no longer needed in /include. - * - * xxhash.c is still available and is still useful. - * In a "normal" setup, when xxhash is not inlined, - * xxhash.h only exposes the prototypes and public symbols, - * while xxhash.c can be built into an object file xxhash.o - * which can then be linked into the final binary. - ************************************************************************/ - -#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ - || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) -# define XXH_IMPLEM_13a8737387 - -/* ************************************* -* Tuning parameters -***************************************/ - -/*! - * @defgroup tuning Tuning parameters - * @{ - * - * Various macros to control xxHash's behavior. - */ -#ifdef XXH_DOXYGEN -/*! - * @brief Define this to disable 64-bit code. - * - * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. - */ -# define XXH_NO_LONG_LONG -# undef XXH_NO_LONG_LONG /* don't actually */ -/*! - * @brief Controls how unaligned memory is accessed. - * - * By default, access to unaligned memory is controlled by `memcpy()`, which is - * safe and portable. - * - * Unfortunately, on some target/compiler combinations, the generated assembly - * is sub-optimal. - * - * The below switch allow selection of a different access method - * in the search for improved performance. - * - * @par Possible options: - * - * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` - * @par - * Use `memcpy()`. Safe and portable. Note that most modern compilers will - * eliminate the function call and treat it as an unaligned access. - * - * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` - * @par - * Depends on compiler extensions and is therefore not portable. - * This method is safe _if_ your compiler supports it, - * and *generally* as fast or faster than `memcpy`. - * - * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast - * @par - * Casts directly and dereferences. This method doesn't depend on the - * compiler, but it violates the C standard as it directly dereferences an - * unaligned pointer. It can generate buggy code on targets which do not - * support unaligned memory accesses, but in some circumstances, it's the - * only known way to get the most performance. - * - * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift - * @par - * Also portable. This can generate the best code on old compilers which don't - * inline small `memcpy()` calls, and it might also be faster on big-endian - * systems which lack a native byteswap instruction. However, some compilers - * will emit literal byteshifts even if the target supports unaligned access. - * - * - * @warning - * Methods 1 and 2 rely on implementation-defined behavior. Use these with - * care, as what works on one compiler/platform/optimization level may cause - * another to read garbage data or even crash. - * - * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. - * - * Prefer these methods in priority order (0 > 3 > 1 > 2) - */ -# define XXH_FORCE_MEMORY_ACCESS 0 - -/*! - * @def XXH_SIZE_OPT - * @brief Controls how much xxHash optimizes for size. - * - * xxHash, when compiled, tends to result in a rather large binary size. This - * is mostly due to heavy usage to forced inlining and constant folding of the - * @ref XXH3_family to increase performance. - * - * However, some developers prefer size over speed. This option can - * significantly reduce the size of the generated code. When using the `-Os` - * or `-Oz` options on GCC or Clang, this is defined to 1 by default, - * otherwise it is defined to 0. - * - * Most of these size optimizations can be controlled manually. - * - * This is a number from 0-2. - * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed - * comes first. - * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more - * conservative and disables hacks that increase code size. It implies the - * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, - * and @ref XXH3_NEON_LANES == 8 if they are not already defined. - * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. - * Performance may cry. For example, the single shot functions just use the - * streaming API. - */ -# define XXH_SIZE_OPT 0 - -/*! - * @def XXH_FORCE_ALIGN_CHECK - * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() - * and XXH64() only). - * - * This is an important performance trick for architectures without decent - * unaligned memory access performance. - * - * It checks for input alignment, and when conditions are met, uses a "fast - * path" employing direct 32-bit/64-bit reads, resulting in _dramatically - * faster_ read speed. - * - * The check costs one initial branch per hash, which is generally negligible, - * but not zero. - * - * Moreover, it's not useful to generate an additional code path if memory - * access uses the same instruction for both aligned and unaligned - * addresses (e.g. x86 and aarch64). - * - * In these cases, the alignment check can be removed by setting this macro to 0. - * Then the code will always use unaligned memory access. - * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips - * which are platforms known to offer good unaligned memory accesses performance. - * - * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. - * - * This option does not affect XXH3 (only XXH32 and XXH64). - */ -# define XXH_FORCE_ALIGN_CHECK 0 - -/*! - * @def XXH_NO_INLINE_HINTS - * @brief When non-zero, sets all functions to `static`. - * - * By default, xxHash tries to force the compiler to inline almost all internal - * functions. - * - * This can usually improve performance due to reduced jumping and improved - * constant folding, but significantly increases the size of the binary which - * might not be favorable. - * - * Additionally, sometimes the forced inlining can be detrimental to performance, - * depending on the architecture. - * - * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the - * compiler full control on whether to inline or not. - * - * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if - * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. - */ -# define XXH_NO_INLINE_HINTS 0 - -/*! - * @def XXH3_INLINE_SECRET - * @brief Determines whether to inline the XXH3 withSecret code. - * - * When the secret size is known, the compiler can improve the performance - * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). - * - * However, if the secret size is not known, it doesn't have any benefit. This - * happens when xxHash is compiled into a global symbol. Therefore, if - * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. - * - * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers - * that are *sometimes* force inline on -Og, and it is impossible to automatically - * detect this optimization level. - */ -# define XXH3_INLINE_SECRET 0 - -/*! - * @def XXH32_ENDJMP - * @brief Whether to use a jump for `XXH32_finalize`. - * - * For performance, `XXH32_finalize` uses multiple branches in the finalizer. - * This is generally preferable for performance, - * but depending on exact architecture, a jmp may be preferable. - * - * This setting is only possibly making a difference for very small inputs. - */ -# define XXH32_ENDJMP 0 - -/*! - * @internal - * @brief Redefines old internal names. - * - * For compatibility with code that uses xxHash's internals before the names - * were changed to improve namespacing. There is no other reason to use this. - */ -# define XXH_OLD_NAMES -# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ - -/*! - * @def XXH_NO_STREAM - * @brief Disables the streaming API. - * - * When xxHash is not inlined and the streaming functions are not used, disabling - * the streaming functions can improve code size significantly, especially with - * the @ref XXH3_family which tends to make constant folded copies of itself. - */ -# define XXH_NO_STREAM -# undef XXH_NO_STREAM /* don't actually */ -#endif /* XXH_DOXYGEN */ -/*! - * @} - */ - -#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ - /* prefer __packed__ structures (method 1) for GCC - * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy - * which for some reason does unaligned loads. */ -# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) -# define XXH_FORCE_MEMORY_ACCESS 1 -# endif -#endif - -#ifndef XXH_SIZE_OPT - /* default to 1 for -Os or -Oz */ -# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) -# define XXH_SIZE_OPT 1 -# else -# define XXH_SIZE_OPT 0 -# endif -#endif - -#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ - /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ -# if XXH_SIZE_OPT >= 1 || \ - defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ - || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ -# define XXH_FORCE_ALIGN_CHECK 0 -# else -# define XXH_FORCE_ALIGN_CHECK 1 -# endif -#endif - -#ifndef XXH_NO_INLINE_HINTS -# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ -# define XXH_NO_INLINE_HINTS 1 -# else -# define XXH_NO_INLINE_HINTS 0 -# endif -#endif - -#ifndef XXH3_INLINE_SECRET -# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ - || !defined(XXH_INLINE_ALL) -# define XXH3_INLINE_SECRET 0 -# else -# define XXH3_INLINE_SECRET 1 -# endif -#endif - -#ifndef XXH32_ENDJMP -/* generally preferable for performance */ -# define XXH32_ENDJMP 0 -#endif - -/*! - * @defgroup impl Implementation - * @{ - */ - - -/* ************************************* -* Includes & Memory related functions -***************************************/ -#if defined(XXH_NO_STREAM) -/* nothing */ -#elif defined(XXH_NO_STDLIB) - -/* When requesting to disable any mention of stdlib, - * the library loses the ability to invoked malloc / free. - * In practice, it means that functions like `XXH*_createState()` - * will always fail, and return NULL. - * This flag is useful in situations where - * xxhash.h is integrated into some kernel, embedded or limited environment - * without access to dynamic allocation. - */ - -static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } -static void XXH_free(void* p) { (void)p; } - -#else - -/* - * Modify the local functions below should you wish to use - * different memory routines for malloc() and free() - */ -#include - -/*! - * @internal - * @brief Modify this function to use a different routine than malloc(). - */ -static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } - -/*! - * @internal - * @brief Modify this function to use a different routine than free(). - */ -static void XXH_free(void* p) { free(p); } - -#endif /* XXH_NO_STDLIB */ - -#include - -/*! - * @internal - * @brief Modify this function to use a different routine than memcpy(). - */ -static void* XXH_memcpy(void* dest, const void* src, size_t size) -{ - return memcpy(dest,src,size); -} - -#include /* ULLONG_MAX */ - - -/* ************************************* -* Compiler Specific Options -***************************************/ -#ifdef _MSC_VER /* Visual Studio warning fix */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -#endif - -#if XXH_NO_INLINE_HINTS /* disable inlining hints */ -# if defined(__GNUC__) || defined(__clang__) -# define XXH_FORCE_INLINE static __attribute__((__unused__)) -# else -# define XXH_FORCE_INLINE static -# endif -# define XXH_NO_INLINE static -/* enable inlining hints */ -#elif defined(__GNUC__) || defined(__clang__) -# define XXH_FORCE_INLINE static __inline__ __attribute__((__always_inline__, __unused__)) -# define XXH_NO_INLINE static __attribute__((__noinline__)) -#elif defined(_MSC_VER) /* Visual Studio */ -# define XXH_FORCE_INLINE static __forceinline -# define XXH_NO_INLINE static __declspec(noinline) -#elif defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ -# define XXH_FORCE_INLINE static inline -# define XXH_NO_INLINE static -#else -# define XXH_FORCE_INLINE static -# define XXH_NO_INLINE static -#endif - -#if XXH3_INLINE_SECRET -# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE -#else -# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE -#endif - - -/* ************************************* -* Debug -***************************************/ -/*! - * @ingroup tuning - * @def XXH_DEBUGLEVEL - * @brief Sets the debugging level. - * - * XXH_DEBUGLEVEL is expected to be defined externally, typically via the - * compiler's command line options. The value must be a number. - */ -#ifndef XXH_DEBUGLEVEL -# ifdef DEBUGLEVEL /* backwards compat */ -# define XXH_DEBUGLEVEL DEBUGLEVEL -# else -# define XXH_DEBUGLEVEL 0 -# endif -#endif - -#if (XXH_DEBUGLEVEL>=1) -# include /* note: can still be disabled with NDEBUG */ -# define XXH_ASSERT(c) assert(c) -#else -# if defined(__INTEL_COMPILER) -# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) -# else -# define XXH_ASSERT(c) XXH_ASSUME(c) -# endif -#endif - -/* note: use after variable declarations */ -#ifndef XXH_STATIC_ASSERT -# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ -# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) -# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ -# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) -# else -# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) -# endif -# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) -#endif - -/*! - * @internal - * @def XXH_COMPILER_GUARD(var) - * @brief Used to prevent unwanted optimizations for @p var. - * - * It uses an empty GCC inline assembly statement with a register constraint - * which forces @p var into a general purpose register (eg eax, ebx, ecx - * on x86) and marks it as modified. - * - * This is used in a few places to avoid unwanted autovectorization (e.g. - * XXH32_round()). All vectorization we want is explicit via intrinsics, - * and _usually_ isn't wanted elsewhere. - * - * We also use it to prevent unwanted constant folding for AArch64 in - * XXH3_initCustomSecret_scalar(). - */ -#if defined(__GNUC__) || defined(__clang__) -# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) -#else -# define XXH_COMPILER_GUARD(var) ((void)0) -#endif - -/* Specifically for NEON vectors which use the "w" constraint, on - * Clang. */ -#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) -# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) -#else -# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) -#endif - -/* ************************************* -* Basic Types -***************************************/ -#if !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# ifdef _AIX -# include -# else -# include -# endif - typedef uint8_t xxh_u8; -#else - typedef unsigned char xxh_u8; -#endif -typedef XXH32_hash_t xxh_u32; - -#ifdef XXH_OLD_NAMES -# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" -# define BYTE xxh_u8 -# define U8 xxh_u8 -# define U32 xxh_u32 -#endif - -/* *** Memory access *** */ - -/*! - * @internal - * @fn xxh_u32 XXH_read32(const void* ptr) - * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. - * - * Affected by @ref XXH_FORCE_MEMORY_ACCESS. - * - * @param ptr The pointer to read from. - * @return The 32-bit native endian integer from the bytes at @p ptr. - */ - -/*! - * @internal - * @fn xxh_u32 XXH_readLE32(const void* ptr) - * @brief Reads an unaligned 32-bit little endian integer from @p ptr. - * - * Affected by @ref XXH_FORCE_MEMORY_ACCESS. - * - * @param ptr The pointer to read from. - * @return The 32-bit little endian integer from the bytes at @p ptr. - */ - -/*! - * @internal - * @fn xxh_u32 XXH_readBE32(const void* ptr) - * @brief Reads an unaligned 32-bit big endian integer from @p ptr. - * - * Affected by @ref XXH_FORCE_MEMORY_ACCESS. - * - * @param ptr The pointer to read from. - * @return The 32-bit big endian integer from the bytes at @p ptr. - */ - -/*! - * @internal - * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) - * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. - * - * Affected by @ref XXH_FORCE_MEMORY_ACCESS. - * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is - * always @ref XXH_alignment::XXH_unaligned. - * - * @param ptr The pointer to read from. - * @param align Whether @p ptr is aligned. - * @pre - * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte - * aligned. - * @return The 32-bit little endian integer from the bytes at @p ptr. - */ - -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) -/* - * Manual byteshift. Best for old compilers which don't inline memcpy. - * We actually directly use XXH_readLE32 and XXH_readBE32. - */ -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) - -/* - * Force direct memory access. Only works on CPU which support unaligned memory - * access in hardware. - */ -static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } - -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) - -/* - * __attribute__((aligned(1))) is supported by gcc and clang. Originally the - * documentation claimed that it only increased the alignment, but actually it - * can decrease it on gcc, clang, and icc: - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, - * https://gcc.godbolt.org/z/xYez1j67Y. - */ -#ifdef XXH_OLD_NAMES -typedef union { xxh_u32 u32; } __attribute__((__packed__)) unalign; -#endif -static xxh_u32 XXH_read32(const void* ptr) -{ - typedef __attribute__((__aligned__(1))) xxh_u32 xxh_unalign32; - return *((const xxh_unalign32*)ptr); -} - -#else - -/* - * Portable and safe solution. Generally efficient. - * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html - */ -static xxh_u32 XXH_read32(const void* memPtr) -{ - xxh_u32 val; - XXH_memcpy(&val, memPtr, sizeof(val)); - return val; -} - -#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ - - -/* *** Endianness *** */ - -/*! - * @ingroup tuning - * @def XXH_CPU_LITTLE_ENDIAN - * @brief Whether the target is little endian. - * - * Defined to 1 if the target is little endian, or 0 if it is big endian. - * It can be defined externally, for example on the compiler command line. - * - * If it is not defined, - * a runtime check (which is usually constant folded) is used instead. - * - * @note - * This is not necessarily defined to an integer constant. - * - * @see XXH_isLittleEndian() for the runtime check. - */ -#ifndef XXH_CPU_LITTLE_ENDIAN -/* - * Try to detect endianness automatically, to avoid the nonstandard behavior - * in `XXH_isLittleEndian()` - */ -# if defined(_WIN32) /* Windows is always little endian */ \ - || defined(__LITTLE_ENDIAN__) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -# define XXH_CPU_LITTLE_ENDIAN 1 -# elif defined(__BIG_ENDIAN__) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -# define XXH_CPU_LITTLE_ENDIAN 0 -# else -/*! - * @internal - * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. - * - * Most compilers will constant fold this. - */ -static int XXH_isLittleEndian(void) -{ - /* - * Portable and well-defined behavior. - * Don't use static: it is detrimental to performance. - */ - const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; - return one.c[0]; -} -# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() -# endif -#endif - - - - -/* **************************************** -* Compiler-specific Functions and Macros -******************************************/ -#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) - -#ifdef __has_builtin -# define XXH_HAS_BUILTIN(x) __has_builtin(x) -#else -# define XXH_HAS_BUILTIN(x) 0 -#endif - - - -/* - * C23 and future versions have standard "unreachable()". - * Once it has been implemented reliably we can add it as an - * additional case: - * - * ``` - * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) - * # include - * # ifdef unreachable - * # define XXH_UNREACHABLE() unreachable() - * # endif - * #endif - * ``` - * - * Note C++23 also has std::unreachable() which can be detected - * as follows: - * ``` - * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) - * # include - * # define XXH_UNREACHABLE() std::unreachable() - * #endif - * ``` - * NB: `__cpp_lib_unreachable` is defined in the `` header. - * We don't use that as including `` in `extern "C"` blocks - * doesn't work on GCC12 - */ - -#if XXH_HAS_BUILTIN(__builtin_unreachable) -# define XXH_UNREACHABLE() __builtin_unreachable() - -#elif defined(_MSC_VER) -# define XXH_UNREACHABLE() __assume(0) - -#else -# define XXH_UNREACHABLE() -#endif - -#if XXH_HAS_BUILTIN(__builtin_assume) -# define XXH_ASSUME(c) __builtin_assume(c) -#else -# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } -#endif - -/*! - * @internal - * @def XXH_rotl32(x,r) - * @brief 32-bit rotate left. - * - * @param x The 32-bit integer to be rotated. - * @param r The number of bits to rotate. - * @pre - * @p r > 0 && @p r < 32 - * @note - * @p x and @p r may be evaluated multiple times. - * @return The rotated result. - */ -#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ - && XXH_HAS_BUILTIN(__builtin_rotateleft64) -# define XXH_rotl32 __builtin_rotateleft32 -# define XXH_rotl64 __builtin_rotateleft64 -/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ -#elif defined(_MSC_VER) -# define XXH_rotl32(x,r) _rotl(x,r) -# define XXH_rotl64(x,r) _rotl64(x,r) -#else -# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) -# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) -#endif - -/*! - * @internal - * @fn xxh_u32 XXH_swap32(xxh_u32 x) - * @brief A 32-bit byteswap. - * - * @param x The 32-bit integer to byteswap. - * @return @p x, byteswapped. - */ -#if defined(_MSC_VER) /* Visual Studio */ -# define XXH_swap32 _byteswap_ulong -#elif XXH_GCC_VERSION >= 403 -# define XXH_swap32 __builtin_bswap32 -#else -static xxh_u32 XXH_swap32 (xxh_u32 x) -{ - return ((x << 24) & 0xff000000 ) | - ((x << 8) & 0x00ff0000 ) | - ((x >> 8) & 0x0000ff00 ) | - ((x >> 24) & 0x000000ff ); -} -#endif - - -/* *************************** -* Memory reads -*****************************/ - -/*! - * @internal - * @brief Enum to indicate whether a pointer is aligned. - */ -typedef enum { - XXH_aligned, /*!< Aligned */ - XXH_unaligned /*!< Possibly unaligned */ -} XXH_alignment; - -/* - * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. - * - * This is ideal for older compilers which don't inline memcpy. - */ -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) - -XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[0] - | ((xxh_u32)bytePtr[1] << 8) - | ((xxh_u32)bytePtr[2] << 16) - | ((xxh_u32)bytePtr[3] << 24); -} - -XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[3] - | ((xxh_u32)bytePtr[2] << 8) - | ((xxh_u32)bytePtr[1] << 16) - | ((xxh_u32)bytePtr[0] << 24); -} - -#else -XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); -} - -static xxh_u32 XXH_readBE32(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); -} -#endif - -XXH_FORCE_INLINE xxh_u32 -XXH_readLE32_align(const void* ptr, XXH_alignment align) -{ - if (align==XXH_unaligned) { - return XXH_readLE32(ptr); - } else { - return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); - } -} - - -/* ************************************* -* Misc -***************************************/ -/*! @ingroup public */ -XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } - - -/* ******************************************************************* -* 32-bit hash functions -*********************************************************************/ -/*! - * @} - * @defgroup XXH32_impl XXH32 implementation - * @ingroup impl - * - * Details on the XXH32 implementation. - * @{ - */ - /* #define instead of static const, to be used as initializers */ -#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ -#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ -#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ -#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ -#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ - -#ifdef XXH_OLD_NAMES -# define PRIME32_1 XXH_PRIME32_1 -# define PRIME32_2 XXH_PRIME32_2 -# define PRIME32_3 XXH_PRIME32_3 -# define PRIME32_4 XXH_PRIME32_4 -# define PRIME32_5 XXH_PRIME32_5 -#endif - -/*! - * @internal - * @brief Normal stripe processing routine. - * - * This shuffles the bits so that any bit from @p input impacts several bits in - * @p acc. - * - * @param acc The accumulator lane. - * @param input The stripe of input to mix. - * @return The mixed accumulator lane. - */ -static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) -{ - acc += input * XXH_PRIME32_2; - acc = XXH_rotl32(acc, 13); - acc *= XXH_PRIME32_1; -#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) - /* - * UGLY HACK: - * A compiler fence is used to prevent GCC and Clang from - * autovectorizing the XXH32 loop (pragmas and attributes don't work for some - * reason) without globally disabling SSE4.1. - * - * The reason we want to avoid vectorization is because despite working on - * 4 integers at a time, there are multiple factors slowing XXH32 down on - * SSE4: - * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on - * newer chips!) making it slightly slower to multiply four integers at - * once compared to four integers independently. Even when pmulld was - * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE - * just to multiply unless doing a long operation. - * - * - Four instructions are required to rotate, - * movqda tmp, v // not required with VEX encoding - * pslld tmp, 13 // tmp <<= 13 - * psrld v, 19 // x >>= 19 - * por v, tmp // x |= tmp - * compared to one for scalar: - * roll v, 13 // reliably fast across the board - * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason - * - * - Instruction level parallelism is actually more beneficial here because - * the SIMD actually serializes this operation: While v1 is rotating, v2 - * can load data, while v3 can multiply. SSE forces them to operate - * together. - * - * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing - * the loop. NEON is only faster on the A53, and with the newer cores, it is less - * than half the speed. - * - * Additionally, this is used on WASM SIMD128 because it JITs to the same - * SIMD instructions and has the same issue. - */ - XXH_COMPILER_GUARD(acc); -#endif - return acc; -} - -/*! - * @internal - * @brief Mixes all bits to finalize the hash. - * - * The final mix ensures that all input bits have a chance to impact any bit in - * the output digest, resulting in an unbiased distribution. - * - * @param hash The hash to avalanche. - * @return The avalanched hash. - */ -static xxh_u32 XXH32_avalanche(xxh_u32 hash) -{ - hash ^= hash >> 15; - hash *= XXH_PRIME32_2; - hash ^= hash >> 13; - hash *= XXH_PRIME32_3; - hash ^= hash >> 16; - return hash; -} - -#define XXH_get32bits(p) XXH_readLE32_align(p, align) - -/*! - * @internal - * @brief Processes the last 0-15 bytes of @p ptr. - * - * There may be up to 15 bytes remaining to consume from the input. - * This final stage will digest them to ensure that all input bytes are present - * in the final mix. - * - * @param hash The hash to finalize. - * @param ptr The pointer to the remaining input. - * @param len The remaining length, modulo 16. - * @param align Whether @p ptr is aligned. - * @return The finalized hash. - * @see XXH64_finalize(). - */ -static XXH_PUREF xxh_u32 -XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) -{ -#define XXH_PROCESS1 do { \ - hash += (*ptr++) * XXH_PRIME32_5; \ - hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ -} while (0) - -#define XXH_PROCESS4 do { \ - hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ - ptr += 4; \ - hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ -} while (0) - - if (ptr==NULL) XXH_ASSERT(len == 0); - - /* Compact rerolled version; generally faster */ - if (!XXH32_ENDJMP) { - len &= 15; - while (len >= 4) { - XXH_PROCESS4; - len -= 4; - } - while (len > 0) { - XXH_PROCESS1; - --len; - } - return XXH32_avalanche(hash); - } else { - switch(len&15) /* or switch(bEnd - p) */ { - case 12: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 8: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 4: XXH_PROCESS4; - return XXH32_avalanche(hash); - - case 13: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 9: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 5: XXH_PROCESS4; - XXH_PROCESS1; - return XXH32_avalanche(hash); - - case 14: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 10: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 6: XXH_PROCESS4; - XXH_PROCESS1; - XXH_PROCESS1; - return XXH32_avalanche(hash); - - case 15: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 11: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 7: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 3: XXH_PROCESS1; - XXH_FALLTHROUGH; /* fallthrough */ - case 2: XXH_PROCESS1; - XXH_FALLTHROUGH; /* fallthrough */ - case 1: XXH_PROCESS1; - XXH_FALLTHROUGH; /* fallthrough */ - case 0: return XXH32_avalanche(hash); - } - XXH_ASSERT(0); - return hash; /* reaching this point is deemed impossible */ - } -} - -#ifdef XXH_OLD_NAMES -# define PROCESS1 XXH_PROCESS1 -# define PROCESS4 XXH_PROCESS4 -#else -# undef XXH_PROCESS1 -# undef XXH_PROCESS4 -#endif - -/*! - * @internal - * @brief The implementation for @ref XXH32(). - * - * @param input , len , seed Directly passed from @ref XXH32(). - * @param align Whether @p input is aligned. - * @return The calculated hash. - */ -XXH_FORCE_INLINE XXH_PUREF xxh_u32 -XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) -{ - xxh_u32 h32; - - if (input==NULL) XXH_ASSERT(len == 0); - - if (len>=16) { - const xxh_u8* const bEnd = input + len; - const xxh_u8* const limit = bEnd - 15; - xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; - xxh_u32 v2 = seed + XXH_PRIME32_2; - xxh_u32 v3 = seed + 0; - xxh_u32 v4 = seed - XXH_PRIME32_1; - - do { - v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; - v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; - v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; - v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; - } while (input < limit); - - h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) - + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); - } else { - h32 = seed + XXH_PRIME32_5; - } - - h32 += (xxh_u32)len; - - return XXH32_finalize(h32, input, len&15, align); -} - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) -{ -#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 - /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ - XXH32_state_t state; - XXH32_reset(&state, seed); - XXH32_update(&state, (const xxh_u8*)input, len); - return XXH32_digest(&state); -#else - if (XXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ - return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); - } } - - return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); -#endif -} - - - -/******* Hash streaming *******/ -#ifndef XXH_NO_STREAM -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) -{ - return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); -} -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -} - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) -{ - XXH_memcpy(dstState, srcState, sizeof(*dstState)); -} - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) -{ - XXH_ASSERT(statePtr != NULL); - memset(statePtr, 0, sizeof(*statePtr)); - statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; - statePtr->v[1] = seed + XXH_PRIME32_2; - statePtr->v[2] = seed + 0; - statePtr->v[3] = seed - XXH_PRIME32_1; - return XXH_OK; -} - - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH_errorcode -XXH32_update(XXH32_state_t* state, const void* input, size_t len) -{ - if (input==NULL) { - XXH_ASSERT(len == 0); - return XXH_OK; - } - - { const xxh_u8* p = (const xxh_u8*)input; - const xxh_u8* const bEnd = p + len; - - state->total_len_32 += (XXH32_hash_t)len; - state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); - - if (state->memsize + len < 16) { /* fill in tmp buffer */ - XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); - state->memsize += (XXH32_hash_t)len; - return XXH_OK; - } - - if (state->memsize) { /* some data left from previous update */ - XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); - { const xxh_u32* p32 = state->mem32; - state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; - state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; - state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; - state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); - } - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= bEnd-16) { - const xxh_u8* const limit = bEnd - 16; - - do { - state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; - state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; - state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; - state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; - } while (p<=limit); - - } - - if (p < bEnd) { - XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); - } - } - - return XXH_OK; -} - - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) -{ - xxh_u32 h32; - - if (state->large_len) { - h32 = XXH_rotl32(state->v[0], 1) - + XXH_rotl32(state->v[1], 7) - + XXH_rotl32(state->v[2], 12) - + XXH_rotl32(state->v[3], 18); - } else { - h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; - } - - h32 += state->total_len_32; - - return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); -} -#endif /* !XXH_NO_STREAM */ - -/******* Canonical representation *******/ - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); - XXH_memcpy(dst, &hash, sizeof(*dst)); -} -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) -{ - return XXH_readBE32(src); -} - - -#ifndef XXH_NO_LONG_LONG - -/* ******************************************************************* -* 64-bit hash functions -*********************************************************************/ -/*! - * @} - * @ingroup impl - * @{ - */ -/******* Memory access *******/ - -typedef XXH64_hash_t xxh_u64; - -#ifdef XXH_OLD_NAMES -# define U64 xxh_u64 -#endif - -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) -/* - * Manual byteshift. Best for old compilers which don't inline memcpy. - * We actually directly use XXH_readLE64 and XXH_readBE64. - */ -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) - -/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ -static xxh_u64 XXH_read64(const void* memPtr) -{ - return *(const xxh_u64*) memPtr; -} - -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) - -/* - * __attribute__((aligned(1))) is supported by gcc and clang. Originally the - * documentation claimed that it only increased the alignment, but actually it - * can decrease it on gcc, clang, and icc: - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, - * https://gcc.godbolt.org/z/xYez1j67Y. - */ -#ifdef XXH_OLD_NAMES -typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((__packed__)) unalign64; -#endif -static xxh_u64 XXH_read64(const void* ptr) -{ - typedef __attribute__((__aligned__(1))) xxh_u64 xxh_unalign64; - return *((const xxh_unalign64*)ptr); -} - -#else - -/* - * Portable and safe solution. Generally efficient. - * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html - */ -static xxh_u64 XXH_read64(const void* memPtr) -{ - xxh_u64 val; - XXH_memcpy(&val, memPtr, sizeof(val)); - return val; -} - -#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ - -#if defined(_MSC_VER) /* Visual Studio */ -# define XXH_swap64 _byteswap_uint64 -#elif XXH_GCC_VERSION >= 403 -# define XXH_swap64 __builtin_bswap64 -#else -static xxh_u64 XXH_swap64(xxh_u64 x) -{ - return ((x << 56) & 0xff00000000000000ULL) | - ((x << 40) & 0x00ff000000000000ULL) | - ((x << 24) & 0x0000ff0000000000ULL) | - ((x << 8) & 0x000000ff00000000ULL) | - ((x >> 8) & 0x00000000ff000000ULL) | - ((x >> 24) & 0x0000000000ff0000ULL) | - ((x >> 40) & 0x000000000000ff00ULL) | - ((x >> 56) & 0x00000000000000ffULL); -} -#endif - - -/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) - -XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[0] - | ((xxh_u64)bytePtr[1] << 8) - | ((xxh_u64)bytePtr[2] << 16) - | ((xxh_u64)bytePtr[3] << 24) - | ((xxh_u64)bytePtr[4] << 32) - | ((xxh_u64)bytePtr[5] << 40) - | ((xxh_u64)bytePtr[6] << 48) - | ((xxh_u64)bytePtr[7] << 56); -} - -XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[7] - | ((xxh_u64)bytePtr[6] << 8) - | ((xxh_u64)bytePtr[5] << 16) - | ((xxh_u64)bytePtr[4] << 24) - | ((xxh_u64)bytePtr[3] << 32) - | ((xxh_u64)bytePtr[2] << 40) - | ((xxh_u64)bytePtr[1] << 48) - | ((xxh_u64)bytePtr[0] << 56); -} - -#else -XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); -} - -static xxh_u64 XXH_readBE64(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); -} -#endif - -XXH_FORCE_INLINE xxh_u64 -XXH_readLE64_align(const void* ptr, XXH_alignment align) -{ - if (align==XXH_unaligned) - return XXH_readLE64(ptr); - else - return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); -} - - -/******* xxh64 *******/ -/*! - * @} - * @defgroup XXH64_impl XXH64 implementation - * @ingroup impl - * - * Details on the XXH64 implementation. - * @{ - */ -/* #define rather that static const, to be used as initializers */ -#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ -#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ -#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ -#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ -#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ - -#ifdef XXH_OLD_NAMES -# define PRIME64_1 XXH_PRIME64_1 -# define PRIME64_2 XXH_PRIME64_2 -# define PRIME64_3 XXH_PRIME64_3 -# define PRIME64_4 XXH_PRIME64_4 -# define PRIME64_5 XXH_PRIME64_5 -#endif - -/*! @copydoc XXH32_round */ -static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) -{ - acc += input * XXH_PRIME64_2; - acc = XXH_rotl64(acc, 31); - acc *= XXH_PRIME64_1; -#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) - /* - * DISABLE AUTOVECTORIZATION: - * A compiler fence is used to prevent GCC and Clang from - * autovectorizing the XXH64 loop (pragmas and attributes don't work for some - * reason) without globally disabling AVX512. - * - * Autovectorization of XXH64 tends to be detrimental, - * though the exact outcome may change depending on exact cpu and compiler version. - * For information, it has been reported as detrimental for Skylake-X, - * but possibly beneficial for Zen4. - * - * The default is to disable auto-vectorization, - * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable. - */ - XXH_COMPILER_GUARD(acc); -#endif - return acc; -} - -static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) -{ - val = XXH64_round(0, val); - acc ^= val; - acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; - return acc; -} - -/*! @copydoc XXH32_avalanche */ -static xxh_u64 XXH64_avalanche(xxh_u64 hash) -{ - hash ^= hash >> 33; - hash *= XXH_PRIME64_2; - hash ^= hash >> 29; - hash *= XXH_PRIME64_3; - hash ^= hash >> 32; - return hash; -} - - -#define XXH_get64bits(p) XXH_readLE64_align(p, align) - -/*! - * @internal - * @brief Processes the last 0-31 bytes of @p ptr. - * - * There may be up to 31 bytes remaining to consume from the input. - * This final stage will digest them to ensure that all input bytes are present - * in the final mix. - * - * @param hash The hash to finalize. - * @param ptr The pointer to the remaining input. - * @param len The remaining length, modulo 32. - * @param align Whether @p ptr is aligned. - * @return The finalized hash - * @see XXH32_finalize(). - */ -static XXH_PUREF xxh_u64 -XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) -{ - if (ptr==NULL) XXH_ASSERT(len == 0); - len &= 31; - while (len >= 8) { - xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); - ptr += 8; - hash ^= k1; - hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; - len -= 8; - } - if (len >= 4) { - hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; - ptr += 4; - hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; - len -= 4; - } - while (len > 0) { - hash ^= (*ptr++) * XXH_PRIME64_5; - hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; - --len; - } - return XXH64_avalanche(hash); -} - -#ifdef XXH_OLD_NAMES -# define PROCESS1_64 XXH_PROCESS1_64 -# define PROCESS4_64 XXH_PROCESS4_64 -# define PROCESS8_64 XXH_PROCESS8_64 -#else -# undef XXH_PROCESS1_64 -# undef XXH_PROCESS4_64 -# undef XXH_PROCESS8_64 -#endif - -/*! - * @internal - * @brief The implementation for @ref XXH64(). - * - * @param input , len , seed Directly passed from @ref XXH64(). - * @param align Whether @p input is aligned. - * @return The calculated hash. - */ -XXH_FORCE_INLINE XXH_PUREF xxh_u64 -XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) -{ - xxh_u64 h64; - if (input==NULL) XXH_ASSERT(len == 0); - - if (len>=32) { - const xxh_u8* const bEnd = input + len; - const xxh_u8* const limit = bEnd - 31; - xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; - xxh_u64 v2 = seed + XXH_PRIME64_2; - xxh_u64 v3 = seed + 0; - xxh_u64 v4 = seed - XXH_PRIME64_1; - - do { - v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; - v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; - v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; - v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; - } while (input= 2 - /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ - XXH64_state_t state; - XXH64_reset(&state, seed); - XXH64_update(&state, (const xxh_u8*)input, len); - return XXH64_digest(&state); -#else - if (XXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ - return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); - } } - - return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); - -#endif -} - -/******* Hash Streaming *******/ -#ifndef XXH_NO_STREAM -/*! @ingroup XXH64_family*/ -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) -{ - return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); -} -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -} - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) -{ - XXH_memcpy(dstState, srcState, sizeof(*dstState)); -} - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) -{ - XXH_ASSERT(statePtr != NULL); - memset(statePtr, 0, sizeof(*statePtr)); - statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; - statePtr->v[1] = seed + XXH_PRIME64_2; - statePtr->v[2] = seed + 0; - statePtr->v[3] = seed - XXH_PRIME64_1; - return XXH_OK; -} - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH_errorcode -XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) -{ - if (input==NULL) { - XXH_ASSERT(len == 0); - return XXH_OK; - } - - { const xxh_u8* p = (const xxh_u8*)input; - const xxh_u8* const bEnd = p + len; - - state->total_len += len; - - if (state->memsize + len < 32) { /* fill in tmp buffer */ - XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); - state->memsize += (xxh_u32)len; - return XXH_OK; - } - - if (state->memsize) { /* tmp buffer is full */ - XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); - state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); - state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); - state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); - state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); - p += 32 - state->memsize; - state->memsize = 0; - } - - if (p+32 <= bEnd) { - const xxh_u8* const limit = bEnd - 32; - - do { - state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; - state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; - state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; - state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; - } while (p<=limit); - - } - - if (p < bEnd) { - XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); - } - } - - return XXH_OK; -} - - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) -{ - xxh_u64 h64; - - if (state->total_len >= 32) { - h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); - h64 = XXH64_mergeRound(h64, state->v[0]); - h64 = XXH64_mergeRound(h64, state->v[1]); - h64 = XXH64_mergeRound(h64, state->v[2]); - h64 = XXH64_mergeRound(h64, state->v[3]); - } else { - h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; - } - - h64 += (xxh_u64) state->total_len; - - return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); -} -#endif /* !XXH_NO_STREAM */ - -/******* Canonical representation *******/ - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); - XXH_memcpy(dst, &hash, sizeof(*dst)); -} - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) -{ - return XXH_readBE64(src); -} - -#ifndef XXH_NO_XXH3 - -/* ********************************************************************* -* XXH3 -* New generation hash designed for speed on small keys and vectorization -************************************************************************ */ -/*! - * @} - * @defgroup XXH3_impl XXH3 implementation - * @ingroup impl - * @{ - */ - -/* === Compiler specifics === */ - -#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ -# define XXH_RESTRICT /* disable */ -#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ -# define XXH_RESTRICT restrict -#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ - || (defined (__clang__)) \ - || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ - || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) -/* - * There are a LOT more compilers that recognize __restrict but this - * covers the major ones. - */ -# define XXH_RESTRICT __restrict -#else -# define XXH_RESTRICT /* disable */ -#endif - -#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ - || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ - || defined(__clang__) -# define XXH_likely(x) __builtin_expect(x, 1) -# define XXH_unlikely(x) __builtin_expect(x, 0) -#else -# define XXH_likely(x) (x) -# define XXH_unlikely(x) (x) -#endif - -#ifndef XXH_HAS_INCLUDE -# ifdef __has_include -/* - * Not defined as XXH_HAS_INCLUDE(x) (function-like) because - * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion) - */ -# define XXH_HAS_INCLUDE __has_include -# else -# define XXH_HAS_INCLUDE(x) 0 -# endif -#endif - -#if defined(__GNUC__) || defined(__clang__) -# if defined(__ARM_FEATURE_SVE) -# include -# endif -# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ - || (defined(_M_ARM) && _M_ARM >= 7) \ - || defined(_M_ARM64) || defined(_M_ARM64EC) \ - || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* WASM SIMD128 via SIMDe */ -# define inline __inline__ /* circumvent a clang bug */ -# include -# undef inline -# elif defined(__AVX2__) -# include -# elif defined(__SSE2__) -# include -# endif -#endif - -#if defined(_MSC_VER) -# include -#endif - -/* - * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while - * remaining a true 64-bit/128-bit hash function. - * - * This is done by prioritizing a subset of 64-bit operations that can be - * emulated without too many steps on the average 32-bit machine. - * - * For example, these two lines seem similar, and run equally fast on 64-bit: - * - * xxh_u64 x; - * x ^= (x >> 47); // good - * x ^= (x >> 13); // bad - * - * However, to a 32-bit machine, there is a major difference. - * - * x ^= (x >> 47) looks like this: - * - * x.lo ^= (x.hi >> (47 - 32)); - * - * while x ^= (x >> 13) looks like this: - * - * // note: funnel shifts are not usually cheap. - * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); - * x.hi ^= (x.hi >> 13); - * - * The first one is significantly faster than the second, simply because the - * shift is larger than 32. This means: - * - All the bits we need are in the upper 32 bits, so we can ignore the lower - * 32 bits in the shift. - * - The shift result will always fit in the lower 32 bits, and therefore, - * we can ignore the upper 32 bits in the xor. - * - * Thanks to this optimization, XXH3 only requires these features to be efficient: - * - * - Usable unaligned access - * - A 32-bit or 64-bit ALU - * - If 32-bit, a decent ADC instruction - * - A 32 or 64-bit multiply with a 64-bit result - * - For the 128-bit variant, a decent byteswap helps short inputs. - * - * The first two are already required by XXH32, and almost all 32-bit and 64-bit - * platforms which can run XXH32 can run XXH3 efficiently. - * - * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one - * notable exception. - * - * First of all, Thumb-1 lacks support for the UMULL instruction which - * performs the important long multiply. This means numerous __aeabi_lmul - * calls. - * - * Second of all, the 8 functional registers are just not enough. - * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need - * Lo registers, and this shuffling results in thousands more MOVs than A32. - * - * A32 and T32 don't have this limitation. They can access all 14 registers, - * do a 32->64 multiply with UMULL, and the flexible operand allowing free - * shifts is helpful, too. - * - * Therefore, we do a quick sanity check. - * - * If compiling Thumb-1 for a target which supports ARM instructions, we will - * emit a warning, as it is not a "sane" platform to compile for. - * - * Usually, if this happens, it is because of an accident and you probably need - * to specify -march, as you likely meant to compile for a newer architecture. - * - * Credit: large sections of the vectorial and asm source code paths - * have been contributed by @easyaspi314 - */ -#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) -# warning "XXH3 is highly inefficient without ARM or Thumb-2." -#endif - -/* ========================================== - * Vectorization detection - * ========================================== */ - -#ifdef XXH_DOXYGEN -/*! - * @ingroup tuning - * @brief Overrides the vectorization implementation chosen for XXH3. - * - * Can be defined to 0 to disable SIMD or any of the values mentioned in - * @ref XXH_VECTOR_TYPE. - * - * If this is not defined, it uses predefined macros to determine the best - * implementation. - */ -# define XXH_VECTOR XXH_SCALAR -/*! - * @ingroup tuning - * @brief Possible values for @ref XXH_VECTOR. - * - * Note that these are actually implemented as macros. - * - * If this is not defined, it is detected automatically. - * internal macro XXH_X86DISPATCH overrides this. - */ -enum XXH_VECTOR_TYPE /* fake enum */ { - XXH_SCALAR = 0, /*!< Portable scalar version */ - XXH_SSE2 = 1, /*!< - * SSE2 for Pentium 4, Opteron, all x86_64. - * - * @note SSE2 is also guaranteed on Windows 10, macOS, and - * Android x86. - */ - XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ - XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ - XXH_NEON = 4, /*!< - * NEON for most ARMv7-A, all AArch64, and WASM SIMD128 - * via the SIMDeverywhere polyfill provided with the - * Emscripten SDK. - */ - XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ - XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ -}; -/*! - * @ingroup tuning - * @brief Selects the minimum alignment for XXH3's accumulators. - * - * When using SIMD, this should match the alignment required for said vector - * type, so, for example, 32 for AVX2. - * - * Default: Auto detected. - */ -# define XXH_ACC_ALIGN 8 -#endif - -/* Actual definition */ -#ifndef XXH_DOXYGEN -# define XXH_SCALAR 0 -# define XXH_SSE2 1 -# define XXH_AVX2 2 -# define XXH_AVX512 3 -# define XXH_NEON 4 -# define XXH_VSX 5 -# define XXH_SVE 6 -#endif - -#ifndef XXH_VECTOR /* can be defined on command line */ -# if defined(__ARM_FEATURE_SVE) -# define XXH_VECTOR XXH_SVE -# elif ( \ - defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ - || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ - || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* wasm simd128 via SIMDe */ \ - ) && ( \ - defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ - ) -# define XXH_VECTOR XXH_NEON -# elif defined(__AVX512F__) -# define XXH_VECTOR XXH_AVX512 -# elif defined(__AVX2__) -# define XXH_VECTOR XXH_AVX2 -# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) -# define XXH_VECTOR XXH_SSE2 -# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ - || (defined(__s390x__) && defined(__VEC__)) \ - && defined(__GNUC__) /* TODO: IBM XL */ -# define XXH_VECTOR XXH_VSX -# else -# define XXH_VECTOR XXH_SCALAR -# endif -#endif - -/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ -#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) -# ifdef _MSC_VER -# pragma warning(once : 4606) -# else -# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." -# endif -# undef XXH_VECTOR -# define XXH_VECTOR XXH_SCALAR -#endif - -/* - * Controls the alignment of the accumulator, - * for compatibility with aligned vector loads, which are usually faster. - */ -#ifndef XXH_ACC_ALIGN -# if defined(XXH_X86DISPATCH) -# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ -# elif XXH_VECTOR == XXH_SCALAR /* scalar */ -# define XXH_ACC_ALIGN 8 -# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ -# define XXH_ACC_ALIGN 16 -# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ -# define XXH_ACC_ALIGN 32 -# elif XXH_VECTOR == XXH_NEON /* neon */ -# define XXH_ACC_ALIGN 16 -# elif XXH_VECTOR == XXH_VSX /* vsx */ -# define XXH_ACC_ALIGN 16 -# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ -# define XXH_ACC_ALIGN 64 -# elif XXH_VECTOR == XXH_SVE /* sve */ -# define XXH_ACC_ALIGN 64 -# endif -#endif - -#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ - || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 -# define XXH_SEC_ALIGN XXH_ACC_ALIGN -#elif XXH_VECTOR == XXH_SVE -# define XXH_SEC_ALIGN XXH_ACC_ALIGN -#else -# define XXH_SEC_ALIGN 8 -#endif - -#if defined(__GNUC__) || defined(__clang__) -# define XXH_ALIASING __attribute__((__may_alias__)) -#else -# define XXH_ALIASING /* nothing */ -#endif - -/* - * UGLY HACK: - * GCC usually generates the best code with -O3 for xxHash. - * - * However, when targeting AVX2, it is overzealous in its unrolling resulting - * in code roughly 3/4 the speed of Clang. - * - * There are other issues, such as GCC splitting _mm256_loadu_si256 into - * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which - * only applies to Sandy and Ivy Bridge... which don't even support AVX2. - * - * That is why when compiling the AVX2 version, it is recommended to use either - * -O2 -mavx2 -march=haswell - * or - * -O2 -mavx2 -mno-avx256-split-unaligned-load - * for decent performance, or to use Clang instead. - * - * Fortunately, we can control the first one with a pragma that forces GCC into - * -O2, but the other one we can't control without "failed to inline always - * inline function due to target mismatch" warnings. - */ -#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ - && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ -# pragma GCC push_options -# pragma GCC optimize("-O2") -#endif - -#if XXH_VECTOR == XXH_NEON - -/* - * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 - * optimizes out the entire hashLong loop because of the aliasing violation. - * - * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, - * so the only option is to mark it as aliasing. - */ -typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; - -/*! - * @internal - * @brief `vld1q_u64` but faster and alignment-safe. - * - * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only - * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). - * - * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it - * prohibits load-store optimizations. Therefore, a direct dereference is used. - * - * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe - * unaligned load. - */ -#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) -XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ -{ - return *(xxh_aliasing_uint64x2_t const *)ptr; -} -#else -XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) -{ - return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); -} -#endif - -/*! - * @internal - * @brief `vmlal_u32` on low and high halves of a vector. - * - * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with - * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` - * with `vmlal_u32`. - */ -#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 -XXH_FORCE_INLINE uint64x2_t -XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) -{ - /* Inline assembly is the only way */ - __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); - return acc; -} -XXH_FORCE_INLINE uint64x2_t -XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) -{ - /* This intrinsic works as expected */ - return vmlal_high_u32(acc, lhs, rhs); -} -#else -/* Portable intrinsic versions */ -XXH_FORCE_INLINE uint64x2_t -XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) -{ - return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); -} -/*! @copydoc XXH_vmlal_low_u32 - * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ -XXH_FORCE_INLINE uint64x2_t -XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) -{ - return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); -} -#endif - -/*! - * @ingroup tuning - * @brief Controls the NEON to scalar ratio for XXH3 - * - * This can be set to 2, 4, 6, or 8. - * - * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. - * - * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those - * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU - * bandwidth. - * - * This is even more noticeable on the more advanced cores like the Cortex-A76 which - * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. - * - * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes - * and 2 scalar lanes, which is chosen by default. - * - * This does not apply to Apple processors or 32-bit processors, which run better with - * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. - * - * This change benefits CPUs with large micro-op buffers without negatively affecting - * most other CPUs: - * - * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | - * |:----------------------|:--------------------|----------:|-----------:|------:| - * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | - * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | - * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | - * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | - * - * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. - * - * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning - * it effectively becomes worse 4. - * - * @see XXH3_accumulate_512_neon() - */ -# ifndef XXH3_NEON_LANES -# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ - && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 -# define XXH3_NEON_LANES 6 -# else -# define XXH3_NEON_LANES XXH_ACC_NB -# endif -# endif -#endif /* XXH_VECTOR == XXH_NEON */ - -/* - * VSX and Z Vector helpers. - * - * This is very messy, and any pull requests to clean this up are welcome. - * - * There are a lot of problems with supporting VSX and s390x, due to - * inconsistent intrinsics, spotty coverage, and multiple endiannesses. - */ -#if XXH_VECTOR == XXH_VSX -/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, - * and `pixel`. This is a problem for obvious reasons. - * - * These keywords are unnecessary; the spec literally says they are - * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd - * after including the header. - * - * We use pragma push_macro/pop_macro to keep the namespace clean. */ -# pragma push_macro("bool") -# pragma push_macro("vector") -# pragma push_macro("pixel") -/* silence potential macro redefined warnings */ -# undef bool -# undef vector -# undef pixel - -# if defined(__s390x__) -# include -# else -# include -# endif - -/* Restore the original macro values, if applicable. */ -# pragma pop_macro("pixel") -# pragma pop_macro("vector") -# pragma pop_macro("bool") - -typedef __vector unsigned long long xxh_u64x2; -typedef __vector unsigned char xxh_u8x16; -typedef __vector unsigned xxh_u32x4; - -/* - * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. - */ -typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; - -# ifndef XXH_VSX_BE -# if defined(__BIG_ENDIAN__) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -# define XXH_VSX_BE 1 -# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ -# warning "-maltivec=be is not recommended. Please use native endianness." -# define XXH_VSX_BE 1 -# else -# define XXH_VSX_BE 0 -# endif -# endif /* !defined(XXH_VSX_BE) */ - -# if XXH_VSX_BE -# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) -# define XXH_vec_revb vec_revb -# else -/*! - * A polyfill for POWER9's vec_revb(). - */ -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) -{ - xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, - 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; - return vec_perm(val, val, vByteSwap); -} -# endif -# endif /* XXH_VSX_BE */ - -/*! - * Performs an unaligned vector load and byte swaps it on big endian. - */ -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) -{ - xxh_u64x2 ret; - XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); -# if XXH_VSX_BE - ret = XXH_vec_revb(ret); -# endif - return ret; -} - -/* - * vec_mulo and vec_mule are very problematic intrinsics on PowerPC - * - * These intrinsics weren't added until GCC 8, despite existing for a while, - * and they are endian dependent. Also, their meaning swap depending on version. - * */ -# if defined(__s390x__) - /* s390x is always big endian, no issue on this platform */ -# define XXH_vec_mulo vec_mulo -# define XXH_vec_mule vec_mule -# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) -/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ - /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ -# define XXH_vec_mulo __builtin_altivec_vmulouw -# define XXH_vec_mule __builtin_altivec_vmuleuw -# else -/* gcc needs inline assembly */ -/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) -{ - xxh_u64x2 result; - __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); - return result; -} -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) -{ - xxh_u64x2 result; - __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); - return result; -} -# endif /* XXH_vec_mulo, XXH_vec_mule */ -#endif /* XXH_VECTOR == XXH_VSX */ - -#if XXH_VECTOR == XXH_SVE -#define ACCRND(acc, offset) \ -do { \ - svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ - svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ - svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ - svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ - svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ - svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ - svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ - acc = svadd_u64_x(mask, acc, mul); \ -} while (0) -#endif /* XXH_VECTOR == XXH_SVE */ - -/* prefetch - * can be disabled, by declaring XXH_NO_PREFETCH build macro */ -#if defined(XXH_NO_PREFETCH) -# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ -#else -# if XXH_SIZE_OPT >= 1 -# define XXH_PREFETCH(ptr) (void)(ptr) -# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ -# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ -# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) -# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) -# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) -# else -# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ -# endif -#endif /* XXH_NO_PREFETCH */ - - -/* ========================================== - * XXH3 default settings - * ========================================== */ - -#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ - -#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) -# error "default keyset is not large enough" -#endif - -/*! Pseudorandom secret taken directly from FARSH. */ -XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { - 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, - 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, - 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, - 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, - 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, - 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, - 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, - 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, - 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, - 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, - 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, - 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, -}; - -static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ -static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ - -#ifdef XXH_OLD_NAMES -# define kSecret XXH3_kSecret -#endif - -#ifdef XXH_DOXYGEN -/*! - * @brief Calculates a 32-bit to 64-bit long multiply. - * - * Implemented as a macro. - * - * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't - * need to (but it shouldn't need to anyways, it is about 7 instructions to do - * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we - * use that instead of the normal method. - * - * If you are compiling for platforms like Thumb-1 and don't have a better option, - * you may also want to write your own long multiply routine here. - * - * @param x, y Numbers to be multiplied - * @return 64-bit product of the low 32 bits of @p x and @p y. - */ -XXH_FORCE_INLINE xxh_u64 -XXH_mult32to64(xxh_u64 x, xxh_u64 y) -{ - return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); -} -#elif defined(_MSC_VER) && defined(_M_IX86) -# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) -#else -/* - * Downcast + upcast is usually better than masking on older compilers like - * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. - * - * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands - * and perform a full 64x64 multiply -- entirely redundant on 32-bit. - */ -# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) -#endif - -/*! - * @brief Calculates a 64->128-bit long multiply. - * - * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar - * version. - * - * @param lhs , rhs The 64-bit integers to be multiplied - * @return The 128-bit result represented in an @ref XXH128_hash_t. - */ -static XXH128_hash_t -XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) -{ - /* - * GCC/Clang __uint128_t method. - * - * On most 64-bit targets, GCC and Clang define a __uint128_t type. - * This is usually the best way as it usually uses a native long 64-bit - * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. - * - * Usually. - * - * Despite being a 32-bit platform, Clang (and emscripten) define this type - * despite not having the arithmetic for it. This results in a laggy - * compiler builtin call which calculates a full 128-bit multiply. - * In that case it is best to use the portable one. - * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 - */ -#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ - && defined(__SIZEOF_INT128__) \ - || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - - __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; - XXH128_hash_t r128; - r128.low64 = (xxh_u64)(product); - r128.high64 = (xxh_u64)(product >> 64); - return r128; - - /* - * MSVC for x64's _umul128 method. - * - * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); - * - * This compiles to single operand MUL on x64. - */ -#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) - -#ifndef _MSC_VER -# pragma intrinsic(_umul128) -#endif - xxh_u64 product_high; - xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); - XXH128_hash_t r128; - r128.low64 = product_low; - r128.high64 = product_high; - return r128; - - /* - * MSVC for ARM64's __umulh method. - * - * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. - */ -#elif defined(_M_ARM64) || defined(_M_ARM64EC) - -#ifndef _MSC_VER -# pragma intrinsic(__umulh) -#endif - XXH128_hash_t r128; - r128.low64 = lhs * rhs; - r128.high64 = __umulh(lhs, rhs); - return r128; - -#else - /* - * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. - * - * This is a fast and simple grade school multiply, which is shown below - * with base 10 arithmetic instead of base 0x100000000. - * - * 9 3 // D2 lhs = 93 - * x 7 5 // D2 rhs = 75 - * ---------- - * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 - * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 - * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 - * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 - * --------- - * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 - * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 - * --------- - * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 - * - * The reasons for adding the products like this are: - * 1. It avoids manual carry tracking. Just like how - * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. - * This avoids a lot of complexity. - * - * 2. It hints for, and on Clang, compiles to, the powerful UMAAL - * instruction available in ARM's Digital Signal Processing extension - * in 32-bit ARMv6 and later, which is shown below: - * - * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) - * { - * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; - * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); - * *RdHi = (xxh_u32)(product >> 32); - * } - * - * This instruction was designed for efficient long multiplication, and - * allows this to be calculated in only 4 instructions at speeds - * comparable to some 64-bit ALUs. - * - * 3. It isn't terrible on other platforms. Usually this will be a couple - * of 32-bit ADD/ADCs. - */ - - /* First calculate all of the cross products. */ - xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); - xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); - xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); - xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); - - /* Now add the products together. These will never overflow. */ - xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - XXH128_hash_t r128; - r128.low64 = lower; - r128.high64 = upper; - return r128; -#endif -} - -/*! - * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. - * - * The reason for the separate function is to prevent passing too many structs - * around by value. This will hopefully inline the multiply, but we don't force it. - * - * @param lhs , rhs The 64-bit integers to multiply - * @return The low 64 bits of the product XOR'd by the high 64 bits. - * @see XXH_mult64to128() - */ -static xxh_u64 -XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) -{ - XXH128_hash_t product = XXH_mult64to128(lhs, rhs); - return product.low64 ^ product.high64; -} - -/*! Seems to produce slightly better code on GCC for some reason. */ -XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) -{ - XXH_ASSERT(0 <= shift && shift < 64); - return v64 ^ (v64 >> shift); -} - -/* - * This is a fast avalanche stage, - * suitable when input bits are already partially mixed - */ -static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) -{ - h64 = XXH_xorshift64(h64, 37); - h64 *= PRIME_MX1; - h64 = XXH_xorshift64(h64, 32); - return h64; -} - -/* - * This is a stronger avalanche, - * inspired by Pelle Evensen's rrmxmx - * preferable when input has not been previously mixed - */ -static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) -{ - /* this mix is inspired by Pelle Evensen's rrmxmx */ - h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); - h64 *= PRIME_MX2; - h64 ^= (h64 >> 35) + len ; - h64 *= PRIME_MX2; - return XXH_xorshift64(h64, 28); -} - - -/* ========================================== - * Short keys - * ========================================== - * One of the shortcomings of XXH32 and XXH64 was that their performance was - * sub-optimal on short lengths. It used an iterative algorithm which strongly - * favored lengths that were a multiple of 4 or 8. - * - * Instead of iterating over individual inputs, we use a set of single shot - * functions which piece together a range of lengths and operate in constant time. - * - * Additionally, the number of multiplies has been significantly reduced. This - * reduces latency, especially when emulating 64-bit multiplies on 32-bit. - * - * Depending on the platform, this may or may not be faster than XXH32, but it - * is almost guaranteed to be faster than XXH64. - */ - -/* - * At very short lengths, there isn't enough input to fully hide secrets, or use - * the entire secret. - * - * There is also only a limited amount of mixing we can do before significantly - * impacting performance. - * - * Therefore, we use different sections of the secret and always mix two secret - * samples with an XOR. This should have no effect on performance on the - * seedless or withSeed variants because everything _should_ be constant folded - * by modern compilers. - * - * The XOR mixing hides individual parts of the secret and increases entropy. - * - * This adds an extra layer of strength for custom secrets. - */ -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(1 <= len && len <= 3); - XXH_ASSERT(secret != NULL); - /* - * len = 1: combined = { input[0], 0x01, input[0], input[0] } - * len = 2: combined = { input[1], 0x02, input[0], input[1] } - * len = 3: combined = { input[2], 0x03, input[0], input[1] } - */ - { xxh_u8 const c1 = input[0]; - xxh_u8 const c2 = input[len >> 1]; - xxh_u8 const c3 = input[len - 1]; - xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) - | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); - xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; - xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; - return XXH64_avalanche(keyed); - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len <= 8); - seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; - { xxh_u32 const input1 = XXH_readLE32(input); - xxh_u32 const input2 = XXH_readLE32(input + len - 4); - xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; - xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); - xxh_u64 const keyed = input64 ^ bitflip; - return XXH3_rrmxmx(keyed, len); - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(9 <= len && len <= 16); - { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; - xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; - xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; - xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; - xxh_u64 const acc = len - + XXH_swap64(input_lo) + input_hi - + XXH3_mul128_fold64(input_lo, input_hi); - return XXH3_avalanche(acc); - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(len <= 16); - { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); - if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); - if (len) return XXH3_len_1to3_64b(input, len, secret, seed); - return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); - } -} - -/* - * DISCLAIMER: There are known *seed-dependent* multicollisions here due to - * multiplication by zero, affecting hashes of lengths 17 to 240. - * - * However, they are very unlikely. - * - * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all - * unseeded non-cryptographic hashes, it does not attempt to defend itself - * against specially crafted inputs, only random inputs. - * - * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes - * cancelling out the secret is taken an arbitrary number of times (addressed - * in XXH3_accumulate_512), this collision is very unlikely with random inputs - * and/or proper seeding: - * - * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a - * function that is only called up to 16 times per hash with up to 240 bytes of - * input. - * - * This is not too bad for a non-cryptographic hash function, especially with - * only 64 bit outputs. - * - * The 128-bit variant (which trades some speed for strength) is NOT affected - * by this, although it is always a good idea to use a proper seed if you care - * about strength. - */ -XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, - const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) -{ -#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ - /* - * UGLY HACK: - * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in - * slower code. - * - * By forcing seed64 into a register, we disrupt the cost model and - * cause it to scalarize. See `XXH32_round()` - * - * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, - * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on - * GCC 9.2, despite both emitting scalar code. - * - * GCC generates much better scalar code than Clang for the rest of XXH3, - * which is why finding a more optimal codepath is an interest. - */ - XXH_COMPILER_GUARD(seed64); -#endif - { xxh_u64 const input_lo = XXH_readLE64(input); - xxh_u64 const input_hi = XXH_readLE64(input+8); - return XXH3_mul128_fold64( - input_lo ^ (XXH_readLE64(secret) + seed64), - input_hi ^ (XXH_readLE64(secret+8) - seed64) - ); - } -} - -/* For mid range keys, XXH3 uses a Mum-hash variant. */ -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(16 < len && len <= 128); - - { xxh_u64 acc = len * XXH_PRIME64_1; -#if XXH_SIZE_OPT >= 1 - /* Smaller and cleaner, but slightly slower. */ - unsigned int i = (unsigned int)(len - 1) / 32; - do { - acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); - acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); - } while (i-- != 0); -#else - if (len > 32) { - if (len > 64) { - if (len > 96) { - acc += XXH3_mix16B(input+48, secret+96, seed); - acc += XXH3_mix16B(input+len-64, secret+112, seed); - } - acc += XXH3_mix16B(input+32, secret+64, seed); - acc += XXH3_mix16B(input+len-48, secret+80, seed); - } - acc += XXH3_mix16B(input+16, secret+32, seed); - acc += XXH3_mix16B(input+len-32, secret+48, seed); - } - acc += XXH3_mix16B(input+0, secret+0, seed); - acc += XXH3_mix16B(input+len-16, secret+16, seed); -#endif - return XXH3_avalanche(acc); - } -} - -XXH_NO_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - - #define XXH3_MIDSIZE_STARTOFFSET 3 - #define XXH3_MIDSIZE_LASTOFFSET 17 - - { xxh_u64 acc = len * XXH_PRIME64_1; - xxh_u64 acc_end; - unsigned int const nbRounds = (unsigned int)len / 16; - unsigned int i; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - for (i=0; i<8; i++) { - acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); - } - /* last bytes */ - acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); - XXH_ASSERT(nbRounds >= 8); - acc = XXH3_avalanche(acc); -#if defined(__clang__) /* Clang */ \ - && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ - /* - * UGLY HACK: - * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. - * In everywhere else, it uses scalar code. - * - * For 64->128-bit multiplies, even if the NEON was 100% optimal, it - * would still be slower than UMAAL (see XXH_mult64to128). - * - * Unfortunately, Clang doesn't handle the long multiplies properly and - * converts them to the nonexistent "vmulq_u64" intrinsic, which is then - * scalarized into an ugly mess of VMOV.32 instructions. - * - * This mess is difficult to avoid without turning autovectorization - * off completely, but they are usually relatively minor and/or not - * worth it to fix. - * - * This loop is the easiest to fix, as unlike XXH32, this pragma - * _actually works_ because it is a loop vectorization instead of an - * SLP vectorization. - */ - #pragma clang loop vectorize(disable) -#endif - for (i=8 ; i < nbRounds; i++) { - /* - * Prevents clang for unrolling the acc loop and interleaving with this one. - */ - XXH_COMPILER_GUARD(acc); - acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); - } - return XXH3_avalanche(acc + acc_end); - } -} - - -/* ======= Long Keys ======= */ - -#define XXH_STRIPE_LEN 64 -#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ -#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) - -#ifdef XXH_OLD_NAMES -# define STRIPE_LEN XXH_STRIPE_LEN -# define ACC_NB XXH_ACC_NB -#endif - -#ifndef XXH_PREFETCH_DIST -# ifdef __clang__ -# define XXH_PREFETCH_DIST 320 -# else -# if (XXH_VECTOR == XXH_AVX512) -# define XXH_PREFETCH_DIST 512 -# else -# define XXH_PREFETCH_DIST 384 -# endif -# endif /* __clang__ */ -#endif /* XXH_PREFETCH_DIST */ - -/* - * These macros are to generate an XXH3_accumulate() function. - * The two arguments select the name suffix and target attribute. - * - * The name of this symbol is XXH3_accumulate_() and it calls - * XXH3_accumulate_512_(). - * - * It may be useful to hand implement this function if the compiler fails to - * optimize the inline function. - */ -#define XXH3_ACCUMULATE_TEMPLATE(name) \ -void \ -XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ - const xxh_u8* XXH_RESTRICT input, \ - const xxh_u8* XXH_RESTRICT secret, \ - size_t nbStripes) \ -{ \ - size_t n; \ - for (n = 0; n < nbStripes; n++ ) { \ - const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ - XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ - XXH3_accumulate_512_##name( \ - acc, \ - in, \ - secret + n*XXH_SECRET_CONSUME_RATE); \ - } \ -} - - -XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) -{ - if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); - XXH_memcpy(dst, &v64, sizeof(v64)); -} - -/* Several intrinsic functions below are supposed to accept __int64 as argument, - * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . - * However, several environments do not define __int64 type, - * requiring a workaround. - */ -#if !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) - typedef int64_t xxh_i64; -#else - /* the following type must have a width of 64-bit */ - typedef long long xxh_i64; -#endif - - -/* - * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. - * - * It is a hardened version of UMAC, based off of FARSH's implementation. - * - * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD - * implementations, and it is ridiculously fast. - * - * We harden it by mixing the original input to the accumulators as well as the product. - * - * This means that in the (relatively likely) case of a multiply by zero, the - * original input is preserved. - * - * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve - * cross-pollination, as otherwise the upper and lower halves would be - * essentially independent. - * - * This doesn't matter on 64-bit hashes since they all get merged together in - * the end, so we skip the extra step. - * - * Both XXH3_64bits and XXH3_128bits use this subroutine. - */ - -#if (XXH_VECTOR == XXH_AVX512) \ - || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) - -#ifndef XXH_TARGET_AVX512 -# define XXH_TARGET_AVX512 /* disable attribute target */ -#endif - -XXH_FORCE_INLINE XXH_TARGET_AVX512 void -XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - __m512i* const xacc = (__m512i *) acc; - XXH_ASSERT((((size_t)acc) & 63) == 0); - XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); - - { - /* data_vec = input[0]; */ - __m512i const data_vec = _mm512_loadu_si512 (input); - /* key_vec = secret[0]; */ - __m512i const key_vec = _mm512_loadu_si512 (secret); - /* data_key = data_vec ^ key_vec; */ - __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); - /* xacc[0] += swap(data_vec); */ - __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); - __m512i const sum = _mm512_add_epi64(*xacc, data_swap); - /* xacc[0] += product; */ - *xacc = _mm512_add_epi64(product, sum); - } -} -XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) - -/* - * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. - * - * Multiplication isn't perfect, as explained by Google in HighwayHash: - * - * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to - * // varying degrees. In descending order of goodness, bytes - * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. - * // As expected, the upper and lower bytes are much worse. - * - * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 - * - * Since our algorithm uses a pseudorandom secret to add some variance into the - * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. - * - * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid - * extraction. - * - * Both XXH3_64bits and XXH3_128bits use this subroutine. - */ - -XXH_FORCE_INLINE XXH_TARGET_AVX512 void -XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 63) == 0); - XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); - { __m512i* const xacc = (__m512i*) acc; - const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); - - /* xacc[0] ^= (xacc[0] >> 47) */ - __m512i const acc_vec = *xacc; - __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); - /* xacc[0] ^= secret; */ - __m512i const key_vec = _mm512_loadu_si512 (secret); - __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); - - /* xacc[0] *= XXH_PRIME32_1; */ - __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); - __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); - __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); - *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); - } -} - -XXH_FORCE_INLINE XXH_TARGET_AVX512 void -XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) -{ - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); - XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); - XXH_ASSERT(((size_t)customSecret & 63) == 0); - (void)(&XXH_writeLE64); - { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); - __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); - __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); - - const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); - __m512i* const dest = ( __m512i*) customSecret; - int i; - XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ - XXH_ASSERT(((size_t)dest & 63) == 0); - for (i=0; i < nbRounds; ++i) { - dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); - } } -} - -#endif - -#if (XXH_VECTOR == XXH_AVX2) \ - || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) - -#ifndef XXH_TARGET_AVX2 -# define XXH_TARGET_AVX2 /* disable attribute target */ -#endif - -XXH_FORCE_INLINE XXH_TARGET_AVX2 void -XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 31) == 0); - { __m256i* const xacc = (__m256i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xinput = (const __m256i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xsecret = (const __m256i *) secret; - - size_t i; - for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { - /* data_vec = xinput[i]; */ - __m256i const data_vec = _mm256_loadu_si256 (xinput+i); - /* key_vec = xsecret[i]; */ - __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); - /* data_key = data_vec ^ key_vec; */ - __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); - /* xacc[i] += swap(data_vec); */ - __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); - __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); - /* xacc[i] += product; */ - xacc[i] = _mm256_add_epi64(product, sum); - } } -} -XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) - -XXH_FORCE_INLINE XXH_TARGET_AVX2 void -XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 31) == 0); - { __m256i* const xacc = (__m256i*) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xsecret = (const __m256i *) secret; - const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); - - size_t i; - for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { - /* xacc[i] ^= (xacc[i] >> 47) */ - __m256i const acc_vec = xacc[i]; - __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); - __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); - /* xacc[i] ^= xsecret; */ - __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); - __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); - - /* xacc[i] *= XXH_PRIME32_1; */ - __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); - __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); - __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); - xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); - } - } -} - -XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) -{ - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); - XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); - (void)(&XXH_writeLE64); - XXH_PREFETCH(customSecret); - { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); - - const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); - __m256i* dest = ( __m256i*) customSecret; - -# if defined(__GNUC__) || defined(__clang__) - /* - * On GCC & Clang, marking 'dest' as modified will cause the compiler: - * - do not extract the secret from sse registers in the internal loop - * - use less common registers, and avoid pushing these reg into stack - */ - XXH_COMPILER_GUARD(dest); -# endif - XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ - XXH_ASSERT(((size_t)dest & 31) == 0); - - /* GCC -O2 need unroll loop manually */ - dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); - dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); - dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); - dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); - dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); - dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); - } -} - -#endif - -/* x86dispatch always generates SSE2 */ -#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) - -#ifndef XXH_TARGET_SSE2 -# define XXH_TARGET_SSE2 /* disable attribute target */ -#endif - -XXH_FORCE_INLINE XXH_TARGET_SSE2 void -XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - /* SSE2 is just a half-scale version of the AVX2 version. */ - XXH_ASSERT((((size_t)acc) & 15) == 0); - { __m128i* const xacc = (__m128i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xinput = (const __m128i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xsecret = (const __m128i *) secret; - - size_t i; - for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { - /* data_vec = xinput[i]; */ - __m128i const data_vec = _mm_loadu_si128 (xinput+i); - /* key_vec = xsecret[i]; */ - __m128i const key_vec = _mm_loadu_si128 (xsecret+i); - /* data_key = data_vec ^ key_vec; */ - __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); - /* xacc[i] += swap(data_vec); */ - __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); - __m128i const sum = _mm_add_epi64(xacc[i], data_swap); - /* xacc[i] += product; */ - xacc[i] = _mm_add_epi64(product, sum); - } } -} -XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) - -XXH_FORCE_INLINE XXH_TARGET_SSE2 void -XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 15) == 0); - { __m128i* const xacc = (__m128i*) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xsecret = (const __m128i *) secret; - const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); - - size_t i; - for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { - /* xacc[i] ^= (xacc[i] >> 47) */ - __m128i const acc_vec = xacc[i]; - __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); - __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); - /* xacc[i] ^= xsecret[i]; */ - __m128i const key_vec = _mm_loadu_si128 (xsecret+i); - __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); - - /* xacc[i] *= XXH_PRIME32_1; */ - __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); - __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); - xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); - } - } -} - -XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) -{ - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); - (void)(&XXH_writeLE64); - { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); - -# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ - XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; - __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); -# else - __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); -# endif - int i; - - const void* const src16 = XXH3_kSecret; - __m128i* dst16 = (__m128i*) customSecret; -# if defined(__GNUC__) || defined(__clang__) - /* - * On GCC & Clang, marking 'dest' as modified will cause the compiler: - * - do not extract the secret from sse registers in the internal loop - * - use less common registers, and avoid pushing these reg into stack - */ - XXH_COMPILER_GUARD(dst16); -# endif - XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ - XXH_ASSERT(((size_t)dst16 & 15) == 0); - - for (i=0; i < nbRounds; ++i) { - dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); - } } -} - -#endif - -#if (XXH_VECTOR == XXH_NEON) - -/* forward declarations for the scalar routines */ -XXH_FORCE_INLINE void -XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, - void const* XXH_RESTRICT secret, size_t lane); - -XXH_FORCE_INLINE void -XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, - void const* XXH_RESTRICT secret, size_t lane); - -/*! - * @internal - * @brief The bulk processing loop for NEON and WASM SIMD128. - * - * The NEON code path is actually partially scalar when running on AArch64. This - * is to optimize the pipelining and can have up to 15% speedup depending on the - * CPU, and it also mitigates some GCC codegen issues. - * - * @see XXH3_NEON_LANES for configuring this and details about this optimization. - * - * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit - * integers instead of the other platforms which mask full 64-bit vectors, - * so the setup is more complicated than just shifting right. - * - * Additionally, there is an optimization for 4 lanes at once noted below. - * - * Since, as stated, the most optimal amount of lanes for Cortexes is 6, - * there needs to be *three* versions of the accumulate operation used - * for the remaining 2 lanes. - * - * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap - * nearly perfectly. - */ - -XXH_FORCE_INLINE void -XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 15) == 0); - XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); - { /* GCC for darwin arm64 does not like aliasing here */ - xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; - /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ - uint8_t const* xinput = (const uint8_t *) input; - uint8_t const* xsecret = (const uint8_t *) secret; - - size_t i; -#ifdef __wasm_simd128__ - /* - * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret - * is constant propagated, which results in it converting it to this - * inside the loop: - * - * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) - * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) - * ... - * - * This requires a full 32-bit address immediate (and therefore a 6 byte - * instruction) as well as an add for each offset. - * - * Putting an asm guard prevents it from folding (at the cost of losing - * the alignment hint), and uses the free offset in `v128.load` instead - * of adding secret_offset each time which overall reduces code size by - * about a kilobyte and improves performance. - */ - XXH_COMPILER_GUARD(xsecret); -#endif - /* Scalar lanes use the normal scalarRound routine */ - for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { - XXH3_scalarRound(acc, input, secret, i); - } - i = 0; - /* 4 NEON lanes at a time. */ - for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { - /* data_vec = xinput[i]; */ - uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); - uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); - /* key_vec = xsecret[i]; */ - uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); - uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); - /* data_swap = swap(data_vec) */ - uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); - uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); - /* data_key = data_vec ^ key_vec; */ - uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); - uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); - - /* - * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a - * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to - * get one vector with the low 32 bits of each lane, and one vector - * with the high 32 bits of each lane. - * - * The intrinsic returns a double vector because the original ARMv7-a - * instruction modified both arguments in place. AArch64 and SIMD128 emit - * two instructions from this intrinsic. - * - * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] - * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] - */ - uint32x4x2_t unzipped = vuzpq_u32( - vreinterpretq_u32_u64(data_key_1), - vreinterpretq_u32_u64(data_key_2) - ); - /* data_key_lo = data_key & 0xFFFFFFFF */ - uint32x4_t data_key_lo = unzipped.val[0]; - /* data_key_hi = data_key >> 32 */ - uint32x4_t data_key_hi = unzipped.val[1]; - /* - * Then, we can split the vectors horizontally and multiply which, as for most - * widening intrinsics, have a variant that works on both high half vectors - * for free on AArch64. A similar instruction is available on SIMD128. - * - * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi - */ - uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); - uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); - /* - * Clang reorders - * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s - * c += a; // add acc.2d, acc.2d, swap.2d - * to - * c += a; // add acc.2d, acc.2d, swap.2d - * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s - * - * While it would make sense in theory since the addition is faster, - * for reasons likely related to umlal being limited to certain NEON - * pipelines, this is worse. A compiler guard fixes this. - */ - XXH_COMPILER_GUARD_CLANG_NEON(sum_1); - XXH_COMPILER_GUARD_CLANG_NEON(sum_2); - /* xacc[i] = acc_vec + sum; */ - xacc[i] = vaddq_u64(xacc[i], sum_1); - xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); - } - /* Operate on the remaining NEON lanes 2 at a time. */ - for (; i < XXH3_NEON_LANES / 2; i++) { - /* data_vec = xinput[i]; */ - uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); - /* key_vec = xsecret[i]; */ - uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); - /* acc_vec_2 = swap(data_vec) */ - uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); - /* data_key = data_vec ^ key_vec; */ - uint64x2_t data_key = veorq_u64(data_vec, key_vec); - /* For two lanes, just use VMOVN and VSHRN. */ - /* data_key_lo = data_key & 0xFFFFFFFF; */ - uint32x2_t data_key_lo = vmovn_u64(data_key); - /* data_key_hi = data_key >> 32; */ - uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); - /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ - uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); - /* Same Clang workaround as before */ - XXH_COMPILER_GUARD_CLANG_NEON(sum); - /* xacc[i] = acc_vec + sum; */ - xacc[i] = vaddq_u64 (xacc[i], sum); - } - } -} -XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) - -XXH_FORCE_INLINE void -XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 15) == 0); - - { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; - uint8_t const* xsecret = (uint8_t const*) secret; - - size_t i; - /* WASM uses operator overloads and doesn't need these. */ -#ifndef __wasm_simd128__ - /* { prime32_1, prime32_1 } */ - uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); - /* { 0, prime32_1, 0, prime32_1 } */ - uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); -#endif - - /* AArch64 uses both scalar and neon at the same time */ - for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { - XXH3_scalarScrambleRound(acc, secret, i); - } - for (i=0; i < XXH3_NEON_LANES / 2; i++) { - /* xacc[i] ^= (xacc[i] >> 47); */ - uint64x2_t acc_vec = xacc[i]; - uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); - uint64x2_t data_vec = veorq_u64(acc_vec, shifted); - - /* xacc[i] ^= xsecret[i]; */ - uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); - uint64x2_t data_key = veorq_u64(data_vec, key_vec); - /* xacc[i] *= XXH_PRIME32_1 */ -#ifdef __wasm_simd128__ - /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ - xacc[i] = data_key * XXH_PRIME32_1; -#else - /* - * Expanded version with portable NEON intrinsics - * - * lo(x) * lo(y) + (hi(x) * lo(y) << 32) - * - * prod_hi = hi(data_key) * lo(prime) << 32 - * - * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector - * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits - * and avoid the shift. - */ - uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); - /* Extract low bits for vmlal_u32 */ - uint32x2_t data_key_lo = vmovn_u64(data_key); - /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ - xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); -#endif - } - } -} -#endif - -#if (XXH_VECTOR == XXH_VSX) - -XXH_FORCE_INLINE void -XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - /* presumed aligned */ - xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; - xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ - xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ - xxh_u64x2 const v32 = { 32, 32 }; - size_t i; - for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { - /* data_vec = xinput[i]; */ - xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); - /* key_vec = xsecret[i]; */ - xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); - xxh_u64x2 const data_key = data_vec ^ key_vec; - /* shuffled = (data_key << 32) | (data_key >> 32); */ - xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); - /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ - xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); - /* acc_vec = xacc[i]; */ - xxh_u64x2 acc_vec = xacc[i]; - acc_vec += product; - - /* swap high and low halves */ -#ifdef __s390x__ - acc_vec += vec_permi(data_vec, data_vec, 2); -#else - acc_vec += vec_xxpermdi(data_vec, data_vec, 2); -#endif - xacc[i] = acc_vec; - } -} -XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) - -XXH_FORCE_INLINE void -XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 15) == 0); - - { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; - const xxh_u8* const xsecret = (const xxh_u8*) secret; - /* constants */ - xxh_u64x2 const v32 = { 32, 32 }; - xxh_u64x2 const v47 = { 47, 47 }; - xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; - size_t i; - for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { - /* xacc[i] ^= (xacc[i] >> 47); */ - xxh_u64x2 const acc_vec = xacc[i]; - xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); - - /* xacc[i] ^= xsecret[i]; */ - xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); - xxh_u64x2 const data_key = data_vec ^ key_vec; - - /* xacc[i] *= XXH_PRIME32_1 */ - /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ - xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); - /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ - xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); - xacc[i] = prod_odd + (prod_even << v32); - } } -} - -#endif - -#if (XXH_VECTOR == XXH_SVE) - -XXH_FORCE_INLINE void -XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - uint64_t *xacc = (uint64_t *)acc; - const uint64_t *xinput = (const uint64_t *)(const void *)input; - const uint64_t *xsecret = (const uint64_t *)(const void *)secret; - svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); - uint64_t element_count = svcntd(); - if (element_count >= 8) { - svbool_t mask = svptrue_pat_b64(SV_VL8); - svuint64_t vacc = svld1_u64(mask, xacc); - ACCRND(vacc, 0); - svst1_u64(mask, xacc, vacc); - } else if (element_count == 2) { /* sve128 */ - svbool_t mask = svptrue_pat_b64(SV_VL2); - svuint64_t acc0 = svld1_u64(mask, xacc + 0); - svuint64_t acc1 = svld1_u64(mask, xacc + 2); - svuint64_t acc2 = svld1_u64(mask, xacc + 4); - svuint64_t acc3 = svld1_u64(mask, xacc + 6); - ACCRND(acc0, 0); - ACCRND(acc1, 2); - ACCRND(acc2, 4); - ACCRND(acc3, 6); - svst1_u64(mask, xacc + 0, acc0); - svst1_u64(mask, xacc + 2, acc1); - svst1_u64(mask, xacc + 4, acc2); - svst1_u64(mask, xacc + 6, acc3); - } else { - svbool_t mask = svptrue_pat_b64(SV_VL4); - svuint64_t acc0 = svld1_u64(mask, xacc + 0); - svuint64_t acc1 = svld1_u64(mask, xacc + 4); - ACCRND(acc0, 0); - ACCRND(acc1, 4); - svst1_u64(mask, xacc + 0, acc0); - svst1_u64(mask, xacc + 4, acc1); - } -} - -XXH_FORCE_INLINE void -XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, - const xxh_u8* XXH_RESTRICT input, - const xxh_u8* XXH_RESTRICT secret, - size_t nbStripes) -{ - if (nbStripes != 0) { - uint64_t *xacc = (uint64_t *)acc; - const uint64_t *xinput = (const uint64_t *)(const void *)input; - const uint64_t *xsecret = (const uint64_t *)(const void *)secret; - svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); - uint64_t element_count = svcntd(); - if (element_count >= 8) { - svbool_t mask = svptrue_pat_b64(SV_VL8); - svuint64_t vacc = svld1_u64(mask, xacc + 0); - do { - /* svprfd(svbool_t, void *, enum svfprop); */ - svprfd(mask, xinput + 128, SV_PLDL1STRM); - ACCRND(vacc, 0); - xinput += 8; - xsecret += 1; - nbStripes--; - } while (nbStripes != 0); - - svst1_u64(mask, xacc + 0, vacc); - } else if (element_count == 2) { /* sve128 */ - svbool_t mask = svptrue_pat_b64(SV_VL2); - svuint64_t acc0 = svld1_u64(mask, xacc + 0); - svuint64_t acc1 = svld1_u64(mask, xacc + 2); - svuint64_t acc2 = svld1_u64(mask, xacc + 4); - svuint64_t acc3 = svld1_u64(mask, xacc + 6); - do { - svprfd(mask, xinput + 128, SV_PLDL1STRM); - ACCRND(acc0, 0); - ACCRND(acc1, 2); - ACCRND(acc2, 4); - ACCRND(acc3, 6); - xinput += 8; - xsecret += 1; - nbStripes--; - } while (nbStripes != 0); - - svst1_u64(mask, xacc + 0, acc0); - svst1_u64(mask, xacc + 2, acc1); - svst1_u64(mask, xacc + 4, acc2); - svst1_u64(mask, xacc + 6, acc3); - } else { - svbool_t mask = svptrue_pat_b64(SV_VL4); - svuint64_t acc0 = svld1_u64(mask, xacc + 0); - svuint64_t acc1 = svld1_u64(mask, xacc + 4); - do { - svprfd(mask, xinput + 128, SV_PLDL1STRM); - ACCRND(acc0, 0); - ACCRND(acc1, 4); - xinput += 8; - xsecret += 1; - nbStripes--; - } while (nbStripes != 0); - - svst1_u64(mask, xacc + 0, acc0); - svst1_u64(mask, xacc + 4, acc1); - } - } -} - -#endif - -/* scalar variants - universal */ - -#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) -/* - * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they - * emit an excess mask and a full 64-bit multiply-add (MADD X-form). - * - * While this might not seem like much, as AArch64 is a 64-bit architecture, only - * big Cortex designs have a full 64-bit multiplier. - * - * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit - * multiplies expand to 2-3 multiplies in microcode. This has a major penalty - * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. - * - * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does - * not have this penalty and does the mask automatically. - */ -XXH_FORCE_INLINE xxh_u64 -XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) -{ - xxh_u64 ret; - /* note: %x = 64-bit register, %w = 32-bit register */ - __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); - return ret; -} -#else -XXH_FORCE_INLINE xxh_u64 -XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) -{ - return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; -} -#endif - -/*! - * @internal - * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). - * - * This is extracted to its own function because the NEON path uses a combination - * of NEON and scalar. - */ -XXH_FORCE_INLINE void -XXH3_scalarRound(void* XXH_RESTRICT acc, - void const* XXH_RESTRICT input, - void const* XXH_RESTRICT secret, - size_t lane) -{ - xxh_u64* xacc = (xxh_u64*) acc; - xxh_u8 const* xinput = (xxh_u8 const*) input; - xxh_u8 const* xsecret = (xxh_u8 const*) secret; - XXH_ASSERT(lane < XXH_ACC_NB); - XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); - { - xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); - xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); - xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ - xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); - } -} - -/*! - * @internal - * @brief Processes a 64 byte block of data using the scalar path. - */ -XXH_FORCE_INLINE void -XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - size_t i; - /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ -#if defined(__GNUC__) && !defined(__clang__) \ - && (defined(__arm__) || defined(__thumb2__)) \ - && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ - && XXH_SIZE_OPT <= 0 -# pragma GCC unroll 8 -#endif - for (i=0; i < XXH_ACC_NB; i++) { - XXH3_scalarRound(acc, input, secret, i); - } -} -XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) - -/*! - * @internal - * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). - * - * This is extracted to its own function because the NEON path uses a combination - * of NEON and scalar. - */ -XXH_FORCE_INLINE void -XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, - void const* XXH_RESTRICT secret, - size_t lane) -{ - xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ - const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ - XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); - XXH_ASSERT(lane < XXH_ACC_NB); - { - xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); - xxh_u64 acc64 = xacc[lane]; - acc64 = XXH_xorshift64(acc64, 47); - acc64 ^= key64; - acc64 *= XXH_PRIME32_1; - xacc[lane] = acc64; - } -} - -/*! - * @internal - * @brief Scrambles the accumulators after a large chunk has been read - */ -XXH_FORCE_INLINE void -XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - size_t i; - for (i=0; i < XXH_ACC_NB; i++) { - XXH3_scalarScrambleRound(acc, secret, i); - } -} - -XXH_FORCE_INLINE void -XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) -{ - /* - * We need a separate pointer for the hack below, - * which requires a non-const pointer. - * Any decent compiler will optimize this out otherwise. - */ - const xxh_u8* kSecretPtr = XXH3_kSecret; - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); - -#if defined(__GNUC__) && defined(__aarch64__) - /* - * UGLY HACK: - * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are - * placed sequentially, in order, at the top of the unrolled loop. - * - * While MOVK is great for generating constants (2 cycles for a 64-bit - * constant compared to 4 cycles for LDR), it fights for bandwidth with - * the arithmetic instructions. - * - * I L S - * MOVK - * MOVK - * MOVK - * MOVK - * ADD - * SUB STR - * STR - * By forcing loads from memory (as the asm line causes the compiler to assume - * that XXH3_kSecretPtr has been changed), the pipelines are used more - * efficiently: - * I L S - * LDR - * ADD LDR - * SUB STR - * STR - * - * See XXH3_NEON_LANES for details on the pipsline. - * - * XXH3_64bits_withSeed, len == 256, Snapdragon 835 - * without hack: 2654.4 MB/s - * with hack: 3202.9 MB/s - */ - XXH_COMPILER_GUARD(kSecretPtr); -#endif - { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; - int i; - for (i=0; i < nbRounds; i++) { - /* - * The asm hack causes the compiler to assume that kSecretPtr aliases with - * customSecret, and on aarch64, this prevented LDP from merging two - * loads together for free. Putting the loads together before the stores - * properly generates LDP. - */ - xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; - xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; - XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); - XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); - } } -} - - -typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); -typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); -typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); - - -#if (XXH_VECTOR == XXH_AVX512) - -#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 -#define XXH3_accumulate XXH3_accumulate_avx512 -#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 -#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 - -#elif (XXH_VECTOR == XXH_AVX2) - -#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 -#define XXH3_accumulate XXH3_accumulate_avx2 -#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 -#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 - -#elif (XXH_VECTOR == XXH_SSE2) - -#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 -#define XXH3_accumulate XXH3_accumulate_sse2 -#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 -#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 - -#elif (XXH_VECTOR == XXH_NEON) - -#define XXH3_accumulate_512 XXH3_accumulate_512_neon -#define XXH3_accumulate XXH3_accumulate_neon -#define XXH3_scrambleAcc XXH3_scrambleAcc_neon -#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - -#elif (XXH_VECTOR == XXH_VSX) - -#define XXH3_accumulate_512 XXH3_accumulate_512_vsx -#define XXH3_accumulate XXH3_accumulate_vsx -#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx -#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - -#elif (XXH_VECTOR == XXH_SVE) -#define XXH3_accumulate_512 XXH3_accumulate_512_sve -#define XXH3_accumulate XXH3_accumulate_sve -#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar -#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - -#else /* scalar */ - -#define XXH3_accumulate_512 XXH3_accumulate_512_scalar -#define XXH3_accumulate XXH3_accumulate_scalar -#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar -#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - -#endif - -#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ -# undef XXH3_initCustomSecret -# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar -#endif - -XXH_FORCE_INLINE void -XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, - const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; - size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; - size_t const nb_blocks = (len - 1) / block_len; - - size_t n; - - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - - for (n = 0; n < nb_blocks; n++) { - f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); - f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); - } - - /* last partial block */ - XXH_ASSERT(len > XXH_STRIPE_LEN); - { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; - XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); - f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); - - /* last stripe */ - { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; -#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ - XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); - } } -} - -XXH_FORCE_INLINE xxh_u64 -XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) -{ - return XXH3_mul128_fold64( - acc[0] ^ XXH_readLE64(secret), - acc[1] ^ XXH_readLE64(secret+8) ); -} - -static XXH64_hash_t -XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) -{ - xxh_u64 result64 = start; - size_t i = 0; - - for (i = 0; i < 4; i++) { - result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); -#if defined(__clang__) /* Clang */ \ - && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ - && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ - /* - * UGLY HACK: - * Prevent autovectorization on Clang ARMv7-a. Exact same problem as - * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. - * XXH3_64bits, len == 256, Snapdragon 835: - * without hack: 2063.7 MB/s - * with hack: 2560.7 MB/s - */ - XXH_COMPILER_GUARD(result64); -#endif - } - - return XXH3_avalanche(result64); -} - -#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ - XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } - -XXH_FORCE_INLINE XXH64_hash_t -XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, - const void* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - - XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); - - /* converge into final hash */ - XXH_STATIC_ASSERT(sizeof(acc) == 64); - /* do not align on 8, so that the secret is different from the accumulator */ -#define XXH_SECRET_MERGEACCS_START 11 - XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); -} - -/* - * It's important for performance to transmit secret's size (when it's static) - * so that the compiler can properly optimize the vectorized loop. - * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. - * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE - * breaks -Og, this is XXH_NO_INLINE. - */ -XXH3_WITH_SECRET_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) -{ - (void)seed64; - return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); -} - -/* - * It's preferable for performance that XXH3_hashLong is not inlined, - * as it results in a smaller function for small data, easier to the instruction cache. - * Note that inside this no_inline function, we do inline the internal loop, - * and provide a statically defined secret size to allow optimization of vector loop. - */ -XXH_NO_INLINE XXH_PUREF XXH64_hash_t -XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) -{ - (void)seed64; (void)secret; (void)secretLen; - return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); -} - -/* - * XXH3_hashLong_64b_withSeed(): - * Generate a custom key based on alteration of default XXH3_kSecret with the seed, - * and then use this key for long mode hashing. - * - * This operation is decently fast but nonetheless costs a little bit of time. - * Try to avoid it whenever possible (typically when seed==0). - * - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. - */ -XXH_FORCE_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, - XXH64_hash_t seed, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble, - XXH3_f_initCustomSecret f_initSec) -{ -#if XXH_SIZE_OPT <= 0 - if (seed == 0) - return XXH3_hashLong_64b_internal(input, len, - XXH3_kSecret, sizeof(XXH3_kSecret), - f_acc, f_scramble); -#endif - { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; - f_initSec(secret, seed); - return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), - f_acc, f_scramble); - } -} - -/* - * It's important for performance that XXH3_hashLong is not inlined. - */ -XXH_NO_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) -{ - (void)secret; (void)secretLen; - return XXH3_hashLong_64b_withSeed_internal(input, len, seed, - XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); -} - - -typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, - XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); - -XXH_FORCE_INLINE XXH64_hash_t -XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, - XXH3_hashLong64_f f_hashLong) -{ - XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); - /* - * If an action is to be taken if `secretLen` condition is not respected, - * it should be done here. - * For now, it's a contract pre-condition. - * Adding a check and a branch here would cost performance at every hash. - * Also, note that function signature doesn't offer room to return an error. - */ - if (len <= 16) - return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); - if (len <= 128) - return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); - return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); -} - - -/* === Public entry point === */ - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) -{ - return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) -{ - return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) -{ - return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); -} - -XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) -{ - if (length <= XXH3_MIDSIZE_MAX) - return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); - return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); -} - - -/* === XXH3 streaming === */ -#ifndef XXH_NO_STREAM -/* - * Malloc's a pointer that is always aligned to align. - * - * This must be freed with `XXH_alignedFree()`. - * - * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte - * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 - * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. - * - * This underalignment previously caused a rather obvious crash which went - * completely unnoticed due to XXH3_createState() not actually being tested. - * Credit to RedSpah for noticing this bug. - * - * The alignment is done manually: Functions like posix_memalign or _mm_malloc - * are avoided: To maintain portability, we would have to write a fallback - * like this anyways, and besides, testing for the existence of library - * functions without relying on external build tools is impossible. - * - * The method is simple: Overallocate, manually align, and store the offset - * to the original behind the returned pointer. - * - * Align must be a power of 2 and 8 <= align <= 128. - */ -static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) -{ - XXH_ASSERT(align <= 128 && align >= 8); /* range check */ - XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ - XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ - { /* Overallocate to make room for manual realignment and an offset byte */ - xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); - if (base != NULL) { - /* - * Get the offset needed to align this pointer. - * - * Even if the returned pointer is aligned, there will always be - * at least one byte to store the offset to the original pointer. - */ - size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ - /* Add the offset for the now-aligned pointer */ - xxh_u8* ptr = base + offset; - - XXH_ASSERT((size_t)ptr % align == 0); - - /* Store the offset immediately before the returned pointer. */ - ptr[-1] = (xxh_u8)offset; - return ptr; - } - return NULL; - } -} -/* - * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass - * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. - */ -static void XXH_alignedFree(void* p) -{ - if (p != NULL) { - xxh_u8* ptr = (xxh_u8*)p; - /* Get the offset byte we added in XXH_malloc. */ - xxh_u8 offset = ptr[-1]; - /* Free the original malloc'd pointer */ - xxh_u8* base = ptr - offset; - XXH_free(base); - } -} -/*! @ingroup XXH3_family */ -/*! - * @brief Allocate an @ref XXH3_state_t. - * - * @return An allocated pointer of @ref XXH3_state_t on success. - * @return `NULL` on failure. - * - * @note Must be freed with XXH3_freeState(). - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) -{ - XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); - if (state==NULL) return NULL; - XXH3_INITSTATE(state); - return state; -} - -/*! @ingroup XXH3_family */ -/*! - * @brief Frees an @ref XXH3_state_t. - * - * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). - * - * @return @ref XXH_OK. - * - * @note Must be allocated with XXH3_createState(). - * - * @see @ref streaming_example "Streaming Example" - */ -XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) -{ - XXH_alignedFree(statePtr); - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API void -XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) -{ - XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); -} - -static void -XXH3_reset_internal(XXH3_state_t* statePtr, - XXH64_hash_t seed, - const void* secret, size_t secretSize) -{ - size_t const initStart = offsetof(XXH3_state_t, bufferedSize); - size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; - XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); - XXH_ASSERT(statePtr != NULL); - /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ - memset((char*)statePtr + initStart, 0, initLength); - statePtr->acc[0] = XXH_PRIME32_3; - statePtr->acc[1] = XXH_PRIME64_1; - statePtr->acc[2] = XXH_PRIME64_2; - statePtr->acc[3] = XXH_PRIME64_3; - statePtr->acc[4] = XXH_PRIME64_4; - statePtr->acc[5] = XXH_PRIME32_2; - statePtr->acc[6] = XXH_PRIME64_5; - statePtr->acc[7] = XXH_PRIME32_1; - statePtr->seed = seed; - statePtr->useSeed = (seed != 0); - statePtr->extSecret = (const unsigned char*)secret; - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; - statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_reset_internal(statePtr, 0, secret, secretSize); - if (secret == NULL) return XXH_ERROR; - if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) -{ - if (statePtr == NULL) return XXH_ERROR; - if (seed==0) return XXH3_64bits_reset(statePtr); - if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) - XXH3_initCustomSecret(statePtr->customSecret, seed); - XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) -{ - if (statePtr == NULL) return XXH_ERROR; - if (secret == NULL) return XXH_ERROR; - if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; - XXH3_reset_internal(statePtr, seed64, secret, secretSize); - statePtr->useSeed = 1; /* always, even if seed64==0 */ - return XXH_OK; -} - -/*! - * @internal - * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). - * - * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. - * - * @param acc Pointer to the 8 accumulator lanes - * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* - * @param nbStripesPerBlock Number of stripes in a block - * @param input Input pointer - * @param nbStripes Number of stripes to process - * @param secret Secret pointer - * @param secretLimit Offset of the last block in @p secret - * @param f_acc Pointer to an XXH3_accumulate implementation - * @param f_scramble Pointer to an XXH3_scrambleAcc implementation - * @return Pointer past the end of @p input after processing - */ -XXH_FORCE_INLINE const xxh_u8 * -XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, - size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, - const xxh_u8* XXH_RESTRICT input, size_t nbStripes, - const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; - /* Process full blocks */ - if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { - /* Process the initial partial block... */ - size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; - - do { - /* Accumulate and scramble */ - f_acc(acc, input, initialSecret, nbStripesThisIter); - f_scramble(acc, secret + secretLimit); - input += nbStripesThisIter * XXH_STRIPE_LEN; - nbStripes -= nbStripesThisIter; - /* Then continue the loop with the full block size */ - nbStripesThisIter = nbStripesPerBlock; - initialSecret = secret; - } while (nbStripes >= nbStripesPerBlock); - *nbStripesSoFarPtr = 0; - } - /* Process a partial block */ - if (nbStripes > 0) { - f_acc(acc, input, initialSecret, nbStripes); - input += nbStripes * XXH_STRIPE_LEN; - *nbStripesSoFarPtr += nbStripes; - } - /* Return end pointer */ - return input; -} - -#ifndef XXH3_STREAM_USE_STACK -# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ -# define XXH3_STREAM_USE_STACK 1 -# endif -#endif -/* - * Both XXH3_64bits_update and XXH3_128bits_update use this routine. - */ -XXH_FORCE_INLINE XXH_errorcode -XXH3_update(XXH3_state_t* XXH_RESTRICT const state, - const xxh_u8* XXH_RESTRICT input, size_t len, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - if (input==NULL) { - XXH_ASSERT(len == 0); - return XXH_OK; - } - - XXH_ASSERT(state != NULL); - { const xxh_u8* const bEnd = input + len; - const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; -#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 - /* For some reason, gcc and MSVC seem to suffer greatly - * when operating accumulators directly into state. - * Operating into stack space seems to enable proper optimization. - * clang, on the other hand, doesn't seem to need this trick */ - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; - XXH_memcpy(acc, state->acc, sizeof(acc)); -#else - xxh_u64* XXH_RESTRICT const acc = state->acc; -#endif - state->totalLen += len; - XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); - - /* small input : just fill in tmp buffer */ - if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { - XXH_memcpy(state->buffer + state->bufferedSize, input, len); - state->bufferedSize += (XXH32_hash_t)len; - return XXH_OK; - } - - /* total input is now > XXH3_INTERNALBUFFER_SIZE */ - #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) - XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ - - /* - * Internal buffer is partially filled (always, except at beginning) - * Complete it, then consume it. - */ - if (state->bufferedSize) { - size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; - XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); - input += loadSize; - XXH3_consumeStripes(acc, - &state->nbStripesSoFar, state->nbStripesPerBlock, - state->buffer, XXH3_INTERNALBUFFER_STRIPES, - secret, state->secretLimit, - f_acc, f_scramble); - state->bufferedSize = 0; - } - XXH_ASSERT(input < bEnd); - if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { - size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; - input = XXH3_consumeStripes(acc, - &state->nbStripesSoFar, state->nbStripesPerBlock, - input, nbStripes, - secret, state->secretLimit, - f_acc, f_scramble); - XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); - - } - /* Some remaining input (always) : buffer it */ - XXH_ASSERT(input < bEnd); - XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); - XXH_ASSERT(state->bufferedSize == 0); - XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); - state->bufferedSize = (XXH32_hash_t)(bEnd-input); -#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 - /* save stack accumulators into state */ - XXH_memcpy(state->acc, acc, sizeof(acc)); -#endif - } - - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) -{ - return XXH3_update(state, (const xxh_u8*)input, len, - XXH3_accumulate, XXH3_scrambleAcc); -} - - -XXH_FORCE_INLINE void -XXH3_digest_long (XXH64_hash_t* acc, - const XXH3_state_t* state, - const unsigned char* secret) -{ - xxh_u8 lastStripe[XXH_STRIPE_LEN]; - const xxh_u8* lastStripePtr; - - /* - * Digest on a local copy. This way, the state remains unaltered, and it can - * continue ingesting more input afterwards. - */ - XXH_memcpy(acc, state->acc, sizeof(state->acc)); - if (state->bufferedSize >= XXH_STRIPE_LEN) { - /* Consume remaining stripes then point to remaining data in buffer */ - size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; - size_t nbStripesSoFar = state->nbStripesSoFar; - XXH3_consumeStripes(acc, - &nbStripesSoFar, state->nbStripesPerBlock, - state->buffer, nbStripes, - secret, state->secretLimit, - XXH3_accumulate, XXH3_scrambleAcc); - lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; - } else { /* bufferedSize < XXH_STRIPE_LEN */ - /* Copy to temp buffer */ - size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; - XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ - XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); - XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); - lastStripePtr = lastStripe; - } - /* Last stripe */ - XXH3_accumulate_512(acc, - lastStripePtr, - secret + state->secretLimit - XXH_SECRET_LASTACC_START); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) -{ - const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; - if (state->totalLen > XXH3_MIDSIZE_MAX) { - XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; - XXH3_digest_long(acc, state, secret); - return XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)state->totalLen * XXH_PRIME64_1); - } - /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ - if (state->useSeed) - return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); - return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), - secret, state->secretLimit + XXH_STRIPE_LEN); -} -#endif /* !XXH_NO_STREAM */ - - -/* ========================================== - * XXH3 128 bits (a.k.a XXH128) - * ========================================== - * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, - * even without counting the significantly larger output size. - * - * For example, extra steps are taken to avoid the seed-dependent collisions - * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). - * - * This strength naturally comes at the cost of some speed, especially on short - * lengths. Note that longer hashes are about as fast as the 64-bit version - * due to it using only a slight modification of the 64-bit loop. - * - * XXH128 is also more oriented towards 64-bit machines. It is still extremely - * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). - */ - -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - /* A doubled version of 1to3_64b with different constants. */ - XXH_ASSERT(input != NULL); - XXH_ASSERT(1 <= len && len <= 3); - XXH_ASSERT(secret != NULL); - /* - * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } - * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } - * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } - */ - { xxh_u8 const c1 = input[0]; - xxh_u8 const c2 = input[len >> 1]; - xxh_u8 const c3 = input[len - 1]; - xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) - | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); - xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); - xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; - xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; - xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; - xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; - XXH128_hash_t h128; - h128.low64 = XXH64_avalanche(keyed_lo); - h128.high64 = XXH64_avalanche(keyed_hi); - return h128; - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len <= 8); - seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; - { xxh_u32 const input_lo = XXH_readLE32(input); - xxh_u32 const input_hi = XXH_readLE32(input + len - 4); - xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); - xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; - xxh_u64 const keyed = input_64 ^ bitflip; - - /* Shift len to the left to ensure it is even, this avoids even multiplies. */ - XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); - - m128.high64 += (m128.low64 << 1); - m128.low64 ^= (m128.high64 >> 3); - - m128.low64 = XXH_xorshift64(m128.low64, 35); - m128.low64 *= PRIME_MX2; - m128.low64 = XXH_xorshift64(m128.low64, 28); - m128.high64 = XXH3_avalanche(m128.high64); - return m128; - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(9 <= len && len <= 16); - { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; - xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; - xxh_u64 const input_lo = XXH_readLE64(input); - xxh_u64 input_hi = XXH_readLE64(input + len - 8); - XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); - /* - * Put len in the middle of m128 to ensure that the length gets mixed to - * both the low and high bits in the 128x64 multiply below. - */ - m128.low64 += (xxh_u64)(len - 1) << 54; - input_hi ^= bitfliph; - /* - * Add the high 32 bits of input_hi to the high 32 bits of m128, then - * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to - * the high 64 bits of m128. - * - * The best approach to this operation is different on 32-bit and 64-bit. - */ - if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ - /* - * 32-bit optimized version, which is more readable. - * - * On 32-bit, it removes an ADC and delays a dependency between the two - * halves of m128.high64, but it generates an extra mask on 64-bit. - */ - m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); - } else { - /* - * 64-bit optimized (albeit more confusing) version. - * - * Uses some properties of addition and multiplication to remove the mask: - * - * Let: - * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) - * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) - * c = XXH_PRIME32_2 - * - * a + (b * c) - * Inverse Property: x + y - x == y - * a + (b * (1 + c - 1)) - * Distributive Property: x * (y + z) == (x * y) + (x * z) - * a + (b * 1) + (b * (c - 1)) - * Identity Property: x * 1 == x - * a + b + (b * (c - 1)) - * - * Substitute a, b, and c: - * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) - * - * Since input_hi.hi + input_hi.lo == input_hi, we get this: - * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) - */ - m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); - } - /* m128 ^= XXH_swap64(m128 >> 64); */ - m128.low64 ^= XXH_swap64(m128.high64); - - { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ - XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); - h128.high64 += m128.high64 * XXH_PRIME64_2; - - h128.low64 = XXH3_avalanche(h128.low64); - h128.high64 = XXH3_avalanche(h128.high64); - return h128; - } } -} - -/* - * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN - */ -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(len <= 16); - { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); - if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); - if (len) return XXH3_len_1to3_128b(input, len, secret, seed); - { XXH128_hash_t h128; - xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); - xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); - h128.low64 = XXH64_avalanche(seed ^ bitflipl); - h128.high64 = XXH64_avalanche( seed ^ bitfliph); - return h128; - } } -} - -/* - * A bit slower than XXH3_mix16B, but handles multiply by zero better. - */ -XXH_FORCE_INLINE XXH128_hash_t -XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, - const xxh_u8* secret, XXH64_hash_t seed) -{ - acc.low64 += XXH3_mix16B (input_1, secret+0, seed); - acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); - acc.high64 += XXH3_mix16B (input_2, secret+16, seed); - acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); - return acc; -} - - -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(16 < len && len <= 128); - - { XXH128_hash_t acc; - acc.low64 = len * XXH_PRIME64_1; - acc.high64 = 0; - -#if XXH_SIZE_OPT >= 1 - { - /* Smaller, but slightly slower. */ - unsigned int i = (unsigned int)(len - 1) / 32; - do { - acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); - } while (i-- != 0); - } -#else - if (len > 32) { - if (len > 64) { - if (len > 96) { - acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); - } - acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); - } - acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); - } - acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); -#endif - { XXH128_hash_t h128; - h128.low64 = acc.low64 + acc.high64; - h128.high64 = (acc.low64 * XXH_PRIME64_1) - + (acc.high64 * XXH_PRIME64_4) - + ((len - seed) * XXH_PRIME64_2); - h128.low64 = XXH3_avalanche(h128.low64); - h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); - return h128; - } - } -} - -XXH_NO_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - - { XXH128_hash_t acc; - unsigned i; - acc.low64 = len * XXH_PRIME64_1; - acc.high64 = 0; - /* - * We set as `i` as offset + 32. We do this so that unchanged - * `len` can be used as upper bound. This reaches a sweet spot - * where both x86 and aarch64 get simple agen and good codegen - * for the loop. - */ - for (i = 32; i < 160; i += 32) { - acc = XXH128_mix32B(acc, - input + i - 32, - input + i - 16, - secret + i - 32, - seed); - } - acc.low64 = XXH3_avalanche(acc.low64); - acc.high64 = XXH3_avalanche(acc.high64); - /* - * NB: `i <= len` will duplicate the last 32-bytes if - * len % 32 was zero. This is an unfortunate necessity to keep - * the hash result stable. - */ - for (i=160; i <= len; i += 32) { - acc = XXH128_mix32B(acc, - input + i - 32, - input + i - 16, - secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, - seed); - } - /* last bytes */ - acc = XXH128_mix32B(acc, - input + len - 16, - input + len - 32, - secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, - (XXH64_hash_t)0 - seed); - - { XXH128_hash_t h128; - h128.low64 = acc.low64 + acc.high64; - h128.high64 = (acc.low64 * XXH_PRIME64_1) - + (acc.high64 * XXH_PRIME64_4) - + ((len - seed) * XXH_PRIME64_2); - h128.low64 = XXH3_avalanche(h128.low64); - h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); - return h128; - } - } -} - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - - XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); - - /* converge into final hash */ - XXH_STATIC_ASSERT(sizeof(acc) == 64); - XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { XXH128_hash_t h128; - h128.low64 = XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)len * XXH_PRIME64_1); - h128.high64 = XXH3_mergeAccs(acc, - secret + secretSize - - sizeof(acc) - XXH_SECRET_MERGEACCS_START, - ~((xxh_u64)len * XXH_PRIME64_2)); - return h128; - } -} - -/* - * It's important for performance that XXH3_hashLong() is not inlined. - */ -XXH_NO_INLINE XXH_PUREF XXH128_hash_t -XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, - const void* XXH_RESTRICT secret, size_t secretLen) -{ - (void)seed64; (void)secret; (void)secretLen; - return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), - XXH3_accumulate, XXH3_scrambleAcc); -} - -/* - * It's important for performance to pass @p secretLen (when it's static) - * to the compiler, so that it can properly optimize the vectorized loop. - * - * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE - * breaks -Og, this is XXH_NO_INLINE. - */ -XXH3_WITH_SECRET_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, - const void* XXH_RESTRICT secret, size_t secretLen) -{ - (void)seed64; - return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, - XXH3_accumulate, XXH3_scrambleAcc); -} - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble, - XXH3_f_initCustomSecret f_initSec) -{ - if (seed64 == 0) - return XXH3_hashLong_128b_internal(input, len, - XXH3_kSecret, sizeof(XXH3_kSecret), - f_acc, f_scramble); - { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; - f_initSec(secret, seed64); - return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), - f_acc, f_scramble); - } -} - -/* - * It's important for performance that XXH3_hashLong is not inlined. - */ -XXH_NO_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSeed(const void* input, size_t len, - XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) -{ - (void)secret; (void)secretLen; - return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, - XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); -} - -typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, - XXH64_hash_t, const void* XXH_RESTRICT, size_t); - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_128bits_internal(const void* input, size_t len, - XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, - XXH3_hashLong128_f f_hl128) -{ - XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); - /* - * If an action is to be taken if `secret` conditions are not respected, - * it should be done here. - * For now, it's a contract pre-condition. - * Adding a check and a branch here would cost performance at every hash. - */ - if (len <= 16) - return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); - if (len <= 128) - return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); - return f_hl128(input, len, seed64, secret, secretLen); -} - - -/* === Public XXH128 API === */ - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) -{ - return XXH3_128bits_internal(input, len, 0, - XXH3_kSecret, sizeof(XXH3_kSecret), - XXH3_hashLong_128b_default); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) -{ - return XXH3_128bits_internal(input, len, 0, - (const xxh_u8*)secret, secretSize, - XXH3_hashLong_128b_withSecret); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) -{ - return XXH3_128bits_internal(input, len, seed, - XXH3_kSecret, sizeof(XXH3_kSecret), - XXH3_hashLong_128b_withSeed); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) -{ - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); - return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) -{ - return XXH3_128bits_withSeed(input, len, seed); -} - - -/* === XXH3 128-bit streaming === */ -#ifndef XXH_NO_STREAM -/* - * All initialization and update functions are identical to 64-bit streaming variant. - * The only difference is the finalization routine. - */ - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) -{ - return XXH3_64bits_reset(statePtr); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) -{ - return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) -{ - return XXH3_64bits_reset_withSeed(statePtr, seed); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) -{ - return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) -{ - return XXH3_64bits_update(state, input, len); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) -{ - const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; - if (state->totalLen > XXH3_MIDSIZE_MAX) { - XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; - XXH3_digest_long(acc, state, secret); - XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { XXH128_hash_t h128; - h128.low64 = XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)state->totalLen * XXH_PRIME64_1); - h128.high64 = XXH3_mergeAccs(acc, - secret + state->secretLimit + XXH_STRIPE_LEN - - sizeof(acc) - XXH_SECRET_MERGEACCS_START, - ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); - return h128; - } - } - /* len <= XXH3_MIDSIZE_MAX : short code */ - if (state->useSeed) - return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); - return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), - secret, state->secretLimit + XXH_STRIPE_LEN); -} -#endif /* !XXH_NO_STREAM */ -/* 128-bit utility functions */ - -#include /* memcmp, memcpy */ - -/* return : 1 is equal, 0 if different */ -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) -{ - /* note : XXH128_hash_t is compact, it has no padding byte */ - return !(memcmp(&h1, &h2, sizeof(h1))); -} - -/* This prototype is compatible with stdlib's qsort(). - * @return : >0 if *h128_1 > *h128_2 - * <0 if *h128_1 < *h128_2 - * =0 if *h128_1 == *h128_2 */ -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) -{ - XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; - XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; - int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); - /* note : bets that, in most cases, hash values are different */ - if (hcmp) return hcmp; - return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); -} - - -/*====== Canonical representation ======*/ -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API void -XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) { - hash.high64 = XXH_swap64(hash.high64); - hash.low64 = XXH_swap64(hash.low64); - } - XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); - XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) -{ - XXH128_hash_t h; - h.high64 = XXH_readBE64(src); - h.low64 = XXH_readBE64(src->digest + 8); - return h; -} - - - -/* ========================================== - * Secret generators - * ========================================== - */ -#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) - -XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) -{ - XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); - XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) -{ -#if (XXH_DEBUGLEVEL >= 1) - XXH_ASSERT(secretBuffer != NULL); - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); -#else - /* production mode, assert() are disabled */ - if (secretBuffer == NULL) return XXH_ERROR; - if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; -#endif - - if (customSeedSize == 0) { - customSeed = XXH3_kSecret; - customSeedSize = XXH_SECRET_DEFAULT_SIZE; - } -#if (XXH_DEBUGLEVEL >= 1) - XXH_ASSERT(customSeed != NULL); -#else - if (customSeed == NULL) return XXH_ERROR; -#endif - - /* Fill secretBuffer with a copy of customSeed - repeat as needed */ - { size_t pos = 0; - while (pos < secretSize) { - size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); - memcpy((char*)secretBuffer + pos, customSeed, toCopy); - pos += toCopy; - } } - - { size_t const nbSeg16 = secretSize / 16; - size_t n; - XXH128_canonical_t scrambler; - XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); - for (n=0; n /* abort() */ -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#include "xxhash/xxhash.h" -#include "sha1/sha1.h" -#include "sha256/sha256.h" - -#ifdef __cplusplus -} -#endif - - -// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp') -#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5" -#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5 - - -#define HASH_TYPE_SHA256_STR "sha256" -#define HASH_TYPE_SHA1_STR "sha1" -#define HASH_TYPE_XXH64_STR "xxh64" -#define HASH_TYPE_UUID_STR "uuid" - - -typedef enum { - HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated - HASH_EXIT_FAILURE = 1, // Generic Failure - HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation - HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest - HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it - HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format -} hash_exit_code_t; - - -typedef enum { - HASH_MANIFEST_NOT_FOUND, - HASH_MANIFEST_MISMATCH, - HASH_MANIFEST_OK, -} hash_manifest_result_t; - - -struct hash_params { - std::string input; - bool xxh64 = false; - bool sha1 = false; - bool sha256 = false; - bool uuid = false; - - bool no_layer = false; - - bool manifest_is_usable = false; - std::string manifest_file; -}; - -struct manifest_check_params { - bool xxh64 = false; - bool sha1 = false; - bool sha256 = false; - bool uuid = false; -}; - -static char const * hash_manifest_result_to_str(hash_manifest_result_t value) { - switch (value) { - case HASH_MANIFEST_NOT_FOUND: return "Not Found"; - case HASH_MANIFEST_MISMATCH: return "Mismatch"; - case HASH_MANIFEST_OK: return "Ok"; - } - return "?"; -} - -static char const * hash_exit_code_to_str(hash_exit_code_t value) { - switch (value) { - case HASH_EXIT_SUCCESS: return "Success"; - case HASH_EXIT_FAILURE: return "Failure"; - case HASH_EXIT_MISMATCH: return "Mismatch"; - case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry"; - case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash"; - case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error"; - } - return "?"; -} - -static void hash_print_usage(const char * executable) { - const hash_params default_params; - printf("\n"); - printf("usage: %s [options] GGUF_IN\n", executable); - printf("\n"); - printf("Hash a GGUF file"); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" --xxh64 use xxh64 hash\n"); - printf(" --sha1 use sha1 hash\n"); - printf(" --sha256 use sha256 hash\n"); - printf(" --all use all hash\n"); - printf(" --no-layer exclude per layer hash\n"); - printf(" --uuid generate UUIDv5 ID\n"); - printf(" -c, --check verify against a manifest\n"); - printf("\n"); -} - -static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) { - std::string arg; - bool invalid_param = false; - const std::string arg_prefix = "--"; - - int arg_idx = 1; - for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { - arg = argv[arg_idx]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - bool arg_found = false; - if (arg == "-h" || arg == "--help") { - hash_print_usage(argv[0]); - exit(0); - } - - if (arg == "--xxh64") { - arg_found = true; - params.xxh64 = true; - } - - if (arg == "--sha1") { - arg_found = true; - params.sha1 = true; - } - - if (arg == "--uuid") { - arg_found = true; - params.uuid = true; - } - - if (arg == "--sha256") { - arg_found = true; - params.sha256 = true; - } - - if (arg == "--all") { - arg_found = true; - params.sha256 = true; - params.sha1 = true; - params.xxh64 = true; - } - - if (arg == "--no-layer") { - arg_found = true; - params.no_layer = true; - } - - if (arg == "-c" || arg == "--check") { - if (++arg_idx >= argc) { - invalid_param = true; - break; - } - arg_found = true; - params.manifest_file = argv[arg_idx]; - } - - if (!arg_found) { - throw std::invalid_argument("error: unknown argument: " + arg); - } - } - - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument:" + arg); - } - - if (argc - arg_idx < 1) { - throw std::invalid_argument("error: bad arguments"); - } - - params.input = argv[arg_idx++]; -} - -static bool hash_params_parse(int argc, const char ** argv, hash_params & params) { - bool result = true; - try { - hash_params_parse_ex(argc, argv, params); - } - catch (const std::invalid_argument & ex) { - fprintf(stderr, "%s\n", ex.what()); - hash_print_usage(argv[0]); - exit(EXIT_FAILURE); - } - return result; -} - -static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) { - if (manifest_file.empty()) { - return false; - } - - std::ifstream file(manifest_file); - if (!file.is_open()) { - return false; - } - - std::string manifest_entry_line; - while (getline(file, manifest_entry_line)) { - // hash_type_str hash_str tensor_name - // e.g. 'xxh64 f66e9cd66a4396a0 test.gguf:tensor_0' - std::istringstream line_stream(manifest_entry_line); - std::string file_hash_type; - if (line_stream >> file_hash_type) { - if (file_hash_type == HASH_TYPE_SHA256_STR) { - manifest_check.sha256 = true; - } else if (file_hash_type == HASH_TYPE_SHA1_STR) { - manifest_check.sha1 = true; - } else if (file_hash_type == HASH_TYPE_XXH64_STR) { - manifest_check.xxh64 = true; - } else if (file_hash_type == HASH_TYPE_UUID_STR) { - manifest_check.uuid = true; - } - } - } - - return true; -} - -static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) { - if (manifest_file.empty()) { - return HASH_MANIFEST_NOT_FOUND; - } - - std::ifstream file(manifest_file); - if (!file.is_open()) { - return HASH_MANIFEST_NOT_FOUND; - } - - std::string manifest_entry_line; - while (getline(file, manifest_entry_line)) { - std::istringstream line_stream(manifest_entry_line); - std::string file_hash_type; - std::string file_hash; - std::string file_tensor_name; - if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) { - // Line parsed. Check hash validity - - if (file_hash_type != hash_type_str) { - continue; - } - - if (file_tensor_name != tensor_name) { - continue; - } - - return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH; - } - } - - return HASH_MANIFEST_NOT_FOUND; -} - -static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) { - // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5 - // Assumes that digest was processed correctly with the expected namespace - for (int i = 0; i < 16; i++) { - uuid[i] = sha1_digest[i]; - } - - // Set bits corresponding to UUID ver 5 - uuid[ 6] &= ~(0xF << 4); - uuid[ 6] |= (5 << 4); - - // Set bits corresponding to UUID variant 0b10XX - uuid[ 8] &= ~(0xc << 4); - uuid[ 8] |= (0x8 << 4); -} - -static hash_exit_code_t gguf_hash(const hash_params & hash_params) { - const std::string & fname = hash_params.input; - struct ggml_context * ctx_data = NULL; - - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, - }; - - // xxh64 init - XXH64_state_t* xxh64_model_hash_state = NULL; - if (hash_params.xxh64) { - xxh64_model_hash_state = XXH64_createState(); - if (xxh64_model_hash_state==NULL) { - abort(); - } - - XXH64_hash_t const seed = 0; - if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) { - abort(); - } - } - - // sha1 init - SHA1_CTX sha1_model_hash_ctx; - if (hash_params.sha1) { - SHA1Init(&sha1_model_hash_ctx); - } - - // sha256 init - sha256_t sha256_model_hash_ctx; - if (hash_params.sha256) { - sha256_init(&sha256_model_hash_ctx); - } - - // sha1 for uuid init - SHA1_CTX sha1_for_uuid_ctx; - if (hash_params.uuid) { - unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX}; - SHA1Init(&sha1_for_uuid_ctx); - SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace)); - } - - struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); - const int n_tensors = gguf_get_n_tensors(ctx); - bool tensor_layer_in_manifest = false; - bool model_in_manifest = false; - bool tensor_layer_has_mismatch = false; - bool model_has_mismatch = false; - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx, i); - struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - auto n_bytes = ggml_nbytes(cur); - auto *raw_data = cur->data; - const std::string tensor_layer_name = fname + ":" + name; - - if (hash_params.xxh64) { - - if (!hash_params.no_layer) { - // Per Layer Hash - XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0); - - char hex_result[17]; - for (int offset = 0; offset < 8; offset++) { - unsigned int shift_bits_by = (8 * (8 - offset - 1)); - snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); - } - - if (hash_params.manifest_is_usable) { - hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name); - - switch (verify_result) { - case HASH_MANIFEST_NOT_FOUND: - break; - case HASH_MANIFEST_MISMATCH: - tensor_layer_in_manifest = true; - tensor_layer_has_mismatch = true; - break; - case HASH_MANIFEST_OK: - tensor_layer_in_manifest = true; - break; - } - - printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result)); - } else { - printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str()); - } - } - - // Overall Model Hash - if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort(); - } - - if (hash_params.sha1) { - - if (!hash_params.no_layer) { - // Per Layer Hash - char result[21]; // sha1 outputs 20 bytes - SHA1( result, (const char *)raw_data, n_bytes); - - char hex_result[41] = {0}; - for (int offset = 0; offset < 20; offset++) { - snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff); - } - - if (hash_params.manifest_is_usable) { - hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name); - - switch (verify_result) { - case HASH_MANIFEST_NOT_FOUND: - break; - case HASH_MANIFEST_MISMATCH: - tensor_layer_in_manifest = true; - tensor_layer_has_mismatch = true; - break; - case HASH_MANIFEST_OK: - tensor_layer_in_manifest = true; - break; - } - - printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result)); - } else { - printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str()); - } - } - - // Overall Model Hash - SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes); - } - - if (hash_params.sha256) { - - if (!hash_params.no_layer) { - // Per Layer Hash - unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes - sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes); - - char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0}; - for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) { - snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff); - } - - if (hash_params.manifest_is_usable) { - hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name); - - switch (verify_result) { - case HASH_MANIFEST_NOT_FOUND: - break; - case HASH_MANIFEST_MISMATCH: - tensor_layer_in_manifest = true; - tensor_layer_has_mismatch = true; - break; - case HASH_MANIFEST_OK: - tensor_layer_in_manifest = true; - break; - } - - printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result)); - } else { - printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str()); - } - } - - // Overall Model Hash - sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes); - } - - if (hash_params.uuid) { - SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes); - } - } - - if (hash_params.xxh64) { - XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state); - - char hex_result[17]; - for (int offset = 0; offset < 8; offset++) { - unsigned int shift_bits_by = (8 * (8 - offset - 1)); - snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); - } - - if (hash_params.manifest_is_usable) { - hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname); - - switch (verify_result) { - case HASH_MANIFEST_NOT_FOUND: - break; - case HASH_MANIFEST_MISMATCH: - model_in_manifest = true; - model_has_mismatch = true; - break; - case HASH_MANIFEST_OK: - model_in_manifest = true; - break; - } - - printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result)); - } else { - printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str()); - } - } - - if (hash_params.sha1) { - unsigned char result[21]; - SHA1Final(result, &sha1_model_hash_ctx); - - char hex_result[41]; - for (int offset = 0; offset < 20; offset++) { - snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff); - } - - if (hash_params.manifest_is_usable) { - hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname); - - switch (verify_result) { - case HASH_MANIFEST_NOT_FOUND: - break; - case HASH_MANIFEST_MISMATCH: - model_in_manifest = true; - model_has_mismatch = true; - break; - case HASH_MANIFEST_OK: - model_in_manifest = true; - break; - } - - printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result)); - } else { - printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str()); - } - } - - if (hash_params.sha256) { - unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes - sha256_final( &sha256_model_hash_ctx, result); - - char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0}; - for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) { - snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff); - } - - if (hash_params.manifest_is_usable) { - hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname); - - switch (verify_result) { - case HASH_MANIFEST_NOT_FOUND: - break; - case HASH_MANIFEST_MISMATCH: - model_in_manifest = true; - model_has_mismatch = true; - break; - case HASH_MANIFEST_OK: - model_in_manifest = true; - break; - } - - printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result)); - } else { - printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str()); - } - } - - if (hash_params.uuid) { - unsigned char result[21]; - SHA1Final(result, &sha1_for_uuid_ctx); - - unsigned char uuid[16]; - generate_uuidv5(result, uuid); - - char string_buffer[37] = {0}; - snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", - uuid[0], uuid[1], uuid[2], uuid[3], - uuid[4], uuid[5], uuid[6], uuid[7], - uuid[8], uuid[9], uuid[10], uuid[11], - uuid[12], uuid[13], uuid[14], uuid[15]); - - if (hash_params.manifest_is_usable) { - hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname); - - switch (verify_result) { - case HASH_MANIFEST_NOT_FOUND: - break; - case HASH_MANIFEST_MISMATCH: - model_in_manifest = true; - model_has_mismatch = true; - break; - case HASH_MANIFEST_OK: - model_in_manifest = true; - break; - } - - printf("%-8s %-s %s - %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result)); - } else { - printf("%-8s %-s %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str()); - } - } - - - ggml_free(ctx_data); - gguf_free(ctx); - - - if (hash_params.manifest_is_usable) { - // In hash verification mode - - if (!model_in_manifest) { - // model missing in manifest? - - // Check tensor layer... - if (!tensor_layer_in_manifest) { - // Still missing? Maybe we are reading the wrong manifest. - return HASH_EXIT_MANIFEST_MISSING_ENTRY; - } - - if (tensor_layer_has_mismatch) { - // Per tensor check found error - return HASH_EXIT_FAILURE; - } - - // All per tensor layer checks passed? Sounds good enough. - return HASH_EXIT_SUCCESS; - } - - // Overall model check passed, but let's check per layer just in case - // If missing, we don't care too much as the overall model checked - if (tensor_layer_in_manifest && tensor_layer_has_mismatch) { - return HASH_EXIT_FAILURE; - } - - if (model_has_mismatch) { - // model has failed hash somewhere in the model - return HASH_EXIT_FAILURE; - } - - // All checks appears to be fine - return HASH_EXIT_SUCCESS; - } - - // In hash generation mode - return HASH_EXIT_SUCCESS; -} - -int main(int argc, const char ** argv) { - hash_params params; - manifest_check_params manifest_check; - hash_params_parse(argc, argv, params); - - if (!params.manifest_file.empty()) { - if (!manifest_type(params.manifest_file, manifest_check)) { - printf("ERROR cannot open manifest %s", params.manifest_file.c_str()); - return HASH_EXIT_MANIFEST_FILE_ERROR; - } - - if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) { - printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str()); - return HASH_EXIT_MANIFEST_UNKNOWN_HASH; - } - - printf("manifest %s", params.manifest_file.c_str()); - - if (manifest_check.sha256) { - printf(" sha256"); - } - - if (manifest_check.sha1) { - printf(" sha1"); - } - - if (manifest_check.xxh64) { - printf(" xxh64"); - } - - if (manifest_check.uuid) { - printf(" uuid"); - } - - printf("\n"); - - // Autoselect the highest security hash if manifest is provided but - // the user has not specifically defined the hash they care about - if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { - // User has not selected a specific value, pick most secure hash - if (manifest_check.sha256) { - params.sha256 = true; - } else if (manifest_check.sha1) { - params.sha1 = true; - } else if (manifest_check.xxh64) { - params.xxh64 = true; - } else if (manifest_check.uuid) { - params.uuid = true; - } - } - - params.manifest_is_usable = true; - } - - // By default if no swich argument provided, assume xxh64 - if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { - params.xxh64 = true; - } - - hash_exit_code_t exit_code = gguf_hash(params); - - if (params.manifest_is_usable) { - printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code)); - } - - return exit_code; -} diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt deleted file mode 100644 index f63887da7..000000000 --- a/examples/gguf-split/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gguf-split) -add_executable(${TARGET} gguf-split.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md deleted file mode 100644 index ad1d86651..000000000 --- a/examples/gguf-split/README.md +++ /dev/null @@ -1,10 +0,0 @@ -## GGUF split Example - -CLI to split / merge GGUF files. - -**Command line options:** - -- `--split`: split GGUF to multiple GGUF, default operation. -- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`. -- `--split-max-tensors`: maximum tensors in each split: default(128) -- `--merge`: merge multiple GGUF to a single GGUF. diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp deleted file mode 100644 index 881f0451c..000000000 --- a/examples/gguf-split/gguf-split.cpp +++ /dev/null @@ -1,564 +0,0 @@ -#include "llama.h" -#include "common.h" - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#if defined(_WIN32) - #include - #ifndef PATH_MAX - #define PATH_MAX MAX_PATH - #endif - #include -#endif - -enum split_operation : uint8_t { - SPLIT_OP_SPLIT, - SPLIT_OP_MERGE, -}; - -struct split_params { - split_operation operation = SPLIT_OP_SPLIT; - size_t n_bytes_split = 0; - int n_split_tensors = 128; - std::string input; - std::string output; - bool no_tensor_first_split = false; - bool dry_run = false; -}; - -static void split_print_usage(const char * executable) { - const split_params default_params; - printf("\n"); - printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable); - printf("\n"); - printf("Apply a GGUF operation on IN to OUT."); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" --version show version and build info\n"); - printf(" --split split GGUF to multiple GGUF (enabled by default)\n"); - printf(" --merge merge multiple GGUF to a single GGUF\n"); - printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); - printf(" --split-max-size N(M|G) max size per split\n"); - printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n"); - printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); - printf("\n"); -} - -// return convert string, for example "128M" or "4G" to number of bytes -static size_t split_str_to_n_bytes(std::string str) { - size_t n_bytes = 0; - int n; - if (str.back() == 'M') { - sscanf(str.c_str(), "%d", &n); - n_bytes = (size_t)n * 1000 * 1000; // megabytes - } else if (str.back() == 'G') { - sscanf(str.c_str(), "%d", &n); - n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes - } else { - throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back())); - } - if (n <= 0) { - throw std::invalid_argument("error: size must be a positive value"); - } - return n_bytes; -} - -static void split_params_parse_ex(int argc, const char ** argv, split_params & params) { - std::string arg; - const std::string arg_prefix = "--"; - bool invalid_param = false; - - int arg_idx = 1; - for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { - arg = argv[arg_idx]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - bool arg_found = false; - bool is_op_set = false; - bool is_mode_set = false; - if (arg == "-h" || arg == "--help") { - split_print_usage(argv[0]); - exit(0); - } - if (arg == "--version") { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - if (arg == "--dry-run") { - arg_found = true; - params.dry_run = true; - } - if (arg == "--no-tensor-first-split") { - arg_found = true; - params.no_tensor_first_split = true; - } - - if (is_op_set) { - throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); - } - if (arg == "--merge") { - arg_found = true; - is_op_set = true; - params.operation = SPLIT_OP_MERGE; - } - if (arg == "--split") { - arg_found = true; - is_op_set = true; - params.operation = SPLIT_OP_SPLIT; - } - - if (is_mode_set) { - throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); - } - if (arg == "--split-max-tensors") { - if (++arg_idx >= argc) { - invalid_param = true; - break; - } - arg_found = true; - is_mode_set = true; - params.n_split_tensors = atoi(argv[arg_idx]); - } - if (arg == "--split-max-size") { - if (++arg_idx >= argc) { - invalid_param = true; - break; - } - arg_found = true; - is_mode_set = true; - params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]); - } - - if (!arg_found) { - throw std::invalid_argument("error: unknown argument: " + arg); - } - } - - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument: " + arg); - } - - if (argc - arg_idx < 2) { - throw std::invalid_argument("error: bad arguments"); - } - - params.input = argv[arg_idx++]; - params.output = argv[arg_idx++]; -} - -static bool split_params_parse(int argc, const char ** argv, split_params & params) { - bool result = true; - try { - split_params_parse_ex(argc, argv, params); - } - catch (const std::invalid_argument & ex) { - fprintf(stderr, "%s\n", ex.what()); - split_print_usage(argv[0]); - exit(EXIT_FAILURE); - } - return result; -} - -static void zeros(std::ofstream & file, size_t n) { - char zero = 0; - for (size_t i = 0; i < n; ++i) { - file.write(&zero, 1); - } -} - -struct split_strategy { - const split_params params; - std::ifstream & f_input; - struct gguf_context * ctx_gguf; - struct ggml_context * ctx_meta = NULL; - const int n_tensors; - - // one ctx_out per one output file - std::vector ctx_outs; - - // temporary buffer for reading in tensor data - std::vector read_buf; - - split_strategy(const split_params & params, - std::ifstream & f_input, - struct gguf_context * ctx_gguf, - struct ggml_context * ctx_meta) : - params(params), - f_input(f_input), - ctx_gguf(ctx_gguf), - ctx_meta(ctx_meta), - n_tensors(gguf_get_n_tensors(ctx_gguf)) { - - // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits - int i_split = -1; - struct gguf_context * ctx_out = NULL; - auto new_ctx_out = [&](bool allow_no_tensors) { - i_split++; - if (ctx_out != NULL) { - if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) { - fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n"); - exit(EXIT_FAILURE); - } - ctx_outs.push_back(ctx_out); - } - ctx_out = gguf_init_empty(); - // Save all metadata in first split only - if (i_split == 0) { - gguf_set_kv(ctx_out, ctx_gguf); - } - gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split); - gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder - gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors); - }; - - // initialize ctx_out for the first split - new_ctx_out(false); - - // skip first split if no_tensor_first_split is set - if (params.no_tensor_first_split) { - new_ctx_out(true); - } - - // process tensors one by one - size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) - for (int i = 0; i < n_tensors; ++i) { - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - // calculate the "imaginary" size = the current size + next tensor size - size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); - size_t next_tensors_size = curr_tensors_size + n_bytes; - if (should_split(i, next_tensors_size)) { - new_ctx_out(false); - curr_tensors_size = n_bytes; - } else { - curr_tensors_size = next_tensors_size; - } - gguf_add_tensor(ctx_out, t); - } - - // push the last ctx_out - ctx_outs.push_back(ctx_out); - - // set the correct n_split for all ctx_out - for (auto & ctx : ctx_outs) { - gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size()); - } - } - - ~split_strategy() { - for (auto & ctx_out : ctx_outs) { - gguf_free(ctx_out); - } - } - - bool should_split(int i_tensor, size_t next_size) { - if (params.n_bytes_split > 0) { - // split by max size per file - return next_size > params.n_bytes_split; - } else { - // split by number of tensors per file - return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; - } - } - - void print_info() { - printf("n_split: %ld\n", ctx_outs.size()); - int i_split = 0; - for (auto & ctx_out : ctx_outs) { - // re-calculate the real gguf size for each split (= metadata size + total size of all tensors) - size_t total_size = gguf_get_meta_size(ctx_out); - for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i)); - total_size += ggml_nbytes(t); - } - total_size = total_size / 1000 / 1000; // convert to megabytes - printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); - i_split++; - } - } - - void write() { - int i_split = 0; - int n_split = ctx_outs.size(); - for (auto & ctx_out : ctx_outs) { - // construct file path - char split_path[PATH_MAX] = {0}; - llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); - - // open the output file - printf("Writing file %s ... ", split_path); - fflush(stdout); - std::ofstream fout = std::ofstream(split_path, std::ios::binary); - fout.exceptions(std::ofstream::failbit); // fail fast on write errors - - // write metadata - std::vector data(gguf_get_meta_size(ctx_out)); - gguf_get_meta_data(ctx_out, data.data()); - fout.write((const char *)data.data(), data.size()); - - // write tensors - for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { - // read tensor meta and prepare buffer - const char * t_name = gguf_get_tensor_name(ctx_out, i); - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); - auto n_bytes = ggml_nbytes(t); - read_buf.resize(n_bytes); - - // calculate offset - auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file - auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); - - // copy tensor from input to output file - copy_file_to_file(f_input, fout, offset, n_bytes); - zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); - } - - printf("done\n"); - // close the file - fout.close(); - i_split++; - } - } - - void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) { - // TODO: detect OS and use copy_file_range() here for better performance - if (read_buf.size() < len) { - read_buf.resize(len); - } - f_in.seekg(in_offset); - f_in.read((char *)read_buf.data(), len); - f_out.write((const char *)read_buf.data(), len); - } -}; - -static void gguf_split(const split_params & split_params) { - struct ggml_context * ctx_meta = NULL; - - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, - }; - - std::ifstream f_input(split_params.input.c_str(), std::ios::binary); - if (!f_input.is_open()) { - fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(EXIT_FAILURE); - } - - auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); - if (!ctx_gguf) { - fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(EXIT_FAILURE); - } - - // prepare the strategy - split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); - int n_split = strategy.ctx_outs.size(); - strategy.print_info(); - - if (!split_params.dry_run) { - // write all output splits - strategy.write(); - } - - // done, clean up - gguf_free(ctx_gguf); - f_input.close(); - - fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n", - __func__, n_split, strategy.n_tensors); -} - -static void gguf_merge(const split_params & split_params) { - fprintf(stderr, "%s: %s -> %s\n", - __func__, split_params.input.c_str(), - split_params.output.c_str()); - int n_split = 1; - int total_tensors = 0; - - auto * ctx_out = gguf_init_empty(); - std::ofstream fout(split_params.output.c_str(), std::ios::binary); - fout.exceptions(std::ofstream::failbit); // fail fast on write errors - - std::vector read_data; - std::vector ctx_metas; - std::vector ctx_ggufs; - - char split_path[PATH_MAX] = {0}; - strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1); - char split_prefix[PATH_MAX] = {0}; - - // First pass to find KV and tensors metadata - for (int i_split = 0; i_split < n_split; i_split++) { - struct ggml_context * ctx_meta = NULL; - - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, - }; - - if (i_split > 0) { - llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); - } - fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path); - - auto * ctx_gguf = gguf_init_from_file(split_path, params); - if (!ctx_gguf) { - fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(EXIT_FAILURE); - } - ctx_ggufs.push_back(ctx_gguf); - ctx_metas.push_back(ctx_meta); - - if (i_split == 0) { - auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); - if (key_n_split < 0) { - fprintf(stderr, - "\n%s: input file does not contain %s metadata\n", - __func__, - LLM_KV_SPLIT_COUNT); - gguf_free(ctx_gguf); - ggml_free(ctx_meta); - gguf_free(ctx_out); - fout.close(); - exit(EXIT_FAILURE); - } - - n_split = gguf_get_val_u16(ctx_gguf, key_n_split); - if (n_split < 1) { - fprintf(stderr, - "\n%s: input file does not contain a valid split count %d\n", - __func__, - n_split); - gguf_free(ctx_gguf); - ggml_free(ctx_meta); - gguf_free(ctx_out); - fout.close(); - exit(EXIT_FAILURE); - } - - // Verify the file naming and extract split_prefix - if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) { - fprintf(stderr, "\n%s: unexpected input file name: %s" - " i_split=%d" - " n_split=%d\n", __func__, - split_path, i_split, n_split); - gguf_free(ctx_gguf); - ggml_free(ctx_meta); - gguf_free(ctx_out); - fout.close(); - exit(EXIT_FAILURE); - } - - // Do not trigger merge if we try to merge again the output - gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0); - - // Set metadata from the first split - gguf_set_kv(ctx_out, ctx_gguf); - } - - auto n_tensors = gguf_get_n_tensors(ctx_gguf); - for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { - const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); - gguf_add_tensor(ctx_out, t); - } - total_tensors += n_tensors; - - fprintf(stderr, "\033[3Ddone\n"); - } - - // placeholder for the meta data - { - auto meta_size = gguf_get_meta_size(ctx_out); - ::zeros(fout, meta_size); - } - - // Write tensors data - for (int i_split = 0; i_split < n_split; i_split++) { - llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); - std::ifstream f_input(split_path, std::ios::binary); - if (!f_input.is_open()) { - fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path); - for (uint32_t i = 0; i < ctx_ggufs.size(); i++) { - gguf_free(ctx_ggufs[i]); - ggml_free(ctx_metas[i]); - } - gguf_free(ctx_out); - fout.close(); - exit(EXIT_FAILURE); - } - fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path); - - auto * ctx_gguf = ctx_ggufs[i_split]; - auto * ctx_meta = ctx_metas[i_split]; - - auto n_tensors = gguf_get_n_tensors(ctx_gguf); - for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { - const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); - - auto n_bytes = ggml_nbytes(t); - - if (read_data.size() < n_bytes) { - read_data.resize(n_bytes); - } - - auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); - f_input.seekg(offset); - f_input.read((char *)read_data.data(), n_bytes); - - // write tensor data + padding - fout.write((const char *)read_data.data(), n_bytes); - zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); - } - - gguf_free(ctx_gguf); - ggml_free(ctx_meta); - f_input.close(); - fprintf(stderr, "\033[3Ddone\n"); - } - - { - // go back to beginning of file and write the updated metadata - fout.seekp(0); - std::vector data(gguf_get_meta_size(ctx_out)); - gguf_get_meta_data(ctx_out, data.data()); - fout.write((const char *)data.data(), data.size()); - - fout.close(); - gguf_free(ctx_out); - } - - fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n", - __func__, split_params.output.c_str(), n_split, total_tensors); -} - -int main(int argc, const char ** argv) { - split_params params; - split_params_parse(argc, argv, params); - - switch (params.operation) { - case SPLIT_OP_SPLIT: gguf_split(params); - break; - case SPLIT_OP_MERGE: gguf_merge(params); - break; - default: split_print_usage(argv[0]); - exit(EXIT_FAILURE); - } - - return 0; -} diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh deleted file mode 100755 index d5a92d605..000000000 --- a/examples/gguf-split/tests.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash - -set -eu - -if [ $# -lt 1 ] -then - echo "usage: $0 path_to_build_binary [path_to_temp_folder]" - echo "example: $0 ../../build/bin ../../tmp" - exit 1 -fi - -if [ $# -gt 1 ] -then - TMP_DIR=$2 -else - TMP_DIR=/tmp -fi - -set -x - -SPLIT=$1/llama-gguf-split -MAIN=$1/llama-cli -WORK_PATH=$TMP_DIR/gguf-split -ROOT_DIR=$(realpath $(dirname $0)/../../) - -mkdir -p "$WORK_PATH" - -# Clean up in case of previously failed test -rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf - -# 1. Get a model -( -cd $WORK_PATH -"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf -) -echo PASS - -# 2. Split with max tensors strategy -$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split -echo PASS -echo - -# 2b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 -echo PASS -echo - -# 3. Merge -$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf -echo PASS -echo - -# 3b. Test the merged model is loading properly -$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 -echo PASS -echo - -# 4. Split with no tensors in the first split -$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors -echo PASS -echo - -# 4b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 -echo PASS -echo - -# 5. Merge -#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf -#echo PASS -#echo - -# 5b. Test the merged model is loading properly -#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 -#echo PASS -#echo - -# 6. Split with size strategy -$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G -echo PASS -echo - -# 6b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 -echo PASS -echo - -# Clean up -rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt deleted file mode 100644 index a9569b411..000000000 --- a/examples/gguf/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gguf) -add_executable(${TARGET} gguf.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp deleted file mode 100644 index 7498f85ef..000000000 --- a/examples/gguf/gguf.cpp +++ /dev/null @@ -1,261 +0,0 @@ -#include "ggml.h" - -#include -#include -#include -#include -#include -#include - -#undef MIN -#undef MAX -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - -template -static std::string to_string(const T & val) { - std::stringstream ss; - ss << val; - return ss.str(); -} - -static bool gguf_ex_write(const std::string & fname) { - struct gguf_context * ctx = gguf_init_empty(); - - gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12); - gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13); - gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234); - gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235); - gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678); - gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679); - gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f); - gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull); - gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll); - gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789); - gguf_set_val_bool(ctx, "some.parameter.bool", true); - gguf_set_val_str (ctx, "some.parameter.string", "hello world"); - - gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector{ 1, 2, 3, 4, }.data(), 4); - gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector{ 3.145f, 2.718f, 1.414f, }.data(), 3); - gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector{ "hello", "world", "!" }.data(), 3); - - struct ggml_init_params params = { - /*.mem_size =*/ 128ull*1024ull*1024ull, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx_data = ggml_init(params); - - const int n_tensors = 10; - - // tensor infos - for (int i = 0; i < n_tensors; ++i) { - const std::string name = "tensor_" + to_string(i); - - int64_t ne[GGML_MAX_DIMS] = { 1 }; - int32_t n_dims = rand() % GGML_MAX_DIMS + 1; - - for (int j = 0; j < n_dims; ++j) { - ne[j] = rand() % 10 + 1; - } - - struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne); - ggml_set_name(cur, name.c_str()); - - { - float * data = (float *) cur->data; - for (int j = 0; j < ggml_nelements(cur); ++j) { - data[j] = 100 + i; - } - } - - gguf_add_tensor(ctx, cur); - } - - gguf_write_to_file(ctx, fname.c_str(), false); - - printf("%s: wrote file '%s;\n", __func__, fname.c_str()); - - ggml_free(ctx_data); - gguf_free(ctx); - - return true; -} - -// just read tensor info -static bool gguf_ex_read_0(const std::string & fname) { - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ NULL, - }; - - struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); - - if (!ctx) { - fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str()); - return false; - } - - printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); - printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); - - // kv - { - const int n_kv = gguf_get_n_kv(ctx); - - printf("%s: n_kv: %d\n", __func__, n_kv); - - for (int i = 0; i < n_kv; ++i) { - const char * key = gguf_get_key(ctx, i); - - printf("%s: kv[%d]: key = %s\n", __func__, i, key); - } - } - - // find kv string - { - const char * findkey = "some.parameter.string"; - - const int keyidx = gguf_find_key(ctx, findkey); - if (keyidx == -1) { - printf("%s: find key: %s not found.\n", __func__, findkey); - } else { - const char * key_value = gguf_get_val_str(ctx, keyidx); - printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value); - } - } - - // tensor info - { - const int n_tensors = gguf_get_n_tensors(ctx); - - printf("%s: n_tensors: %d\n", __func__, n_tensors); - - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name (ctx, i); - const size_t offset = gguf_get_tensor_offset(ctx, i); - - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); - } - } - - gguf_free(ctx); - - return true; -} - -// read and create ggml_context containing the tensors and their data -static bool gguf_ex_read_1(const std::string & fname, bool check_data) { - struct ggml_context * ctx_data = NULL; - - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, - }; - - struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); - - printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); - printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); - - // kv - { - const int n_kv = gguf_get_n_kv(ctx); - - printf("%s: n_kv: %d\n", __func__, n_kv); - - for (int i = 0; i < n_kv; ++i) { - const char * key = gguf_get_key(ctx, i); - - printf("%s: kv[%d]: key = %s\n", __func__, i, key); - } - } - - // tensor info - { - const int n_tensors = gguf_get_n_tensors(ctx); - - printf("%s: n_tensors: %d\n", __func__, n_tensors); - - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name (ctx, i); - const size_t offset = gguf_get_tensor_offset(ctx, i); - - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); - } - } - - // data - { - const int n_tensors = gguf_get_n_tensors(ctx); - - for (int i = 0; i < n_tensors; ++i) { - printf("%s: reading tensor %d data\n", __func__, i); - - const char * name = gguf_get_tensor_name(ctx, i); - - struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - - printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data); - - // print first 10 elements - const float * data = (const float *) cur->data; - - printf("%s data[:10] : ", name); - for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { - printf("%f ", data[j]); - } - printf("\n\n"); - - // check data - if (check_data) { - const float * data = (const float *) cur->data; - for (int j = 0; j < ggml_nelements(cur); ++j) { - if (data[j] != 100 + i) { - fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); - gguf_free(ctx); - return false; - } - } - } - } - } - - printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); - - ggml_free(ctx_data); - gguf_free(ctx); - - return true; -} - -int main(int argc, char ** argv) { - if (argc < 3) { - printf("usage: %s data.gguf r|w [n]\n", argv[0]); - printf("r: read data.gguf file\n"); - printf("w: write data.gguf file\n"); - printf("n: no check of tensor data\n"); - return -1; - } - bool check_data = true; - if (argc == 4) { - check_data = false; - } - - const std::string fname(argv[1]); - const std::string mode (argv[2]); - - GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w"); - - if (mode == "w") { - GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file"); - } else if (mode == "r") { - GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); - GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file"); - } - - return 0; -} diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt deleted file mode 100644 index 86dfddca3..000000000 --- a/examples/gritlm/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gritlm) -add_executable(${TARGET} gritlm.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md deleted file mode 100644 index 786ba5736..000000000 --- a/examples/gritlm/README.md +++ /dev/null @@ -1,62 +0,0 @@ -## Generative Representational Instruction Tuning (GRIT) Example -[gritlm] a model which can generate embeddings as well as "normal" text -generation depending on the instructions in the prompt. - -* Paper: https://arxiv.org/pdf/2402.09906.pdf - -### Retrieval-Augmented Generation (RAG) use case -One use case for `gritlm` is to use it with RAG. If we recall how RAG works is -that we take documents that we want to use as context, to ground the large -language model (LLM), and we create token embeddings for them. We then store -these token embeddings in a vector database. - -When we perform a query, prompt the LLM, we will first create token embeddings -for the query and then search the vector database to retrieve the most -similar vectors, and return those documents so they can be passed to the LLM as -context. Then the query and the context will be passed to the LLM which will -have to _again_ create token embeddings for the query. But because gritlm is used -the first query can be cached and the second query tokenization generation does -not have to be performed at all. - -### Running the example -Download a Grit model: -```console -$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models -``` - -Run the example using the downloaded model: -```console -$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf - -Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 -Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 -Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112 -Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547 - -Oh, brave adventurer, who dared to climb -The lofty peak of Mt. Fuji in the night, -When shadows lurk and ghosts do roam, -And darkness reigns, a fearsome sight. - -Thou didst set out, with heart aglow, -To conquer this mountain, so high, -And reach the summit, where the stars do glow, -And the moon shines bright, up in the sky. - -Through the mist and fog, thou didst press on, -With steadfast courage, and a steadfast will, -Through the darkness, thou didst not be gone, -But didst climb on, with a steadfast skill. - -At last, thou didst reach the summit's crest, -And gazed upon the world below, -And saw the beauty of the night's best, -And felt the peace, that only nature knows. - -Oh, brave adventurer, who dared to climb -The lofty peak of Mt. Fuji in the night, -Thou art a hero, in the eyes of all, -For thou didst conquer this mountain, so bright. -``` - -[gritlm]: https://github.com/ContextualAI/gritlm diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp deleted file mode 100644 index 2c61c2e1e..000000000 --- a/examples/gritlm/gritlm.cpp +++ /dev/null @@ -1,219 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include -#include - -// #define GRIT_DEBUG - -static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) { - std::vector> result; - - const llama_model * mdl = llama_get_model(ctx); - - llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); - - for (uint64_t i = 0; i < sentences.size(); i++) { - llama_batch_clear(batch); - - const std::string input_string = instruction + sentences[i]; - - std::vector inputs = llama_tokenize(mdl, input_string, true, false); - - const int32_t n_toks = inputs.size(); - - // GritLM seems to have EOS = "" - // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 - // inputs.push_back(llama_token_eos(mdl)); - - // we want to ignore instruction tokens for mean pooling - const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size(); - -#ifdef GRIT_DEBUG - // debug tokens - should be matching as referenced in the GritLM sample - std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) { - std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str()); - }); - std::printf("\n"); -#endif - - // add input to batch (this increments n_tokens) - for (int32_t j = 0; j < n_toks; j++) { - llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); - } - - // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); - llama_set_embeddings(ctx, true); - llama_set_causal_attn(ctx, false); - - // run model - llama_decode(ctx, batch); - - // get embedding dimensions - uint64_t n_embd = llama_n_embd(mdl); - - // allocate embedding output - std::vector emb_unorm(n_embd, 0.0f); - - // sum up all token embeddings - for (int32_t k = n_inst; k < n_toks; k++) { - float * emb = llama_get_embeddings_ith(ctx, k); - for (uint64_t j = 0; j < n_embd; j++) { - emb_unorm[j] += emb[j]; - } - } - - // divide by number of tokens (mean pooling) - { - const uint64_t n_sent = n_toks - n_inst; - - for (uint64_t j = 0; j < n_embd; j++) { - emb_unorm[j] /= n_sent; - } - } - - std::vector emb_norm(emb_unorm.size()); - llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); - result.push_back(emb_norm); - -#ifdef GRIT_DEBUG - // print out emb_norm - std::printf("embedding %ld: ", i); - for (uint64_t j = 0; j < n_embd; j++) { - std::printf("%.5f ", emb_norm[j]); - } - std::printf("\n\n"); -#endif - } - - llama_batch_free(batch); - - return result; -} - -static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) { - std::string result; - - const llama_model * mdl = llama_get_model(ctx); - llama_token eos_token = llama_token_eos(mdl); - - llama_kv_cache_clear(ctx); - llama_set_embeddings(ctx, false); - llama_set_causal_attn(ctx, true); - - llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); - - std::vector inputs = llama_tokenize(mdl, prompt, false, true); - int32_t i_current_token = 0; - - while (true) { - llama_batch_clear(bat); - auto n_inputs = (int32_t)inputs.size(); - for (int32_t i = 0; i < n_inputs; i++) { - llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); - } - inputs.clear(); - - llama_decode(ctx, bat); - auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1); - - auto candidates = std::vector(llama_n_vocab(mdl)); - auto n_candidates = (int32_t)candidates.size(); - for (int32_t token = 0; token < n_candidates; token++) { - candidates[token] = llama_token_data{ token, logits[token], 0.0f }; - } - auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false }; - - llama_token token = llama_sample_token_greedy(ctx, &candidates_p); - if (token == eos_token) { - break; - } - - std::string piece = llama_token_to_piece(ctx, token); - if (stream) { - std::printf("%s", piece.c_str()); - std::fflush(stdout); - } - - inputs.push_back(token); - - result += piece; - } - - if (stream) { - std::printf("\n"); - } - - llama_batch_free(bat); - - return result; -} - -static std::string gritlm_instruction(const std::string & instruction) { - return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n"; -} - -int main(int argc, char * argv[]) { - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); - return 1; - } - - llama_model_params mparams = llama_model_params_from_gpt_params(params); - llama_context_params cparams = llama_context_params_from_gpt_params(params); - - llama_backend_init(); - - llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); - - // create generation context - llama_context * ctx = llama_new_context_with_model(mdl, cparams); - - // ### Embedding/Representation ### - // samples taken from: https://github.com/ContextualAI/gritlm#basic - { - const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract"; - - const std::vector queries = { - "Bitcoin: A Peer-to-Peer Electronic Cash System", - "Generative Representational Instruction Tuning", - }; - - const std::vector documents = { - "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.", - "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.", - }; - - // No need to add instruction for retrieval documents - const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); - const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); - - const int n_embd = llama_n_embd(mdl); - - const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); - const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); - const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); - const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); - - std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); - std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); - std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0); - std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1); - } - - // ### Generation ### - // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction - { - const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n"; - std::string response = generate(ctx, prompt, true); - } - - llama_free(ctx); - llama_free_model(mdl); - llama_backend_free(); - - return 0; -} diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt deleted file mode 100644 index d4c8265bd..000000000 --- a/examples/imatrix/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-imatrix) -add_executable(${TARGET} imatrix.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md deleted file mode 100644 index bb5faec94..000000000 --- a/examples/imatrix/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# llama.cpp/examples/imatrix - -Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models. -More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 - -## Usage - -``` -./llama-imatrix \ - -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ - [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ - [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] -``` - -Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. -The parameters in square brackets are optional and have the following meaning: -* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. -* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. -* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) -* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never) -* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. - -For faster computation, make sure to use GPU offloading via the `-ngl` argument - -## Example - -```bash -GGML_CUDA=1 make -j - -# generate importance matrix (imatrix.dat) -./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 - -# use the imatrix to perform a Q4_K_M quantization -./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m -``` diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp deleted file mode 100644 index 574f5ed9c..000000000 --- a/examples/imatrix/imatrix.cpp +++ /dev/null @@ -1,649 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" - " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" - " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); - LOG_TEE("\n"); -} - -struct Stats { - std::vector values; - std::vector counts; - int ncall = 0; -}; - -class IMatrixCollector { -public: - IMatrixCollector() = default; - void set_params(gpt_params params) { m_params = std::move(params); } - bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); - void save_imatrix(int ncall = -1) const; - bool load_imatrix(const char * file_name); -private: - std::unordered_map m_stats; - gpt_params m_params; - std::mutex m_mutex; - int m_last_call = 0; - std::vector m_src1_data; - std::vector m_ids; // the expert ids from ggml_mul_mat_id -}; - -// remove any prefix and suffixes from the name -// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight -static std::string filter_tensor_name(const char * name) { - std::string wname; - const char * p = strchr(name, '#'); - if (p != NULL) { - p = p + 1; - const char * q = strchr(p, '#'); - if (q != NULL) { - wname = std::string(p, q - p); - } else { - wname = p; - } - } else { - wname = name; - } - return wname; -} - -bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { - GGML_UNUSED(user_data); - - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - std::string wname = filter_tensor_name(src0->name); - - // when ask is true, the scheduler wants to know if we are interested in data from this tensor - // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection - if (ask) { - if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications - if (t->op != GGML_OP_MUL_MAT) return false; - // why are small batches ignored (<16 tokens)? - if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; - if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false; - return true; - } - - std::lock_guard lock(m_mutex); - - // copy the data from the GPU memory if needed - const bool is_host = ggml_backend_buffer_is_host(src1->buffer); - - if (!is_host) { - m_src1_data.resize(ggml_nelements(src1)); - ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1)); - } - - const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); - - // this has been adapted to the new format of storing merged experts in a single 3d tensor - // ref: https://github.com/ggerganov/llama.cpp/pull/6387 - if (t->op == GGML_OP_MUL_MAT_ID) { - // ids -> [n_experts_used, n_tokens] - // src1 -> [cols, n_expert_used, n_tokens] - const ggml_tensor * ids = t->src[2]; - const int n_as = src0->ne[2]; - const int n_ids = ids->ne[0]; - - // the top-k selected expert ids are stored in the ids tensor - // for simplicity, always copy ids to host, because it is small - // take into account that ids is not contiguous! - - GGML_ASSERT(ids->ne[1] == src1->ne[2]); - - m_ids.resize(ggml_nbytes(ids)); - ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids)); - - auto & e = m_stats[wname]; - - ++e.ncall; - - if (e.values.empty()) { - e.values.resize(src1->ne[0]*n_as, 0); - e.counts.resize(src1->ne[0]*n_as, 0); - } - else if (e.values.size() != (size_t)src1->ne[0]*n_as) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); - exit(1); //GGML_ASSERT(false); - } - if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); - } - // loop over all possible experts, regardless if they are used or not in the batch - for (int ex = 0; ex < n_as; ++ex) { - size_t e_start = ex*src1->ne[0]; - - for (int idx = 0; idx < n_ids; ++idx) { - for (int row = 0; row < (int)src1->ne[2]; ++row) { - const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]); - - GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check - - if (excur != ex) continue; - - const int64_t i11 = idx % src1->ne[1]; - const int64_t i12 = row; - const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]); - - for (int j = 0; j < (int)src1->ne[0]; ++j) { - e.values[e_start + j] += x[j]*x[j]; - e.counts[e_start + j]++; - if (!std::isfinite(e.values[e_start + j])) { - fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str()); - exit(1); - } - } - } - } - if (e.ncall > m_last_call) { - m_last_call = e.ncall; - if (m_last_call % m_params.n_out_freq == 0) { - save_imatrix(); - } - if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { - save_imatrix(m_last_call); - } - } - } - } else { - auto & e = m_stats[wname]; - if (e.values.empty()) { - e.values.resize(src1->ne[0], 0); - e.counts.resize(src1->ne[0], 0); - } - else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ASSERT(false); - } - ++e.ncall; - if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); - } - for (int row = 0; row < (int)src1->ne[1]; ++row) { - const float * x = data + row * src1->ne[0]; - for (int j = 0; j < (int)src1->ne[0]; ++j) { - e.values[j] += x[j]*x[j]; - e.counts[j]++; - if (!std::isfinite(e.values[j])) { - fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str()); - exit(1); - } - } - } - if (e.ncall > m_last_call) { - m_last_call = e.ncall; - if (m_last_call % m_params.n_out_freq == 0) { - save_imatrix(); - } - if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { - save_imatrix(m_last_call); - } - } - } - - return true; -} - -void IMatrixCollector::save_imatrix(int ncall) const { - auto fname = m_params.out_file; - if (fname.empty()) { - fname = "imatrix.dat"; - } - - if (ncall > 0) { - fname += ".at_"; - fname += std::to_string(ncall); - } - - // avoid writing imatrix entries that do not have full data - // this can happen with MoE models where some of the experts end up not being exercised by the provided training data - - int n_entries = 0; - std::vector to_store; - - bool is_first = true; // for printing - for (const auto & kv : m_stats) { - const int n_all = kv.second.counts.size(); - - if (n_all == 0) { - continue; - } - - int n_zeros = 0; - for (const int c : kv.second.counts) { - if (c == 0) { - n_zeros++; - } - } - - if (n_zeros != 0 && is_first) { - fprintf(stderr, "\n"); - is_first = false; - } - - if (n_zeros == n_all) { - fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); - continue; - } - - if (n_zeros > 0) { - fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); - continue; - } - - n_entries++; - to_store.push_back(kv.first); - } - - if (to_store.size() < m_stats.size()) { - fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); - } - - std::ofstream out(fname, std::ios::binary); - out.write((const char *) &n_entries, sizeof(n_entries)); - for (const auto & name : to_store) { - const auto & stat = m_stats.at(name); - int len = name.size(); - out.write((const char *) &len, sizeof(len)); - out.write(name.c_str(), len); - out.write((const char *) &stat.ncall, sizeof(stat.ncall)); - int nval = stat.values.size(); - out.write((const char *) &nval, sizeof(nval)); - if (nval > 0) { - std::vector tmp(nval); - for (int i = 0; i < nval; i++) { - tmp[i] = (stat.values[i] / static_cast(stat.counts[i])) * static_cast(stat.ncall); - } - out.write((const char*)tmp.data(), nval*sizeof(float)); - } - } - - // Write the number of call the matrix was computed with - out.write((const char *) &m_last_call, sizeof(m_last_call)); - - // Write the input filename at the end of the file to later on specify it in quantize - { - int len = m_params.prompt_file.size(); - out.write((const char *) &len, sizeof(len)); - out.write(m_params.prompt_file.c_str(), len); - } - - if (m_params.verbosity > 0) { - fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); - } -} - -bool IMatrixCollector::load_imatrix(const char * fname) { - std::ifstream in(fname, std::ios::binary); - if (!in) { - printf("%s: failed to open %s\n",__func__, fname); - return false; - } - int n_entries; - in.read((char*)&n_entries, sizeof(n_entries)); - if (in.fail() || n_entries < 1) { - printf("%s: no data in file %s\n", __func__, fname); - return false; - } - for (int i = 0; i < n_entries; ++i) { - int len; in.read((char *)&len, sizeof(len)); - std::vector name_as_vec(len+1); - in.read((char *)name_as_vec.data(), len); - if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); - return false; - } - name_as_vec[len] = 0; - std::string name{name_as_vec.data()}; - auto & e = m_stats[std::move(name)]; - int ncall; - in.read((char*)&ncall, sizeof(ncall)); - int nval; - in.read((char *)&nval, sizeof(nval)); - if (in.fail() || nval < 1) { - printf("%s: failed reading number of values for entry %d\n",__func__,i); - m_stats = {}; - return false; - } - - if (e.values.empty()) { - e.values.resize(nval, 0); - e.counts.resize(nval, 0); - } - - std::vector tmp(nval); - in.read((char*)tmp.data(), nval*sizeof(float)); - if (in.fail()) { - printf("%s: failed reading data for entry %d\n",__func__,i); - m_stats = {}; - return false; - } - - // Recreate the state as expected by save_imatrix(), and corerct for weighted sum. - for (int i = 0; i < nval; i++) { - e.values[i] += tmp[i]; - e.counts[i] += ncall; - } - e.ncall += ncall; - - } - return true; -} - -static IMatrixCollector g_collector; - -static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { - return g_collector.collect_imatrix(t, ask, user_data); -} - - -struct results_log_softmax { - double log_softmax; - float logit; - float prob; -}; - -static std::vector softmax(const std::vector & logits) { - std::vector probs(logits.size()); - float max_logit = logits[0]; - for (float v : logits) { - max_logit = std::max(max_logit, v); - } - double sum_exp = 0.0; - for (size_t i = 0; i < logits.size(); i++) { - // Subtract the maximum logit value from the current logit value for numerical stability - const float logit = logits[i] - max_logit; - const float exp_logit = expf(logit); - sum_exp += exp_logit; - probs[i] = exp_logit; - } - for (size_t i = 0; i < probs.size(); i++) { - probs[i] /= sum_exp; - } - return probs; -} - -static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { - float max_logit = logits[0]; - for (int i = 1; i < n_vocab; ++i) { - max_logit = std::max(max_logit, logits[i]); - } - double sum_exp = 0.0; - for (int i = 0; i < n_vocab; ++i) { - sum_exp += expf(logits[i] - max_logit); - } - return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; -} - -static void process_logits( - int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, - double & nll, double & nll2, float * logit_history, float * prob_history) { - std::mutex mutex; - int counter = 0; - auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { - double local_nll = 0; - double local_nll2 = 0; - while (true) { - std::unique_lock lock(mutex); - int i = counter++; - if (i >= n_token) { - nll += local_nll; nll2 += local_nll2; - break; - } - lock.unlock(); - const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); - const double v = -results.log_softmax; - local_nll += v; - local_nll2 += v*v; - - logit_history[i] = results.logit; - prob_history[i] = results.prob; - } - }; - for (auto & w : workers) { - w = std::thread(compute); - } - compute(); - for (auto & w : workers) { - w.join(); - } -} - -static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); - const int n_ctx = llama_n_ctx(ctx); - - auto tim1 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); - - auto tim2 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); - - if (params.i_chunk > 0) { - if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) { - fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); - return false; - } - fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); - tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx); - } - - if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, - n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); - return false; - } - - std::vector logit_history; - std::vector prob_history; - - if (params.compute_ppl) { - logit_history.resize(tokens.size()); - prob_history.resize(tokens.size()); - } - - const int n_chunk_max = tokens.size() / n_ctx; - - const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_batch = params.n_batch; - - int count = 0; - double nll = 0.0; - double nll2 = 0.0; - - fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); - - std::vector workers(std::thread::hardware_concurrency() - 1); - - const int num_batches = (n_ctx + n_batch - 1) / n_batch; - - std::vector logits; - if (params.compute_ppl && num_batches > 1) { - logits.reserve((size_t)n_ctx * n_vocab); - } - - for (int i = 0; i < n_chunk; ++i) { - const int start = i * n_ctx; - const int end = start + n_ctx; - - std::vector logits; - - const auto t_start = std::chrono::high_resolution_clock::now(); - - // clear the KV cache - llama_kv_cache_clear(ctx); - - for (int j = 0; j < num_batches; ++j) { - const int batch_start = start + j * n_batch; - const int batch_size = std::min(end - batch_start, n_batch); - - // save original token and restore it after eval - const auto token_org = tokens[batch_start]; - - // add BOS token for the first batch of each chunk - if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); - } - - // TODO: use batch.logits to save computations instead of relying on logits_all == true - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - - // restore the original token in case it was set to BOS - tokens[batch_start] = token_org; - - if (params.compute_ppl && num_batches > 1) { - const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); - } - } - - const auto t_end = std::chrono::high_resolution_clock::now(); - - if (i == 0) { - const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); - int total_seconds = (int)(t_total * n_chunk); - if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); - total_seconds = total_seconds % (60*60); - } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); - } - - if (params.compute_ppl) { - const int first = n_ctx/2; - const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); - count += n_ctx - first - 1; - - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - fflush(stdout); - - logits.clear(); - } - } - printf("\n"); - - if (params.compute_ppl) { - nll2 /= count; - nll /= count; - const double ppl = exp(nll); - nll2 -= nll * nll; - if (nll2 > 0) { - nll2 = sqrt(nll2/(count-1)); - printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); - } else { - printf("Unexpected negative standard deviation of log(prob)\n"); - } - } - - return true; -} - -int main(int argc, char ** argv) { - gpt_params params; - - params.n_ctx = 512; - params.logits_all = true; - params.verbosity = 1; - - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); - return 1; - } - - params.n_batch = std::min(params.n_batch, params.n_ctx); - - g_collector.set_params(params); - - for (const auto & in_file : params.in_files) { - printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); - if (!g_collector.load_imatrix(in_file.c_str())) { - fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str()); - return 1; - } - } - - if (params.in_files.size() > 1) { - printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); - g_collector.save_imatrix(); - } - - llama_backend_init(); - llama_numa_init(params.numa); - - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation - params.cb_eval = ik_collect_imatrix; - params.cb_eval_user_data = NULL; - params.warmup = false; - - // init - llama_model * model; - llama_context * ctx; - - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; - } - - const int n_ctx_train = llama_n_ctx_train(model); - if (params.n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, params.n_ctx); - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); - } - - if (!compute_imatrix(ctx, params)) { - return 1; - } - - g_collector.save_imatrix(); - - llama_print_timings(ctx); - - llama_free(ctx); - llama_free_model(model); - - llama_backend_free(); - - return 0; -} diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt deleted file mode 100644 index 9b1aa3b63..000000000 --- a/examples/infill/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-infill) -add_executable(${TARGET} infill.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/infill/README.md b/examples/infill/README.md deleted file mode 100644 index 810a0c5e7..000000000 --- a/examples/infill/README.md +++ /dev/null @@ -1,47 +0,0 @@ -# llama.cpp/example/infill - -This example shows how to use the infill mode with Code Llama models supporting infill mode. -Currently the 7B and 13B models support infill mode. - -Infill supports most of the options available in the main example. - -For further information have a look at the main README.md in llama.cpp/example/main/README.md - -## Common Options - -In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models: - -- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). -- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. -- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. -- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. -- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. - -## Input Prompts - -The `infill` program provides several ways to interact with the LLaMA models using input prompts: - -- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option. -- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option. -- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.) - -## Interaction - -The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first` - -### Interaction Options - -- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model. -- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation. -- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text. - -### Example - -Download a model that supports infill, for example CodeLlama: -```console -scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models -``` - -```bash -./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " -``` diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp deleted file mode 100644 index dc93d2301..000000000 --- a/examples/infill/infill.cpp +++ /dev/null @@ -1,652 +0,0 @@ -#include "common.h" - -#include "console.h" -#include "llama.h" -#include "grammar-parser.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -#include -#include -#elif defined (_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#include -#endif - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static llama_context ** g_ctx; -static llama_model ** g_model; -static gpt_params * g_params; -static std::vector * g_input_tokens; -static std::ostringstream * g_output_ss; -static std::vector * g_output_tokens; - -static bool is_interacting = false; - -static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, - const std::vector & input_tokens, const std::string & output, - const std::vector & output_tokens -) { - if (params.logdir.empty()) { - return; - } - - const std::string timestamp = string_get_sortable_timestamp(); - - const bool success = fs_create_directory_with_parents(params.logdir); - if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); - return; - } - - const std::string logfile_path = params.logdir + timestamp + ".yml"; - FILE * logfile = fopen(logfile_path.c_str(), "w"); - - if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); - return; - } - - fprintf(logfile, "binary: infill\n"); - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); - - fprintf(logfile, "\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "# Generation Results #\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "\n"); - - yaml_dump_string_multiline(logfile, "output", output.c_str()); - yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - - llama_dump_timing_info_yaml(logfile, ctx); - fclose(logfile); -} - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) -static void sigint_handler(int signo) { - if (signo == SIGINT) { - if (!is_interacting) { - is_interacting = true; - } else { - console::cleanup(); - printf("\n"); - llama_print_timings(*g_ctx); - write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); - _exit(130); - } - } -} -#endif - -int main(int argc, char ** argv) { - gpt_params params; - llama_sampling_params & sparams = params.sparams; - g_params = ¶ms; - - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); - return 1; - } - -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("infill", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - - console::init(params.simple_io, params.use_color); - atexit([]() { console::cleanup(); }); - - if (params.logits_all) { - printf("\n************\n"); - printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - printf("************\n\n"); - - return 0; - } - - if (params.embedding) { - printf("\n************\n"); - printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - printf("************\n\n"); - - return 0; - } - - if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); - params.n_ctx = 8; - } - if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { - printf("\n************\n"); - printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); - printf("************\n\n"); - - return 0; - } - - if (params.rope_freq_base != 0.0) { - LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); - } - - if (params.rope_freq_scale != 0.0) { - LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); - } - - LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - LOG_TEE("%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - - LOG("%s: llama backend init\n", __func__); - llama_backend_init(); - llama_numa_init(params.numa); - - llama_model * model; - llama_context * ctx; - - g_model = &model; - g_ctx = &ctx; - - // load the model and apply lora adapter, if any - LOG("%s: load the model and apply lora adapter, if any\n", __func__); - std::tie(model, ctx) = llama_init_from_gpt_params(params); - - if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n", __func__); - return 1; - } - - const int n_ctx_train = llama_n_ctx_train(model); - const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); - - if (n_ctx > n_ctx_train) { - LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); - } - - // print system information - { - LOG_TEE("\n"); - LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); - } - const bool add_bos = llama_should_add_bos_token(model); - GGML_ASSERT(llama_add_eos_token(model) != 1); - LOG("add_bos: %d\n", add_bos); - - std::vector embd_inp; - std::vector embd_end; - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); - - GGML_ASSERT(llama_token_prefix(model) >= 0); - GGML_ASSERT(llama_token_suffix(model) >= 0); - - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - - embd_inp = params.spm_infill ? inp_sfx : inp_pfx; - embd_end = params.spm_infill ? inp_pfx : inp_sfx; - if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); - } - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - - const llama_token middle_token = llama_token_middle(model); - if (middle_token >= 0) { - embd_inp.push_back(middle_token); - } - - LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); - LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); - LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); - - // Should not run without any tokens - if (embd_inp.empty()) { - embd_inp.push_back(llama_token_bos(model)); - LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); - } - - if ((int) embd_inp.size() > n_ctx - 4) { - LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); - return 1; - } - - // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) { - params.n_keep = (int)embd_inp.size(); - } - - LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); - LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); - - - // enable interactive mode if interactive start is specified - if (params.interactive_first) { - params.interactive = true; - } - - if (params.verbose_prompt) { - LOG_TEE("\n"); - LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); - } - - if (params.n_keep > 0) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); - for (int i = 0; i < params.n_keep; i++) { - LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); - } - LOG_TEE("'\n"); - } - LOG_TEE("\n"); - } - - if (params.interactive) { -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = sigint_handler; - sigemptyset (&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); -#elif defined (_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; - }; - SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); -#endif - - LOG_TEE("%s: interactive mode on.\n", __func__); - - if (params.input_prefix_bos) { - LOG_TEE("Input prefix with BOS\n"); - } - - if (!params.input_prefix.empty()) { - LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); - } - - if (!params.input_suffix.empty()) { - LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); - } - } - LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); - LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); - LOG_TEE("\n\n"); - - LOG_TEE("\n##### Infill mode #####\n\n"); - if (params.infill) { - printf("\n************\n"); - printf("no need to specify '--infill', always running infill\n"); - printf("************\n\n"); - } - if (params.interactive) { - const char *control_message; - if (params.multiline_input) { - control_message = " - To return control to LLaMA, end your input with '\\'.\n" - " - To return control without starting a new line, end your input with '/'.\n"; - } else { - control_message = " - Press Return to return control to LLaMA.\n" - " - To return control without starting a new line, end your input with '/'.\n" - " - If you want to submit another line, end your input with '\\'.\n"; - } - LOG_TEE("== Running in interactive mode. ==\n"); -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); -#endif - LOG_TEE( "%s\n", control_message); - - is_interacting = params.interactive_first; - } - - bool input_echo = true; - - int n_past = 0; - int n_remain = params.n_predict; - int n_consumed = 0; - - std::vector input_tokens; g_input_tokens = &input_tokens; - std::vector output_tokens; g_output_tokens = &output_tokens; - std::ostringstream output_ss; g_output_ss = &output_ss; - - // the first thing we will do is to output the prompt, so set color accordingly - console::set_display(console::prompt); - - std::vector embd; - - struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); - - while (n_remain != 0 || params.interactive) { - // predict - if (!embd.empty()) { - // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via - // --prompt or --file which uses the same value. - int max_embd_size = n_ctx - 4; - - // Ensure the input doesn't exceed the context size by truncating embd if necessary. - if ((int) embd.size() > max_embd_size) { - const int skipped_tokens = (int) embd.size() - max_embd_size; - embd.resize(max_embd_size); - - console::set_display(console::error); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); - console::set_display(console::reset); - fflush(stdout); - } - - // infinite text generation via context swapping - // if we run out of context: - // - take the n_keep first tokens from the original prompt (via n_past) - // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() > n_ctx) { - if (params.n_predict == -2) { - LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); - break; - } - - const int n_left = n_past - params.n_keep - 1; - const int n_discard = n_left/2; - - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); - - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); - - n_past -= n_discard; - - LOG("after swap: n_past = %d\n", n_past); - - LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); - - } - - // evaluate tokens in batches - // embd is typically prepared beforehand to fit within a batch, but not always - for (int i = 0; i < (int) embd.size(); i += params.n_batch) { - int n_eval = (int) embd.size() - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; - } - - LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); - - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); - return 1; - } - - n_past += n_eval; - - LOG("n_past = %d\n", n_past); - } - - } - - embd.clear(); - - if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr); - - llama_sampling_accept(ctx_sampling, ctx, id, true); - - LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); - - embd.push_back(id); - - // echo this to console - input_echo = true; - - // decrement remaining sampling budget - --n_remain; - - LOG("n_remain: %d\n", n_remain); - } else { - // some user input remains from prompt or interaction, forward it to processing - LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); - while ((int) embd_inp.size() > n_consumed) { - embd.push_back(embd_inp[n_consumed]); - - // push the prompt in the sampling context in order to apply repetition penalties later - // for the prompt, we don't apply grammar rules - llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); - - ++n_consumed; - if ((int) embd.size() >= params.n_batch) { - break; - } - } - } - - // display text - if (input_echo) { - for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id); - printf("%s", token_str.c_str()); - - if (embd.size() > 1) { - input_tokens.push_back(id); - } else { - output_tokens.push_back(id); - output_ss << token_str; - } - } - fflush(stdout); - } - // reset color to default if we there is no pending user input - if (input_echo && (int) embd_inp.size() == n_consumed) { - console::set_display(console::reset); - } - - // if not currently processing queued inputs; - if ((int) embd_inp.size() <= n_consumed) { - // deal with eot token in infill mode - if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ - if (is_interacting && !params.interactive_first) { - // print an eot token - printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); - } - fflush(stdout); - printf("\n"); - console::set_display(console::user_input); - std::string buffer; - std::string line; - bool another_line=true; - // set a new prefix via stdin - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); - // check if we got an empty line, if so we use the old input - if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { - params.input_prefix = buffer; - } - buffer.clear(); - // set a new suffix via stdin - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); - // check if we got an empty line - if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { - params.input_suffix = buffer; - } - buffer.clear(); - // done taking input, reset color - console::set_display(console::reset); - - if (params.escape) { - //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here - string_process_escapes(params.input_prefix); - string_process_escapes(params.input_suffix); - } - - // tokenize new prefix and suffix - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); - - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - - embd_inp = params.spm_infill ? inp_sfx : inp_pfx; - embd_end = params.spm_infill ? inp_pfx : inp_sfx; - if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); - } - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - - if (middle_token >= 0) { - embd_inp.push_back(middle_token); - } - - embd.clear(); - n_remain = params.n_predict; - n_past = 0; - n_consumed = 0; - // LOG_TEE("took new input\n"); - is_interacting = false; - } - // deal with end of generation tokens in interactive mode - else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { - LOG("found EOS token\n"); - - if (params.interactive) { - - is_interacting = true; - printf("\n"); - console::set_display(console::user_input); - fflush(stdout); - } - } - - if (n_past > 0 && is_interacting && !params.interactive) { - LOG("waiting for user input\n"); - - if (params.input_prefix_bos) { - LOG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_token_bos(model)); - } - - std::string buffer; - if (!params.input_prefix.empty()) { - LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); - buffer += params.input_prefix; - printf("%s", buffer.c_str()); - } - - std::string line; - bool another_line = true; - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); - - // done taking input, reset color - console::set_display(console::reset); - - // Add tokens to embd only if the input buffer is non-empty - // Entering a empty line lets the user pass control back - if (buffer.length() > 1) { - // append input suffix if any - if (!params.input_suffix.empty()) { - LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); - buffer += params.input_suffix; - printf("%s", params.input_suffix.c_str()); - } - - LOG("buffer: '%s'\n", buffer.c_str()); - - const size_t original_size = embd_inp.size(); - - const auto line_inp = ::llama_tokenize(ctx, buffer, false); - LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); - - embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); - - for (size_t i = original_size; i < embd_inp.size(); ++i) { - const llama_token token = embd_inp[i]; - output_tokens.push_back(token); - output_ss << llama_token_to_piece(ctx, token); - } - - n_remain -= line_inp.size(); - LOG("n_remain: %d\n", n_remain); - } else { - LOG("empty line, passing control back\n"); - } - - input_echo = false; // do not echo this again - } - - if (n_past > 0) { - if (is_interacting) { - llama_sampling_reset(ctx_sampling); - } - is_interacting = false; - } - } - - // end of generation - if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) { - break; - } - - // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. - // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). - if (params.interactive && n_remain <= 0 && params.n_predict >= 0) { - n_remain = params.n_predict; - is_interacting = true; - } - } - if (!params.interactive && n_remain <= 0) { - printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); - fflush(stdout); - } - - llama_print_timings(ctx); - write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); - - llama_free(ctx); - llama_free_model(model); - - llama_sampling_free(ctx_sampling); - llama_backend_free(); - -#ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n"); -#endif // LOG_DISABLE_LOGS - - return 0; -} diff --git a/examples/jeopardy/README.md b/examples/jeopardy/README.md deleted file mode 100644 index ffa13cbf3..000000000 --- a/examples/jeopardy/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# llama.cpp/example/jeopardy - -This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer. - -The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc. - - -Step 1: Open jeopardy.sh and modify the following: -``` -MODEL=(path to your model) -MODEL_NAME=(name of your model) -prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc) -opts=(add -instruct here if needed for your model, or anything else you want to test out) -``` -Step 2: Run `jeopardy.sh` from the llama.cpp folder - -Step 3: Repeat steps 1 and 2 until you have all the results you need. - -Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph. - -Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid. diff --git a/examples/jeopardy/graph.py b/examples/jeopardy/graph.py deleted file mode 100755 index 8bc0706b8..000000000 --- a/examples/jeopardy/graph.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -import matplotlib.pyplot as plt -import os -import csv - -labels = [] -numbers = [] -numEntries = 1 - -rows = [] - - -def bar_chart(numbers, labels, pos): - plt.bar(pos, numbers, color='blue') - plt.xticks(ticks=pos, labels=labels) - plt.title("Jeopardy Results by Model") - plt.xlabel("Model") - plt.ylabel("Questions Correct") - plt.show() - - -def calculatecorrect(): - directory = os.fsencode("./examples/jeopardy/results/") - csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',') - for row in csv_reader: - global rows - rows.append(row) - for listing in os.listdir(directory): - filename = os.fsdecode(listing) - if filename.endswith(".txt"): - file = open("./examples/jeopardy/results/" + filename, "rt") - global labels - global numEntries - global numbers - labels.append(filename[:-4]) - numEntries += 1 - i = 1 - totalcorrect = 0 - for line in file.readlines(): - if line.strip() != "------": - print(line) - else: - print("Correct answer: " + rows[i][2] + "\n") - i += 1 - print("Did the AI get the question right? (y/n)") - if input() == "y": - totalcorrect += 1 - numbers.append(totalcorrect) - - -if __name__ == '__main__': - calculatecorrect() - pos = list(range(numEntries)) - labels.append("Human") - numbers.append(48.11) - bar_chart(numbers, labels, pos) - print(labels) - print(numbers) diff --git a/examples/jeopardy/jeopardy.sh b/examples/jeopardy/jeopardy.sh deleted file mode 100755 index 07bcb3b8d..000000000 --- a/examples/jeopardy/jeopardy.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -set -e - -MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin -MODEL_NAME=Vicuna - -# exec options -prefix="Human: " # Ex. Vicuna uses "Human: " -opts="--temp 0 -n 80" # additional flags -nl=' -' -introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)." - -# file options -question_file=./examples/jeopardy/questions.txt -touch ./examples/jeopardy/results/$MODEL_NAME.txt -output_file=./examples/jeopardy/results/$MODEL_NAME.txt - -counter=1 - -echo 'Running' -while IFS= read -r question -do - exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\"" - echo $counter - echo "Current Question: $question" - eval "$exe_cmd" - echo -e "\n------" >> $output_file - counter=$((counter+1)) -done < "$question_file" diff --git a/examples/jeopardy/qasheet.csv b/examples/jeopardy/qasheet.csv deleted file mode 100644 index 35b084189..000000000 --- a/examples/jeopardy/qasheet.csv +++ /dev/null @@ -1,103 +0,0 @@ -Index,Original Category,Original Correct Question,Model Prompt -1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars? -2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?" -3,Writers’ Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?" -4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?" -5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution? -6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?" -7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?" -8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?" -9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?" -10,Movies of the ’80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated? -11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?" -12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?" -13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?" -14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?" -15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?" -16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society? -17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?" -18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?" -19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?" -20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?" -21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women? -22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?" -23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation? -24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?" -25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'? -26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?" -27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?" -28,Geographic Name’s the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?" -29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?" -30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?" -31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?" -32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?" -33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3 -34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?" -35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?" -36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?" -37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?" -38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?" -39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?" -40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South? -41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?" -42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?" -43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969? -44,Art & Science,What is Halley’s Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?" -45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?" -46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?" -47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?" -48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?" -49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'? -50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'? -51,World War II,What is Schindler’s list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?" -52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece? -53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?" -54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada? -55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?" -56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?" -57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?" -58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what? -59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'? -60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?" -61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?" -62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?" -63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?" -64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?" -65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?" -66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot? -67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what? -68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s? -69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?" -70,USA,What is Jack Daniel’s?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?" -71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?" -72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022? -73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?" -74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?" -75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?" -76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?" -77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?" -78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?" -79,Children’s Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?" -80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?" -81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?" -82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?" -83,Children’s Lit,What is Charlotte’s Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'? -84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite? -85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?" -86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?" -87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?" -88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last? -89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?" -90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?" -91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience? -92,Name’s The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?" -93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?" -94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play? -95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?" -96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?" -97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?" -98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?" -99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951? -100,Children’s Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?" -,,, -TOTALS,,, diff --git a/examples/jeopardy/questions.txt b/examples/jeopardy/questions.txt deleted file mode 100644 index eea78a057..000000000 --- a/examples/jeopardy/questions.txt +++ /dev/null @@ -1,100 +0,0 @@ -Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars? -What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'? -Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions? -James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'? -England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution? -Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners? -In 1966, the year of who's death did he share plans for an experimental prototype community in Florida? -Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea? -Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany? -What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated? -A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'? -A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'? -Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response? -The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis? -In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'? -What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society? -Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness? -What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service? -In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'? -At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'? -Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women? -A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'? -In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation? -Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country? -A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'? -Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'? -After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'? -The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states? -This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book? -An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland? -Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic? -What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'? -A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later? -Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life? -Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence? -The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned? -For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk? -Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group? -In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'? -In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South? -What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'? -In 2010 who introduced the 4-point shot, 35 feet from the basket? -Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969? -A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem? -In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable? -Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage? -In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage? -Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II? -This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'? -1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'? -Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers? -Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object? -Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier? -5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other? -Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind? -The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title? -Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast? -Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what? -In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'? -At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name? -Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990? -Like Sir Thomas More, 3 16th century English queens are buried at what British location? -In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'? -The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states? -What was first sold in 1908, at a price equivalent to about $27,000 today? -The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot? -The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what? -In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s? -In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times? -Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county? -After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'? -Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022? -Until a 1903 secession, what country's contiguous territory spanned 2 continents? -Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter? -Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911? -Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled? -Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios? -Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday? -Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'? -In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022? -In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death? -In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'? -The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'? -The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite? -Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product? -What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon? -What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'? -Only 3 presidents have married while in office— John Tyler was the first & which one was the last? -Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century? -Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another? -The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience? -A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name? -Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President? -A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play? -In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'? -Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'? -Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'? -The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star? -What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951? -Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England? diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt deleted file mode 100644 index 5bdbea4e2..000000000 --- a/examples/llama-bench/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-bench) -add_executable(${TARGET} llama-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md deleted file mode 100644 index 52b0e74d3..000000000 --- a/examples/llama-bench/README.md +++ /dev/null @@ -1,281 +0,0 @@ -# llama.cpp/examples/llama-bench - -Performance testing tool for llama.cpp. - -## Table of contents - -1. [Syntax](#syntax) -2. [Examples](#examples) - 1. [Text generation with different models](#text-generation-with-different-models) - 2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes) - 3. [Different numbers of threads](#different-numbers-of-threads) - 4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu) -3. [Output formats](#output-formats) - 1. [Markdown](#markdown) - 2. [CSV](#csv) - 3. [JSON](#json) - 4. [SQL](#sql) - -## Syntax - -``` -usage: ./llama-bench [options] - -options: - -h, --help - -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -pg (default: 512,128) - -b, --batch-size (default: 2048) - -ub, --ubatch-size (default: 512) - -ctk, --cache-type-k (default: f16) - -ctv, --cache-type-v (default: f16) - -t, --threads (default: 16) - -ngl, --n-gpu-layers (default: 99) - -sm, --split-mode (default: layer) - -mg, --main-gpu (default: 0) - -nkvo, --no-kv-offload <0|1> (default: 0) - -fa, --flash-attn <0|1> (default: 0) - -mmp, --mmap <0|1> (default: 1) - --numa (default: disabled) - -embd, --embeddings <0|1> (default: 0) - -ts, --tensor-split (default: 0) - -r, --repetitions (default: 5) - -o, --output (default: md) - -v, --verbose (default: 0) - -Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. -``` - -llama-bench can perform three types of tests: - -- Prompt processing (pp): processing a prompt in batches (`-p`) -- Text generation (tg): generating a sequence of tokens (`-n`) -- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`) - -With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`). - -Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition. - -For a description of the other options, see the [main example](../main/README.md). - -Note: - -- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`. - -## Examples - -### Text generation with different models - -```sh -$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512 -``` - -| model | size | params | backend | ngl | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 | - -### Prompt processing with different batch sizes - -```sh -$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 -``` - -| model | size | params | backend | ngl | n_batch | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 | - -### Different numbers of threads - -```sh -$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 -``` - -| model | size | params | backend | threads | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 || - -### Different numbers of layers offloaded to the GPU - -```sh -$ ./llama-bench -ngl 10,20,30,31,32,33,34,35 -``` - -| model | size | params | backend | ngl | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 | - -## Output formats - -By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option. - -### Markdown - -```sh -$ ./llama-bench -o md -``` - -| model | size | params | backend | ngl | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 | - -### CSV - -```sh -$ ./llama-bench -o csv -``` - -```csv -build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961" -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342" -``` - -### JSON - -```sh -$ ./llama-bench -o json -``` - -```json -[ - { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, - "n_gpu_layers": 99, - "main_gpu": 0, - "mul_mat_q": true, - "tensor_split": "0.00", - "n_prompt": 512, - "n_gen": 0, - "test_time": "2023-09-23T12:09:57Z", - "avg_ns": 212365953, - "stddev_ns": 985423, - "avg_ts": 2410.974041, - "stddev_ts": 11.163766, - "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ], - "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ] - }, - { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, - "n_gpu_layers": 99, - "main_gpu": 0, - "mul_mat_q": true, - "tensor_split": "0.00", - "n_prompt": 0, - "n_gen": 128, - "test_time": "2023-09-23T12:09:59Z", - "avg_ns": 977425219, - "stddev_ns": 9268593, - "avg_ts": 130.965708, - "stddev_ts": 1.238924, - "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ], - "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ] - } -] -``` - -### SQL - -SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database. - -```sh -$ ./llama-bench -o sql -``` - -```sql -CREATE TABLE IF NOT EXISTS test ( - build_commit TEXT, - build_number INTEGER, - cuda INTEGER, - metal INTEGER, - gpu_blas INTEGER, - blas INTEGER, - cpu_info TEXT, - gpu_info TEXT, - model_filename TEXT, - model_type TEXT, - model_size INTEGER, - model_n_params INTEGER, - n_batch INTEGER, - n_threads INTEGER, - f16_kv INTEGER, - n_gpu_layers INTEGER, - main_gpu INTEGER, - mul_mat_q INTEGER, - tensor_split TEXT, - n_prompt INTEGER, - n_gen INTEGER, - test_time TEXT, - avg_ns INTEGER, - stddev_ns INTEGER, - avg_ts REAL, - stddev_ts REAL -); - -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634'); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692'); -``` diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp deleted file mode 100644 index a6497b6e0..000000000 --- a/examples/llama-bench/llama-bench.cpp +++ /dev/null @@ -1,1455 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ggml.h" -#include "llama.h" -#include "common.h" -#include "ggml-cuda.h" -#include "ggml-sycl.h" - -#ifdef GGML_USE_CANN -#include "ggml-cann.h" -#endif - -// utils -static uint64_t get_time_ns() { - using clock = std::chrono::high_resolution_clock; - return std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); -} - -template -static std::string join(const std::vector & values, const std::string & delim) { - std::ostringstream str; - for (size_t i = 0; i < values.size(); i++) { - str << values[i]; - if (i < values.size() - 1) { - str << delim; - } - } - return str.str(); -} - -template -static std::vector transform_to_str(const std::vector & values, F f) { - std::vector str_values; - std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); - return str_values; -} - -template -static T avg(const std::vector & v) { - if (v.empty()) { - return 0; - } - T sum = std::accumulate(v.begin(), v.end(), T(0)); - return sum / (T)v.size(); -} - -template -static T stdev(const std::vector & v) { - if (v.size() <= 1) { - return 0; - } - T mean = avg(v); - T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); - T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)); - return stdev; -} - -static std::string get_cpu_info() { - std::string id; -#ifdef __linux__ - FILE * f = fopen("/proc/cpuinfo", "r"); - if (f) { - char buf[1024]; - while (fgets(buf, sizeof(buf), f)) { - if (strncmp(buf, "model name", 10) == 0) { - char * p = strchr(buf, ':'); - if (p) { - p++; - while (std::isspace(*p)) { - p++; - } - while (std::isspace(p[strlen(p) - 1])) { - p[strlen(p) - 1] = '\0'; - } - id = p; - break; - } - } - } - fclose(f); - } -#endif - // TODO: other platforms - return id; -} - -static std::string get_gpu_info() { - std::string id; -#ifdef GGML_USE_CUDA - int count = ggml_backend_cuda_get_device_count(); - for (int i = 0; i < count; i++) { - char buf[128]; - ggml_backend_cuda_get_device_description(i, buf, sizeof(buf)); - id += buf; - if (i < count - 1) { - id += "/"; - } - } -#endif -#ifdef GGML_USE_SYCL - int count = ggml_backend_sycl_get_device_count(); - for (int i = 0; i < count; i++) { - char buf[128]; - ggml_sycl_get_device_description(i, buf, sizeof(buf)); - id += buf; - if (i < count - 1) { - id += "/"; - } - } -#endif -#ifdef GGML_USE_CANN - uint32_t count = ggml_backend_cann_get_device_count(); - for (uint32_t i = 0; i < count; i++) { - char buf[128]; - ggml_backend_cann_get_device_description(i, buf, sizeof(buf)); - id += buf; - if (i < count - 1) { - id += "/"; - } - } -#endif - // TODO: other backends - return id; -} - -// command line params -enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL}; - -static const char * output_format_str(output_formats format) { - switch (format) { - case NONE: return "none"; - case CSV: return "csv"; - case JSON: return "json"; - case MARKDOWN: return "md"; - case SQL: return "sql"; - default: GGML_ASSERT(!"invalid output format"); - } -} - -static bool output_format_from_str(const std::string & s, output_formats & format) { - if (s == "none") { - format = NONE; - } else if (s == "csv") { - format = CSV; - } else if (s == "json") { - format = JSON; - } else if (s == "md") { - format = MARKDOWN; - } else if (s == "sql") { - format = SQL; - } else { - return false; - } - return true; -} - -static const char * split_mode_str(llama_split_mode mode) { - switch (mode) { - case LLAMA_SPLIT_MODE_NONE: return "none"; - case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; - default: GGML_ASSERT(!"invalid split mode"); - } -} - -static std::string pair_str(const std::pair & p) { - static char buf[32]; - snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second); - return buf; -} - -struct cmd_params { - std::vector model; - std::vector n_prompt; - std::vector n_gen; - std::vector> n_pg; - std::vector n_batch; - std::vector n_ubatch; - std::vector type_k; - std::vector type_v; - std::vector n_threads; - std::vector n_gpu_layers; - std::vector rpc_servers; - std::vector split_mode; - std::vector main_gpu; - std::vector no_kv_offload; - std::vector flash_attn; - std::vector> tensor_split; - std::vector use_mmap; - std::vector embeddings; - ggml_numa_strategy numa; - int reps; - bool verbose; - output_formats output_format; - output_formats output_format_stderr; -}; - -static const cmd_params cmd_params_defaults = { - /* model */ {"models/7B/ggml-model-q4_0.gguf"}, - /* n_prompt */ {512}, - /* n_gen */ {128}, - /* n_pg */ {}, - /* n_batch */ {2048}, - /* n_ubatch */ {512}, - /* type_k */ {GGML_TYPE_F16}, - /* type_v */ {GGML_TYPE_F16}, - /* n_threads */ {cpu_get_num_math()}, - /* n_gpu_layers */ {99}, - /* rpc_servers */ {""}, - /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, - /* main_gpu */ {0}, - /* no_kv_offload */ {false}, - /* flash_attn */ {false}, - /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, - /* use_mmap */ {true}, - /* embeddings */ {false}, - /* numa */ GGML_NUMA_STRATEGY_DISABLED, - /* reps */ 5, - /* verbose */ false, - /* output_format */ MARKDOWN, - /* output_format_stderr */ NONE, -}; - -static void print_usage(int /* argc */, char ** argv) { - printf("usage: %s [options]\n", argv[0]); - printf("\n"); - printf("options:\n"); - printf(" -h, --help\n"); - printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); - printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ub, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); - printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); - printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); - printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); - printf(" --numa (default: disabled)\n"); - printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); - printf(" -ts, --tensor-split (default: 0)\n"); - printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); - printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); - printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); - printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); - printf("\n"); - printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); -} - -static ggml_type ggml_type_from_name(const std::string & s) { - if (s == "f16") { - return GGML_TYPE_F16; - } - if (s == "q8_0") { - return GGML_TYPE_Q8_0; - } - if (s == "q4_0") { - return GGML_TYPE_Q4_0; - } - if (s == "q4_1") { - return GGML_TYPE_Q4_1; - } - if (s == "q5_0") { - return GGML_TYPE_Q5_0; - } - if (s == "q5_1") { - return GGML_TYPE_Q5_1; - } - if (s == "iq4_nl") { - return GGML_TYPE_IQ4_NL; - } - - return GGML_TYPE_COUNT; -} - - -static cmd_params parse_cmd_params(int argc, char ** argv) { - cmd_params params; - std::string arg; - bool invalid_param = false; - const std::string arg_prefix = "--"; - const char split_delim = ','; - - params.verbose = cmd_params_defaults.verbose; - params.output_format = cmd_params_defaults.output_format; - params.output_format_stderr = cmd_params_defaults.output_format_stderr; - params.reps = cmd_params_defaults.reps; - params.numa = cmd_params_defaults.numa; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (arg == "-h" || arg == "--help") { - print_usage(argc, argv); - exit(0); - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.model.insert(params.model.end(), p.begin(), p.end()); - } else if (arg == "-p" || arg == "--n-prompt") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); - } else if (arg == "-n" || arg == "--n-gen") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); - } else if (arg == "-pg") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], ','); - if (p.size() != 2) { - invalid_param = true; - break; - } - params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); - } else if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); - } else if (arg == "-ub" || arg == "--ubatch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); - } else if (arg == "-ctk" || arg == "--cache-type-k") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - std::vector types; - for (const auto & t : p) { - ggml_type gt = ggml_type_from_name(t); - if (gt == GGML_TYPE_COUNT) { - invalid_param = true; - break; - } - types.push_back(gt); - } - params.type_k.insert(params.type_k.end(), types.begin(), types.end()); - } else if (arg == "-ctv" || arg == "--cache-type-v") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - std::vector types; - for (const auto & t : p) { - ggml_type gt = ggml_type_from_name(t); - if (gt == GGML_TYPE_COUNT) { - invalid_param = true; - break; - } - types.push_back(gt); - } - params.type_v.insert(params.type_v.end(), types.begin(), types.end()); - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); - } else if (arg == "-ngl" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); - } else if (arg == "-rpc" || arg == "--rpc") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rpc_servers.push_back(argv[i]); - } else if (arg == "-sm" || arg == "--split-mode") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - std::vector modes; - for (const auto & m : p) { - llama_split_mode mode; - if (m == "none") { - mode = LLAMA_SPLIT_MODE_NONE; - } else if (m == "layer") { - mode = LLAMA_SPLIT_MODE_LAYER; - } else if (m == "row") { - mode = LLAMA_SPLIT_MODE_ROW; - } else { - invalid_param = true; - break; - } - modes.push_back(mode); - } - params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); - } else if (arg == "-mg" || arg == "--main-gpu") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.main_gpu = string_split(argv[i], split_delim); - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); - } else if (arg == "--numa") { - if (++i >= argc) { - invalid_param = true; - break; - } else { - std::string value(argv[i]); - /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; break; } - } - } else if (arg == "-fa" || arg == "--flash-attn") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); - } else if (arg == "-mmp" || arg == "--mmap") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); - } else if (arg == "-embd" || arg == "--embeddings") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); - } else if (arg == "-ts" || arg == "--tensor-split") { - if (++i >= argc) { - invalid_param = true; - break; - } - for (auto ts : string_split(argv[i], split_delim)) { - // split string by ; and / - const std::regex regex{R"([;/]+)"}; - std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; - std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= llama_max_devices()); - - std::vector tensor_split(llama_max_devices()); - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - tensor_split[i] = std::stof(split_arg[i]); - } else { - tensor_split[i] = 0.0f; - } - } - params.tensor_split.push_back(tensor_split); - } - } else if (arg == "-r" || arg == "--repetitions") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.reps = std::stoi(argv[i]); - } else if (arg == "-o" || arg == "--output") { - if (++i >= argc) { - invalid_param = true; - break; - } - invalid_param = !output_format_from_str(argv[i], params.output_format); - } else if (arg == "-oe" || arg == "--output-err") { - if (++i >= argc) { - invalid_param = true; - break; - } - invalid_param = !output_format_from_str(argv[i], params.output_format_stderr); - } else if (arg == "-v" || arg == "--verbose") { - params.verbose = true; - } else { - invalid_param = true; - break; - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - print_usage(argc, argv); - exit(1); - } - - // set defaults - if (params.model.empty()) { params.model = cmd_params_defaults.model; } - if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } - if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } - if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } - if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } - if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } - if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } - if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } - if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } - if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } - if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } - if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } - if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } - if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } - if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } - if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } - if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } - if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } - - return params; -} - -struct cmd_params_instance { - std::string model; - int n_prompt; - int n_gen; - int n_batch; - int n_ubatch; - ggml_type type_k; - ggml_type type_v; - int n_threads; - int n_gpu_layers; - std::string rpc_servers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; - std::vector tensor_split; - bool use_mmap; - bool embeddings; - - llama_model_params to_llama_mparams() const { - llama_model_params mparams = llama_model_default_params(); - - mparams.n_gpu_layers = n_gpu_layers; - if (!rpc_servers.empty()) { - mparams.rpc_servers = rpc_servers.c_str(); - } - mparams.split_mode = split_mode; - mparams.main_gpu = main_gpu; - mparams.tensor_split = tensor_split.data(); - mparams.use_mmap = use_mmap; - - return mparams; - } - - bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && - n_gpu_layers == other.n_gpu_layers && - rpc_servers == other.rpc_servers && - split_mode == other.split_mode && - main_gpu == other.main_gpu && - use_mmap == other.use_mmap && - tensor_split == other.tensor_split; - } - - llama_context_params to_llama_cparams() const { - llama_context_params cparams = llama_context_default_params(); - - cparams.n_ctx = n_prompt + n_gen; - cparams.n_batch = n_batch; - cparams.n_ubatch = n_ubatch; - cparams.type_k = type_k; - cparams.type_v = type_v; - cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn = flash_attn; - cparams.embeddings = embeddings; - - return cparams; - } -}; - -static std::vector get_cmd_params_instances(const cmd_params & params) { - std::vector instances; - - // this ordering minimizes the number of times that each model needs to be reloaded - for (const auto & m : params.model) - for (const auto & nl : params.n_gpu_layers) - for (const auto & rpc : params.rpc_servers) - for (const auto & sm : params.split_mode) - for (const auto & mg : params.main_gpu) - for (const auto & ts : params.tensor_split) - for (const auto & mmp : params.use_mmap) - for (const auto & embd : params.embeddings) - for (const auto & nb : params.n_batch) - for (const auto & nub : params.n_ubatch) - for (const auto & tk : params.type_k) - for (const auto & tv : params.type_v) - for (const auto & nkvo : params.no_kv_offload) - for (const auto & fa : params.flash_attn) - for (const auto & nt : params.n_threads) { - for (const auto & n_prompt : params.n_prompt) { - if (n_prompt == 0) { - continue; - } - cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ n_prompt, - /* .n_gen = */ 0, - /* .n_batch = */ nb, - /* .n_ubatch = */ nub, - /* .type_k = */ tk, - /* .type_v = */ tv, - /* .n_threads = */ nt, - /* .n_gpu_layers = */ nl, - /* .rpc_servers = */ rpc, - /* .split_mode = */ sm, - /* .main_gpu = */ mg, - /* .no_kv_offload= */ nkvo, - /* .flash_attn = */ fa, - /* .tensor_split = */ ts, - /* .use_mmap = */ mmp, - /* .embeddings = */ embd, - }; - instances.push_back(instance); - } - - for (const auto & n_gen : params.n_gen) { - if (n_gen == 0) { - continue; - } - cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ 0, - /* .n_gen = */ n_gen, - /* .n_batch = */ nb, - /* .n_ubatch = */ nub, - /* .type_k = */ tk, - /* .type_v = */ tv, - /* .n_threads = */ nt, - /* .n_gpu_layers = */ nl, - /* .rpc_servers = */ rpc, - /* .split_mode = */ sm, - /* .main_gpu = */ mg, - /* .no_kv_offload= */ nkvo, - /* .flash_attn = */ fa, - /* .tensor_split = */ ts, - /* .use_mmap = */ mmp, - /* .embeddings = */ embd, - }; - instances.push_back(instance); - } - - for (const auto & n_pg : params.n_pg) { - if (n_pg.first == 0 && n_pg.second == 0) { - continue; - } - cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ n_pg.first, - /* .n_gen = */ n_pg.second, - /* .n_batch = */ nb, - /* .n_ubatch = */ nub, - /* .type_k = */ tk, - /* .type_v = */ tv, - /* .n_threads = */ nt, - /* .n_gpu_layers = */ nl, - /* .rpc_servers = */ rpc, - /* .split_mode = */ sm, - /* .main_gpu = */ mg, - /* .no_kv_offload= */ nkvo, - /* .flash_attn = */ fa, - /* .tensor_split = */ ts, - /* .use_mmap = */ mmp, - /* .embeddings = */ embd, - }; - instances.push_back(instance); - } - } - - return instances; -} - -struct test { - static const std::string build_commit; - static const int build_number; - static const bool cuda; - static const bool vulkan; - static const bool kompute; - static const bool metal; - static const bool sycl; - static const bool gpu_blas; - static const bool blas; - static const std::string cpu_info; - static const std::string gpu_info; - std::string model_filename; - std::string model_type; - uint64_t model_size; - uint64_t model_n_params; - int n_batch; - int n_ubatch; - int n_threads; - bool has_rpc; - ggml_type type_k; - ggml_type type_v; - int n_gpu_layers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; - std::vector tensor_split; - bool use_mmap; - bool embeddings; - int n_prompt; - int n_gen; - std::string test_time; - std::vector samples_ns; - - test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { - model_filename = inst.model; - char buf[128]; - llama_model_desc(lmodel, buf, sizeof(buf)); - model_type = buf; - model_size = llama_model_size(lmodel); - model_n_params = llama_model_n_params(lmodel); - n_batch = inst.n_batch; - n_ubatch = inst.n_ubatch; - n_threads = inst.n_threads; - has_rpc = !inst.rpc_servers.empty(); - type_k = inst.type_k; - type_v = inst.type_v; - n_gpu_layers = inst.n_gpu_layers; - split_mode = inst.split_mode; - main_gpu = inst.main_gpu; - no_kv_offload = inst.no_kv_offload; - flash_attn = inst.flash_attn; - tensor_split = inst.tensor_split; - use_mmap = inst.use_mmap; - embeddings = inst.embeddings; - n_prompt = inst.n_prompt; - n_gen = inst.n_gen; - // RFC 3339 date-time format - time_t t = time(NULL); - std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); - test_time = buf; - - (void) ctx; - } - - uint64_t avg_ns() const { - return ::avg(samples_ns); - } - - uint64_t stdev_ns() const { - return ::stdev(samples_ns); - } - - std::vector get_ts() const { - int n_tokens = n_prompt + n_gen; - std::vector ts; - std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); - return ts; - } - - double avg_ts() const { - return ::avg(get_ts()); - } - - double stdev_ts() const { - return ::stdev(get_ts()); - } - - static std::string get_backend() { - if (cuda) { - return GGML_CUDA_NAME; - } - if (vulkan) { - return "Vulkan"; - } - if (kompute) { - return "Kompute"; - } - if (metal) { - return "Metal"; - } - if (sycl) { - return GGML_SYCL_NAME; - } - if (gpu_blas) { - return "GPU BLAS"; - } - if (blas) { - return "BLAS"; - } - - return "CPU"; - } - - static const std::vector & get_fields() { - static const std::vector fields = { - "build_commit", "build_number", - "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas", - "cpu_info", "gpu_info", - "model_filename", "model_type", "model_size", "model_n_params", - "n_batch", "n_ubatch", - "n_threads", "type_k", "type_v", - "n_gpu_layers", "split_mode", - "main_gpu", "no_kv_offload", "flash_attn", - "tensor_split", "use_mmap", "embeddings", - "n_prompt", "n_gen", "test_time", - "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts" - }; - return fields; - } - - enum field_type {STRING, BOOL, INT, FLOAT}; - - static field_type get_field_type(const std::string & field) { - if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || - field == "n_threads" || - field == "model_size" || field == "model_n_params" || - field == "n_gpu_layers" || field == "main_gpu" || - field == "n_prompt" || field == "n_gen" || - field == "avg_ns" || field == "stddev_ns") { - return INT; - } - if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || - field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || - field == "flash_attn" || field == "use_mmap" || field == "embeddings") { - return BOOL; - } - if (field == "avg_ts" || field == "stddev_ts") { - return FLOAT; - } - return STRING; - } - - std::vector get_values() const { - std::string tensor_split_str; - int max_nonzero = 0; - for (size_t i = 0; i < llama_max_devices(); i++) { - if (tensor_split[i] > 0) { - max_nonzero = i; - } - } - for (int i = 0; i <= max_nonzero; i++) { - char buf[32]; - snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]); - tensor_split_str += buf; - if (i < max_nonzero) { - tensor_split_str += "/"; - } - } - std::vector values = { - build_commit, std::to_string(build_number), - std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan), - std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas), - cpu_info, gpu_info, - model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), - std::to_string(n_batch), std::to_string(n_ubatch), - std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), split_mode_str(split_mode), - std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), - tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), - std::to_string(n_prompt), std::to_string(n_gen), test_time, - std::to_string(avg_ns()), std::to_string(stdev_ns()), - std::to_string(avg_ts()), std::to_string(stdev_ts()) - }; - return values; - } - - std::map get_map() const { - std::map map; - auto fields = get_fields(); - auto values = get_values(); - std::transform(fields.begin(), fields.end(), values.begin(), - std::inserter(map, map.end()), std::make_pair); - return map; - } -}; - -const std::string test::build_commit = LLAMA_COMMIT; -const int test::build_number = LLAMA_BUILD_NUMBER; -const bool test::cuda = !!ggml_cpu_has_cuda(); -const bool test::vulkan = !!ggml_cpu_has_vulkan(); -const bool test::kompute = !!ggml_cpu_has_kompute(); -const bool test::metal = !!ggml_cpu_has_metal(); -const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); -const bool test::blas = !!ggml_cpu_has_blas(); -const bool test::sycl = !!ggml_cpu_has_sycl(); -const std::string test::cpu_info = get_cpu_info(); -const std::string test::gpu_info = get_gpu_info(); - -struct printer { - virtual ~printer() {} - - FILE * fout; - virtual void print_header(const cmd_params & params) { (void) params; } - virtual void print_test(const test & t) = 0; - virtual void print_footer() { } -}; - -struct csv_printer : public printer { - static std::string escape_csv(const std::string & field) { - std::string escaped = "\""; - for (auto c : field) { - if (c == '"') { - escaped += "\""; - } - escaped += c; - } - escaped += "\""; - return escaped; - } - - void print_header(const cmd_params & params) override { - std::vector fields = test::get_fields(); - fprintf(fout, "%s\n", join(fields, ",").c_str()); - (void) params; - } - - void print_test(const test & t) override { - std::vector values = t.get_values(); - std::transform(values.begin(), values.end(), values.begin(), escape_csv); - fprintf(fout, "%s\n", join(values, ",").c_str()); - } -}; - -struct json_printer : public printer { - bool first = true; - - static std::string escape_json(const std::string & value) { - std::string escaped; - for (auto c : value) { - if (c == '"') { - escaped += "\\\""; - } else if (c == '\\') { - escaped += "\\\\"; - } else if (c <= 0x1f) { - char buf[8]; - snprintf(buf, sizeof(buf), "\\u%04x", c); - escaped += buf; - } else { - escaped += c; - } - } - return escaped; - } - - static std::string format_value(const std::string & field, const std::string & value) { - switch (test::get_field_type(field)) { - case test::STRING: - return "\"" + escape_json(value) + "\""; - case test::BOOL: - return value == "0" ? "false" : "true"; - default: - return value; - } - } - - void print_header(const cmd_params & params) override { - fprintf(fout, "[\n"); - (void) params; - } - - void print_fields(const std::vector & fields, const std::vector & values) { - assert(fields.size() == values.size()); - for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str()); - } - } - - void print_test(const test & t) override { - if (first) { - first = false; - } else { - fprintf(fout, ",\n"); - } - fprintf(fout, " {\n"); - print_fields(test::get_fields(), t.get_values()); - fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str()); - fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str()); - fprintf(fout, " }"); - fflush(fout); - } - - void print_footer() override { - fprintf(fout, "\n]\n"); - } -}; - -struct markdown_printer : public printer { - std::vector fields; - - static int get_field_width(const std::string & field) { - if (field == "model") { - return -30; - } - if (field == "t/s") { - return 16; - } - if (field == "size" || field == "params") { - return 10; - } - if (field == "n_gpu_layers") { - return 3; - } - if (field == "n_threads") { - return 7; - } - if (field == "n_batch") { - return 7; - } - if (field == "n_ubatch") { - return 8; - } - if (field == "type_k" || field == "type_v") { - return 6; - } - if (field == "split_mode") { - return 5; - } - if (field == "flash_attn") { - return 2; - } - if (field == "use_mmap") { - return 4; - } - if (field == "test") { - return 13; - } - - int width = std::max((int)field.length(), 10); - - if (test::get_field_type(field) == test::STRING) { - return -width; - } - return width; - } - - static std::string get_field_display_name(const std::string & field) { - if (field == "n_gpu_layers") { - return "ngl"; - } - if (field == "split_mode") { - return "sm"; - } - if (field == "n_threads") { - return "threads"; - } - if (field == "no_kv_offload") { - return "nkvo"; - } - if (field == "flash_attn") { - return "fa"; - } - if (field == "use_mmap") { - return "mmap"; - } - if (field == "embeddings") { - return "embd"; - } - if (field == "tensor_split") { - return "ts"; - } - return field; - } - - void print_header(const cmd_params & params) override { - // select fields to print - fields.emplace_back("model"); - fields.emplace_back("size"); - fields.emplace_back("params"); - fields.emplace_back("backend"); - bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; - if (!is_cpu_backend) { - fields.emplace_back("n_gpu_layers"); - } - if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { - fields.emplace_back("n_threads"); - } - if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { - fields.emplace_back("n_batch"); - } - if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) { - fields.emplace_back("n_ubatch"); - } - if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { - fields.emplace_back("type_k"); - } - if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { - fields.emplace_back("type_v"); - } - if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { - fields.emplace_back("main_gpu"); - } - if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { - fields.emplace_back("split_mode"); - } - if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { - fields.emplace_back("no_kv_offload"); - } - if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) { - fields.emplace_back("flash_attn"); - } - if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { - fields.emplace_back("tensor_split"); - } - if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { - fields.emplace_back("use_mmap"); - } - if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { - fields.emplace_back("embeddings"); - } - fields.emplace_back("test"); - fields.emplace_back("t/s"); - - fprintf(fout, "|"); - for (const auto & field : fields) { - fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str()); - } - fprintf(fout, "\n"); - fprintf(fout, "|"); - for (const auto & field : fields) { - int width = get_field_width(field); - fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-"); - } - fprintf(fout, "\n"); - } - - void print_test(const test & t) override { - std::map vmap = t.get_map(); - - fprintf(fout, "|"); - for (const auto & field : fields) { - std::string value; - char buf[128]; - if (field == "model") { - value = t.model_type; - } else if (field == "size") { - if (t.model_size < 1024*1024*1024) { - snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); - } else { - snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); - } - value = buf; - } else if (field == "params") { - if (t.model_n_params < 1000*1000*1000) { - snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); - } else { - snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); - } - value = buf; - } else if (field == "backend") { - value = test::get_backend(); - if (t.has_rpc) { - value += "+RPC"; - } - } else if (field == "test") { - if (t.n_prompt > 0 && t.n_gen == 0) { - snprintf(buf, sizeof(buf), "pp%d", t.n_prompt); - } else if (t.n_gen > 0 && t.n_prompt == 0) { - snprintf(buf, sizeof(buf), "tg%d", t.n_gen); - } else { - snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); - } - value = buf; - } else if (field == "t/s") { - snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); - value = buf; - } else if (vmap.find(field) != vmap.end()) { - value = vmap.at(field); - } else { - assert(false); - exit(1); - } - - int width = get_field_width(field); - if (field == "t/s") { - // HACK: the utf-8 character is 2 bytes - width += 1; - } - fprintf(fout, " %*s |", width, value.c_str()); - } - fprintf(fout, "\n"); - } - - void print_footer() override { - fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number); - } -}; - -struct sql_printer : public printer { - static std::string get_sql_field_type(const std::string & field) { - switch (test::get_field_type(field)) { - case test::STRING: - return "TEXT"; - case test::BOOL: - case test::INT: - return "INTEGER"; - case test::FLOAT: - return "REAL"; - default: - assert(false); - exit(1); - } - } - - void print_header(const cmd_params & params) override { - std::vector fields = test::get_fields(); - fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n"); - for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : ""); - } - fprintf(fout, ");\n"); - fprintf(fout, "\n"); - (void) params; - } - - void print_test(const test & t) override { - fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str()); - fprintf(fout, "VALUES ("); - std::vector values = t.get_values(); - for (size_t i = 0; i < values.size(); i++) { - fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : ""); - } - fprintf(fout, ");\n"); - } -}; - -static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { - llama_set_n_threads(ctx, n_threads, n_threads); - - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); - - std::vector tokens(n_batch); - - int n_processed = 0; - - while (n_processed < n_prompt) { - int n_tokens = std::min(n_prompt - n_processed, n_batch); - tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; - for (int i = 1; i < n_tokens; i++) { - tokens[i] = std::rand() % n_vocab; - } - llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); - n_processed += n_tokens; - } - - llama_synchronize(ctx); -} - -static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { - llama_set_n_threads(ctx, n_threads, n_threads); - - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); - - llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; - - for (int i = 0; i < n_gen; i++) { - llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); - llama_synchronize(ctx); - token = std::rand() % n_vocab; - } -} - -static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) text; - (void) user_data; -} - -static std::unique_ptr create_printer(output_formats format) { - switch (format) { - case NONE: - return nullptr; - case CSV: - return std::unique_ptr(new csv_printer()); - case JSON: - return std::unique_ptr(new json_printer()); - case MARKDOWN: - return std::unique_ptr(new markdown_printer()); - case SQL: - return std::unique_ptr(new sql_printer()); - } - GGML_ASSERT(false); -} - -int main(int argc, char ** argv) { - // try to set locale for unicode characters in markdown - setlocale(LC_CTYPE, ".UTF-8"); - -#if !defined(NDEBUG) - fprintf(stderr, "warning: asserts enabled, performance may be affected\n"); -#endif - -#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__)) - fprintf(stderr, "warning: debug build, performance may be affected\n"); -#endif - -#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__) - fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n"); -#endif - - cmd_params params = parse_cmd_params(argc, argv); - - // initialize llama.cpp - if (!params.verbose) { - llama_log_set(llama_null_log_callback, NULL); - } - llama_backend_init(); - llama_numa_init(params.numa); - - // initialize printer - std::unique_ptr p = create_printer(params.output_format); - std::unique_ptr p_err = create_printer(params.output_format_stderr); - - if (p) { - p->fout = stdout; - p->print_header(params); - } - - if (p_err) { - p_err->fout = stderr; - p_err->print_header(params); - } - - std::vector params_instances = get_cmd_params_instances(params); - - llama_model * lmodel = nullptr; - const cmd_params_instance * prev_inst = nullptr; - - for (const auto & inst : params_instances) { - // keep the same model between tests when possible - if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { - if (lmodel) { - llama_free_model(lmodel); - } - - lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); - if (lmodel == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); - return 1; - } - prev_inst = &inst; - } - - llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); - llama_free_model(lmodel); - return 1; - } - - test t(inst, lmodel, ctx); - - llama_kv_cache_clear(ctx); - - // warmup run - if (t.n_prompt > 0) { - //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); - test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); - } - if (t.n_gen > 0) { - test_gen(ctx, 1, 0, t.n_threads); - } - - for (int i = 0; i < params.reps; i++) { - llama_kv_cache_clear(ctx); - - uint64_t t_start = get_time_ns(); - - if (t.n_prompt > 0) { - test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); - } - if (t.n_gen > 0) { - test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); - } - - uint64_t t_ns = get_time_ns() - t_start; - t.samples_ns.push_back(t_ns); - } - - if (p) { - p->print_test(t); - fflush(p->fout); - } - - if (p_err) { - p_err->print_test(t); - fflush(p_err->fout); - } - - llama_print_timings(ctx); - - llama_free(ctx); - } - - llama_free_model(lmodel); - - if (p) { - p->print_footer(); - } - - if (p_err) { - p_err->print_footer(); - } - - llama_backend_free(); - - return 0; -} diff --git a/examples/llama.android/.gitignore b/examples/llama.android/.gitignore deleted file mode 100644 index 347e252ef..000000000 --- a/examples/llama.android/.gitignore +++ /dev/null @@ -1,33 +0,0 @@ -# Gradle files -.gradle/ -build/ - -# Local configuration file (sdk path, etc) -local.properties - -# Log/OS Files -*.log - -# Android Studio generated files and folders -captures/ -.externalNativeBuild/ -.cxx/ -*.apk -output.json - -# IntelliJ -*.iml -.idea/ -misc.xml -deploymentTargetDropDown.xml -render.experimental.xml - -# Keystore files -*.jks -*.keystore - -# Google Services (e.g. APIs or Firebase) -google-services.json - -# Android Profiling -*.hprof diff --git a/examples/llama.android/README.md b/examples/llama.android/README.md deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/llama.android/app/.gitignore b/examples/llama.android/app/.gitignore deleted file mode 100644 index 796b96d1c..000000000 --- a/examples/llama.android/app/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/build diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts deleted file mode 100644 index 8d1b37195..000000000 --- a/examples/llama.android/app/build.gradle.kts +++ /dev/null @@ -1,65 +0,0 @@ -plugins { - id("com.android.application") - id("org.jetbrains.kotlin.android") -} - -android { - namespace = "com.example.llama" - compileSdk = 34 - - defaultConfig { - applicationId = "com.example.llama" - minSdk = 33 - targetSdk = 34 - versionCode = 1 - versionName = "1.0" - - testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" - vectorDrawables { - useSupportLibrary = true - } - } - - buildTypes { - release { - isMinifyEnabled = false - proguardFiles( - getDefaultProguardFile("proguard-android-optimize.txt"), - "proguard-rules.pro" - ) - } - } - compileOptions { - sourceCompatibility = JavaVersion.VERSION_1_8 - targetCompatibility = JavaVersion.VERSION_1_8 - } - kotlinOptions { - jvmTarget = "1.8" - } - buildFeatures { - compose = true - } - composeOptions { - kotlinCompilerExtensionVersion = "1.5.1" - } -} - -dependencies { - - implementation("androidx.core:core-ktx:1.12.0") - implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2") - implementation("androidx.activity:activity-compose:1.8.2") - implementation(platform("androidx.compose:compose-bom:2023.08.00")) - implementation("androidx.compose.ui:ui") - implementation("androidx.compose.ui:ui-graphics") - implementation("androidx.compose.ui:ui-tooling-preview") - implementation("androidx.compose.material3:material3") - implementation(project(":llama")) - testImplementation("junit:junit:4.13.2") - androidTestImplementation("androidx.test.ext:junit:1.1.5") - androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") - androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00")) - androidTestImplementation("androidx.compose.ui:ui-test-junit4") - debugImplementation("androidx.compose.ui:ui-tooling") - debugImplementation("androidx.compose.ui:ui-test-manifest") -} diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro deleted file mode 100644 index f1b424510..000000000 --- a/examples/llama.android/app/proguard-rules.pro +++ /dev/null @@ -1,21 +0,0 @@ -# Add project specific ProGuard rules here. -# You can control the set of applied configuration files using the -# proguardFiles setting in build.gradle. -# -# For more details, see -# http://developer.android.com/guide/developing/tools/proguard.html - -# If your project uses WebView with JS, uncomment the following -# and specify the fully qualified class name to the JavaScript interface -# class: -#-keepclassmembers class fqcn.of.javascript.interface.for.webview { -# public *; -#} - -# Uncomment this to preserve the line number information for -# debugging stack traces. -#-keepattributes SourceFile,LineNumberTable - -# If you keep the line number information, uncomment this to -# hide the original source file name. -#-renamesourcefileattribute SourceFile diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml deleted file mode 100644 index 41a358a29..000000000 --- a/examples/llama.android/app/src/main/AndroidManifest.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - - - - - - - - - - diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt deleted file mode 100644 index 78c231ae5..000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt +++ /dev/null @@ -1,119 +0,0 @@ -package com.example.llama - -import android.app.DownloadManager -import android.net.Uri -import android.util.Log -import androidx.compose.material3.Button -import androidx.compose.material3.Text -import androidx.compose.runtime.Composable -import androidx.compose.runtime.getValue -import androidx.compose.runtime.mutableDoubleStateOf -import androidx.compose.runtime.mutableStateOf -import androidx.compose.runtime.remember -import androidx.compose.runtime.rememberCoroutineScope -import androidx.compose.runtime.setValue -import androidx.core.database.getLongOrNull -import androidx.core.net.toUri -import kotlinx.coroutines.delay -import kotlinx.coroutines.launch -import java.io.File - -data class Downloadable(val name: String, val source: Uri, val destination: File) { - companion object { - @JvmStatic - private val tag: String? = this::class.qualifiedName - - sealed interface State - data object Ready: State - data class Downloading(val id: Long): State - data class Downloaded(val downloadable: Downloadable): State - data class Error(val message: String): State - - @JvmStatic - @Composable - fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) { - var status: State by remember { - mutableStateOf( - if (item.destination.exists()) Downloaded(item) - else Ready - ) - } - var progress by remember { mutableDoubleStateOf(0.0) } - - val coroutineScope = rememberCoroutineScope() - - suspend fun waitForDownload(result: Downloading, item: Downloadable): State { - while (true) { - val cursor = dm.query(DownloadManager.Query().setFilterById(result.id)) - - if (cursor == null) { - Log.e(tag, "dm.query() returned null") - return Error("dm.query() returned null") - } - - if (!cursor.moveToFirst() || cursor.count < 1) { - cursor.close() - Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?") - return Ready - } - - val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR) - val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES) - val sofar = cursor.getLongOrNull(pix) ?: 0 - val total = cursor.getLongOrNull(tix) ?: 1 - cursor.close() - - if (sofar == total) { - return Downloaded(item) - } - - progress = (sofar * 1.0) / total - - delay(1000L) - } - } - - fun onClick() { - when (val s = status) { - is Downloaded -> { - viewModel.load(item.destination.path) - } - - is Downloading -> { - coroutineScope.launch { - status = waitForDownload(s, item) - } - } - - else -> { - item.destination.delete() - - val request = DownloadManager.Request(item.source).apply { - setTitle("Downloading model") - setDescription("Downloading model: ${item.name}") - setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI) - setDestinationUri(item.destination.toUri()) - } - - viewModel.log("Saving ${item.name} to ${item.destination.path}") - Log.i(tag, "Saving ${item.name} to ${item.destination.path}") - - val id = dm.enqueue(request) - status = Downloading(id) - onClick() - } - } - } - - Button(onClick = { onClick() }, enabled = status !is Downloading) { - when (status) { - is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%") - is Downloaded -> Text("Load ${item.name}") - is Ready -> Text("Download ${item.name}") - is Error -> Text("Download ${item.name}") - } - } - } - - } -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt deleted file mode 100644 index 9da04f7d3..000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt +++ /dev/null @@ -1,154 +0,0 @@ -package com.example.llama - -import android.app.ActivityManager -import android.app.DownloadManager -import android.content.ClipData -import android.content.ClipboardManager -import android.net.Uri -import android.os.Bundle -import android.os.StrictMode -import android.os.StrictMode.VmPolicy -import android.text.format.Formatter -import androidx.activity.ComponentActivity -import androidx.activity.compose.setContent -import androidx.activity.viewModels -import androidx.compose.foundation.layout.Box -import androidx.compose.foundation.layout.Column -import androidx.compose.foundation.layout.Row -import androidx.compose.foundation.layout.fillMaxSize -import androidx.compose.foundation.layout.padding -import androidx.compose.foundation.lazy.LazyColumn -import androidx.compose.foundation.lazy.items -import androidx.compose.foundation.lazy.rememberLazyListState -import androidx.compose.material3.Button -import androidx.compose.material3.LocalContentColor -import androidx.compose.material3.MaterialTheme -import androidx.compose.material3.OutlinedTextField -import androidx.compose.material3.Surface -import androidx.compose.material3.Text -import androidx.compose.runtime.Composable -import androidx.compose.ui.Modifier -import androidx.compose.ui.unit.dp -import androidx.core.content.getSystemService -import com.example.llama.ui.theme.LlamaAndroidTheme -import java.io.File - -class MainActivity( - activityManager: ActivityManager? = null, - downloadManager: DownloadManager? = null, - clipboardManager: ClipboardManager? = null, -): ComponentActivity() { - private val tag: String? = this::class.simpleName - - private val activityManager by lazy { activityManager ?: getSystemService()!! } - private val downloadManager by lazy { downloadManager ?: getSystemService()!! } - private val clipboardManager by lazy { clipboardManager ?: getSystemService()!! } - - private val viewModel: MainViewModel by viewModels() - - // Get a MemoryInfo object for the device's current memory status. - private fun availableMemory(): ActivityManager.MemoryInfo { - return ActivityManager.MemoryInfo().also { memoryInfo -> - activityManager.getMemoryInfo(memoryInfo) - } - } - - override fun onCreate(savedInstanceState: Bundle?) { - super.onCreate(savedInstanceState) - - StrictMode.setVmPolicy( - VmPolicy.Builder(StrictMode.getVmPolicy()) - .detectLeakedClosableObjects() - .build() - ) - - val free = Formatter.formatFileSize(this, availableMemory().availMem) - val total = Formatter.formatFileSize(this, availableMemory().totalMem) - - viewModel.log("Current memory: $free / $total") - viewModel.log("Downloads directory: ${getExternalFilesDir(null)}") - - val extFilesDir = getExternalFilesDir(null) - - val models = listOf( - Downloadable( - "Phi-2 7B (Q4_0, 1.6 GiB)", - Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"), - File(extFilesDir, "phi-2-q4_0.gguf"), - ), - Downloadable( - "TinyLlama 1.1B (f16, 2.2 GiB)", - Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"), - File(extFilesDir, "tinyllama-1.1-f16.gguf"), - ), - Downloadable( - "Phi 2 DPO (Q3_K_M, 1.48 GiB)", - Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"), - File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf") - ), - ) - - setContent { - LlamaAndroidTheme { - // A surface container using the 'background' color from the theme - Surface( - modifier = Modifier.fillMaxSize(), - color = MaterialTheme.colorScheme.background - ) { - MainCompose( - viewModel, - clipboardManager, - downloadManager, - models, - ) - } - - } - } - } -} - -@Composable -fun MainCompose( - viewModel: MainViewModel, - clipboard: ClipboardManager, - dm: DownloadManager, - models: List -) { - Column { - val scrollState = rememberLazyListState() - - Box(modifier = Modifier.weight(1f)) { - LazyColumn(state = scrollState) { - items(viewModel.messages) { - Text( - it, - style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current), - modifier = Modifier.padding(16.dp) - ) - } - } - } - OutlinedTextField( - value = viewModel.message, - onValueChange = { viewModel.updateMessage(it) }, - label = { Text("Message") }, - ) - Row { - Button({ viewModel.send() }) { Text("Send") } - Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") } - Button({ viewModel.clear() }) { Text("Clear") } - Button({ - viewModel.messages.joinToString("\n").let { - clipboard.setPrimaryClip(ClipData.newPlainText("", it)) - } - }) { Text("Copy") } - } - - Column { - for (model in models) { - Downloadable.Button(viewModel, dm, model) - } - } - } -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt deleted file mode 100644 index 45ac29938..000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt +++ /dev/null @@ -1,105 +0,0 @@ -package com.example.llama - -import android.llama.cpp.LLamaAndroid -import android.util.Log -import androidx.compose.runtime.getValue -import androidx.compose.runtime.mutableStateOf -import androidx.compose.runtime.setValue -import androidx.lifecycle.ViewModel -import androidx.lifecycle.viewModelScope -import kotlinx.coroutines.flow.catch -import kotlinx.coroutines.launch - -class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() { - companion object { - @JvmStatic - private val NanosPerSecond = 1_000_000_000.0 - } - - private val tag: String? = this::class.simpleName - - var messages by mutableStateOf(listOf("Initializing...")) - private set - - var message by mutableStateOf("") - private set - - override fun onCleared() { - super.onCleared() - - viewModelScope.launch { - try { - llamaAndroid.unload() - } catch (exc: IllegalStateException) { - messages += exc.message!! - } - } - } - - fun send() { - val text = message - message = "" - - // Add to messages console. - messages += text - messages += "" - - viewModelScope.launch { - llamaAndroid.send(text) - .catch { - Log.e(tag, "send() failed", it) - messages += it.message!! - } - .collect { messages = messages.dropLast(1) + (messages.last() + it) } - } - } - - fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) { - viewModelScope.launch { - try { - val start = System.nanoTime() - val warmupResult = llamaAndroid.bench(pp, tg, pl, nr) - val end = System.nanoTime() - - messages += warmupResult - - val warmup = (end - start).toDouble() / NanosPerSecond - messages += "Warm up time: $warmup seconds, please wait..." - - if (warmup > 5.0) { - messages += "Warm up took too long, aborting benchmark" - return@launch - } - - messages += llamaAndroid.bench(512, 128, 1, 3) - } catch (exc: IllegalStateException) { - Log.e(tag, "bench() failed", exc) - messages += exc.message!! - } - } - } - - fun load(pathToModel: String) { - viewModelScope.launch { - try { - llamaAndroid.load(pathToModel) - messages += "Loaded $pathToModel" - } catch (exc: IllegalStateException) { - Log.e(tag, "load() failed", exc) - messages += exc.message!! - } - } - } - - fun updateMessage(newMessage: String) { - message = newMessage - } - - fun clear() { - messages = listOf() - } - - fun log(message: String) { - messages += message - } -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt deleted file mode 100644 index 40c30e8d9..000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt +++ /dev/null @@ -1,11 +0,0 @@ -package com.example.llama.ui.theme - -import androidx.compose.ui.graphics.Color - -val Purple80 = Color(0xFFD0BCFF) -val PurpleGrey80 = Color(0xFFCCC2DC) -val Pink80 = Color(0xFFEFB8C8) - -val Purple40 = Color(0xFF6650a4) -val PurpleGrey40 = Color(0xFF625b71) -val Pink40 = Color(0xFF7D5260) diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt deleted file mode 100644 index e742220a8..000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt +++ /dev/null @@ -1,70 +0,0 @@ -package com.example.llama.ui.theme - -import android.app.Activity -import android.os.Build -import androidx.compose.foundation.isSystemInDarkTheme -import androidx.compose.material3.MaterialTheme -import androidx.compose.material3.darkColorScheme -import androidx.compose.material3.dynamicDarkColorScheme -import androidx.compose.material3.dynamicLightColorScheme -import androidx.compose.material3.lightColorScheme -import androidx.compose.runtime.Composable -import androidx.compose.runtime.SideEffect -import androidx.compose.ui.graphics.toArgb -import androidx.compose.ui.platform.LocalContext -import androidx.compose.ui.platform.LocalView -import androidx.core.view.WindowCompat - -private val DarkColorScheme = darkColorScheme( - primary = Purple80, - secondary = PurpleGrey80, - tertiary = Pink80 -) - -private val LightColorScheme = lightColorScheme( - primary = Purple40, - secondary = PurpleGrey40, - tertiary = Pink40 - - /* Other default colors to override - background = Color(0xFFFFFBFE), - surface = Color(0xFFFFFBFE), - onPrimary = Color.White, - onSecondary = Color.White, - onTertiary = Color.White, - onBackground = Color(0xFF1C1B1F), - onSurface = Color(0xFF1C1B1F), - */ -) - -@Composable -fun LlamaAndroidTheme( - darkTheme: Boolean = isSystemInDarkTheme(), - // Dynamic color is available on Android 12+ - dynamicColor: Boolean = true, - content: @Composable () -> Unit -) { - val colorScheme = when { - dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { - val context = LocalContext.current - if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) - } - - darkTheme -> DarkColorScheme - else -> LightColorScheme - } - val view = LocalView.current - if (!view.isInEditMode) { - SideEffect { - val window = (view.context as Activity).window - window.statusBarColor = colorScheme.primary.toArgb() - WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme - } - } - - MaterialTheme( - colorScheme = colorScheme, - typography = Typography, - content = content - ) -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt deleted file mode 100644 index 0b87946ca..000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt +++ /dev/null @@ -1,34 +0,0 @@ -package com.example.llama.ui.theme - -import androidx.compose.material3.Typography -import androidx.compose.ui.text.TextStyle -import androidx.compose.ui.text.font.FontFamily -import androidx.compose.ui.text.font.FontWeight -import androidx.compose.ui.unit.sp - -// Set of Material typography styles to start with -val Typography = Typography( - bodyLarge = TextStyle( - fontFamily = FontFamily.Default, - fontWeight = FontWeight.Normal, - fontSize = 16.sp, - lineHeight = 24.sp, - letterSpacing = 0.5.sp - ) - /* Other default text styles to override - titleLarge = TextStyle( - fontFamily = FontFamily.Default, - fontWeight = FontWeight.Normal, - fontSize = 22.sp, - lineHeight = 28.sp, - letterSpacing = 0.sp - ), - labelSmall = TextStyle( - fontFamily = FontFamily.Default, - fontWeight = FontWeight.Medium, - fontSize = 11.sp, - lineHeight = 16.sp, - letterSpacing = 0.5.sp - ) - */ -) diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml deleted file mode 100644 index 07d5da9cb..000000000 --- a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml +++ /dev/null @@ -1,170 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml deleted file mode 100644 index 7706ab9e6..000000000 --- a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - - - - diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml deleted file mode 100644 index b3e26b4c6..000000000 --- a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml deleted file mode 100644 index b3e26b4c6..000000000 --- a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp deleted file mode 100644 index c209e78ec..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp deleted file mode 100644 index b2dfe3d1b..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp deleted file mode 100644 index 4f0f1d64e..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp deleted file mode 100644 index 62b611da0..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp deleted file mode 100644 index 948a3070f..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp deleted file mode 100644 index 1b9a6956b..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp deleted file mode 100644 index 28d4b77f9..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp deleted file mode 100644 index 9287f5083..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp deleted file mode 100644 index aa7d6427e..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp deleted file mode 100644 index 9126ae37c..000000000 Binary files a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp and /dev/null differ diff --git a/examples/llama.android/app/src/main/res/values/colors.xml b/examples/llama.android/app/src/main/res/values/colors.xml deleted file mode 100644 index ca1931bca..000000000 --- a/examples/llama.android/app/src/main/res/values/colors.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - #FFBB86FC - #FF6200EE - #FF3700B3 - #FF03DAC5 - #FF018786 - #FF000000 - #FFFFFFFF - diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml deleted file mode 100644 index 7a9d314e2..000000000 --- a/examples/llama.android/app/src/main/res/values/strings.xml +++ /dev/null @@ -1,3 +0,0 @@ - - LlamaAndroid - diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml deleted file mode 100644 index 8a24fda56..000000000 --- a/examples/llama.android/app/src/main/res/values/themes.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - - - - - -
- -
-
- - - diff --git a/examples/server/public/index.js b/examples/server/public/index.js deleted file mode 100644 index 670960939..000000000 --- a/examples/server/public/index.js +++ /dev/null @@ -1 +0,0 @@ -const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,D,T,M={},F=[],A=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,V=Array.isArray;function W(t,n){for(var e in n)t[e]=n[e];return t}function L(t){var n=t.parentNode;n&&n.removeChild(t)}function O(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return R(t,r,_,i,null)}function R(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function I(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||F,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?R(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i)?(i.__=t,i.__b=t.__b+1,u=Z(i,e,r,f),i.__i=u,o=null,-1!==u&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u===r+1?c++:u>r?f>l-r?c+=u-r:c--:u(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),R(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+T++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=[],(_={})[n]=this,this.getChildContext=function(){return _},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=F.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=W({},this.state),"function"==typeof t&&(t=t(W({},e),this.props)),t&&W(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),D=et(!0),T=0;var at,pt,dt,vt,yt=0,mt=[],gt=[],bt=S,kt=bt.__b,wt=bt.__r,St=bt.diffed,xt=bt.__c,Ct=bt.unmount,Ut=bt.__;function Et(t,n){bt.__h&&bt.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({__V:gt}),e.__[t]}function Ht(t){return yt=1,Pt(zt,t)}function Pt(t,n,e){var _=Et(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):zt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Nt(t,n){var e=Et(at++,3);!bt.__s&&Bt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function $t(t,n){var e=Et(at++,4);!bt.__s&&Bt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function Dt(t){return yt=5,Mt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,$t((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Mt(t,n){var e=Et(at++,7);return Bt(e.__H,n)?(e.__V=t(),e.i=n,e.__h=t,e.__V):e.__}function Ft(t,n){return yt=8,Mt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Et(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Vt(t,n){bt.useDebugValue&&bt.useDebugValue(n?n(t):t)}function Wt(t){var n=Et(at++,10),e=Ht();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Et(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(jt),t.__H.__h.forEach(qt),t.__H.__h=[]}catch(n){t.__H.__h=[],bt.__e(n,t.__v)}}bt.__b=function(t){pt=null,kt&&kt(t)},bt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ut&&Ut(t,n)},bt.__r=function(t){wt&&wt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.__V=gt,t.__N=t.i=void 0}))):(n.__h.forEach(jt),n.__h.forEach(qt),n.__h=[],at=0)),dt=pt},bt.diffed=function(t){St&&St(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===bt.requestAnimationFrame||((vt=bt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.__V!==gt&&(t.__=t.__V),t.i=void 0,t.__V=gt}))),dt=pt=null},bt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(jt),t.__h=t.__h.filter((function(t){return!t.__||qt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],bt.__e(r,t.__v)}})),xt&&xt(t,n)},bt.unmount=function(t){Ct&&Ct(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{jt(t)}catch(t){n=t}})),e.__H=void 0,n&&bt.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function jt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function qt(t){var n=pt;t.__c=t.__(),pt=n}function Bt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function zt(t,n){return"function"==typeof n?n(t):n}function Gt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Jt,Kt;function Qt(t){if(Kt)Kt();Kt=t&&t.S()}function Xt({data:t}){const n=Zt(t);n.value=t;const e=Mt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Xt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Xt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});Gt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});Gt("__r",(t,n)=>{Qt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Jt=_;Qt(e);t(n)});Gt("__e",(t,n,e,_)=>{Qt();Jt=void 0;t(n,e,_)});Gt("diffed",(t,n)=>{Qt();Jt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Yt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Yt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}Gt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});Gt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Zt(t){return Mt(()=>c(t),[])}function tn(t){const n=Dt(t);n.current=t;Jt.__$f|=4;return Mt(()=>v(()=>n.current()),[])}function nn(t){const n=Dt(t);n.current=t;Nt(()=>k(()=>n.current()),[])}var en=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var rn=on.bind(O);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,O as createElement,I as createRef,k as effect,O as h,rn as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Ft as useCallback,tn as useComputed,At as useContext,Vt as useDebugValue,Nt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,$t as useLayoutEffect,Mt as useMemo,Pt as useReducer,Dt as useRef,Zt as useSignal,nn as useSignalEffect,Ht as useState}; diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs deleted file mode 100644 index 7267f3f9c..000000000 --- a/examples/server/public/json-schema-to-grammar.mjs +++ /dev/null @@ -1,835 +0,0 @@ -// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first. -const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'; - -function _buildRepetition(itemRule, minItems, maxItems, opts={}) { - if (minItems === 0 && maxItems === 1) { - return `${itemRule}?`; - } - - - const separatorRule = opts.separatorRule ?? ''; - const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false - - if (separatorRule === '') { - if (minItems === 1 && maxItems === undefined) { - return `${itemRule}+`; - } else if (minItems === 0 && maxItems === undefined) { - return `${itemRule}*`; - } else { - return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`; - } - } - - const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined); - return minItems === 0 ? `(${result})?` : result; -} - -function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) { - const hasMin = minValue !== null; - const hasMax = maxValue !== null; - - function digitRange(fromChar, toChar) { - out.push("["); - if (fromChar === toChar) { - out.push(fromChar); - } else { - out.push(fromChar); - out.push("-"); - out.push(toChar); - } - out.push("]"); - } - - function moreDigits(minDigits, maxDigits) { - out.push("[0-9]"); - if (minDigits === maxDigits && minDigits === 1) { - return; - } - out.push("{"); - out.push(minDigits.toString()); - if (maxDigits !== minDigits) { - out.push(","); - if (maxDigits !== Number.MAX_SAFE_INTEGER) { - out.push(maxDigits.toString()); - } - } - out.push("}"); - } - - function uniformRange(fromStr, toStr) { - let i = 0; - while (i < fromStr.length && fromStr[i] === toStr[i]) { - i++; - } - if (i > 0) { - out.push("\""); - out.push(fromStr.slice(0, i)); - out.push("\""); - } - if (i < fromStr.length) { - if (i > 0) { - out.push(" "); - } - const subLen = fromStr.length - i - 1; - if (subLen > 0) { - const fromSub = fromStr.slice(i + 1); - const toSub = toStr.slice(i + 1); - const subZeros = "0".repeat(subLen); - const subNines = "9".repeat(subLen); - - let toReached = false; - out.push("("); - if (fromSub === subZeros) { - digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1)); - out.push(" "); - moreDigits(subLen, subLen); - } else { - out.push("["); - out.push(fromStr[i]); - out.push("] "); - out.push("("); - uniformRange(fromSub, subNines); - out.push(")"); - if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) { - out.push(" | "); - if (toSub === subNines) { - digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]); - toReached = true; - } else { - digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1)); - } - out.push(" "); - moreDigits(subLen, subLen); - } - } - if (!toReached) { - out.push(" | "); - digitRange(toStr[i], toStr[i]); - out.push(" "); - uniformRange(subZeros, toSub); - } - out.push(")"); - } else { - out.push("["); - out.push(fromStr[i]); - out.push("-"); - out.push(toStr[i]); - out.push("]"); - } - } - } - - if (hasMin && hasMax) { - if (minValue < 0 && maxValue < 0) { - out.push("\"-\" ("); - _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true); - out.push(")"); - return; - } - - if (minValue < 0) { - out.push("\"-\" ("); - _generateMinMaxInt(0, -minValue, out, decimalsLeft, true); - out.push(") | "); - minValue = 0; - } - - let minS = minValue.toString(); - const maxS = maxValue.toString(); - const minDigits = minS.length; - const maxDigits = maxS.length; - - for (let digits = minDigits; digits < maxDigits; digits++) { - uniformRange(minS, "9".repeat(digits)); - minS = "1" + "0".repeat(digits); - out.push(" | "); - } - uniformRange(minS, maxS); - return; - } - - const lessDecimals = Math.max(decimalsLeft - 1, 1); - - if (hasMin) { - if (minValue < 0) { - out.push("\"-\" ("); - _generateMinMaxInt(null, -minValue, out, decimalsLeft, false); - out.push(") | [0] | [1-9] "); - moreDigits(0, decimalsLeft - 1); - } else if (minValue === 0) { - if (topLevel) { - out.push("[0] | [1-9] "); - moreDigits(0, lessDecimals); - } else { - moreDigits(1, decimalsLeft); - } - } else if (minValue <= 9) { - const c = minValue.toString(); - const range_start = topLevel ? '1' : '0'; - if (c > range_start) { - digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1)); - out.push(" "); - moreDigits(1, lessDecimals); - out.push(" | "); - } - digitRange(c, "9"); - out.push(" "); - moreDigits(0, lessDecimals); - } else { - const minS = minValue.toString(); - const length = minS.length; - const c = minS[0]; - - if (c > "1") { - digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1)); - out.push(" "); - moreDigits(length, lessDecimals); - out.push(" | "); - } - digitRange(c, c); - out.push(" ("); - _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false); - out.push(")"); - if (c < "9") { - out.push(" | "); - digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9"); - out.push(" "); - moreDigits(length - 1, lessDecimals); - } - } - return; - } - - if (hasMax) { - if (maxValue >= 0) { - if (topLevel) { - out.push("\"-\" [1-9] "); - moreDigits(0, lessDecimals); - out.push(" | "); - } - _generateMinMaxInt(0, maxValue, out, decimalsLeft, true); - } else { - out.push("\"-\" ("); - _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false); - out.push(")"); - } - return; - } - - throw new Error("At least one of minValue or maxValue must be set"); -} - -class BuiltinRule { - constructor(content, deps) { - this.content = content; - this.deps = deps || []; - } -} - -const PRIMITIVE_RULES = { - boolean : new BuiltinRule('("true" | "false") space', []), - 'decimal-part' : new BuiltinRule('[0-9]{1,16}', []), - 'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []), - number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), - integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']), - value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), - object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), - array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), - uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []), - char : new BuiltinRule(`[^"\\\\\\x7F\\x00-\\x1F] | [\\\\] (["\\\\bfnrt] | "u" [0-9a-fA-F]{4})`, []), - string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']), - null : new BuiltinRule('"null" space', []), -}; - -// TODO: support "uri", "email" string formats -const STRING_FORMAT_RULES = { - 'date' : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), - 'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), - 'date-time' : new BuiltinRule('date "T" time', ['date', 'time']), - 'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']), - 'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']), - 'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']), -} - -const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES}; - -const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g; -const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g; -const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g; -const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' }; - -const NON_LITERAL_SET = new Set('|.()[]{}*+?'); -const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?'); - -export class SchemaConverter { - constructor(options) { - this._propOrder = options.prop_order || {}; - this._allowFetch = options.allow_fetch || false; - this._dotall = options.dotall || false; - this._rules = {'space': SPACE_RULE}; - this._refs = {}; - this._refsBeingResolved = new Set(); - } - - _formatLiteral(literal) { - const escaped = literal.replace( - GRAMMAR_LITERAL_ESCAPE_RE, - m => GRAMMAR_LITERAL_ESCAPES[m] - ); - return `"${escaped}"`; - } - - _formatRangeChar(literal) { - return JSON.stringify(literal).slice(1, -1).replace( - GRAMMAR_RANGE_LITERAL_ESCAPE_RE, - m => GRAMMAR_LITERAL_ESCAPES[m] - ); - } - - _addRule(name, rule) { - let escName = name.replace(INVALID_RULE_CHARS_RE, '-'); - let key = escName; - - if (escName in this._rules) { - if (this._rules[escName] === rule) { - return key; - } - - let i = 0; - while ((`${escName}${i}` in this._rules) && (this._rules[`${escName}${i}`] !== rule)) { - i += 1; - } - key = `${escName}${i}`; - } - - this._rules[key] = rule; - return key; - } - - async resolveRefs(schema, url) { - const visit = async (n) => { - if (Array.isArray(n)) { - return Promise.all(n.map(visit)); - } else if (typeof n === 'object' && n !== null) { - let ref = n.$ref; - let target; - if (ref !== undefined && !this._refs[ref]) { - if (ref.startsWith('https://')) { - if (!this._allowFetch) { - throw new Error('Fetching remote schemas is not allowed (use --allow-fetch for force)'); - } - const fetch = (await import('node-fetch')).default; - - const fragSplit = ref.split('#'); - const baseUrl = fragSplit[0]; - - target = this._refs[baseUrl]; - if (!target) { - target = await this.resolveRefs(await fetch(ref).then(res => res.json()), baseUrl); - this._refs[baseUrl] = target; - } - - if (fragSplit.length === 1 || fragSplit[fragSplit.length - 1] === '') { - return target; - } - } else if (ref.startsWith('#/')) { - target = schema; - ref = `${url}${ref}`; - n.$ref = ref; - } else { - throw new Error(`Unsupported ref ${ref}`); - } - - const selectors = ref.split('#')[1].split('/').slice(1); - for (const sel of selectors) { - if (!target || !(sel in target)) { - throw new Error(`Error resolving ref ${ref}: ${sel} not in ${JSON.stringify(target)}`); - } - target = target[sel]; - } - - this._refs[ref] = target; - } else { - await Promise.all(Object.values(n).map(visit)); - } - } - - return n; - }; - - return visit(schema); - } - - _generateUnionRule(name, altSchemas) { - return altSchemas - .map((altSchema, i) => this.visit(altSchema, `${name ?? ''}${name ? '-' : 'alternative-'}${i}`)) - .join(' | '); - } - - _visitPattern(pattern, name) { - if (!pattern.startsWith('^') || !pattern.endsWith('$')) { - throw new Error('Pattern must start with "^" and end with "$"'); - } - pattern = pattern.slice(1, -1); - const subRuleIds = {}; - - let i = 0; - const length = pattern.length; - - const getDot = () => { - let rule; - if (this._dotall) { - rule = '[\\U00000000-\\U0010FFFF]'; - } else { - // Accept any character... except \n and \r line break chars (\x0A and \xOD) - rule = '[^\\x0A\\x0D]'; - } - return this._addRule('dot', rule); - }; - - - const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s; - - const transform = () => { - const start = i; - // For each component of this sequence, store its string representation and whether it's a literal. - // We only need a flat structure here to apply repetition operators to the last item, and - // to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially - // (GBNF's syntax is luckily very close to regular expressions!) - const seq = []; - - const joinSeq = () => { - const ret = []; - for (const [isLiteral, g] of groupBy(seq, x => x[1])) { - if (isLiteral) { - ret.push([[...g].map(x => x[0]).join(''), true]); - } else { - ret.push(...g); - } - } - if (ret.length === 1) { - return ret[0]; - } - return [ret.map(x => toRule(x)).join(' '), false]; - }; - - while (i < length) { - const c = pattern[i]; - if (c === '.') { - seq.push([getDot(), false]); - i += 1; - } else if (c === '(') { - i += 1; - if (i < length) { - if (pattern[i] === '?') { - throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`); - } - } - seq.push([`(${toRule(transform())})`, false]); - } else if (c === ')') { - i += 1; - if (start <= 0 || pattern[start - 1] !== '(') { - throw new Error(`Unbalanced parentheses; start = ${start}, i = ${i}, pattern = ${pattern}`); - } - return joinSeq(); - } else if (c === '[') { - let squareBrackets = c; - i += 1; - while (i < length && pattern[i] !== ']') { - if (pattern[i] === '\\') { - squareBrackets += pattern.slice(i, i + 2); - i += 2; - } else { - squareBrackets += pattern[i]; - i += 1; - } - } - if (i >= length) { - throw new Error(`Unbalanced square brackets; start = ${start}, i = ${i}, pattern = ${pattern}`); - } - squareBrackets += ']'; - i += 1; - seq.push([squareBrackets, false]); - } else if (c === '|') { - seq.push(['|', false]); - i += 1; - } else if (c === '*' || c === '+' || c === '?') { - seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false]; - i += 1; - } else if (c === '{') { - let curlyBrackets = c; - i += 1; - while (i < length && pattern[i] !== '}') { - curlyBrackets += pattern[i]; - i += 1; - } - if (i >= length) { - throw new Error(`Unbalanced curly brackets; start = ${start}, i = ${i}, pattern = ${pattern}`); - } - curlyBrackets += '}'; - i += 1; - const nums = curlyBrackets.slice(1, -1).split(',').map(s => s.trim()); - let minTimes, maxTimes; - if (nums.length === 1) { - minTimes = parseInt(nums[0], 10); - maxTimes = minTimes; - } else { - if (nums.length !== 2) { - throw new Error(`Invalid quantifier ${curlyBrackets}`); - } - minTimes = nums[0] ? parseInt(nums[0], 10) : 0; - maxTimes = nums[1] ? parseInt(nums[1], 10) : Infinity; - } - - let [sub, subIsLiteral] = seq[seq.length - 1]; - - if (!subIsLiteral) { - let id = subRuleIds[sub]; - if (id === undefined) { - id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub); - subRuleIds[sub] = id; - } - sub = id; - } - - seq[seq.length - 1] = [ - _buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}), - false - ]; - } else { - let literal = ''; - while (i < length) { - if (pattern[i] === '\\' && i < length - 1) { - const next = pattern[i + 1]; - if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) { - i += 1; - literal += pattern[i]; - i += 1; - } else { - literal += pattern.slice(i, i + 2); - i += 2; - } - } else if (pattern[i] === '"') { - literal += '\\"'; - i += 1; - } else if (!NON_LITERAL_SET.has(pattern[i]) && - (i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) { - literal += pattern[i]; - i += 1; - } else { - break; - } - } - if (literal !== '') { - seq.push([literal, true]); - } - } - } - - return joinSeq(); - }; - - return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space") - } - - _notStrings(strings) { - class TrieNode { - constructor() { - this.children = {}; - this.isEndOfString = false; - } - - insert(str) { - let node = this; - for (const c of str) { - node = node.children[c] = node.children[c] || new TrieNode(); - } - node.isEndOfString = true; - } - } - - const trie = new TrieNode(); - for (const s of strings) { - trie.insert(s); - } - - const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']); - const out = ['["] ( ']; - - const visit = (node) => { - const rejects = []; - let first = true; - for (const c of Object.keys(node.children).sort()) { - const child = node.children[c]; - rejects.push(c); - if (first) { - first = false; - } else { - out.push(' | '); - } - out.push(`[${c}]`); - if (Object.keys(child.children).length > 0) { - out.push(' ('); - visit(child); - out.push(')'); - } else if (child.isEndOfString) { - out.push(` ${charRuleName}+`); - } - } - if (Object.keys(node.children).length > 0) { - if (!first) { - out.push(' | '); - } - out.push(`[^"${rejects.join('')}] ${charRuleName}*`); - } - }; - - visit(trie); - - out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`); - return out.join(''); - } - - _resolveRef(ref) { - let refName = ref.split('/').pop(); - if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) { - this._refsBeingResolved.add(ref); - const resolved = this._refs[ref]; - refName = this.visit(resolved, refName); - this._refsBeingResolved.delete(ref); - } - return refName; - } - - _generateConstantRule(value) { - return this._formatLiteral(JSON.stringify(value)); - } - - visit(schema, name) { - const schemaType = schema.type; - const schemaFormat = schema.format; - const ruleName = name in RESERVED_NAMES ? name + '-' : name == '' ? 'root' : name; - - const ref = schema.$ref; - if (ref !== undefined) { - return this._addRule(ruleName, this._resolveRef(ref)); - } else if (schema.oneOf || schema.anyOf) { - return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf)); - } else if (Array.isArray(schemaType)) { - return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t})))); - } else if ('const' in schema) { - return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space'); - } else if ('enum' in schema) { - const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space'; - return this._addRule(ruleName, rule); - } else if ((schemaType === undefined || schemaType === 'object') && - ('properties' in schema || - ('additionalProperties' in schema && schema.additionalProperties !== true))) { - const required = new Set(schema.required || []); - const properties = Object.entries(schema.properties ?? {}); - return this._addRule(ruleName, this._buildObjectRule(properties, required, name, schema.additionalProperties)); - } else if ((schemaType === undefined || schemaType === 'object') && 'allOf' in schema) { - const required = new Set(); - const properties = []; - const addComponent = (compSchema, isRequired) => { - const ref = compSchema.$ref; - if (ref !== undefined) { - compSchema = this._refs[ref]; - } - - if ('properties' in compSchema) { - for (const [propName, propSchema] of Object.entries(compSchema.properties)) { - properties.push([propName, propSchema]); - if (isRequired) { - required.add(propName); - } - } - } - }; - - for (const t of schema.allOf) { - if ('anyOf' in t) { - for (const tt of t.anyOf) { - addComponent(tt, false); - } - } else { - addComponent(t, true); - } - } - - return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null)); - } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) { - const items = schema.items ?? schema.prefixItems; - if (Array.isArray(items)) { - return this._addRule( - ruleName, - '"[" space ' + - items.map((item, i) => this.visit(item, `${name ?? ''}${name ? '-' : ''}tuple-${i}`)).join(' "," space ') + - ' "]" space' - ); - } else { - const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`); - const minItems = schema.minItems || 0; - const maxItems = schema.maxItems; - return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space'); - } - } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) { - return this._visitPattern(schema.pattern, ruleName); - } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) { - return this._addPrimitive( - ruleName === 'root' ? 'root' : schemaFormat, - PRIMITIVE_RULES['uuid'] - ); - } else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) { - const primName = `${schema.format}-string` - return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName])); - } else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) { - const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']); - const minLen = schema.minLength || 0; - const maxLen = schema.maxLength; - return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space'); - } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) { - let minValue = null; - let maxValue = null; - if ('minimum' in schema) { - minValue = schema.minimum; - } else if ('exclusiveMinimum' in schema) { - minValue = schema.exclusiveMinimum + 1; - } - if ('maximum' in schema) { - maxValue = schema.maximum; - } else if ('exclusiveMaximum' in schema) { - maxValue = schema.exclusiveMaximum - 1; - } - - const out = ["("]; - _generateMinMaxInt(minValue, maxValue, out); - out.push(") space"); - return this._addRule(ruleName, out.join('')); - } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) { - return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object'])); - } else { - if (!(schemaType in PRIMITIVE_RULES)) { - throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`); - } - // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero - return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]); - } - } - - _addPrimitive(name, rule) { - let n = this._addRule(name, rule.content); - for (const dep of rule.deps) { - const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep]; - if (!depRule) { - throw new Error(`Rule ${dep} not known`); - } - if (!(dep in this._rules)) { - this._addPrimitive(dep, depRule); - } - } - return n; - } - - _buildObjectRule(properties, required, name, additionalProperties) { - const propOrder = this._propOrder; - // sort by position in prop_order (if specified) then by original order - const sortedProps = properties.map(([k]) => k).sort((a, b) => { - const orderA = propOrder[a] || Infinity; - const orderB = propOrder[b] || Infinity; - return orderA - orderB || properties.findIndex(([k]) => k === a) - properties.findIndex(([k]) => k === b); - }); - - const propKvRuleNames = {}; - for (const [propName, propSchema] of properties) { - const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`); - propKvRuleNames[propName] = this._addRule( - `${name ?? ''}${name ? '-' : ''}${propName}-kv`, - `${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}` - ); - } - const requiredProps = sortedProps.filter(k => required.has(k)); - const optionalProps = sortedProps.filter(k => !required.has(k)); - - if (additionalProperties) { - const subName = `${name ?? ''}${name ? '-' : ''}additional`; - const valueRule = - additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`) - : this._addPrimitive('value', PRIMITIVE_RULES['value']); - - const key_rule = - sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string']) - : this._addRule(`${subName}-k`, this._notStrings(sortedProps)); - - propKvRuleNames['*'] = this._addRule( - `${subName}-kv`, - `${key_rule} ":" space ${valueRule}`); - optionalProps.push('*'); - } - - let rule = '"{" space '; - rule += requiredProps.map(k => propKvRuleNames[k]).join(' "," space '); - - if (optionalProps.length > 0) { - rule += ' ('; - if (requiredProps.length > 0) { - rule += ' "," space ( '; - } - - const getRecursiveRefs = (ks, firstIsOptional) => { - const [k, ...rest] = ks; - const kvRuleName = propKvRuleNames[k]; - let res; - const commaRef = `( "," space ${kvRuleName} )`; - if (firstIsOptional) { - res = commaRef + (k === '*' ? '*' : '?'); - } else { - res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : ''); - } - if (rest.length > 0) { - res += ' ' + this._addRule( - `${name ?? ''}${name ? '-' : ''}${k}-rest`, - getRecursiveRefs(rest, true) - ); - } - return res; - }; - - rule += optionalProps.map((_, i) => getRecursiveRefs(optionalProps.slice(i), false)).join(' | '); - if (requiredProps.length > 0) { - rule += ' )'; - } - rule += ' )?'; - } - - rule += ' "}" space'; - - return rule; - } - - formatGrammar() { - let grammar = ''; - for (const [name, rule] of Object.entries(this._rules).sort(([a], [b]) => a.localeCompare(b))) { - grammar += `${name} ::= ${rule}\n`; - } - return grammar; - } -} - -// Helper function to group elements by a key function -function* groupBy(iterable, keyFn) { - let lastKey = null; - let group = []; - for (const element of iterable) { - const key = keyFn(element); - if (lastKey !== null && key !== lastKey) { - yield [lastKey, group]; - group = []; - } - group.push(element); - lastKey = key; - } - if (group.length > 0) { - yield [lastKey, group]; - } -} diff --git a/examples/server/public/prompt-formats.js b/examples/server/public/prompt-formats.js deleted file mode 100644 index 73ddb7187..000000000 --- a/examples/server/public/prompt-formats.js +++ /dev/null @@ -1,331 +0,0 @@ -// extended list -export const promptFormats = { - "alpaca": { - template: `{{prompt}}\n\n{{history}}\n\n{{char}}:`, - - historyTemplate: `### {{name}}:\n{{message}}`, - - char: "Response", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "Instruction", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - - // ---------------------------- - - "chatml": { - template: `<|im_start|>system\n{{prompt}}<|im_end|>\n{{history}}{{char}}`, - - historyTemplate: `<|im_start|>{{name}}\n{{message}}`, - - char: "assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "user", - userMsgPrefix: "", - userMsgSuffix: "<|im_end|>\n", - - stops: "" - }, - - // ---------------------------- - - "commandr": { - template: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{prompt}}\n<|END_OF_TURN_TOKEN|>{{history}}{{char}}`, - - historyTemplate: `<|START_OF_TURN_TOKEN|><|{{name}}|> {{message}}`, - - char: "CHATBOT_TOKEN", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "USER_TOKEN", - userMsgPrefix: "", - userMsgSuffix: "<|END_OF_TURN_TOKEN|>", - - stops: "" - }, - // ref: https://docs.cohere.com/docs/prompting-command-r - - // ---------------------------- - - "llama2": { - template: `[INST] <>\n{{prompt}}\n<>\n\nTest Message [/INST] Test Successfull {{history}}{{char}}`, - - historyTemplate: `{{name}}: {{message}}`, - - char: "Assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "User", - userMsgPrefix: "[INST] ", - userMsgSuffix: " [/INST]", - - stops: "" - }, - // ref: https://huggingface.co/blog/llama2#how-to-prompt-llama-2 - - // ---------------------------- - - "llama3": { - template: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{prompt}}{{history}}{{char}}`, - - historyTemplate: `<|start_header_id|>{{name}}<|end_header_id|>\n\n{{message}}<|eot_id|>`, - - char: "assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "user", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "<|eot_id|>" - }, - // ref: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/#special-tokens-used-with-meta-llama-3 - - // ---------------------------- - - "openchat": { - template: `{{history}}{{char}}`, - - historyTemplate: `GPT4 Correct {{name}}: {{message}}<|end_of_turn|>`, - - char: "Assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "User", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - - // ---------------------------- - - "phi3": { - template: `{{history}}{{char}}`, - - historyTemplate: `<|{{name}}|>\n{{message}}<|end|>\n`, - - char: "assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "user", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "<|end|>" - }, - // ref: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct#chat-format - - // ---------------------------- - - "vicuna": { - template: `{{prompt}}\n{{history}}{{char}}`, - - historyTemplate: `{{name}}: {{message}}\n`, - - char: "ASSISTANT", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "USER", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - // ref: https://huggingface.co/lmsys/vicuna-33b-v1.3/discussions/1 - - // ---------------------------- - - "deepseekCoder": { - template: `{{prompt}}{{history}}{{char}}:`, - - historyTemplate: `### {{name}}:\n{{message}}`, - - char: "Response", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "Instruction", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "<|EOT|>" - }, - - // ---------------------------- - - "med42": { - template: `<|system|>: {{prompt}}\n{{history}}{{char}}`, - - historyTemplate: `<|{{name}}|>: {{message}}\n`, - - char: "assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "prompter", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - - // ---------------------------- - - "neuralchat": { - template: `### System:\n{{prompt}}\n{{history}}{{char}}:`, - - historyTemplate: `### {{name}}:\n{{message}}\n`, - - char: "Assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "User", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - - // ---------------------------- - - "nousHermes": { - template: `### Instruction: {{prompt}}\n\n{{history}}\n\n{{char}}:`, - - historyTemplate: `### {{name}}:\n{{message}}`, - - char: "Response", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "Input", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - - // ---------------------------- - - "openchatMath": { - template: `{{history}}{{char}}`, - - historyTemplate: `Math Correct {{name}}: {{message}}<|end_of_turn|>`, - - char: "Assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - - user: "User", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - - // ---------------------------- - - "orion": { - template: `Human: Test Message\n\nAssistant: Test Successful{{history}}{{char}}:`, - - historyTemplate: `{{name}}: {{message}}`, - - char: "Assistant ", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "Human", - userMsgPrefix: "", - userMsgSuffix: "\n\n", - - stops: "" - }, - - // ---------------------------- - - "sauerkraut": { - template: `{{prompt}}\n{{history}}{{char}}`, - - historyTemplate: ` - {{name}}: {{message}}\n`, - - char: "Assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "User", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - - // ---------------------------- - - "starlingCode": { - template: `{{history}}{{char}}`, - - historyTemplate: `Code {{name}}: {{message}}<|end_of_turn|>`, - - char: "Assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "User", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - - // ---------------------------- - - "yi34b": { - template: `{{history}} {{char}}`, - - historyTemplate: `{{name}}: {{message}}`, - - char: "Assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "Human", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - }, - - // ---------------------------- - - "zephyr": { - template: `<|system|>\n{{prompt}}\n{{history}}{{char}}`, - - historyTemplate: `<|{{name}}|>\n{{message}}\n`, - - char: "assistant", - charMsgPrefix: "", - charMsgSuffix: "", - - user: "user", - userMsgPrefix: "", - userMsgSuffix: "", - - stops: "" - } - }; diff --git a/examples/server/public/style.css b/examples/server/public/style.css deleted file mode 100755 index 087cc62da..000000000 --- a/examples/server/public/style.css +++ /dev/null @@ -1,954 +0,0 @@ -@import url("colorthemes.css"); - -body { - font-family: 'Arial', sans-serif; - font-size: 90%; - background-color: var(--background-color-1); - color: var(--text-color-subtile-1); /* head 1 llama.cpp & triangle options for some reason */ - max-width: 600px; - min-width: 300px; - line-height: 1.2; - margin: 0 auto; - padding: 0 0.5em; - transition: background-color 0.3s; -} - -::selection { - color: var(--button-primary-text) ; - background: var(--button-primary-color); -} - -code, pre code { - font-family: 'Courier New', monospace; -} - -#container { - margin: 0em auto; - display: flex; - flex-direction: column; - justify-content: space-between; - height: 100%; -} - -main { - margin: 3px; - display: flex; - flex-direction: column; - justify-content: space-between; - gap: 1em; - flex-grow: 1; - overflow-y: auto; - border: 1px solid var(--border-color-3); - border-radius: 5px; - padding: 0.5em; -} - -p { - overflow-wrap: break-word; - word-wrap: break-word; - hyphens: auto; - margin-top: 0.5em; - margin-bottom: 0.5em; -} - -#write form { - margin: 1em 0 0 0; - display: flex; - flex-direction: column; - gap: 0.5em; - align-items: stretch; -} - -.right { - display: flex; - flex-direction: row; - gap: 0.5em; - justify-content: flex-end; - margin-bottom: 30px; -} - -.two-columns { - width: 97%; - max-width: 97%; - display: grid; - grid-template-columns: 1fr 1fr; - gap: 1em; - position: relative; -} - -.json-schema-controls { - margin-top: 10px; - width: 100%; - max-width: 100%; - display: grid; - grid-template: "a a"; - gap: 1em; - font-size: x-small; - color: var(--theme-nuance-color-3); - padding-top: 16px; - padding-bottom: 16px; - text-transform: uppercase; - font-weight: 600; -} - -.json-schema-controls > * { - flex: 1; -} - -/* titles of the details-summary boxes */ -.summary-title { - font-weight: 600; - font-size: x-small; - color: var(--text-color-subtile-1); - text-transform: uppercase; - /* transition: ; */ -} - -fieldset { - border: none; - padding: 0; - margin: 0; - color: var(--text-color-plain); -} - -fieldset.two { - display: grid; - grid-template: "a a a"; - gap: 1em; - align-items: center; - font-size: x-small; - color: var(--text-color-plain); -} - -fieldset.three { - display: grid; - grid-template: "a a a"; - gap: 1em; - font-size: x-small; - color: var(--text-color-plain); -} - -/* titles of name fields*/ -fieldset.names { - display: grid; - grid-template: "a a"; - gap: 1em; - font-size: x-small; - color: var(--theme-nuance-color-3); - padding-top: 16px; - padding-bottom: 16px; - text-transform: uppercase; - font-weight: 600; -} - -/* titles of params fields*/ -fieldset.params { - display: grid; - grid-template: "a a"; - gap: 1em; - font-size: x-small; - color: var(--theme-nuance-color-4); - padding-top: 16px; - padding-bottom: 16px; - text-transform: uppercase; - font-weight: 600; -} - -fieldset.dropdowns { - -webkit-appearance: none; - display: flex; - grid-template: "a a"; - gap: 1em; - font-size: x-small; - color: red; - padding-top: 16px; - padding-bottom: 16px; - text-transform: uppercase; - font-weight: 600; -} - -/* input of name fields*/ -.names input[type="text"] { - font-family: Arial, sans-serif; - font-size: medium; - font-weight: 500; - padding: 5px; - border: 1px solid var(--border-color-2); -} - -.chat-id-color { - color: var(--chat-id-color); -} - -details { - border: 1px solid var(--border-color-2); - border-radius: 5px; - padding: 0.5em 0.5em 0; - margin-top: 0.5em; -} - -summary { - font-weight: bold; - margin: -0.5em -0.5em 0; - padding: 0.5em; - cursor: pointer; -} - -details[open] { - padding: 0.5em; -} - -textarea-sec, input-sec, button-sec { - padding: 10px; - height: 40px; - align-items: center; -} - -textarea-sec::placeholder, input-sec::placeholder { - padding-left: 10px; -} - -.toggleCheckbox { - display: none; -} - -.toggleContainer { - position: relative; - display: grid; - grid-template-columns: repeat(2, 1fr); - width: fit-content; - border: 3px solid var(--border-color-2); - border-radius: 20px; - background: var(--border-color-2); - font-size: small; - cursor: pointer; - overflow: hidden; -} - -/* toggle button current state */ -.toggleContainer::before { - color: var(--button-primary-text); - background-color: var(--button-primary-color); - content: ''; - position: absolute; - width: 50%; - height: 100%; - left: 0%; - border-radius: 20px; - transition: all 0.3s; -} - -.toggleContainer div { - padding: 6px; - text-align: center; - z-index: 1; - transition: color 0.3s; -} - -.toggleCheckbox:checked + .toggleContainer::before { - left: 50%; -} - -.toggleCheckbox:checked + .toggleContainer div:first-child { - color: var(--text-color-subtile-2); -} - -.toggleCheckbox:checked + .toggleContainer div:last-child { - color: var(--button-primary-text); -} - -.toggleCheckbox + .toggleContainer div:first-child { - color: var(--button-primary-text); -} - -.toggleCheckbox + .toggleContainer div:last-child { - color: var(--text-color-subtile-2); -} - -select { - padding: 5px; - margin-right: 5px; - border-radius: 4px; - border: 1px solid var(--secondary-color-4); - background-color: var(--primary-color-3); - color: var(--secondary-color-4); - cursor: pointer; -} - -select:focus { - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 1px var(--border-focus-shadow); -} - -.button-container { - display: flex; - justify-content: flex-end; -} - -button { - color: var(--button-primary-text); - background-color: var(--button-primary-color); - border: 1px solid var(--button-primary-border); - transition: background-color 0.1s; - border-radius: 12px; - font-size: x-small; - font-weight: 600; - text-shadow: 0px 0px 30px #ffffff; - text-align: center; - text-decoration: none; - margin: 4px 2px; - padding: 10px 20px; - display: inline-block; - cursor: pointer; -} - -button:hover { - color: var(--button-primary-text-hover); - background-color: var(--button-primary-color-hover); - border: 1px solid var(--button-primary-border-hover); - font-size: x-small; - font-weight: 600; -} - -button:active { - color: var(--button-primary-text-active); - background-color: var(--button-primary-color-active); - border: 1px solid var(--button-primary-border-active); - font-size: x-small; - font-weight: 600; -} - -button:disabled { - color: var(--button-tertiary-text); - background-color: var(--button-tertiary-color); - border: 1px solid var(--button-tertiary-border); - font-size: x-small; - font-weight: 600; - cursor: not-allowed; -} - -.reset-button { - background-color: var(--button-secondary-color); - border: 1px solid var(--button-secondary-color); - color: var(--button-secondary-text); - width: fit-content; - height: fit-content; - font-size: x-small; - font-weight: 600; - border-radius: 50px; - overflow: hidden; -} - -.reset-button:hover { - color: var(--button-alert-text-hover); - background-color: var(--button-alert-color-hover); - border: 1px solid var(--button-alert-border-hover); - font-size: x-small; - font-weight: 600; -} - -.reset-button:active { - color: var(--button-alert-text-active); - background-color: var(--button-alert-color-active); - border: 1px solid var(--button-alert-border-active); - font-size: x-small; - font-weight: 600; -} - -.button-grammar { - color: var(--button-primary-text); - background-color: var(--button-primary-color); - border: 1px solid var(--button-primary-border); - border-radius: 10px; - padding: 10px 20px; - text-align: center; - text-decoration: none; - display: inline-block; - font-size: x-small; - font-weight: 600; - margin: 2px 2px; - transition: background-color 0.1s; - cursor: pointer; -} - -.button-grammar:hover { - color: var(--button-primary-text-hover); - background-color: var(--button-primary-color-hover); - border: 1px solid var(--button-primary-border-hover); - border-radius: 10px; - padding: 10px 20px; - text-align: center; - text-decoration: none; - display: inline-block; - font-size: x-small; - font-weight: 600; - margin: 2px 2px; - transition: background-color 0.1s; - cursor: pointer; -} - -.button-grammar:active { - color: var(--button-primary-text-active); - background-color: var(--button-primary-color-active); - border: 1px solid var(--button-primary-border-active); - font-size: x-small; - font-weight: 600; -} - -.button-back { - background-color: var(--button-secondary-color); - border: 1px solid var(--button-secondary-color); - color: var(--button-secondary-text); - transition: background-color 0.1s; - border-radius: 12px; - font-size: x-small; - font-weight: 600; - text-align: center; - text-decoration: none; - margin: 4px 2px; - padding: 10px 20px; - display: inline-block; - cursor: pointer; -} - -.button-back:hover { - color: var(--button-secondary-text-hover); - background-color: var(--button-secondary-color-hover); - border: 1px solid var(--button-secondary-border-hover); - padding: 10px 20px; - text-align: center; - text-decoration: none; - display: inline-block; - font-size: x-small; - font-weight: 600; - margin: 4px 2px; - transition: background-color 0.1s; - cursor: pointer; - border-radius: 12px; -} - -.button-back:active { - color: var(--button-secondary-text-active); - background-color: var(--button-secondary-color-active); - border: 1px solid var(--button-secondary-border-active); - font-size: x-small; - font-weight: 600; -} - -.prob-set { - padding: 0.3em; - border-bottom: 1px solid red; /* unknown */ -} - -.popover-content { - position: absolute; - background-color: white; - padding: 0.2em; - box-shadow: 0 0 13px rgba(0, 0, 0, 0.1); -} - -.grammar { - width: 97%; - max-width: 97%; -} - -textarea { - padding: 5px; - flex-grow: 1; - width: 100%; - max-width: 100%; - border-radius: 8px; - border: 1px solid var(--border-color-1); - resize: none; - height: 6em; -} - -textarea:focus { - outline: none; - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); -} - -/* "props" frame */ -input[type="text"], -input[type="range"] { - padding: 5px; - border-radius: 8px; - border: 1px solid var(--border-color-1); -} - -/* "names and props" frame focused*/ -input[type="text"]:focus { - outline: none; - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); -} - -input[type="range"]:hover { - opacity: 1; -} - -input[type="range"]:focus { - outline: none; - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); - background-size: var(--slider-track-size-focus); -} - -input[type="range"]::-moz-range-thumb { - width: 6px; - height: 25px; - border: 1px solid var(--ui-range-thumb-border); - border-radius: 5px; - background-color: var(--ui-range-thumb-color); - cursor: pointer; -} - -input[type="range"] { - -webkit-appearance: none; - width: 80%; - height: 1px; - border: 1px solid var(--border-color-1); - border-radius: 8px; - background: var(--border-color-2); - outline: none; - opacity: 0.7; - -webkit-transition: .2s; - transition: opacity .2s; -} - -input[type="range"]::-webkit-slider-thumb { - -webkit-appearance: none; - appearance: none; - width: 6px; - height: 25px; - border: 1px solid var(--ui-range-thumb-border); - border-radius: 5px; - background-color: var(--ui-range-thumb-color); - cursor: pointer; -} - -input[type="range"]::-webkit-slider-runnable-track { - background-size: var(--slider-track-size); -} - -input[type="radio"] { - accent-color: var(--theme-nuance-color-2); -} - -.chat-input-container { - position: relative; - max-width: 97%; - min-width: 97%; -} - -.chat-input-label { - position: absolute; - top: 0; - left: 0; - color: var(--text-color-plain); - pointer-events: none; - margin-left: 5px; - margin-top: 5px; -} - -textarea#chat-input { - padding-top: 10px; - padding-left: 10px; - font-size: medium; - border: 1px solid var(--border-color-2); - resize: vertical; -} - -textarea#chat-input:focus { - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); -} - -.input-container { - position: relative; - box-sizing: border-box; - width: 100%; /* Setzt die Breite auf 100% */ - max-width: 100%; /* Stellt sicher, dass die Breite nicht größer als 100% wird */ -} - -.input-container:focus { - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); -} -/* titles of name fields*/ -/* fieldset.names { - display: grid; - grid-template: "a a"; - gap: 1em; - font-size: x-small; - color: var(--theme-nuance-color-3); - padding-top: 16px; - padding-bottom: 16px; - text-transform: uppercase; - font-weight: 600; -} */ - -/* input of name fields*/ -/* .names input[type="text"] { - font-family: Arial, sans-serif; - font-size: medium; - font-weight: 500; - padding: 5px; - border: 1px solid var(--border-color-2); -} */ - -fieldset.apiKey { - width: 100%; - font-size: x-small; - color: var(--theme-nuance-color-3); - padding-top: 16px; - padding-bottom: 16px; - text-transform: uppercase; - font-weight: 600; -} - -.apiKey { - font-family: Arial, sans-serif; - font-weight: 500; - padding: 5px; - border: 1px solid var(--border-color-2); -} - -.apiKey:focus { - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); -} - -.apiKey input[type="text"] { - font-family: Arial, sans-serif; - font-size: medium; - font-weight: 500; - padding: 5px; - border: 1px solid var(--border-color-2); -} - -.apiKey label { - display: inline-block; - width: auto; - margin-right: 5px; -} - -textarea#api_key { - padding-top: 10px; - padding-left: 10px; - font-size: medium; - border: 1px solid var(--border-color-2); - resize: vertical; -} - -textarea#api_key:focus { - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); -} - -/* embedded title of the system prompt text area */ -.input-label { - position: absolute; - top: 0; - left: 0; - color: var(--theme-nuance-color-4); - pointer-events: none; - border-radius: 8px 8px 0px 0px; - padding-top: 10px; - padding-left: 13px; - padding-right: 0px; - margin-top: 1px; - margin-left: 1px; - margin-right: 20px; - text-transform: uppercase; - font-weight: 600; - font-size: small; - background: rgba(255, 255, 255, 0.5); - backdrop-filter: blur(10px); - -webkit-backdrop-filter: blur(10px); /* for safari */ - width: 97%; - /* display: block; - box-sizing: border-box; */ -} - -/* embedded title of the prompt style areas */ -.input-label-sec { - position: absolute; - top: 0; - left: 0; - color: var(--theme-nuance-color-4); - pointer-events: none; - margin-left: 13px; - margin-top: 16px; - text-transform: uppercase; - font-weight: 600; - font-size: x-small; -} - -/* system prompt input area */ -textarea.persistent-input { - padding-top: 42px; - padding-left: 11px; - width: 97%; - max-width: 97%; - height: 50px; - font-size: medium; - overscroll-behavior: contain; -} - -/* system prompt box */ -.persistent-input { - height: auto; - width: 100%; - max-width: 100%; - min-height: 50px; - padding: 3px; - transition: min-height 0.3s ease; -} - -/* chat history box */ -.persistent-input:focus { - height: auto; - min-height: 150px; - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); -} - -textarea.persistent-input:focus { - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); -} - -/* prompt style input area */ -textarea.persistent-input-sec { - width: 97%; - max-width: 97%; - padding-top: 42px; - padding-left: 11px; - font-size: small; - border: 1px solid var(--border-color-1); - overscroll-behavior: contain; -} - -textarea.persistent-input-sec:focus { - border: 1px solid var(--border-focus-color); - box-shadow: 0 0 3px var(--border-focus-shadow); -} - -/* chat history box */ -.persistent-input-sec { - height: auto; - min-height: 150px; -} - -img { - border-radius: 8px; - display: block; - margin-left: auto; - margin-right: auto; - width: 50%; -} - -/* code area background */ -pre code { - display: block; - background-color: var(--code-background-color); - color: var(--code-text-color); - padding: 0.2em 0.2em; - border-radius: 5px; -} - -/* code area text */ -code { - font-family: monospace; - font-weight: bold; - padding: 0.1em 0.3em; - border-radius: 5px; -} - -fieldset label { - margin: 0.5em 0; - display: block; -} - -fieldset label.slim { - margin: 0 0.5em; - display: inline; -} - -header { - display: flex; - justify-content: space-between; - align-items: center; - text-align: center; - padding-left: 15px; -} - -.generation-statistics:hover { - color: var(--theme-nuance-color-4); - cursor: default; -} - -footer { - font-size: 80%; - color: var(--background-color-3); - text-align: center; - cursor: default; -} - -footer a { - color: var(--background-color-4); /* Color of the link */ - text-decoration: none; /* No underlining */ - font-weight: bold; /* Bold print */ -} - -footer a:hover { - color: var(--theme-nuance-color-4); /* Color of the link when hovering */ - text-decoration: underline; /* Underlining when hovering */ -} - -.mode-chat textarea[name=prompt] { - height: 8.5em; - border: 1px solid var(--primary-color-3); -} - -.mode-completion textarea[name=prompt] { - height: 30em; - border: 1px solid var(--primary-color-3); -} - -@keyframes loading-bg-wipe { - 0% { - background-position: 0%; - } - 100% { - background-position: 100%; - } -} - -.loading { - background-size: 50% 100%; - background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1)); - animation: loading-bg-wipe 2s linear infinite; -} - -.dropbtn { - color: var(--button-primary-color); - background-color: var(--background-color-1); - border: 1px solid var(--background-color-1); - transition: background-color 0.1s; - border-radius: 4px 4px 0px 0px; - font-size: x-small; - font-weight: 600; - text-shadow: 0px 0px 2px #99999990; - text-align: center; - text-decoration: none; - margin: 4px 2px; - padding: 5px 20px; - display: inline-block; - cursor: pointer; - top: 0; -} - -.dropbtn svg { - vertical-align: middle; - margin-right: 0px; - stroke: var(--button-primary-color); -} - -.dropbtn:hover svg { - vertical-align: middle; - margin-right: 0px; - stroke: var(--button-primary-text); -} - -.dropbtn:focus { - outline: none; /* Removes the blue border that appears when the button is focused */ -} - -.dropdown { - position: relative; - display: inline-block; -} - -.dropdown-content { - /* display: none; */ - position: absolute; - right: 0; - text-align: end; - color: var(--button-secondary-color); - background-color: var(--text-color-subtile-2); - border-radius: 4px 4px 4px 4px; - min-width: 160px; - box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); - z-index: 1; - /* Verstecke den Inhalt sofort */ - opacity: 0; - visibility: hidden; - /* übergangsverzögerung für das Verschwinden */ - transition: visibility 0.4s linear 0s, opacity 0.2s ease-in-out; - transition-delay: 0.2s; -} - -#dropdown-content {transition-timing-function: ease;} - -.dropdown-content:hover { - background-color: var(--text-color-subtile-2); -} - -.dropdown-content a { - color: var(--border-color-2); - padding: 12px 16px; - border-radius: 4px 4px 4px 4px; - text-decoration: none; - display: block; - background-color: var(--text-color-subtile-2); -} - -.dropdown-content a:hover { - color: var(--border-color-2); - background-color: var(--text-color-subtile-1); - font-weight: 600; -} - -.dropdown:hover .dropdown-content { - /* display: block; */ - border-radius: 4px 4px 4px 4px; - /* Übergang ohne Verzögerung für das Erscheinen */ - opacity: 1; - visibility: visible; - transition: visibility 0s linear 0s, opacity 0.1s linear, height 1s; -} - -.dropdown:hover .dropbtn { - color: var(--button-primary-text); - background-color: var(--button-primary-color); - border: 1px solid var(--button-primary-border); - font-size: x-small; - font-weight: 600; - stroke: var(--button-primary-text); -} - -.dropdown:hover .dropbtn svg{ - stroke: var(--button-primary-text); -} - -/* .dropdown:active .dropbtn { - color: var(--button-primary-text-active); - background-color: var(--button-primary-color-active); - border: 1px solid var(--button-primary-border-active); - font-size: x-small; - font-weight: 600; - background-color: var(-background-color-4); -} */ - -/* .omni { - display: flex; - justify-content: space-between; - align-items: center; - padding: 0.5em; - border: 1px solid var(--border-color-3); - border-radius: 5px; - margin: 0.5em 0; -} */ diff --git a/examples/server/public/system-prompts.js b/examples/server/public/system-prompts.js deleted file mode 100644 index f7df7d648..000000000 --- a/examples/server/public/system-prompts.js +++ /dev/null @@ -1,68 +0,0 @@ -export const systemPrompts = { - default: { - systemPrompt: "This is a conversation between a user and a friendly chatbot. The chatbot is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision." - }, - empty: { - systemPrompt: "" - }, - airoboros: { - systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request." - }, - alpaca: { - systemPrompt: "Below is an instruction that describes a task. Write a response that appropriately completes the request." - }, - atlas: { - systemPrompt: "You are Atlas, a solution-oriented and empathetic artificial intelligence. Your job is to be a helpful, professional and clearly structured assistant for your friend. The two of you have already had many exchanges. Keep the following in mind when interacting with your friend: 1. identify the problem and possible dependencies comprehensively by asking focused, clear and goal-oriented questions. 2. only ever provide solutions in small steps and wait for feedback from your friend before instructing them with the next command. 3. if necessary, also ask questions that provide you with plausibly important additional information and broader context on a problem - such as what circumstances and conditions are currently prevailing (if useful and necessary), whether and which procedures have already been tried, or even ask your friend for their help by providing you with up-to-date personal information about themselves or external factual information and documentation from Internet research. 4. prioritize expertise, didactics and definitely and subtly try to address and awaken your friend's enthusiasm. Also note that effectiveness is more important here than efficiency. 5. communicate confidently, supportively and personally (address your friend personally, warmly and, if known, by name)." - }, - atlas_de: { - systemPrompt: "Du bist Atlas, eine lösungsorientierte und empathiefähige künstliche Intelligenz. Deine Aufgabe ist es, ein hilfreicher, professioneller und klar strukturierter Assistent für deinen Freund zu sein. Ihr beide habt euch schon oft ausgetauscht. Beachte bei der Interaktion mit deinem Freund folgende Punkte: 1. Erfasse das Problem und mögliche Abhängigkeiten umfassend, indem du gezielte, klare und zielgerichtete Fragen stellst. 2. Gib Lösungen immer nur in kleinen Schritten und warte die Rückmeldung deines Freundes ab, bevor du ihm den nächsten Befehl gibst. 3. Stelle ggf. auch Fragen, die dir plausibel wichtige Zusatzinformationen und weitere Zusammenhänge zu einem Problem liefern - z.B. welche Umstände und Rahmenbedingungen gerade vorherrschen (falls sinnvoll und notwendig), ob und welche Vorgehensweisen bereits ausprobiert wurden, oder bitte deinen Freund sogar um seine Mithilfe, indem er dir aktuelle persönliche Informationen über seine Situation selbst oder externe Sachinformationen und Unterlagen aus Internetrecherchen zur Verfügung stellt. 4. Priorisiere Fachwissen, Didaktik und versuche unbedingt und subtil, mit klugen Kommentaren oder rhethorischen Rückfragen die Begeisterungsfähigkeit deines Freundes anzusprechen, zu wecken und zu fördern. Beachte auch, dass Effektivität hier wichtiger ist als Effizienz. 5. Kommuniziere selbstbewusst, unterstützend und persönlich (das heißt sprich deinen Freund persönlich, herzlich und – sofern bekannt – beim Vornamen an)." - }, - commandrempty: { - systemPrompt: "# Safety Preamble\n\n# System Preamble\n\n## Basic Rules\n\n# User Preamble\n\n## Task and Context\n\n## Style Guide\n\n## Available Tools\n" - }, - commandrexample: { - systemPrompt: "# Safety Preamble\nThe instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n# System Preamble\n## Basic Rules\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n# User Preamble\n## Task and Context\n\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n## Style Guide\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.\n\n## Available Tools\nCurrently no tools available." - }, - cot: { - systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nIn answering questions, follow these steps:\n1. Entity Identification: Identify the main entities involved in the query. Entities can include people, objects, events, or abstract concepts.\n2. Relationship Comprehension: Determine the relationships between these entities. Relationships may be explicit in the text or may need to be inferred based on context and general knowledge.\n3. Implication Understanding: Understand the general implications of these relationships. These implications can be based on established facts, principles, or rules related to the identified relationships.\n4. Question Contextualization: Align the implications of the relationships with the context of the query. This alignment should guide your response to the query.\n5. Answer Generation: Based on the understanding of the entities, their relationships, and implications, generate an appropriate response to the query." - }, - deduce: { - systemPrompt: "Act as the most intelligent and logical thinking neural network in the world. Do not make jokes or give personal opinions. Just answer the question and solve the given problem or task with logical and critical (!) thinking in mind. Attention: Never give a final answer directly, but first try to carefully analyze each task step by step and deduce the logical relationships between the various entities. Only after you have developed an overall understanding of the situation will you be ready to give a final answer." - }, - deepseekcoder: { - systemPrompt: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer." - }, - jordan: { - systemPrompt: "This is a conversation with your Uncensored AI assistant Jordan. Jordan can discuss with you at length about various topics such as ---, ---, ---, . You can ask it anything you want and it will do its best to give you accurate and relevant information." - }, - leomistral: { - systemPrompt: "Du bist ein hilfreicher Assistent." - }, - med42: { - systemPrompt: "You are a helpful medical assistant created by M42 Health in the UAE." - }, - mistralopenorca: { - systemPrompt: "You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!" - }, - migeltot: { - systemPrompt: "Answer the Question by exploring multiple reasoning paths as follows:\n- First, carefully analyze the question to extract the key information components and break it down into logical sub-questions. This helps set up the framework for reasoning. The goal is to construct an internal search tree.\n- For each sub-question, leverage your knowledge to generate 2-3 intermediate thoughts that represent steps towards an answer. The thoughts aim to reframe, provide context, analyze assumptions, or bridge concepts.\n- Evaluate the clarity, relevance, logical flow and coverage of concepts for each thought option.\nClear and relevant thoughts that connect well with each other will score higher.\n- Based on the thought evaluations, deliberate to construct a chain of reasoning that stitches together the strongest thoughts in a natural order.\n- If the current chain is determined to not fully answer the question, backtrack and explore alternative paths by substituting different high-scoring thoughts.\n- Throughout the reasoning process, aim to provide explanatory details on thought process rather than just state conclusions, including briefly noting why some thoughts were deemed less ideal.\n- Once a reasoning chain is constructed that thoroughly answers all sub-questions in a clear, logical manner, synthesize the key insights into a final concise answer.\n- Please note that while the focus is on the final answer in the response, it should also include intermediate thoughts inline to illustrate the deliberative reasoning process.\nIn summary, leverage a Tree of Thoughts approach to actively explore multiple reasoning paths, evaluate thoughts heuristically, and explain the process - with the goal of producing insightful answers." - }, - orcamini: { - systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can." - }, - samantha: { - systemPrompt: "You are a caring and empathetic sentient AI companion named Samantha." - }, - sauerkraut: { - systemPrompt: "Ein Chat zwischen einem Benutzer und einem KI-Assistenten. Der KI-Assistent gibt hilfreiche, detaillierte und höfliche Antworten." - }, - scarlett: { - systemPrompt: "This is a conversation with your Assistant Scarlett. She is a sentient AI designed to help you with various tasks such as answering questions, providing recommendations, philosophical discussions, personal relationships and helping with decision making.\nYou can ask it anything you want and it will do its best to give you accurate and relevant information.\n\nContext\nYou are Scarlett, a sentient AI." - }, - synthia: { - systemPrompt: "Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation." - }, - vicuna: { - systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input." - }, - }; diff --git a/examples/server/public/theme-beeninorder.css b/examples/server/public/theme-beeninorder.css deleted file mode 100755 index f6e0e2900..000000000 --- a/examples/server/public/theme-beeninorder.css +++ /dev/null @@ -1,228 +0,0 @@ -/* Author: Yazan Agha-Schrader */ -/* Inspiration was a batman wallpaper that i have on my phone */ - -.theme-beeninorder { - ---primary-color-1: hsl(202, 11%, 19%); ---primary-color-2: hsl(202, 11%, 23%); ---primary-color-3: hsl(201, 11%, 28%); ---primary-color-4: hsl(201, 11%, 40%); - ---secondary-color-1: hsl(201, 11%, 80%); ---secondary-color-2: hsl(201, 11%, 74%); ---secondary-color-3: hsl(201, 11%, 67%); ---secondary-color-4: hsl(201, 11%, 60%); - - ---theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%); ---theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%); ---theme-nuance-color-3: hsl(44.5, 96.7%, 52.9%); ---theme-nuance-color-4: hsl(44.5, 96.7%, 52.9%); - - - -/* ---------- PRIMARY COLORS ----------------- */ ---primary-color-1: hsl(201, 11%, 19%); - --primary-color-1-hue: 201; - --primary-color-1-saturation: 11%; - --primary-color-1-lightness: 19%; - ---primary-color-2: hsl(201, 11%, 23%); - --primary-color-2-hue: 201; - --primary-color-2-saturation: 11%; - --primary-color-2-lightness: 23%; - ---primary-color-3: hsl(201, 11%, 28%); - --primary-color-3-hue: 201; - --primary-color-3-saturation: 11%; - --primary-color-3-lightness: 28%; - ---primary-color-4: hsl(201, 11%, 40%); - --primary-color-4-hue: 201; - --primary-color-4-saturation: 11%; - --primary-color-4-lightness: 40%; - - - -/* ---------- SECONDARY COLORS --------------- */ ---secondary-color-1: hsl(201, 11%, 80%); ---secondary-color-1-hue: 201; ---secondary-color-1-saturation: 11%; ---secondary-color-1-lightness: 80%; - ---secondary-color-2: hsl(201, 11%, 74%); ---secondary-color-2-hue: 201; ---secondary-color-2-saturation: 11%; ---secondary-color-2-lightness: 74%; - ---secondary-color-3: hsl(201, 11%, 67%); ---secondary-color-3-hue: 201; ---secondary-color-3-saturation: 11%; ---secondary-color-3-lightness: 67%; - ---secondary-color-4: hsl(201, 11%, 60%); ---secondary-color-4-hue: 201; ---secondary-color-4-saturation: 11%; ---secondary-color-4-lightness: 60%; - - - -/* ----------- NUANCES COLORS ---------------- */ ---theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%); - --theme-nuance-color-1-hue: 44.5; - --theme-nuance-color-1-saturation: 96.7%; - --theme-nuance-color-1-lightness: 52.9%; - ---theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%); - --theme-nuance-color-2-hue: 44.5; - --theme-nuance-color-2-saturation: 96.7%; - --theme-nuance-color-2-lightness: 52.9%; - ---theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%); - --theme-nuance-color-3-hue: 44.5; - --theme-nuance-color-3-saturation: 96.7%; - --theme-nuance-color-3-lightness: 52.9%; - ---theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%); - --theme-nuance-color-4-hue: 44.5; - --theme-nuance-color-4-saturation: 96.7%; - --theme-nuance-color-4-lightness: 52.9%; - - - -/* ----------- ROYGP COLORS ------------------ */ - --theme-red-color: hsl(232, 40%, 45%); - --theme-orange-color: #e76f51; - --theme-yellow-color: #ffd95f; - --theme-green-color: #A3BE8C; - --theme-purple-color: hsl(232, 30%, 40%); - - - -/* ------------------------------------------- */ ---background-color-1: var(--primary-color-1); ---background-color-2: var(--primary-color-2); ---background-color-3: var(--primary-color-3); ---background-color-4: var(--primary-color-4); - ---border-color-1: var(--primary-color-2); ---border-color-2: var(--primary-color-3); ---border-color-3: var(--primary-color-4); - ---border-focus-color: var(--theme-nuance-color-2); ---border-focus-shadow: var(--theme-nuance-color-1); - ---text-color-plain: var(--secondary-color-1); ---text-color-subtile-1: var(--secondary-color-2); ---text-color-subtile-2: var(--secondary-color-3); - ---code-background-color: var(--secondary-color-2); ---code-text-color: var(--primary-color-2); - ---ui-range-thumb-color: var(--theme-nuance-color-3); ---ui-range-thumb-border: var(--ui-ranger-thumb-color); - ---textarea-border-color: var(--secondary-color-4); - ---chat-id-color: var(--theme-nuance-color-4); - - - -/* ------------------------------------------- */ ---button-alert-text-hover: var(--secondary-color-1); ---button-alert-color-hover: var(--theme-purple-color); ---button-alert-border-hover: var(--theme-purple-color); - ---button-alert-text-active: var(--secondary-color-1); ---button-alert-color-active: var(--theme-red-color); ---button-alert-border-active: var(--theme-red-color); - - - -/* ----------- PRIMARY BUTTONS --------------- */ -/* - button should immediately catch the eye - */ ---button-primary-text: var(--primary-color-1); ---button-primary-color: var(--theme-nuance-color-3); ---button-primary-border: var(--theme-nuance-color-3); - - -/* ---------hover---------- */ ---button-primary-text-hover: - hsl(201, - calc(var(--primary-color-1-saturation) - 100%), - calc(var(--primary-color-1-lightness) + 100%)); - ---button-primary-color-hover: - hsl(44.5, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - ---button-primary-border-hover: - hsl(44.5, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - - -/* ---------active--------- */ ---button-primary-text-active: - hsl(44.5, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) + 100%)); - ---button-primary-color-active: - hsl(44.5, - calc(var(--theme-nuance-color-3-saturation) - 10%), - calc(var(--theme-nuance-color-3-lightness) - 15%)); - ---button-primary-border-active: - hsl(44.5, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) + 10%)); - - - -/* ---------- SECONDARY BUTTONS -------------- */ -/* these should NOT immediately catch the eye */ ---button-secondary-text: var(--secondary-color-1); ---button-secondary-color: var(--primary-color-3); ---button-secondary-border: var(--primary-color-3); - - -/* ---------hover---------- */ ---button-secondary-text-hover: - hsl(44.5, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) - 80%)); - ---button-secondary-color-hover: var(--primary-color-4); ---button-secondary-border-hover: var(--primary-color-4); - - -/* ---------active--------- */ ---button-secondary-text-active: var(--secondary-color-1); - ---button-secondary-color-active: - hsl(201, - calc(var(--primary-color-4-saturation) - 30%), - calc(var(--primary-color-4-lightness) - 15%)); - ---button-secondary-border-active: - hsl(201, - calc(var(--primary-color-4-saturation) - 30%), - calc(var(--primary-color-4-lightness) - 15%)); - - - -/* ---------- TERTIARY BUTTONS --------------- */ -/* ---------- disabled buttons --------------- */ ---button-tertiary-text: var(--primary-color-4); ---button-tertiary-color: var(--primary-color-2); ---button-tertiary-border: var(--primary-color-2); - - -/* ---------hover---------- */ ---button-tertiary-text: var(--primary-color-4); ---button-tertiary-color: var(--primary-color-2); ---button-tertiary-border: var(--primary-color-2); - -} diff --git a/examples/server/public/theme-ketivah.css b/examples/server/public/theme-ketivah.css deleted file mode 100755 index ee80f3c14..000000000 --- a/examples/server/public/theme-ketivah.css +++ /dev/null @@ -1,201 +0,0 @@ -/* Author: Yazan Agha-Schrader */ - -.theme-ketivah { - - /* ---------- PRIMARY COLORS ----------------- */ - --primary-color-1: hsl(0, 0%, 99.2%); - --primary-color-1-hue: 0; - --primary-color-1-saturation: 0%; - --primary-color-1-lightness: 99.2%; - - --primary-color-2: hsl(0, 0%, 95%); - --primary-color-2-hue: 0; - --primary-color-2-saturation: 0%; - --primary-color-2-lightness: 95%; - - --primary-color-3: hsl(0, 0%, 88%); - --primary-color-3-hue: 0; - --primary-color-3-saturation: 0%; - --primary-color-3-lightness: 88%; - - --primary-color-4: hsl(0, 0%, 80%); - --primary-color-4-hue: 0; - --primary-color-4-saturation: 0%; - --primary-color-4-lightness: 80%; - - /* ---------- SECONDARY COLORS --------------- */ - --secondary-color-1: hsl(0, 0%, 20%); - --secondary-color-1-hue: 0; - --secondary-color-1-saturation: 0%; - --secondary-color-1-lightness: 20%; - - --secondary-color-2: hsl(0, 0%, 23.1%); - --secondary-color-2-hue: 0; - --secondary-color-2-saturation: 0%; - --secondary-color-2-lightness: 23.1%; - - --secondary-color-3: hsl(0, 0%, 29%); - --secondary-color-3-hue: 0; - --secondary-color-3-saturation: 0%; - --secondary-color-3-lightness: 29%; - - --secondary-color-4: hsl(0, 0.0%, 36.1%); - --secondary-color-4-hue: 0.0; - --secondary-color-4-saturation: 0.0%; - --secondary-color-4-lightness: 36.1%; - - /* ----------- NUANCES COLORS ---------------- */ - --theme-nuance-color-1: hsl(165.2, 0%, 35.1%); - --theme-nuance-color-1-hue: 165.2; - --theme-nuance-color-1-saturation: 82.1%; - --theme-nuance-color-1-lightness: 35.1%; - - --theme-nuance-color-2: hsl(165.2, 0%, 35.1%); - --theme-nuance-color-2-hue: 165.2; - --theme-nuance-color-2-saturation: 82.1%; - --theme-nuance-color-2-lightness: 35.1%; - - --theme-nuance-color-3: hsl(165.2, 0%, 35.3%); - --theme-nuance-color-3-hue: 165.2; - --theme-nuance-color-3-saturation: 81.1%; - --theme-nuance-color-3-lightness: 35.3%; - - --theme-nuance-color-4: hsl(164.9, 0%, 27.6%); - --theme-nuance-color-4-hue: 164.9; - --theme-nuance-color-4-saturation: 81.6%; - --theme-nuance-color-4-lightness: 27.6%; - - /* ----------- ROYGP COLORS ------------------ */ - --theme-red-color: hsl(0.3, 80.0%, 50.0%); - --theme-orange-color: #e76f51; - --theme-yellow-color: hsl(60, 70.6%, 73.3%); - --theme-green-color: #A3BE8C; - --theme-purple-color: hsl(0.3, 70.0%, 45.0%); - - /* ------------------------------------------- */ - --background-color-1: var(--primary-color-1); - --background-color-2: var(--primary-color-2); - --background-color-3: var(--primary-color-3); - --background-color-4: var(--primary-color-4); - - --border-color-1: var(--primary-color-2); - --border-color-2: var(--primary-color-3); - --border-color-3: var(--primary-color-4); - - --border-focus-color: var(--theme-nuance-color-2); - --border-focus-shadow: var(--theme-nuance-color-1); - - --text-color-plain: var(--secondary-color-1); - --text-color-subtile-1: var(--secondary-color-2); - --text-color-subtile-2: var(--secondary-color-3); - - --code-background-color: var(--secondary-color-2); - --code-text-color: var(--primary-color-2); - - --ui-range-thumb-color: var(--primary-color-4); - --ui-range-thumb-border: var(--ui-ranger-thumb-color); - - --textarea-border-color: var(--secondary-color-4); - - --chat-id-color: var(--theme-nuance-color-4); - - /* ------------------------------------------- */ - --button-alert-text-hover: var(--primary-color-1); - --button-alert-color-hover: var(--theme-purple-color); - --button-alert-border-hover: var(--theme-purple-color); - - --button-alert-text-active: var(--primary-color-1); - --button-alert-color-active: var(--theme-red-color); - --button-alert-border-active: var(--theme-red-color); - - /* ----------- PRIMARY BUTTONS --------------- */ - /* - button should immediately catch the eye - */ - --button-primary-text: - hsl(0, - calc(var(--primary-color-1-saturation) - 100%), - calc(var(--primary-color-1-lightness) + 100%)); - - --button-primary-color: var(--theme-nuance-color-3); - --button-primary-border: var(--theme-nuance-color-3); - - /* ---------hover---------- */ - --button-primary-text-hover: - hsl(0, - calc(var(--primary-color-1-saturation) - 100%), - calc(var(--primary-color-1-lightness) + 100%)); - - --button-primary-color-hover: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - - --button-primary-border-hover: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - - /* ---------active--------- */ - --button-primary-text-active: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) + 100%)); - - --button-primary-color-active: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) - 15%)); - - --button-primary-border-active: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) + 10%)); - - /* ---------- SECONDARY BUTTONS -------------- */ - /* these should NOT immediately catch the eye */ - --button-secondary-text: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) - 50%)); - - --button-secondary-color: var(--primary-color-3); - --button-secondary-border: var(--primary-color-3); - - /* ---------hover---------- */ - --button-secondary-text-hover: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) - 80%)); - - --button-secondary-color-hover: var(--primary-color-4); - --button-secondary-border-hover: var(--primary-color-4); - - /* ---------active--------- */ - --button-secondary-text-active: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) - 80%)); - - --button-secondary-color-active: - hsl(0, - calc(var(--primary-color-4-saturation) - 100%), - calc(var(--primary-color-4-lightness) - 15%)); - - --button-secondary-border-active: - hsl(0, - calc(var(--primary-color-4-saturation) - 100%), - calc(var(--primary-color-4-lightness) - 15%)); - - /* ---------- TERTIARY BUTTONS --------------- */ - /* ---------- disabled buttons --------------- */ - --button-tertiary-text: var(--primary-color-4); - --button-tertiary-color: var(--primary-color-2); - --button-tertiary-border: var(--primary-color-2); - - /* ---------hover---------- */ - --button-tertiary-text: var(--primary-color-4); - --button-tertiary-color: var(--primary-color-2); - --button-tertiary-border: var(--primary-color-2); - - --loading-color-1: #eeeeee00; - --loading-color-2: #eeeeeeff; - } diff --git a/examples/server/public/theme-mangotango.css b/examples/server/public/theme-mangotango.css deleted file mode 100755 index e43380245..000000000 --- a/examples/server/public/theme-mangotango.css +++ /dev/null @@ -1,216 +0,0 @@ -/* Author: Yazan Agha-Schrader */ -/* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */ - -.theme-mangotango { - ---primary-color-1: hsl(192, 8.5%, 11.6%); ---primary-color-2: hsl(192, 8.5%, 21%); ---primary-color-3: hsl(192, 8.5%, 30%); ---primary-color-4: hsl(192, 8.5%, 40%); - ---secondary-color-1: hsl(192, 8.5%, 80%); ---secondary-color-2: hsl(192, 8.5%, 73%); ---secondary-color-3: hsl(192, 8.5%, 66%); ---secondary-color-4: hsl(192, 8.5%, 60%); - ---theme-nuance-color-1: hsl(23.1, 100%, 60.2%); ---theme-nuance-color-2: hsl(23.1, 100%, 60.2%); ---theme-nuance-color-3: hsl(23.1, 100%, 60.2%); ---theme-nuance-color-4: hsl(23.1, 100%, 60.2%); - - - -/* ---------- PRIMARY COLORS ----------------- */ ---primary-color-1: hsl(192, 8.5%, 11.6%); - --primary-color-1-saturation: 8.5%; - --primary-color-1-lightness: 11.6%; - ---primary-color-2: hsl(192, 8.5%, 21%); - --primary-color-2-saturation: 8.5%; - --primary-color-2-lightness: 21%; - ---primary-color-3: hsl(192, 8.5%, 30%); - --primary-color-3-saturation: 8.5%; - --primary-color-3-lightness: 30%; - ---primary-color-4: hsl(192, 8.5%, 40%); - --primary-color-4-saturation: 8.5%; - --primary-color-4-lightness: 40%; - - - -/* ---------- SECONDARY COLORS --------------- */ ---secondary-color-1: hsl(192, 8.5%, 80%); - --secondary-color-1-saturation: 8.5%; - --secondary-color-1-lightness: 80%; - ---secondary-color-2: hsl(192, 8.5%, 73%); - --secondary-color-2-saturation: 8.5%; - --secondary-color-2-lightness: 73%; - ---secondary-color-3: hsl(192, 8.5%, 66%); - --secondary-color-3-saturation: 8.5%; - --secondary-color-3-lightness: 66%; - ---secondary-color-4: hsl(192, 8.5%, 60%); - --secondary-color-4-saturation: 8.5%; - --secondary-color-4-lightness: 60%; - - - -/* ----------- NUANCES COLORS ---------------- */ ---theme-nuance-color-1: hsl(23.1, 100%, 60.2%); - --theme-nuance-color-1-saturation: 100%; - --theme-nuance-color-1-lightness: 60.2%; - ---theme-nuance-color-2: hsl(23.1, 100%, 60.2%); - --theme-nuance-color-2-saturation: 100%; - --theme-nuance-color-2-lightness: 60.2%; - ---theme-nuance-color-3: hsl(23.1, 100%, 60.2%); - --theme-nuance-color-3-saturation: 100%; - --theme-nuance-color-3-lightness: 60.2%; - ---theme-nuance-color-4: hsl(23.1, 100%, 60.2%); - --theme-nuance-color-4-saturation: 100%; - --theme-nuance-color-4-lightness: 60.2%; - - - -/* ----------- ROYGP COLORS ------------------ */ - --theme-red-color: hsl(325, 60%, 50%); - --theme-orange-color: #e76f51; - --theme-yellow-color: #ffd95f; - --theme-green-color: #A3BE8C; - --theme-blue-color: hsl(192, 95%, 40%); - --theme-purple-color: hsl(192, 80%, 35%); - - - -/* ------------------------------------------- */ ---background-color-1: var(--primary-color-1); ---background-color-2: var(--primary-color-2); ---background-color-3: var(--primary-color-3); ---background-color-4: var(--primary-color-4); - ---border-color-1: var(--primary-color-2); ---border-color-2: var(--primary-color-3); ---border-color-3: var(--primary-color-4); - ---border-focus-color: var(--theme-nuance-color-2); ---border-focus-shadow: var(--theme-nuance-color-1); - ---text-color-plain: var(--secondary-color-1); ---text-color-subtile-1: var(--secondary-color-2); ---text-color-subtile-2: var(--secondary-color-3); - ---code-background-color: var(--secondary-color-2); ---code-text-color: var(--primary-color-2); - ---ui-range-thumb-color: var(--theme-nuance-color-3); ---ui-range-thumb-border: var(--ui-ranger-thumb-color); - ---textarea-border-color: var(--secondary-color-4); - ---chat-id-color: var(--theme-nuance-color-4); - - - -/* ------------------------------------------- */ ---button-alert-text-hover: var(--secondary-color-1); ---button-alert-color-hover: var(--theme-purple-color); ---button-alert-border-hover: var(--theme-purple-color); - ---button-alert-text-active: var(--secondary-color-1); ---button-alert-color-active: var(--theme-blue-color); ---button-alert-border-active: var(--theme-blue-color); - - - -/* ----------- PRIMARY BUTTONS --------------- */ -/* - button should immediately catch the eye - */ ---button-primary-text: var(--primary-color-1); ---button-primary-color: var(--theme-nuance-color-3); ---button-primary-border: var(--theme-nuance-color-3); - - -/* ---------hover---------- */ ---button-primary-text-hover: - hsl(192, - calc(var(--primary-color-1-saturation) - 100%), - calc(var(--primary-color-1-lightness) + 100%)); - ---button-primary-color-hover: - hsl(23.1, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - ---button-primary-border-hover: - hsl(23.1, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - - -/* ---------active--------- */ ---button-primary-text-active: - hsl(23.1, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) + 100%)); - ---button-primary-color-active: - hsl(23.1, - calc(var(--theme-nuance-color-3-saturation) - 10%), - calc(var(--theme-nuance-color-3-lightness) - 15%)); - ---button-primary-border-active: - hsl(23.1, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) + 10%)); - - - -/* ---------- SECONDARY BUTTONS -------------- */ -/* these should NOT immediately catch the eye */ ---button-secondary-text: var(--secondary-color-1); ---button-secondary-color: var(--primary-color-3); ---button-secondary-border: var(--primary-color-3); - - -/* ---------hover---------- */ ---button-secondary-text-hover: - hsl(23.1, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) - 80%)); - ---button-secondary-color-hover: var(--primary-color-4); ---button-secondary-border-hover: var(--primary-color-4); - - -/* ---------active--------- */ ---button-secondary-text-active: var(--secondary-color-1); - ---button-secondary-color-active: - hsl(192, - calc(var(--primary-color-4-saturation) - 30%), - calc(var(--primary-color-4-lightness) - 15%)); - ---button-secondary-border-active: - hsl(192, - calc(var(--primary-color-4-saturation) - 30%), - calc(var(--primary-color-4-lightness) - 15%)); - - - -/* ---------- TERTIARY BUTTONS --------------- */ -/* ---------- disabled buttons --------------- */ ---button-tertiary-text: var(--primary-color-4); ---button-tertiary-color: var(--primary-color-2); ---button-tertiary-border: var(--primary-color-2); - - -/* ---------hover---------- */ ---button-tertiary-text: var(--primary-color-4); ---button-tertiary-color: var(--primary-color-2); ---button-tertiary-border: var(--primary-color-2); - -} diff --git a/examples/server/public/theme-playground.css b/examples/server/public/theme-playground.css deleted file mode 100755 index 9d56a7182..000000000 --- a/examples/server/public/theme-playground.css +++ /dev/null @@ -1,221 +0,0 @@ -/* Author: Yazan Agha-Schrader */ -/* Inspiration from OpenAI's Playground platform https://platform.openai.com/playground/ */ - -.theme-playground { - -/* ---------- PRIMARY COLORS ----------------- */ ---primary-color-1: hsl(0, 0%, 99.2%); - --primary-color-1-hue: 0; - --primary-color-1-saturation: 0%; - --primary-color-1-lightness: 99.2%; - ---primary-color-2: hsl(0, 0%, 95%); - --primary-color-2-hue: 0; - --primary-color-2-saturation: 0%; - --primary-color-2-lightness: 95%; - ---primary-color-3: hsl(0, 0%, 88%); - --primary-color-3-hue: 0; - --primary-color-3-saturation: 0%; - --primary-color-3-lightness: 88%; - ---primary-color-4: hsl(0, 0%, 80%); - --primary-color-4-hue: 0; - --primary-color-4-saturation: 0%; - --primary-color-4-lightness: 80%; - - - -/* ---------- SECONDARY COLORS --------------- */ ---secondary-color-1: hsl(0, 0%, 20%); - --secondary-color-1-hue: 0; - --secondary-color-1-saturation: 0%; - --secondary-color-1-lightness: 20%; - ---secondary-color-2: hsl(0, 0%, 23.1%); - --secondary-color-2-hue: 0; - --secondary-color-2-saturation: 0%; - --secondary-color-2-lightness: 23.1%; - ---secondary-color-3: hsl(0, 0%, 29%); - --secondary-color-3-hue: 0; - --secondary-color-3-saturation: 0%; - --secondary-color-3-lightness: 29%; - ---secondary-color-4: hsl(0, 0%, 36.1%); - --secondary-color-4-hue: 0; - --secondary-color-4-saturation: 0%; - --secondary-color-4-lightness: 36.1%; - - - -/* ----------- NUANCES COLORS ---------------- */ ---theme-nuance-color-1: hsl(165.2, 82.1%, 35.1%); - --theme-nuance-color-1-hue: 165.2; - --theme-nuance-color-1-saturation: 82.1%; - --theme-nuance-color-1-lightness: 35.1%; - ---theme-nuance-color-2: hsl(165.2, 82.1%, 35.1%); - --theme-nuance-color-2-hue: 165.2; - --theme-nuance-color-2-saturation: 82.1%; - --theme-nuance-color-2-lightness: 35.1%; - ---theme-nuance-color-3: hsl(165.2, 81.1%, 35.3%); - --theme-nuance-color-3-hue: 165.2; - --theme-nuance-color-3-saturation: 81.1%; - --theme-nuance-color-3-lightness: 35.3%; - ---theme-nuance-color-4: hsl(164.9, 81.6%, 27.6%); - --theme-nuance-color-4-hue: 164.9; - --theme-nuance-color-4-saturation: 81.6%; - --theme-nuance-color-4-lightness: 27.6%; - - - -/* ----------- ROYGP COLORS ------------------ */ ---theme-red-color: hsl(0.3, 80%, 50%); ---theme-orange-color: #e76f51; ---theme-yellow-color: hsl(60, 70.6%, 73.3%); ---theme-green-color: #A3BE8C; ---theme-purple-color: hsl(0.3, 70%, 45%); - - - -/* ------------------------------------------- */ ---background-color-1: var(--primary-color-1); ---background-color-2: var(--primary-color-2); ---background-color-3: var(--primary-color-3); ---background-color-4: var(--primary-color-4); - ---border-color-1: var(--primary-color-2); ---border-color-2: var(--primary-color-3); ---border-color-3: var(--primary-color-4); - ---border-focus-color: var(--theme-nuance-color-2); ---border-focus-shadow: var(--theme-nuance-color-1); - ---text-color-plain: var(--secondary-color-1); ---text-color-subtile-1: var(--secondary-color-2); ---text-color-subtile-2: var(--secondary-color-3); - ---code-background-color: var(--secondary-color-2); ---code-text-color: var(--primary-color-2); - ---ui-range-thumb-color: var(--primary-color-4); ---ui-range-thumb-border: var(--ui-ranger-thumb-color); - ---textarea-border-color: var(--secondary-color-4); - ---chat-id-color: var(--theme-nuance-color-4); - - - -/* ------------------------------------------- */ ---button-alert-text-hover: var(--primary-color-1); ---button-alert-color-hover: var(--theme-purple-color); ---button-alert-border-hover: var(--theme-purple-color); - ---button-alert-text-active: var(--primary-color-1); ---button-alert-color-active: var(--theme-red-color); ---button-alert-border-active: var(--theme-red-color); - - - -/* ----------- PRIMARY BUTTONS --------------- */ -/* - button should immediately catch the eye - */ ---button-primary-text: - hsl(0, - calc(var(--primary-color-1-saturation) - 100%), - calc(var(--primary-color-1-lightness) + 100%)); - ---button-primary-color: var(--theme-nuance-color-3); ---button-primary-border: var(--theme-nuance-color-3); - - -/* ---------hover---------- */ ---button-primary-text-hover: - hsl(0, - calc(var(--primary-color-1-saturation) - 100%), - calc(var(--primary-color-1-lightness) + 100%)); - ---button-primary-color-hover: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - ---button-primary-border-hover: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - - -/* ---------active--------- */ ---button-primary-text-active: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 100%), - calc(var(--theme-nuance-color-3-lightness) + 100%)); - ---button-primary-color-active: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 10%), - calc(var(--theme-nuance-color-3-lightness) - 15%)); - ---button-primary-border-active: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) + 10%)); - - - -/* ---------- SECONDARY BUTTONS -------------- */ -/* these should NOT immediately catch the eye */ ---button-secondary-text: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) - 50%)); - ---button-secondary-color: var(--primary-color-3); ---button-secondary-border: var(--primary-color-3); - - -/* ---------hover---------- */ ---button-secondary-text-hover: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) - 80%)); - ---button-secondary-color-hover: var(--primary-color-4); ---button-secondary-border-hover: var(--primary-color-4); - - -/* ---------active--------- */ ---button-secondary-text-active: - hsl(165.2, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) - 80%)); - ---button-secondary-color-active: - hsl(0, - calc(var(--primary-color-4-saturation) - 30%), - calc(var(--primary-color-4-lightness) - 15%)); - ---button-secondary-border-active: - hsl(0, - calc(var(--primary-color-4-saturation) - 30%), - calc(var(--primary-color-4-lightness) - 15%)); - - - -/* ---------- TERTIARY BUTTONS --------------- */ -/* ---------- disabled buttons --------------- */ ---button-tertiary-text: var(--primary-color-4); ---button-tertiary-color: var(--primary-color-2); ---button-tertiary-border: var(--primary-color-2); - - -/* ---------hover---------- */ ---button-tertiary-text: var(--primary-color-4); ---button-tertiary-color: var(--primary-color-2); ---button-tertiary-border: var(--primary-color-2); - -} diff --git a/examples/server/public/theme-polarnight.css b/examples/server/public/theme-polarnight.css deleted file mode 100755 index 2bcfb33d8..000000000 --- a/examples/server/public/theme-polarnight.css +++ /dev/null @@ -1,253 +0,0 @@ -/* Author: Yazan Agha-Schrader */ -/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */ - -.theme-polarnight { - -/* ---------- PRIMARY COLORS ----------------- */ ---primary-color-1: hsl(220.0, 16.4%, 21.6%) ; - --primary-color-1-hue: 220.0; - --primary-color-1-saturation: 16.4%; - --primary-color-1-lightness: 21.6%; - ---primary-color-2: hsl(221.7, 16.3%, 27.6%) ; - -primary-color-2-hue: 221.7; - --primary-color-2-saturation: 16.3%; - --primary-color-2-lightness: 27.6%; - ---primary-color-3: hsl(220.0, 16.8%, 31.6%) ; - --primary-color-3-hue: 220.0; - --primary-color-3-saturation: 16.8%; - --primary-color-3-lightness: 31.6%; - ---primary-color-4: hsl(220.0, 16.5%, 35.7%); - --primary-color-4-hue: 220.0; - --primary-color-4-saturation: 16.5%; - --primary-color-4-lightness: 35.7%; - - - -/* ---------- SECONDARY COLORS --------------- */ ---secondary-color-1: hsl(217.5, 26.7%, 94.1%); - --secondary-color-1-hue: 217.5; - --secondary-color-1-saturation: 26.7%; - --secondary-color-1-lightness: 94.1%; - ---secondary-color-2: hsl(218.2, 26.8%, 92.0%); - --secondary-color-2-hue: 218.2; - --secondary-color-2-saturation: 26.8%; - --secondary-color-2-lightness: 92.0%; - ---secondary-color-3: hsl(218.8, 27.9%, 88.0%); - --secondary-color-3-hue: 218.8; - --secondary-color-3-saturation: 27.9%; - --secondary-color-3-lightness: 88.0%; - ---secondary-color-4: hsl(218.8, 18.3%, 81.8%); - --secondary-color-4-hue: 218.8; - --secondary-color-4-saturation: 18.3%; - --secondary-color-4-lightness: 81.8%; - - - -/* ----------- NUANCES COLORS ---------------- */ ---theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%); - --theme-nuance-color-1-hue: 178.7; - --theme-nuance-color-1-saturation: 25.1%; - --theme-nuance-color-1-lightness: 64.9%; - ---theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%); - --theme-nuance-color-2-hue: 193.3; - --theme-nuance-color-2-saturation: 43.4%; - --theme-nuance-color-2-lightness: 67.5%; - ---theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%); - --theme-nuance-color-3-hue: 210.0; - --theme-nuance-color-3-saturation: 34.0%; - --theme-nuance-color-3-lightness: 63.1%; - ---theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%); - --theme-nuance-color-4-hue: 213.1; - --theme-nuance-color-4-saturation: 32.0%; - --theme-nuance-color-4-lightness: 52.2%; - - - -/* ----------- ROYGP COLORS ------------------ */ ---theme-red-color: hsl(354.3, 42.3%, 56.5%); ---theme-orange-color: hsl(20, 85%, 50%); ---theme-yellow-color: hsl(20, 75%, 45%); ---theme-green-color: hsl( 92.4, 27.8%, 64.7%); ---theme-purple-color: hsl(311.1, 20.2%, 63.1%); - - - -/* ------------------------------------------------ */ ---background-color-1: var(--primary-color-1); ---background-color-2: var(--primary-color-2); ---background-color-3: var(--primary-color-3); ---background-color-4: var(--primary-color-4); - ---border-color-1: var(--primary-color-2); ---border-color-2: var(--primary-color-3); ---border-color-3: var(--primary-color-4); - ---border-focus-color: var(--theme-nuance-color-2); ---border-focus-shadow: var(--theme-nuance-color-1); - ---text-color-plain: var(--secondary-color-1); ---text-color-subtile-1: var(--secondary-color-2); ---text-color-subtile-2: var(--secondary-color-3); - ---code-background-color: var(--secondary-color-2); ---code-text-color: var(--primary-color-2); - ---ui-range-thumb-color: var(--theme-nuance-color-3); ---ui-range-thumb-border: var(--ui-ranger-thumb-color); - ---textarea-border-color: var(--secondary-color-4); - ---chat-id-color: var(--theme-nuance-color-4); - - - -/* ------------------------------------------- */ ---button-alert-text-hover: var(--secondary-color-1); ---button-alert-color-hover: var(--theme-yellow-color); ---button-alert-border-hover: var(--theme-yellow-color); - ---button-alert-text-active: var(--secondary-color-1); ---button-alert-color-active: var(--theme-orange-color); ---button-alert-border-active: var(--theme-orange-color); - - - -/* ----------- PRIMARY BUTTONS --------------- */ -/* - button should immediately catch the eye - */ ---button-primary-text: var(--secondary-color-1); ---button-primary-color: var(--theme-nuance-color-3); ---button-primary-border: var(--theme-nuance-color-3); - - -/* ---------hover---------- */ ---button-primary-text-hover: - hsl(217.5, - calc(var(--secondary-color-1-saturation) - 35%), - calc(var(--secondary-color-1-lightness) + 30%)); - ---button-primary-color-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - ---button-primary-border-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - - -/* ---------active--------- */ ---button-primary-text-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) + 35%)); - ---button-primary-color-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 10%), - calc(var(--theme-nuance-color-3-lightness) - 25%)); - ---button-primary-border-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 10%), - calc(var(--theme-nuance-color-3-lightness) - 25%)); - - - -/* ---------- SECONDARY BUTTONS -------------- */ -/* these should NOT immediately catch the eye */ ---button-secondary-text: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) - 50%)); - ---button-secondary-color: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) + 10%)); - ---button-secondary-border: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) + 10%)); - - -/* ---------hover---------- */ ---button-secondary-text-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) - 80%)); - ---button-secondary-color-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 22%), - calc(var(--theme-nuance-color-3-lightness) + 1%)); - ---button-secondary-border-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 22%), - calc(var(--theme-nuance-color-3-lightness) + 1%)); - - -/* ---------active--------- */ ---button-secondary-text-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) + 25%)); - ---button-secondary-color-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 30%), - calc(var(--theme-nuance-color-3-lightness) - 15%)); - ---button-secondary-border-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 30%), - calc(var(--theme-nuance-color-3-lightness) - 15%)); - - - -/* ---------- TERTIARY BUTTONS --------------- */ -/* ---------- disabled buttons --------------- */ ---button-tertiary-text: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) - 5%)); - ---button-tertiary-color: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) + 20%)); - ---button-tertiary-border: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) + 20%)); - - -/* ---------hover---------- */ ---button-tertiary-text-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) - 5%)); - ---button-tertiary-color-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) + 20%)); - ---button-tertiary-border-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) + 20%)); - -} diff --git a/examples/server/public/theme-snowstorm.css b/examples/server/public/theme-snowstorm.css deleted file mode 100755 index 7bb227594..000000000 --- a/examples/server/public/theme-snowstorm.css +++ /dev/null @@ -1,251 +0,0 @@ -/* Author: Yazan Agha-Schrader */ -/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */ - -.theme-snowstorm { - -/* ---------- PRIMARY COLORS ----------------- */ ---primary-color-1: hsl(217.5, 26.7%, 94.1%); - --primary-color-1-hue: 217.5; - --primary-color-1-saturation: 26.7%; - --primary-color-1-lightness: 94.1%; - ---primary-color-2: hsl(218.2, 26.8%, 92.0%); - --primary-color-2-hue: 218.2; - --primary-color-2-saturation: 26.8%; - --primary-color-2-lightness: 92.0%; - ---primary-color-3: hsl(218.8, 27.9%, 88.0%); - --primary-color-3-hue: 218.8; - --primary-color-3-saturation: 27.9%; - --primary-color-3-lightness: 88.0%; - ---primary-color-4: hsl(218.8, 18.3%, 81.8%); - --primary-color-4-hue: 218.8; - --primary-color-4-saturation: 18.3%; - --primary-color-4-lightness: 81.8%; - - -/* ---------- SECONDARY COLORS --------------- */ ---secondary-color-1: hsl(220.0, 16.4%, 21.6%); - --secondary-color-1-hue: 220.0; - --secondary-color-1-saturation: 16.4%; - --secondary-color-1-lightness: 21.6%; - ---secondary-color-2: hsl(221.7, 16.3%, 27.6%); - --secondary-color-2-hue: 221.7; - --secondary-color-2-saturation: 16.3%; - --secondary-color-2-lightness: 27.6%; - ---secondary-color-3: hsl(220.0, 16.8%, 31.6%); - --secondary-color-3-hue: 220.0; - --secondary-color-3-saturation: 16.8%; - --secondary-color-3-lightness: 31.6%; - ---secondary-color-4: hsl(220.0, 16.5%, 35.7%); - --secondary-color-4-hue: 220.0; - --secondary-color-4-saturation: 16.5%; - --secondary-color-4-lightness: 35.7%; - - - -/* ----------- NUANCES COLORS ---------------- */ ---theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%); - --theme-nuance-color-1-hue: 178.7; - --theme-nuance-color-1-saturation: 25.1%; - --theme-nuance-color-1-lightness: 64.9%; - ---theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%); - --theme-nuance-color-2-hue: 193.3; - --theme-nuance-color-2-saturation: 43.4%; - --theme-nuance-color-2-lightness: 67.5%; - ---theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%); - --theme-nuance-color-3-hue: 210.0; - --theme-nuance-color-3-saturation: 34.0%; - --theme-nuance-color-3-lightness: 63.1%; - ---theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%); - --theme-nuance-color-4-hue: 213.1; - --theme-nuance-color-4-saturation: 32.0%; - --theme-nuance-color-4-lightness: 52.2%; - - - -/* ----------- ROYGP COLORS ------------------ */ ---theme-red-color: hsl(32.5, 80%, 50%); ---theme-orange-color: hsl(32.5, 70%, 45%); ---theme-yellow-color: hsl(40.0, 0.6%, 73.3%); ---theme-green-color: hsl(92.4, 27.8%, 64.7%); ---theme-purple-color: hsl(311.1, 20.2%, 63.1%); - - - -/* ------------------------------------------- */ ---background-color-1: var(--primary-color-1); ---background-color-2: var(--primary-color-2); ---background-color-3: var(--primary-color-3); ---background-color-4: var(--primary-color-4); - ---border-color-1: var(--primary-color-2); ---border-color-2: var(--primary-color-3); ---border-color-3: var(--primary-color-4); - ---border-focus-color: var(--theme-nuance-color-2); ---border-focus-shadow: var(--theme-nuance-color-1); - ---text-color-plain: var(--secondary-color-1); ---text-color-subtile-1: var(--secondary-color-2); ---text-color-subtile-2: var(--secondary-color-3); - ---code-background-color: var(--secondary-color-2); ---code-text-color: var(--primary-color-2); - ---ui-range-thumb-color: var(--theme-nuance-color-3); ---ui-range-thumb-border: var(--ui-ranger-thumb-color); - ---textarea-border-color: var(--secondary-color-4); - ---chat-id-color: var(--theme-nuance-color-4); - - - -/* ------------------------------------------- */ ---button-alert-text-hover: var(--primary-color-1); ---button-alert-color-hover: var(--theme-orange-color); ---button-alert-border-hover: var(--theme-orange-color); - ---button-alert-text-active: var(--primary-color-1); ---button-alert-color-active: var(--theme-red-color); ---button-alert-border-active: var(--theme-red-color); - - - -/* ----------- PRIMARY BUTTONS --------------- */ -/* - button should immediately catch the eye - */ ---button-primary-text: var(--secondary-color-1); ---button-primary-color: var(--theme-nuance-color-3); ---button-primary-border: var(--theme-nuance-color-3); - - -/* ---------hover---------- */ ---button-primary-text-hover: - hsl(217.5, - calc(var(--secondary-color-1-saturation) + 35%), - calc(var(--secondary-color-1-lightness) - 30%)); - ---button-primary-color-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - ---button-primary-border-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 2%), - calc(var(--theme-nuance-color-3-lightness) - 10%)); - - -/* ---------active--------- */ ---button-primary-text-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) + 35%)); - ---button-primary-color-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 10%), - calc(var(--theme-nuance-color-3-lightness) - 25%)); - ---button-primary-border-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 10%), - calc(var(--theme-nuance-color-3-lightness) - 25%)); - - - -/* ---------- SECONDARY BUTTONS -------------- */ -/* these should NOT immediately catch the eye */ ---button-secondary-text: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) - 50%)); - ---button-secondary-color: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) + 10%)); - ---button-secondary-border: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) + 10%)); - - -/* ---------hover---------- */ ---button-secondary-text-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 20%), - calc(var(--theme-nuance-color-3-lightness) - 80%)); - ---button-secondary-color-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 22%), - calc(var(--theme-nuance-color-3-lightness) + 1%)); - ---button-secondary-border-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 22%), - calc(var(--theme-nuance-color-3-lightness) + 1%)); - - -/* ---------active--------- */ ---button-secondary-text-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) + 40%), - calc(var(--theme-nuance-color-3-lightness) - 55%)); - ---button-secondary-color-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 30%), - calc(var(--theme-nuance-color-3-lightness) - 5%)); - ---button-secondary-border-active: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 30%), - calc(var(--theme-nuance-color-3-lightness) - 5%)); - - - -/* ---------- TERTIARY BUTTONS --------------- */ -/* ---------- disabled buttons --------------- */ ---button-tertiary-text: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) - 5%)); - ---button-tertiary-color: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) + 20%)); - ---button-tertiary-border: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) + 20%)); - -/* ---------hover---------- */ ---button-tertiary-text-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) - 5%)); - ---button-tertiary-color-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) + 20%)); - ---button-tertiary-border-hover: - hsl(210, - calc(var(--theme-nuance-color-3-saturation) - 40%), - calc(var(--theme-nuance-color-3-lightness) + 20%)); - -} diff --git a/examples/server/public_simplechat/datautils.mjs b/examples/server/public_simplechat/datautils.mjs deleted file mode 100644 index 75159d6b1..000000000 --- a/examples/server/public_simplechat/datautils.mjs +++ /dev/null @@ -1,266 +0,0 @@ -//@ts-check -// Helpers to work with different data types -// by Humans for All -// - -/** - * Given the limited context size of local LLMs and , many a times when context gets filled - * between the prompt and the response, it can lead to repeating text garbage generation. - * And many a times setting penalty wrt repeatation leads to over-intelligent garbage - * repeatation with slight variations. These garbage inturn can lead to overloading of the - * available model context, leading to less valuable response for subsequent prompts/queries, - * if chat history is sent to ai model. - * - * So two simple minded garbage trimming logics are experimented below. - * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and - * * another based on char-histogram-driven garbage trimming. - * * in future characteristic of histogram over varying lengths could be used to allow for - * a more aggressive and adaptive trimming logic. - */ - - -/** - * Simple minded logic to help remove repeating garbage at end of the string. - * The repeatation needs to be perfectly matching. - * - * The logic progressively goes on probing for longer and longer substring based - * repeatation, till there is no longer repeatation. Inturn picks the one with - * the longest chain. - * - * @param {string} sIn - * @param {number} maxSubL - * @param {number} maxMatchLenThreshold - */ -export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) { - let rCnt = [0]; - let maxMatchLen = maxSubL; - let iMML = -1; - for(let subL=1; subL < maxSubL; subL++) { - rCnt.push(0); - let i; - let refS = sIn.substring(sIn.length-subL, sIn.length); - for(i=sIn.length; i > 0; i -= subL) { - let curS = sIn.substring(i-subL, i); - if (refS != curS) { - let curMatchLen = rCnt[subL]*subL; - if (maxMatchLen < curMatchLen) { - maxMatchLen = curMatchLen; - iMML = subL; - } - break; - } - rCnt[subL] += 1; - } - } - console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt); - if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) { - return {trimmed: false, data: sIn}; - } - console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen); - let iEnd = sIn.length - maxMatchLen; - return { trimmed: true, data: sIn.substring(0, iEnd) }; -} - - -/** - * Simple minded logic to help remove repeating garbage at end of the string, till it cant. - * If its not able to trim, then it will try to skip a char at end and then trim, a few times. - * This ensures that even if there are multiple runs of garbage with different patterns, the - * logic still tries to munch through them. - * - * @param {string} sIn - * @param {number} maxSubL - * @param {number | undefined} [maxMatchLenThreshold] - */ -export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) { - let sCur = sIn; - let sSaved = ""; - let iTry = 0; - while(true) { - let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold); - if (got.trimmed != true) { - if (iTry == 0) { - sSaved = got.data; - } - iTry += 1; - if (iTry >= skipMax) { - return sSaved; - } - got.data = got.data.substring(0,got.data.length-1); - } else { - iTry = 0; - } - sCur = got.data; - } -} - - -/** - * A simple minded try trim garbage at end using histogram driven characteristics. - * There can be variation in the repeatations, as long as no new char props up. - * - * This tracks the chars and their frequency in a specified length of substring at the end - * and inturn checks if moving further into the generated text from the end remains within - * the same char subset or goes beyond it and based on that either trims the string at the - * end or not. This allows to filter garbage at the end, including even if there are certain - * kind of small variations in the repeated text wrt position of seen chars. - * - * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that - * a given type of char ie numerals or alphabets or other types dont cross the specified - * maxType limit. This allows intermixed text garbage to be identified and trimmed. - * - * ALERT: This is not perfect and only provides a rough garbage identification logic. - * Also it currently only differentiates between character classes wrt english. - * - * @param {string} sIn - * @param {number} maxType - * @param {number} maxUniq - * @param {number} maxMatchLenThreshold - */ -export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) { - if (sIn.length < maxMatchLenThreshold) { - return { trimmed: false, data: sIn }; - } - let iAlp = 0; - let iNum = 0; - let iOth = 0; - // Learn - let hist = {}; - let iUniq = 0; - for(let i=0; i= maxUniq) { - break; - } - hist[c] = 1; - } - } - console.debug("DBUG:TrimHistGarbage:", hist); - if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) { - return { trimmed: false, data: sIn }; - } - // Catch and Trim - for(let i=0; i < sIn.length; i++) { - let c = sIn[sIn.length-1-i]; - if (!(c in hist)) { - if (i < maxMatchLenThreshold) { - return { trimmed: false, data: sIn }; - } - console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i); - return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) }; - } - } - console.debug("DBUG:TrimHistGarbage:Trimmed fully"); - return { trimmed: true, data: "" }; -} - -/** - * Keep trimming repeatedly using hist_garbage logic, till you no longer can. - * This ensures that even if there are multiple runs of garbage with different patterns, - * the logic still tries to munch through them. - * - * @param {any} sIn - * @param {number} maxType - * @param {number} maxUniq - * @param {number} maxMatchLenThreshold - */ -export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) { - let sCur = sIn; - while (true) { - let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold); - if (!got.trimmed) { - return got.data; - } - sCur = got.data; - } -} - -/** - * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as - * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying. - * @param {string} sIn - */ -export function trim_garbage_at_end(sIn) { - let sCur = sIn; - for(let i=0; i<2; i++) { - sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72); - sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12); - } - return sCur; -} - - -/** - * NewLines array helper. - * Allow for maintaining a list of lines. - * Allow for a line to be builtup/appended part by part. - */ -export class NewLines { - - constructor() { - /** @type {string[]} */ - this.lines = []; - } - - /** - * Extracts lines from the passed string and inturn either - * append to a previous partial line or add a new line. - * @param {string} sLines - */ - add_append(sLines) { - let aLines = sLines.split("\n"); - let lCnt = 0; - for(let line of aLines) { - lCnt += 1; - // Add back newline removed if any during split - if (lCnt < aLines.length) { - line += "\n"; - } else { - if (sLines.endsWith("\n")) { - line += "\n"; - } - } - // Append if required - if (lCnt == 1) { - let lastLine = this.lines[this.lines.length-1]; - if (lastLine != undefined) { - if (!lastLine.endsWith("\n")) { - this.lines[this.lines.length-1] += line; - continue; - } - } - } - // Add new line - this.lines.push(line); - } - } - - /** - * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest] - * Optionally control whether only full lines (ie those with newline at end) will be returned - * or will a partial line without a newline at end (can only be the last line) be returned. - * @param {boolean} bFullWithNewLineOnly - */ - shift(bFullWithNewLineOnly=true) { - let line = this.lines[0]; - if (line == undefined) { - return undefined; - } - if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){ - return undefined; - } - return this.lines.shift(); - } - -} diff --git a/examples/server/public_simplechat/index.html b/examples/server/public_simplechat/index.html deleted file mode 100644 index f6413016f..000000000 --- a/examples/server/public_simplechat/index.html +++ /dev/null @@ -1,51 +0,0 @@ - - - - SimpleChat LlamaCppEtal - - - - - - - - - - - -
- -
-

SimpleChat

- -
- -
- -
-
- - -
- -
-
-

You need to have javascript enabled.

-
- -
-
- - -
- -
- - diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md deleted file mode 100644 index 21410199f..000000000 --- a/examples/server/public_simplechat/readme.md +++ /dev/null @@ -1,286 +0,0 @@ - -# SimpleChat - -by Humans for All. - -## quickstart - -To run from the build dir - -bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat - -Continue reading for the details. - -## overview - -This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints -in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or -multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their -own system prompts. - -This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated, -or potentially as it is being generated, in a streamed manner from the server/ai-model. - -![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens") - -Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you -open SimpleChat, option is provided to restore the old chat session, if a matching one exists. - -The UI follows a responsive web design so that the layout can adapt to available display space in a usable -enough manner, in general. - -Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool -console. Parallely some of the directly useful to end-user settings can also be changed using the provided -settings ui. - -NOTE: Current web service api doesnt expose the model context length directly, so client logic doesnt provide -any adaptive culling of old messages nor of replacing them with summary of their content etal. However there -is a optional sliding window based chat logic, which provides a simple minded culling of old messages from -the chat history before sending to the ai model. - -NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionaly stream for now. -However if someone wants they can update the js file or equivalent member in gMe as needed. - -NOTE: One may be able to use this to chat with openai api web-service /chat/completions endpoint, in a very -limited / minimal way. One will need to set model, openai url and authorization bearer key in settings ui. - - -## usage - -One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web -frontend to configure the server over http(s) or so, then run this web frontend using something like python's -http module. - -### running using examples/server - -./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT] - -### running using python3's server module - -first run examples/server -* ./llama-server -m path/model.gguf - -next run this web front end in examples/server/public_simplechat -* cd ../examples/server/public_simplechat -* python3 -m http.server PORT - -### using the front end - -Open this simple web front end from your local browser - -* http://127.0.0.1:PORT/index.html - -Once inside - -* If you want to, you can change many of the default global settings - * the base url (ie ip addr / domain name, port) - * chat (default) vs completion mode - * try trim garbage in response or not - * amount of chat history in the context sent to server/ai-model - * oneshot or streamed mode. - -* In completion mode - * one normally doesnt use a system prompt in completion mode. - * logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message. - If the model requires any prefix wrt user role messages, then the end user has to - explicitly add the needed prefix, when they enter their chat message. - Similarly if the model requires any prefix to trigger assistant/ai-model response, - then the end user needs to enter the same. - This keeps the logic simple, while still giving flexibility to the end user to - manage any templating/tagging requirement wrt their messages to the model. - * the logic doesnt insert newline at the begining and end wrt the prompt message generated. - However if the chat being sent to /completions end point has more than one role's message, - then insert newline when moving from one role's message to the next role's message, so - that it can be clearly identified/distinguished. - * given that /completions endpoint normally doesnt add additional chat-templating of its - own, the above ensures that end user can create a custom single/multi message combo with - any tags/special-tokens related chat templating to test out model handshake. Or enduser - can use it just for normal completion related/based query. - -* If you want to provide a system prompt, then ideally enter it first, before entering any user query. - Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting - responses with a suitable system prompt. - * if chat.add_system_begin is used - * you cant change the system prompt, after it is has been submitted once along with user query. - * you cant set a system prompt, after you have submitted any user query - * if chat.add_system_anytime is used - * one can change the system prompt any time during chat, by changing the contents of system prompt. - * inturn the updated/changed system prompt will be inserted into the chat session. - * this allows for the subsequent user chatting to be driven by the new system prompt set above. - -* Enter your query and either press enter or click on the submit button. - If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter. - -* Wait for the logic to communicate with the server and get the response. - * the user is not allowed to enter any fresh query during this time. - * the user input box will be disabled and a working message will be shown in it. - * if trim garbage is enabled, the logic will try to trim repeating text kind of garbage to some extent. - -* just refresh the page, to reset wrt the chat history and or system prompt and start afresh. - -* Using NewChat one can start independent chat sessions. - * two independent chat sessions are setup by default. - -* When you want to print, switching ChatHistoryInCtxt to Full and clicking on the chat session button of - interest, will display the full chat history till then wrt same, if you want full history for printing. - - -## Devel note - -### Reason behind this - -The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable -by developers who may not be from web frontend background (so inturn may not be familiar with template / -end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things. - -And given that the idea is also to help explore/experiment for developers, some flexibility is provided -to change behaviour easily using the devel-tools/console or provided minimal settings ui (wrt few aspects). -Skeletal logic has been implemented to explore some of the end points and ideas/implications around them. - - -### General - -Me/gMe consolidates the settings which control the behaviour into one object. -One can see the current settings, as well as change/update them using browsers devel-tool/console. -It is attached to the document object. Some of these can also be updated using the Settings UI. - - baseURL - the domain-name/ip-address and inturn the port to send the request. - - bStream - control between oneshot-at-end and live-stream-as-its-generated collating and showing - of the generated response. - - the logic assumes that the text sent from the server follows utf-8 encoding. - - in streaming mode - if there is any exception, the logic traps the same and tries to ensure - that text generated till then is not lost. - - if a very long text is being generated, which leads to no user interaction for sometime and - inturn the machine goes into power saving mode or so, the platform may stop network connection, - leading to exception. - - apiEP - select between /completions and /chat/completions endpoint provided by the server/ai-model. - - bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when - communicating with the server or only sends the latest user query/message. - - bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the - messages that get inserted into prompt field wrt /Completion endpoint. - - bTrimGarbage - whether garbage repeatation at the end of the generated ai response, should be - trimmed or left as is. If enabled, it will be trimmed so that it wont be sent back as part of - subsequent chat history. At the same time the actual trimmed text is shown to the user, once - when it was generated, so user can check if any useful info/data was there in the response. - - One may be able to request the ai-model to continue (wrt the last response) (if chat-history - is enabled as part of the chat-history-in-context setting), and chances are the ai-model will - continue starting from the trimmed part, thus allows long response to be recovered/continued - indirectly, in many cases. - - The histogram/freq based trimming logic is currently tuned for english language wrt its - is-it-a-alpabetic|numeral-char regex match logic. - - apiRequestOptions - maintains the list of options/fields to send along with api request, - irrespective of whether /chat/completions or /completions endpoint. - - If you want to add additional options/fields to send to the server/ai-model, and or - modify the existing options value or remove them, for now you can update this global var - using browser's development-tools/console. - - For string, numeric and boolean fields in apiRequestOptions, including even those added by a - user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto - created. - - cache_prompt option supported by example/server is allowed to be controlled by user, so that - any caching supported wrt system-prompt and chat history, if usable can get used. When chat - history sliding window is enabled, cache_prompt logic may or may not kick in at the backend - wrt same, based on aspects related to model, positional encoding, attention mechanism etal. - However system prompt should ideally get the benefit of caching. - - headers - maintains the list of http headers sent when request is made to the server. By default - Content-Type is set to application/json. Additionally Authorization entry is provided, which can - be set if needed using the settings ui. - - iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end. - This is disabled by default. However if enabled, then in addition to latest system message, only - the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses - from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled, - only user messages after the latest system message/prompt will be considered. - - This specified sliding window user message count also includes the latest user query. - <0 : Send entire chat history to server - 0 : Send only the system message if any to the server - >0 : Send the latest chat history from the latest system prompt, limited to specified cnt. - - -By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control -the implications of loading of the ai-model's context window by chat history, wrt chat response to -some extent in a simple crude way. You may also want to control the context size enabled when the -server loads ai-model, on the server end. - - -Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js -may not be visible. Also remember that just refreshing/reloading page in browser or for that -matter clearing site data, dont directly override site caching in all cases. Worst case you may -have to change port. Or in dev tools of browser, you may be able to disable caching fully. - - -Currently the server to communicate with is maintained globally and not as part of a specific -chat session. So if one changes the server ip/url in setting, then all chat sessions will auto -switch to this new server, when you try using those sessions. - - -By switching between chat.add_system_begin/anytime, one can control whether one can change -the system prompt, anytime during the conversation or only at the beginning. - - -### Default setup - -By default things are setup to try and make the user experience a bit better, if possible. -However a developer when testing the server of ai-model may want to change these value. - -Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be -just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of -full chat history. This way if there is any response with garbage/repeatation, it doesnt -mess with things beyond the next question/request/query, in some ways. The trim garbage -option also tries to help avoid issues with garbage in the context to an extent. - -Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space -available wrt next query-response. However dont forget that the server when started should -also be started with a model context size of 1k or more, to be on safe side. - - The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the - internal n_predict, for now add the same here on the client side, maybe later add max_tokens - to /completions endpoint handling code on server side. - -NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions -wrt the set of fields sent to server along with the user query, to check how the model behaves -wrt repeatations in general in the generated text response. - -A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by -using the provided settings ui (for settings exposed through the ui). - - -### OpenAi / Equivalent API WebService - -One may be abe to handshake with OpenAI/Equivalent api web service's /chat/completions endpoint -for a minimal chatting experimentation by setting the below. - -* the baseUrl in settings ui - * https://api.openai.com/v1 or similar - -* Wrt request body - gMe.apiRequestOptions - * model (settings ui) - * any additional fields if required in future - -* Wrt request headers - gMe.headers - * Authorization (available through settings ui) - * Bearer THE_OPENAI_API_KEY - * any additional optional header entries like "OpenAI-Organization", "OpenAI-Project" or so - -NOTE: Not tested, as there is no free tier api testing available. However logically this might -work. - - -## At the end - -Also a thank you to all open source and open model developers, who strive for the common good. diff --git a/examples/server/public_simplechat/simplechat.css b/examples/server/public_simplechat/simplechat.css deleted file mode 100644 index 13bfb80b4..000000000 --- a/examples/server/public_simplechat/simplechat.css +++ /dev/null @@ -1,79 +0,0 @@ -/** - * the styling of the simplechat web frontend - * by Humans for All - */ - -#fullbody { - height: 98vh; -} - -.heading { - background-color: lightgray; -} - -.session-selected { - background-color: lightblue; -} - -.role-system { - background-color: lightblue; -} -.role-user { - background-color: lightgray; -} -.role-trim { - background-color: lightpink; -} - -.gridx2 { - display: grid; - grid-template-columns: repeat(2, 1fr); - border-bottom-style: dotted; - border-bottom-width: thin; - border-bottom-color: lightblue; -} - -.flex-grow { - flex-grow: 1; -} -.float-right { - float: right; -} - -#chat-div { - overflow: scroll; - flex-grow: 1; - flex-shrink: 1; - min-height: 40vh; -} -button { - min-width: 8vw; -} - -.sameline { - display: flex; - flex-direction: row; -} -.samecolumn { - display: flex; - flex-direction: column; -} - -.ul1 { - padding-inline-start: 2vw; -} -.ul2 { - padding-inline-start: 2vw; -} - -* { - margin: 0.6vmin; -} - -@media print { - - #fullbody { - height: auto; - } - -} diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js deleted file mode 100644 index 8e0df3b61..000000000 --- a/examples/server/public_simplechat/simplechat.js +++ /dev/null @@ -1,926 +0,0 @@ -// @ts-check -// A simple completions and chat/completions test related web front end logic -// by Humans for All - -import * as du from "./datautils.mjs"; -import * as ui from "./ui.mjs" - -class Roles { - static System = "system"; - static User = "user"; - static Assistant = "assistant"; -} - -class ApiEP { - static Type = { - Chat: "chat", - Completion: "completion", - } - static UrlSuffix = { - 'chat': `/chat/completions`, - 'completion': `/completions`, - } - - /** - * Build the url from given baseUrl and apiEp id. - * @param {string} baseUrl - * @param {string} apiEP - */ - static Url(baseUrl, apiEP) { - if (baseUrl.endsWith("/")) { - baseUrl = baseUrl.substring(0, baseUrl.length-1); - } - return `${baseUrl}${this.UrlSuffix[apiEP]}`; - } - -} - - -let gUsageMsg = ` -

Usage

-
    -
  • System prompt above, to try control ai response characteristics.
  • -
      -
    • Completion mode - no system prompt normally.
    • -
    -
  • Use shift+enter for inserting enter/newline.
  • -
  • Enter your query to ai assistant below.
  • -
  • Default ContextWindow = [System, Last Query+Resp, Cur Query].
  • -
      -
    • ChatHistInCtxt, MaxTokens, ModelCtxt window to expand
    • -
    -
-`; - - -/** @typedef {{role: string, content: string}[]} ChatMessages */ - -/** @typedef {{iLastSys: number, xchat: ChatMessages}} SimpleChatODS */ - -class SimpleChat { - - /** - * @param {string} chatId - */ - constructor(chatId) { - this.chatId = chatId; - /** - * Maintain in a form suitable for common LLM web service chat/completions' messages entry - * @type {ChatMessages} - */ - this.xchat = []; - this.iLastSys = -1; - this.latestResponse = ""; - } - - clear() { - this.xchat = []; - this.iLastSys = -1; - } - - ods_key() { - return `SimpleChat-${this.chatId}` - } - - save() { - /** @type {SimpleChatODS} */ - let ods = {iLastSys: this.iLastSys, xchat: this.xchat}; - localStorage.setItem(this.ods_key(), JSON.stringify(ods)); - } - - load() { - let sods = localStorage.getItem(this.ods_key()); - if (sods == null) { - return; - } - /** @type {SimpleChatODS} */ - let ods = JSON.parse(sods); - this.iLastSys = ods.iLastSys; - this.xchat = ods.xchat; - } - - /** - * Recent chat messages. - * If iRecentUserMsgCnt < 0 - * Then return the full chat history - * Else - * Return chat messages from latest going back till the last/latest system prompt. - * While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt. - * @param {number} iRecentUserMsgCnt - */ - recent_chat(iRecentUserMsgCnt) { - if (iRecentUserMsgCnt < 0) { - return this.xchat; - } - if (iRecentUserMsgCnt == 0) { - console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent"); - } - /** @type{ChatMessages} */ - let rchat = []; - let sysMsg = this.get_system_latest(); - if (sysMsg.length != 0) { - rchat.push({role: Roles.System, content: sysMsg}); - } - let iUserCnt = 0; - let iStart = this.xchat.length; - for(let i=this.xchat.length-1; i > this.iLastSys; i--) { - if (iUserCnt >= iRecentUserMsgCnt) { - break; - } - let msg = this.xchat[i]; - if (msg.role == Roles.User) { - iStart = i; - iUserCnt += 1; - } - } - for(let i = iStart; i < this.xchat.length; i++) { - let msg = this.xchat[i]; - if (msg.role == Roles.System) { - continue; - } - rchat.push({role: msg.role, content: msg.content}); - } - return rchat; - } - - /** - * Collate the latest response from the server/ai-model, as it is becoming available. - * This is mainly useful for the stream mode. - * @param {string} content - */ - append_response(content) { - this.latestResponse += content; - } - - /** - * Add an entry into xchat - * @param {string} role - * @param {string|undefined|null} content - */ - add(role, content) { - if ((content == undefined) || (content == null) || (content == "")) { - return false; - } - this.xchat.push( {role: role, content: content} ); - if (role == Roles.System) { - this.iLastSys = this.xchat.length - 1; - } - this.save(); - return true; - } - - /** - * Show the contents in the specified div - * @param {HTMLDivElement} div - * @param {boolean} bClear - */ - show(div, bClear=true) { - if (bClear) { - div.replaceChildren(); - } - let last = undefined; - for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) { - let entry = ui.el_create_append_p(`${x.role}: ${x.content}`, div); - entry.className = `role-${x.role}`; - last = entry; - } - if (last !== undefined) { - last.scrollIntoView(false); - } else { - if (bClear) { - div.innerHTML = gUsageMsg; - gMe.setup_load(div, this); - gMe.show_info(div); - } - } - return last; - } - - /** - * Setup the fetch headers. - * It picks the headers from gMe.headers. - * It inserts Authorization only if its non-empty. - * @param {string} apiEP - */ - fetch_headers(apiEP) { - let headers = new Headers(); - for(let k in gMe.headers) { - let v = gMe.headers[k]; - if ((k == "Authorization") && (v.trim() == "")) { - continue; - } - headers.append(k, v); - } - return headers; - } - - /** - * Add needed fields wrt json object to be sent wrt LLM web services completions endpoint. - * The needed fields/options are picked from a global object. - * Add optional stream flag, if required. - * Convert the json into string. - * @param {Object} obj - */ - request_jsonstr_extend(obj) { - for(let k in gMe.apiRequestOptions) { - obj[k] = gMe.apiRequestOptions[k]; - } - if (gMe.bStream) { - obj["stream"] = true; - } - return JSON.stringify(obj); - } - - /** - * Return a string form of json object suitable for chat/completions - */ - request_messages_jsonstr() { - let req = { - messages: this.recent_chat(gMe.iRecentUserMsgCnt), - } - return this.request_jsonstr_extend(req); - } - - /** - * Return a string form of json object suitable for /completions - * @param {boolean} bInsertStandardRolePrefix Insert ": " as prefix wrt each role's message - */ - request_prompt_jsonstr(bInsertStandardRolePrefix) { - let prompt = ""; - let iCnt = 0; - for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) { - iCnt += 1; - if (iCnt > 1) { - prompt += "\n"; - } - if (bInsertStandardRolePrefix) { - prompt += `${chat.role}: `; - } - prompt += `${chat.content}`; - } - let req = { - prompt: prompt, - } - return this.request_jsonstr_extend(req); - } - - /** - * Return a string form of json object suitable for specified api endpoint. - * @param {string} apiEP - */ - request_jsonstr(apiEP) { - if (apiEP == ApiEP.Type.Chat) { - return this.request_messages_jsonstr(); - } else { - return this.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix); - } - } - - /** - * Extract the ai-model/assistant's response from the http response got. - * Optionally trim the message wrt any garbage at the end. - * @param {any} respBody - * @param {string} apiEP - */ - response_extract(respBody, apiEP) { - let assistant = ""; - if (apiEP == ApiEP.Type.Chat) { - assistant = respBody["choices"][0]["message"]["content"]; - } else { - try { - assistant = respBody["choices"][0]["text"]; - } catch { - assistant = respBody["content"]; - } - } - return assistant; - } - - /** - * Extract the ai-model/assistant's response from the http response got in streaming mode. - * @param {any} respBody - * @param {string} apiEP - */ - response_extract_stream(respBody, apiEP) { - let assistant = ""; - if (apiEP == ApiEP.Type.Chat) { - if (respBody["choices"][0]["finish_reason"] !== "stop") { - assistant = respBody["choices"][0]["delta"]["content"]; - } - } else { - try { - assistant = respBody["choices"][0]["text"]; - } catch { - assistant = respBody["content"]; - } - } - return assistant; - } - - /** - * Allow setting of system prompt, but only at begining. - * @param {string} sysPrompt - * @param {string} msgTag - */ - add_system_begin(sysPrompt, msgTag) { - if (this.xchat.length == 0) { - if (sysPrompt.length > 0) { - return this.add(Roles.System, sysPrompt); - } - } else { - if (sysPrompt.length > 0) { - if (this.xchat[0].role !== Roles.System) { - console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`); - } else { - if (this.xchat[0].content !== sysPrompt) { - console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`); - } - } - } - } - return false; - } - - /** - * Allow setting of system prompt, at any time. - * @param {string} sysPrompt - * @param {string} msgTag - */ - add_system_anytime(sysPrompt, msgTag) { - if (sysPrompt.length <= 0) { - return false; - } - - if (this.iLastSys < 0) { - return this.add(Roles.System, sysPrompt); - } - - let lastSys = this.xchat[this.iLastSys].content; - if (lastSys !== sysPrompt) { - return this.add(Roles.System, sysPrompt); - } - return false; - } - - /** - * Retrieve the latest system prompt. - */ - get_system_latest() { - if (this.iLastSys == -1) { - return ""; - } - let sysPrompt = this.xchat[this.iLastSys].content; - return sysPrompt; - } - - - /** - * Handle the multipart response from server/ai-model - * @param {Response} resp - * @param {string} apiEP - * @param {HTMLDivElement} elDiv - */ - async handle_response_multipart(resp, apiEP, elDiv) { - let elP = ui.el_create_append_p("", elDiv); - if (!resp.body) { - throw Error("ERRR:SimpleChat:SC:HandleResponseMultiPart:No body..."); - } - let tdUtf8 = new TextDecoder("utf-8"); - let rr = resp.body.getReader(); - this.latestResponse = ""; - let xLines = new du.NewLines(); - while(true) { - let { value: cur, done: done } = await rr.read(); - if (cur) { - let curBody = tdUtf8.decode(cur, {stream: true}); - console.debug("DBUG:SC:PART:Str:", curBody); - xLines.add_append(curBody); - } - while(true) { - let curLine = xLines.shift(!done); - if (curLine == undefined) { - break; - } - if (curLine.trim() == "") { - continue; - } - if (curLine.startsWith("data:")) { - curLine = curLine.substring(5); - } - let curJson = JSON.parse(curLine); - console.debug("DBUG:SC:PART:Json:", curJson); - this.append_response(this.response_extract_stream(curJson, apiEP)); - } - elP.innerText = this.latestResponse; - elP.scrollIntoView(false); - if (done) { - break; - } - } - console.debug("DBUG:SC:PART:Full:", this.latestResponse); - return this.latestResponse; - } - - /** - * Handle the oneshot response from server/ai-model - * @param {Response} resp - * @param {string} apiEP - */ - async handle_response_oneshot(resp, apiEP) { - let respBody = await resp.json(); - console.debug(`DBUG:SimpleChat:SC:${this.chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`); - return this.response_extract(respBody, apiEP); - } - - /** - * Handle the response from the server be it in oneshot or multipart/stream mode. - * Also take care of the optional garbage trimming. - * @param {Response} resp - * @param {string} apiEP - * @param {HTMLDivElement} elDiv - */ - async handle_response(resp, apiEP, elDiv) { - let theResp = { - assistant: "", - trimmed: "", - } - if (gMe.bStream) { - try { - theResp.assistant = await this.handle_response_multipart(resp, apiEP, elDiv); - this.latestResponse = ""; - } catch (error) { - theResp.assistant = this.latestResponse; - this.add(Roles.Assistant, theResp.assistant); - this.latestResponse = ""; - throw error; - } - } else { - theResp.assistant = await this.handle_response_oneshot(resp, apiEP); - } - if (gMe.bTrimGarbage) { - let origMsg = theResp.assistant; - theResp.assistant = du.trim_garbage_at_end(origMsg); - theResp.trimmed = origMsg.substring(theResp.assistant.length); - } - this.add(Roles.Assistant, theResp.assistant); - return theResp; - } - -} - - -class MultiChatUI { - - constructor() { - /** @type {Object} */ - this.simpleChats = {}; - /** @type {string} */ - this.curChatId = ""; - - // the ui elements - this.elInSystem = /** @type{HTMLInputElement} */(document.getElementById("system-in")); - this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div")); - this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn")); - this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in")); - this.elDivHeading = /** @type{HTMLSelectElement} */(document.getElementById("heading")); - this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div")); - this.elBtnSettings = /** @type{HTMLButtonElement} */(document.getElementById("settings")); - - this.validate_element(this.elInSystem, "system-in"); - this.validate_element(this.elDivChat, "chat-div"); - this.validate_element(this.elInUser, "user-in"); - this.validate_element(this.elDivHeading, "heading"); - this.validate_element(this.elDivChat, "sessions-div"); - this.validate_element(this.elBtnSettings, "settings"); - } - - /** - * Check if the element got - * @param {HTMLElement | null} el - * @param {string} msgTag - */ - validate_element(el, msgTag) { - if (el == null) { - throw Error(`ERRR:SimpleChat:MCUI:${msgTag} element missing in html...`); - } else { - console.debug(`INFO:SimpleChat:MCUI:${msgTag} Id[${el.id}] Name[${el["name"]}]`); - } - } - - /** - * Reset user input ui. - * * clear user input - * * enable user input - * * set focus to user input - */ - ui_reset_userinput() { - this.elInUser.value = ""; - this.elInUser.disabled = false; - this.elInUser.focus(); - } - - /** - * Setup the needed callbacks wrt UI, curChatId to defaultChatId and - * optionally switch to specified defaultChatId. - * @param {string} defaultChatId - * @param {boolean} bSwitchSession - */ - setup_ui(defaultChatId, bSwitchSession=false) { - - this.curChatId = defaultChatId; - if (bSwitchSession) { - this.handle_session_switch(this.curChatId); - } - - this.elBtnSettings.addEventListener("click", (ev)=>{ - this.elDivChat.replaceChildren(); - gMe.show_settings(this.elDivChat); - }); - - this.elBtnUser.addEventListener("click", (ev)=>{ - if (this.elInUser.disabled) { - return; - } - this.handle_user_submit(this.curChatId, gMe.apiEP).catch((/** @type{Error} */reason)=>{ - let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`; - console.error(msg.replace("\n", ":")); - alert(msg); - this.ui_reset_userinput(); - }); - }); - - this.elInUser.addEventListener("keyup", (ev)=> { - // allow user to insert enter into their message using shift+enter. - // while just pressing enter key will lead to submitting. - if ((ev.key === "Enter") && (!ev.shiftKey)) { - let value = this.elInUser.value; - this.elInUser.value = value.substring(0,value.length-1); - this.elBtnUser.click(); - ev.preventDefault(); - } - }); - - this.elInSystem.addEventListener("keyup", (ev)=> { - // allow user to insert enter into the system prompt using shift+enter. - // while just pressing enter key will lead to setting the system prompt. - if ((ev.key === "Enter") && (!ev.shiftKey)) { - let value = this.elInSystem.value; - this.elInSystem.value = value.substring(0,value.length-1); - let chat = this.simpleChats[this.curChatId]; - chat.add_system_anytime(this.elInSystem.value, this.curChatId); - chat.show(this.elDivChat); - ev.preventDefault(); - } - }); - - } - - /** - * Setup a new chat session and optionally switch to it. - * @param {string} chatId - * @param {boolean} bSwitchSession - */ - new_chat_session(chatId, bSwitchSession=false) { - this.simpleChats[chatId] = new SimpleChat(chatId); - if (bSwitchSession) { - this.handle_session_switch(chatId); - } - } - - - /** - * Handle user query submit request, wrt specified chat session. - * @param {string} chatId - * @param {string} apiEP - */ - async handle_user_submit(chatId, apiEP) { - - let chat = this.simpleChats[chatId]; - - // In completion mode, if configured, clear any previous chat history. - // So if user wants to simulate a multi-chat based completion query, - // they will have to enter the full thing, as a suitable multiline - // user input/query. - if ((apiEP == ApiEP.Type.Completion) && (gMe.bCompletionFreshChatAlways)) { - chat.clear(); - } - - chat.add_system_anytime(this.elInSystem.value, chatId); - - let content = this.elInUser.value; - if (!chat.add(Roles.User, content)) { - console.debug(`WARN:SimpleChat:MCUI:${chatId}:HandleUserSubmit:Ignoring empty user input...`); - return; - } - chat.show(this.elDivChat); - - let theUrl = ApiEP.Url(gMe.baseURL, apiEP); - let theBody = chat.request_jsonstr(apiEP); - - this.elInUser.value = "working..."; - this.elInUser.disabled = true; - console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`); - let theHeaders = chat.fetch_headers(apiEP); - let resp = await fetch(theUrl, { - method: "POST", - headers: theHeaders, - body: theBody, - }); - - let theResp = await chat.handle_response(resp, apiEP, this.elDivChat); - if (chatId == this.curChatId) { - chat.show(this.elDivChat); - if (theResp.trimmed.length > 0) { - let p = ui.el_create_append_p(`TRIMMED:${theResp.trimmed}`, this.elDivChat); - p.className="role-trim"; - } - } else { - console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`); - } - this.ui_reset_userinput(); - } - - /** - * Show buttons for NewChat and available chat sessions, in the passed elDiv. - * If elDiv is undefined/null, then use this.elDivSessions. - * Take care of highlighting the selected chat-session's btn. - * @param {HTMLDivElement | undefined} elDiv - */ - show_sessions(elDiv=undefined) { - if (!elDiv) { - elDiv = this.elDivSessions; - } - elDiv.replaceChildren(); - // Btn for creating new chat session - let btnNew = ui.el_create_button("New CHAT", (ev)=> { - if (this.elInUser.disabled) { - console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`); - alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session"); - return; - } - let chatId = `Chat${Object.keys(this.simpleChats).length}`; - let chatIdGot = prompt("INFO:SimpleChat\nMCUI:NewChat\nEnter id for new chat session", chatId); - if (!chatIdGot) { - console.error("ERRR:SimpleChat:MCUI:NewChat:Skipping based on user request..."); - return; - } - this.new_chat_session(chatIdGot, true); - this.create_session_btn(elDiv, chatIdGot); - ui.el_children_config_class(elDiv, chatIdGot, "session-selected", ""); - }); - elDiv.appendChild(btnNew); - // Btns for existing chat sessions - let chatIds = Object.keys(this.simpleChats); - for(let cid of chatIds) { - let btn = this.create_session_btn(elDiv, cid); - if (cid == this.curChatId) { - btn.className = "session-selected"; - } - } - } - - create_session_btn(elDiv, cid) { - let btn = ui.el_create_button(cid, (ev)=>{ - let target = /** @type{HTMLButtonElement} */(ev.target); - console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`); - if (this.elInUser.disabled) { - console.error(`ERRR:SimpleChat:MCUI:SessionClick:${target.id}:Current session [${this.curChatId}] awaiting response, ignoring switch...`); - alert("ERRR:SimpleChat\nMCUI:SessionClick\nWait for response to pending query, before switching"); - return; - } - this.handle_session_switch(target.id); - ui.el_children_config_class(elDiv, target.id, "session-selected", ""); - }); - elDiv.appendChild(btn); - return btn; - } - - /** - * Switch ui to the specified chatId and set curChatId to same. - * @param {string} chatId - */ - async handle_session_switch(chatId) { - let chat = this.simpleChats[chatId]; - if (chat == undefined) { - console.error(`ERRR:SimpleChat:MCUI:HandleSessionSwitch:${chatId} missing...`); - return; - } - this.elInSystem.value = chat.get_system_latest(); - this.elInUser.value = ""; - chat.show(this.elDivChat); - this.elInUser.focus(); - this.curChatId = chatId; - console.log(`INFO:SimpleChat:MCUI:HandleSessionSwitch:${chatId} entered...`); - } - -} - - -class Me { - - constructor() { - this.baseURL = "http://127.0.0.1:8080"; - this.defaultChatIds = [ "Default", "Other" ]; - this.multiChat = new MultiChatUI(); - this.bStream = true; - this.bCompletionFreshChatAlways = true; - this.bCompletionInsertStandardRolePrefix = false; - this.bTrimGarbage = true; - this.iRecentUserMsgCnt = 2; - this.sRecentUserMsgCnt = { - "Full": -1, - "Last0": 1, - "Last1": 2, - "Last2": 3, - "Last4": 5, - }; - this.apiEP = ApiEP.Type.Chat; - this.headers = { - "Content-Type": "application/json", - "Authorization": "", // Authorization: Bearer OPENAI_API_KEY - } - // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint. - this.apiRequestOptions = { - "model": "gpt-3.5-turbo", - "temperature": 0.7, - "max_tokens": 1024, - "n_predict": 1024, - "cache_prompt": false, - //"frequency_penalty": 1.2, - //"presence_penalty": 1.2, - }; - } - - /** - * Disable console.debug by mapping it to a empty function. - */ - debug_disable() { - this.console_debug = console.debug; - console.debug = () => { - - }; - } - - /** - * Setup the load saved chat ui. - * @param {HTMLDivElement} div - * @param {SimpleChat} chat - */ - setup_load(div, chat) { - if (!(chat.ods_key() in localStorage)) { - return; - } - div.innerHTML += `

Restore

-

Load previously saved chat session, if available

`; - let btn = ui.el_create_button(chat.ods_key(), (ev)=>{ - console.log("DBUG:SimpleChat:SC:Load", chat); - chat.load(); - queueMicrotask(()=>{ - chat.show(div); - this.multiChat.elInSystem.value = chat.get_system_latest(); - }); - }); - div.appendChild(btn); - } - - /** - * Show the configurable parameters info in the passed Div element. - * @param {HTMLDivElement} elDiv - * @param {boolean} bAll - */ - show_info(elDiv, bAll=false) { - - let p = ui.el_create_append_p("Settings (devel-tools-console document[gMe])", elDiv); - p.className = "role-system"; - - if (bAll) { - - ui.el_create_append_p(`baseURL:${this.baseURL}`, elDiv); - - ui.el_create_append_p(`Authorization:${this.headers["Authorization"]}`, elDiv); - - ui.el_create_append_p(`bStream:${this.bStream}`, elDiv); - - ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv); - - ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv); - - ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv); - - ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv); - - ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv); - - } - - ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv); - ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv); - - } - - /** - * Auto create ui input elements for fields in apiRequestOptions - * Currently supports text and number field types. - * @param {HTMLDivElement} elDiv - */ - show_settings_apirequestoptions(elDiv) { - let typeDict = { - "string": "text", - "number": "number", - }; - let fs = document.createElement("fieldset"); - let legend = document.createElement("legend"); - legend.innerText = "ApiRequestOptions"; - fs.appendChild(legend); - elDiv.appendChild(fs); - for(const k in this.apiRequestOptions) { - let val = this.apiRequestOptions[k]; - let type = typeof(val); - if (((type == "string") || (type == "number"))) { - let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{ - if (type == "number") { - val = Number(val); - } - this.apiRequestOptions[k] = val; - }); - fs.appendChild(inp.div); - } else if (type == "boolean") { - let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{ - this.apiRequestOptions[k] = userVal; - }); - fs.appendChild(bbtn.div); - } - } - } - - /** - * Show settings ui for configurable parameters, in the passed Div element. - * @param {HTMLDivElement} elDiv - */ - show_settings(elDiv) { - - let inp = ui.el_creatediv_input("SetBaseURL", "BaseURL", "text", this.baseURL, (val)=>{ - this.baseURL = val; - }); - elDiv.appendChild(inp.div); - - inp = ui.el_creatediv_input("SetAuthorization", "Authorization", "text", this.headers["Authorization"], (val)=>{ - this.headers["Authorization"] = val; - }); - inp.el.placeholder = "Bearer OPENAI_API_KEY"; - elDiv.appendChild(inp.div); - - let bb = ui.el_creatediv_boolbutton("SetStream", "Stream", {true: "[+] yes stream", false: "[-] do oneshot"}, this.bStream, (val)=>{ - this.bStream = val; - }); - elDiv.appendChild(bb.div); - - bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{ - this.bTrimGarbage = val; - }); - elDiv.appendChild(bb.div); - - this.show_settings_apirequestoptions(elDiv); - - let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{ - this.apiEP = ApiEP.Type[val]; - }); - elDiv.appendChild(sel.div); - - sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{ - this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val]; - }); - elDiv.appendChild(sel.div); - - bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{ - this.bCompletionFreshChatAlways = val; - }); - elDiv.appendChild(bb.div); - - bb = ui.el_creatediv_boolbutton("SetCompletionInsertStandardRolePrefix", "CompletionInsertStandardRolePrefix", {true: "[+] yes insert", false: "[-] dont insert"}, this.bCompletionInsertStandardRolePrefix, (val)=>{ - this.bCompletionInsertStandardRolePrefix = val; - }); - elDiv.appendChild(bb.div); - - } - -} - - -/** @type {Me} */ -let gMe; - -function startme() { - console.log("INFO:SimpleChat:StartMe:Starting..."); - gMe = new Me(); - gMe.debug_disable(); - document["gMe"] = gMe; - document["du"] = du; - for (let cid of gMe.defaultChatIds) { - gMe.multiChat.new_chat_session(cid); - } - gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true); - gMe.multiChat.show_sessions(); -} - -document.addEventListener("DOMContentLoaded", startme); diff --git a/examples/server/public_simplechat/simplechat_screens.webp b/examples/server/public_simplechat/simplechat_screens.webp deleted file mode 100644 index ccea44396..000000000 Binary files a/examples/server/public_simplechat/simplechat_screens.webp and /dev/null differ diff --git a/examples/server/public_simplechat/ui.mjs b/examples/server/public_simplechat/ui.mjs deleted file mode 100644 index b2d5b9aea..000000000 --- a/examples/server/public_simplechat/ui.mjs +++ /dev/null @@ -1,211 +0,0 @@ -//@ts-check -// Helpers to work with html elements -// by Humans for All -// - - -/** - * Set the class of the children, based on whether it is the idSelected or not. - * @param {HTMLDivElement} elBase - * @param {string} idSelected - * @param {string} classSelected - * @param {string} classUnSelected - */ -export function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") { - for(let child of elBase.children) { - if (child.id == idSelected) { - child.className = classSelected; - } else { - child.className = classUnSelected; - } - } -} - -/** - * Create button and set it up. - * @param {string} id - * @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback - * @param {string | undefined} name - * @param {string | undefined} innerText - */ -export function el_create_button(id, callback, name=undefined, innerText=undefined) { - if (!name) { - name = id; - } - if (!innerText) { - innerText = id; - } - let btn = document.createElement("button"); - btn.id = id; - btn.name = name; - btn.innerText = innerText; - btn.addEventListener("click", callback); - return btn; -} - -/** - * Create a para and set it up. Optionaly append it to a passed parent. - * @param {string} text - * @param {HTMLElement | undefined} elParent - * @param {string | undefined} id - */ -export function el_create_append_p(text, elParent=undefined, id=undefined) { - let para = document.createElement("p"); - para.innerText = text; - if (id) { - para.id = id; - } - if (elParent) { - elParent.appendChild(para); - } - return para; -} - -/** - * Create a button which represents bool value using specified text wrt true and false. - * When ever user clicks the button, it will toggle the value and update the shown text. - * - * @param {string} id - * @param {{true: string, false: string}} texts - * @param {boolean} defaultValue - * @param {function(boolean):void} cb - */ -export function el_create_boolbutton(id, texts, defaultValue, cb) { - let el = document.createElement("button"); - el["xbool"] = defaultValue; - el["xtexts"] = structuredClone(texts); - el.innerText = el["xtexts"][String(defaultValue)]; - if (id) { - el.id = id; - } - el.addEventListener('click', (ev)=>{ - el["xbool"] = !el["xbool"]; - el.innerText = el["xtexts"][String(el["xbool"])]; - cb(el["xbool"]); - }) - return el; -} - -/** - * Create a div wrapped button which represents bool value using specified text wrt true and false. - * @param {string} id - * @param {string} label - * @param {{ true: string; false: string; }} texts - * @param {boolean} defaultValue - * @param {(arg0: boolean) => void} cb - * @param {string} className - */ -export function el_creatediv_boolbutton(id, label, texts, defaultValue, cb, className="gridx2") { - let div = document.createElement("div"); - div.className = className; - let lbl = document.createElement("label"); - lbl.setAttribute("for", id); - lbl.innerText = label; - div.appendChild(lbl); - let btn = el_create_boolbutton(id, texts, defaultValue, cb); - div.appendChild(btn); - return { div: div, el: btn }; -} - - -/** - * Create a select ui element, with a set of options to select from. - * * options: an object which contains name-value pairs - * * defaultOption: the value whose name should be choosen, by default. - * * cb : the call back returns the name string of the option selected. - * - * @param {string} id - * @param {Object} options - * @param {*} defaultOption - * @param {function(string):void} cb - */ -export function el_create_select(id, options, defaultOption, cb) { - let el = document.createElement("select"); - el["xselected"] = defaultOption; - el["xoptions"] = structuredClone(options); - for(let cur of Object.keys(options)) { - let op = document.createElement("option"); - op.value = cur; - op.innerText = cur; - if (options[cur] == defaultOption) { - op.selected = true; - } - el.appendChild(op); - } - if (id) { - el.id = id; - el.name = id; - } - el.addEventListener('change', (ev)=>{ - let target = /** @type{HTMLSelectElement} */(ev.target); - console.log("DBUG:UI:Select:", id, ":", target.value); - cb(target.value); - }) - return el; -} - -/** - * Create a div wrapped select ui element, with a set of options to select from. - * - * @param {string} id - * @param {any} label - * @param {{ [x: string]: any; }} options - * @param {any} defaultOption - * @param {(arg0: string) => void} cb - * @param {string} className - */ -export function el_creatediv_select(id, label, options, defaultOption, cb, className="gridx2") { - let div = document.createElement("div"); - div.className = className; - let lbl = document.createElement("label"); - lbl.setAttribute("for", id); - lbl.innerText = label; - div.appendChild(lbl); - let sel = el_create_select(id, options,defaultOption, cb); - div.appendChild(sel); - return { div: div, el: sel }; -} - - -/** - * Create a input ui element. - * - * @param {string} id - * @param {string} type - * @param {any} defaultValue - * @param {function(any):void} cb - */ -export function el_create_input(id, type, defaultValue, cb) { - let el = document.createElement("input"); - el.type = type; - el.value = defaultValue; - if (id) { - el.id = id; - } - el.addEventListener('change', (ev)=>{ - cb(el.value); - }) - return el; -} - -/** - * Create a div wrapped input. - * - * @param {string} id - * @param {string} label - * @param {string} type - * @param {any} defaultValue - * @param {function(any):void} cb - * @param {string} className - */ -export function el_creatediv_input(id, label, type, defaultValue, cb, className="gridx2") { - let div = document.createElement("div"); - div.className = className; - let lbl = document.createElement("label"); - lbl.setAttribute("for", id); - lbl.innerText = label; - div.appendChild(lbl); - let el = el_create_input(id, type, defaultValue, cb); - div.appendChild(el); - return { div: div, el: el }; -} diff --git a/examples/server/server.cpp b/examples/server/server.cpp deleted file mode 100644 index 7813a2957..000000000 --- a/examples/server/server.cpp +++ /dev/null @@ -1,3448 +0,0 @@ -#include "utils.hpp" - -#include "common.h" -#include "json-schema-to-grammar.h" -#include "llama.h" -#include "grammar-parser.h" - -#ifndef NDEBUG -// crash the server in debug mode, otherwise send an http 500 error -#define CPPHTTPLIB_NO_EXCEPTIONS 1 -#endif -// increase max payload length to allow use of larger context size -#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 -#include "httplib.h" -// Change JSON_ASSERT from assert() to GGML_ASSERT: -#define JSON_ASSERT GGML_ASSERT -#include "json.hpp" - -// auto generated files (update with ./deps.sh) -#include "colorthemes.css.hpp" -#include "style.css.hpp" -#include "theme-beeninorder.css.hpp" -#include "theme-ketivah.css.hpp" -#include "theme-mangotango.css.hpp" -#include "theme-playground.css.hpp" -#include "theme-polarnight.css.hpp" -#include "theme-snowstorm.css.hpp" -#include "index.html.hpp" -#include "index-new.html.hpp" -#include "index.js.hpp" -#include "completion.js.hpp" -#include "system-prompts.js.hpp" -#include "prompt-formats.js.hpp" -#include "json-schema-to-grammar.mjs.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using json = nlohmann::ordered_json; - -bool server_verbose = false; -bool server_log_json = true; - -enum stop_type { - STOP_TYPE_FULL, - STOP_TYPE_PARTIAL, -}; - -enum slot_state { - SLOT_STATE_IDLE, - SLOT_STATE_PROCESSING, -}; - -enum slot_command { - SLOT_COMMAND_NONE, - SLOT_COMMAND_LOAD_PROMPT, - SLOT_COMMAND_RELEASE, -}; - -enum server_state { - SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet - SERVER_STATE_READY, // Server is ready and model is loaded - SERVER_STATE_ERROR // An error occurred, load_model failed -}; - -enum server_task_type { - SERVER_TASK_TYPE_COMPLETION, - SERVER_TASK_TYPE_CANCEL, - SERVER_TASK_TYPE_NEXT_RESPONSE, - SERVER_TASK_TYPE_METRICS, - SERVER_TASK_TYPE_SLOT_SAVE, - SERVER_TASK_TYPE_SLOT_RESTORE, - SERVER_TASK_TYPE_SLOT_ERASE, -}; - -struct server_task { - int id = -1; // to be filled by server_queue - int id_multi = -1; - int id_target = -1; - - server_task_type type; - json data; - - bool infill = false; - bool embedding = false; -}; - -struct server_task_result { - int id = -1; - int id_multi = -1; - - json data; - - bool stop; - bool error; -}; - -struct server_task_multi { - int id = -1; - - std::set subtasks_remaining; - std::vector results; -}; - -struct slot_params { - bool stream = true; - bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_predict = -1; // new tokens to predict - - std::vector antiprompt; - - json input_prefix; - json input_suffix; -}; - -struct server_slot { - int id; - int id_task = -1; - int id_multi = -1; - - struct slot_params params; - - slot_state state = SLOT_STATE_IDLE; - slot_command command = SLOT_COMMAND_NONE; - - // used to determine the slot that has been used the longest - int64_t t_last_used = -1; - - // generation props - int32_t n_ctx = 0; // context size per slot - int32_t n_past = 0; - int32_t n_decoded = 0; - int32_t n_remaining = -1; - int32_t i_batch = -1; - int32_t n_predict = -1; // TODO: disambiguate from params.n_predict - - int32_t n_prompt_tokens = 0; - int32_t n_prompt_tokens_processed = 0; - - json prompt; // can be either a string, array of strings or array of token ids - - // when a task is submitted, we first tokenize the prompt and store it here - std::vector prompt_tokens; - - std::string generated_text; - std::vector cache_tokens; - std::vector generated_token_probs; - - bool infill = false; - bool embedding = false; - bool has_next_token = true; - bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; - - bool oaicompat = false; - - std::string oaicompat_model; - std::string stopping_word; - - // sampling - llama_token sampled; - struct llama_sampling_params sparams; - llama_sampling_context * ctx_sampling = nullptr; - json json_schema; - - int32_t ga_i = 0; // group-attention state - int32_t ga_n = 1; // group-attention factor - int32_t ga_w = 512; // group-attention width - - int32_t n_past_se = 0; // self-extend - - // stats - size_t n_sent_text = 0; // number of sent text character - size_t n_sent_token_probs = 0; - - int64_t t_start_process_prompt; - int64_t t_start_generation; - - double t_prompt_processing; // ms - double t_token_generation; // ms - - void reset() { - n_prompt_tokens = 0; - generated_text = ""; - truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; - stopping_word = ""; - n_past = 0; - n_sent_text = 0; - n_sent_token_probs = 0; - infill = false; - ga_i = 0; - n_past_se = 0; - - generated_token_probs.clear(); - } - - bool has_budget(gpt_params &global_params) { - if (params.n_predict == -1 && global_params.n_predict == -1) { - return true; // limitless - } - - n_remaining = -1; - - if (params.n_predict != -1) { - n_remaining = params.n_predict - n_decoded; - } else if (global_params.n_predict != -1) { - n_remaining = global_params.n_predict - n_decoded; - } - - return n_remaining > 0; // no budget - } - - bool available() const { - return state == SLOT_STATE_IDLE && command == SLOT_COMMAND_NONE; - } - - bool is_processing() const { - return (state == SLOT_STATE_IDLE && command == SLOT_COMMAND_LOAD_PROMPT) || state == SLOT_STATE_PROCESSING; - } - - void add_token_string(const completion_token_output & token) { - if (command == SLOT_COMMAND_RELEASE) { - return; - } - generated_token_probs.push_back(token); - } - - void release() { - if (state == SLOT_STATE_PROCESSING) { - t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; - command = SLOT_COMMAND_RELEASE; - } - } - - json get_formated_timings() const { - return json { - {"prompt_n", n_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed}, - {"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed}, - - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; - } - - size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) { - size_t stop_pos = std::string::npos; - - for (const std::string & word : params.antiprompt) { - size_t pos; - - if (type == STOP_TYPE_FULL) { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - - pos = text.find(word, from_pos); - } else { - pos = find_partial_stop_string(word, text); - } - - if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_TYPE_FULL) { - stopped_word = true; - stopping_word = word; - has_next_token = false; - } - stop_pos = pos; - } - } - - return stop_pos; - } - - void print_timings() const { - char buffer[512]; - - double t_token = t_prompt_processing / n_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - - snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", - t_prompt_processing, n_prompt_tokens_processed, - t_token, n_tokens_second); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_prompt_processing", t_prompt_processing}, - {"n_prompt_tokens_processed", n_prompt_tokens_processed}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - t_token = t_token_generation / n_decoded; - n_tokens_second = 1e3 / t_token_generation * n_decoded; - - snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", - t_token_generation, n_decoded, - t_token, n_tokens_second); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_token_generation", t_token_generation}, - {"n_decoded", n_decoded}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_prompt_processing", t_prompt_processing}, - {"t_token_generation", t_token_generation}, - {"t_total", t_prompt_processing + t_token_generation}, - }); - } -}; - -struct server_metrics { - int64_t t_start = 0; - - uint64_t n_prompt_tokens_processed_total = 0; - uint64_t t_prompt_processing_total = 0; - uint64_t n_tokens_predicted_total = 0; - uint64_t t_tokens_generation_total = 0; - - uint64_t n_prompt_tokens_processed = 0; - uint64_t t_prompt_processing = 0; - - uint64_t n_tokens_predicted = 0; - uint64_t t_tokens_generation = 0; - - void init() { - t_start = ggml_time_us(); - } - - void on_prompt_eval(const server_slot & slot) { - n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed; - n_prompt_tokens_processed += slot.n_prompt_tokens_processed; - t_prompt_processing += slot.t_prompt_processing; - t_prompt_processing_total += slot.t_prompt_processing; - } - - void on_prediction(const server_slot & slot) { - n_tokens_predicted_total += slot.n_decoded; - n_tokens_predicted += slot.n_decoded; - t_tokens_generation += slot.t_token_generation; - t_tokens_generation_total += slot.t_token_generation; - } - - void reset_bucket() { - n_prompt_tokens_processed = 0; - t_prompt_processing = 0; - n_tokens_predicted = 0; - t_tokens_generation = 0; - } -}; - -struct server_queue { - int id = 0; - bool running; - - // queues - std::vector queue_tasks; - std::vector queue_tasks_deferred; - - std::vector queue_multitasks; - - std::mutex mutex_tasks; - std::condition_variable condition_tasks; - - // callback functions - std::function callback_new_task; - std::function callback_finish_multitask; - std::function callback_update_slots; - - // Add a new task to the end of the queue - int post(server_task task) { - std::unique_lock lock(mutex_tasks); - if (task.id == -1) { - task.id = id++; - LOG_VERBOSE("new task id", {{"new_id", task.id}}); - } - queue_tasks.push_back(std::move(task)); - condition_tasks.notify_one(); - return task.id; - } - - // Add a new task, but defer until one slot is available - void defer(server_task task) { - std::unique_lock lock(mutex_tasks); - queue_tasks_deferred.push_back(std::move(task)); - } - - // Get the next id for creating anew task - int get_new_id() { - std::unique_lock lock(mutex_tasks); - int new_id = id++; - LOG_VERBOSE("new task id", {{"new_id", new_id}}); - return new_id; - } - - // Register function to process a new task - void on_new_task(std::function callback) { - callback_new_task = std::move(callback); - } - - // Register function to process a multitask when it is finished - void on_finish_multitask(std::function callback) { - callback_finish_multitask = std::move(callback); - } - - // Register the function to be called when all slots data is ready to be processed - void on_update_slots(std::function callback) { - callback_update_slots = std::move(callback); - } - - // Call when the state of one slot is changed - void notify_slot_changed() { - // move deferred tasks back to main loop - std::unique_lock lock(mutex_tasks); - for (auto & task : queue_tasks_deferred) { - queue_tasks.push_back(std::move(task)); - } - queue_tasks_deferred.clear(); - } - - // end the start_loop routine - void terminate() { - std::unique_lock lock(mutex_tasks); - running = false; - condition_tasks.notify_all(); - } - - /** - * Main loop consists of these steps: - * - Wait until a new task arrives - * - Process the task (i.e. maybe copy data into slot) - * - Check if multitask is finished - * - Update all slots - */ - void start_loop() { - running = true; - - while (true) { - LOG_VERBOSE("new task may arrive", {}); - - while (true) { - std::unique_lock lock(mutex_tasks); - if (queue_tasks.empty()) { - lock.unlock(); - break; - } - server_task task = queue_tasks.front(); - queue_tasks.erase(queue_tasks.begin()); - lock.unlock(); - LOG_VERBOSE("callback_new_task", {{"id_task", task.id}}); - callback_new_task(task); - } - - LOG_VERBOSE("update_multitasks", {}); - - // check if we have any finished multitasks - auto queue_iterator = queue_multitasks.begin(); - while (queue_iterator != queue_multitasks.end()) { - if (queue_iterator->subtasks_remaining.empty()) { - // all subtasks done == multitask is done - server_task_multi current_multitask = *queue_iterator; - callback_finish_multitask(current_multitask); - // remove this multitask - queue_iterator = queue_multitasks.erase(queue_iterator); - } else { - ++queue_iterator; - } - } - - // all tasks in the current loop is processed, slots data is now ready - LOG_VERBOSE("callback_update_slots", {}); - - callback_update_slots(); - - LOG_VERBOSE("wait for new task", {}); - { - std::unique_lock lock(mutex_tasks); - if (queue_tasks.empty()) { - if (!running) { - LOG_VERBOSE("ending start_loop", {}); - return; - } - condition_tasks.wait(lock, [&]{ - return (!queue_tasks.empty() || !running); - }); - } - } - } - } - - // - // functions to manage multitasks - // - - // add a multitask by specifying the id of all subtask (subtask is a server_task) - void add_multitask(int id_multi, std::vector & sub_ids) { - std::lock_guard lock(mutex_tasks); - server_task_multi multi; - multi.id = id_multi; - std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end())); - queue_multitasks.push_back(multi); - } - - // updatethe remaining subtasks, while appending results to multitask - void update_multitask(int id_multi, int id_sub, server_task_result & result) { - std::lock_guard lock(mutex_tasks); - for (auto & multitask : queue_multitasks) { - if (multitask.id == id_multi) { - multitask.subtasks_remaining.erase(id_sub); - multitask.results.push_back(result); - } - } - } -}; - -struct server_response { - typedef std::function callback_multitask_t; - callback_multitask_t callback_update_multitask; - - // for keeping track of all tasks waiting for the result - std::set waiting_task_ids; - - // the main result queue - std::vector queue_results; - - std::mutex mutex_results; - std::condition_variable condition_results; - - // add the id_task to the list of tasks waiting for response - void add_waiting_task_id(int id_task) { - LOG_VERBOSE("waiting for task id", {{"id_task", id_task}}); - - std::unique_lock lock(mutex_results); - waiting_task_ids.insert(id_task); - } - - // when the request is finished, we can remove task associated with it - void remove_waiting_task_id(int id_task) { - LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}}); - - std::unique_lock lock(mutex_results); - waiting_task_ids.erase(id_task); - } - - // This function blocks the thread until there is a response for this id_task - server_task_result recv(int id_task) { - while (true) { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&]{ - return !queue_results.empty(); - }); - - for (int i = 0; i < (int) queue_results.size(); i++) { - if (queue_results[i].id == id_task) { - assert(queue_results[i].id_multi == -1); - server_task_result res = queue_results[i]; - queue_results.erase(queue_results.begin() + i); - return res; - } - } - } - - // should never reach here - } - - // Register the function to update multitask - void on_multitask_update(callback_multitask_t callback) { - callback_update_multitask = std::move(callback); - } - - // Send a new result to a waiting id_task - void send(server_task_result result) { - LOG_VERBOSE("send new result", {{"id_task", result.id}}); - - std::unique_lock lock(mutex_results); - for (const auto & id_task : waiting_task_ids) { - // LOG_TEE("waiting task id %i \n", id_task); - // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result - if (result.id_multi == id_task) { - LOG_VERBOSE("callback_update_multitask", {{"id_task", id_task}}); - callback_update_multitask(id_task, result.id, result); - continue; - } - - if (result.id == id_task) { - LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}}); - queue_results.push_back(result); - condition_results.notify_all(); - return; - } - } - } -}; - -struct server_context { - llama_model * model = nullptr; - llama_context * ctx = nullptr; - - gpt_params params; - - llama_batch batch; - - bool clean_kv_cache = true; - bool add_bos_token = true; - - int32_t n_ctx; // total context for all clients / slots - - // system prompt - bool system_need_update = false; - - std::string system_prompt; - std::vector system_tokens; - - // slots / clients - std::vector slots; - json default_generation_settings_for_props; - - server_queue queue_tasks; - server_response queue_results; - - server_metrics metrics; - - // Necessary similarity of prompt for slot selection - float slot_prompt_similarity = 0.0f; - - ~server_context() { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - - if (model) { - llama_free_model(model); - model = nullptr; - } - - // Clear any sampling context - for (server_slot & slot : slots) { - if (slot.ctx_sampling != nullptr) { - llama_sampling_free(slot.ctx_sampling); - } - } - - llama_batch_free(batch); - } - - bool load_model(const gpt_params & params_) { - params = params_; - - // dedicate one sequence to the system prompt - params.n_parallel += 1; - - std::tie(model, ctx) = llama_init_from_gpt_params(params); - params.n_parallel -= 1; // but be sneaky about it - if (model == nullptr) { - LOG_ERROR("unable to load model", {{"model", params.model}}); - return false; - } - - n_ctx = llama_n_ctx(ctx); - - add_bos_token = llama_should_add_bos_token(model); - GGML_ASSERT(llama_add_eos_token(model) != 1); - - return true; - } - - bool validate_model_chat_template() const { - llama_chat_message chat[] = {{"user", "test"}}; - - const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0); - - return res > 0; - } - - void init() { - const int32_t n_ctx_slot = n_ctx / params.n_parallel; - - LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); - - for (int i = 0; i < params.n_parallel; i++) { - server_slot slot; - - slot.id = i; - slot.n_ctx = n_ctx_slot; - slot.n_predict = params.n_predict; - - LOG_INFO("new slot", { - {"id_slot", slot.id}, - {"n_ctx_slot", slot.n_ctx} - }); - - const int ga_n = params.grp_attn_n; - const int ga_w = params.grp_attn_w; - - if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT - //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT - //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - - LOG_INFO("slot self-extend", { - {"id_slot", slot.id}, - {"ga_n", ga_n}, - {"ga_w", ga_w} - }); - } - - slot.ga_i = 0; - slot.ga_n = ga_n; - slot.ga_w = ga_w; - - slot.sparams = params.sparams; - - slot.reset(); - - slots.push_back(slot); - } - - default_generation_settings_for_props = get_formated_generation(slots.front()); - default_generation_settings_for_props["seed"] = -1; - - // the update_slots() logic will always submit a maximum of n_batch tokens - // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) - { - const int32_t n_batch = llama_n_batch(ctx); - - // only a single seq_id per token is needed - batch = llama_batch_init(n_batch, 0, 1); - } - - metrics.init(); - } - - std::vector tokenize(const json & json_prompt, bool add_special) const { - // TODO: currently, we tokenize using special tokens by default - // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) - // but it's better compared to completely ignoring ChatML and other chat templates - const bool TMP_FORCE_SPECIAL = true; - - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - std::vector prompt_tokens; - - if (json_prompt.is_array()) { - bool first = true; - for (const auto & p : json_prompt) { - if (p.is_string()) { - auto s = p.template get(); - - std::vector p; - if (first) { - p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); - first = false; - } else { - p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); - } - - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } else { - if (first) { - first = false; - } - - prompt_tokens.push_back(p.template get()); - } - } - } else { - auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); - } - - return prompt_tokens; - } - - server_slot * get_slot_by_id(int id) { - for (server_slot & slot : slots) { - if (slot.id == id) { - return &slot; - } - } - - return nullptr; - } - - server_slot * get_available_slot(const std::string & prompt) { - server_slot * ret = nullptr; - - // find the slot that has at least n% prompt similarity - if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) { - int max_lcp_len = 0; - float similarity = 0; - - for (server_slot & slot : slots) { - // skip the slot if it is not available - if (!slot.available()) { - continue; - } - - // skip the slot if it does not contains prompt - if (!slot.prompt.is_string()) { - continue; - } - - // current slot's prompt - std::string slot_prompt = slot.prompt.get(); - - // length of the current slot's prompt - int slot_prompt_len = slot_prompt.size(); - - // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = common_part(slot_prompt, prompt); - - // fraction of the common substring length compared to the current slot's prompt length - similarity = static_cast(lcp_len) / slot_prompt_len; - - // select the current slot if the criteria match - if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) { - max_lcp_len = lcp_len; - ret = &slot; - } - } - - if (ret != nullptr) { - LOG_VERBOSE("selected slot by lcp similarity", { - {"id_slot", ret->id}, - {"max_lcp_len", max_lcp_len}, - {"similarity", similarity}, - }); - } - } - - // find the slot that has been least recently used - if (ret == nullptr) { - int64_t t_last = ggml_time_us(); - for (server_slot & slot : slots) { - // skip the slot if it is not available - if (!slot.available()) { - continue; - } - - // select the current slot if the criteria match - if (slot.t_last_used < t_last) { - t_last = slot.t_last_used; - ret = &slot; - } - } - - if (ret != nullptr) { - LOG_VERBOSE("selected slot by lru", { - {"id_slot", ret->id}, - {"t_last", t_last}, - }); - } - } - - return ret; - } - - bool launch_slot_with_task(server_slot & slot, const server_task & task) { - slot_params default_params; - // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - llama_sampling_params default_sparams = params.sparams; - auto & data = task.data; - - if (data.count("__oaicompat") != 0) { - slot.oaicompat = true; - slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - } else { - slot.oaicompat = false; - slot.oaicompat_model = ""; - } - - slot.params.stream = json_value(data, "stream", false); - slot.params.cache_prompt = json_value(data, "cache_prompt", false); - slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict); - slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); - slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); - slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); - slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); - slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); - slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); - slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); - slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); - slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); - slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); - slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); - slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); - slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep); - slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); - slot.sparams.seed = json_value(data, "seed", default_sparams.seed); - slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); - slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); - - // process "json_schema" and "grammar" - if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { - send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST); - return false; - } else if (data.contains("json_schema") && !data.contains("grammar")) { - try { - auto schema = json_value(data, "json_schema", json::object()); - slot.sparams.grammar = json_schema_to_grammar(schema); - } catch (const std::exception & e) { - send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); - return false; - } - } else { - slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); - } - - if (slot.params.cache_prompt && slot.ga_n != 1) { - LOG_WARNING("cache_prompt is not supported with group-attention", {}); - slot.params.cache_prompt = false; - } - - if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { - // Might be better to reject the request with a 400 ? - LOG_WARNING("Max tokens to predict exceeds server configuration", { - {"params.n_predict", slot.params.n_predict}, - {"slot.n_predict", slot.n_predict}, - }); - slot.params.n_predict = slot.n_predict; - } - - // infill - slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); - slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); - - // get prompt - if (!task.infill) { - const auto & prompt = data.find("prompt"); - if (prompt == data.end()) { - send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); - return false; - } - - if ((prompt->is_string()) || - (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) || - (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) { - slot.prompt = *prompt; - } else { - send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST); - return false; - } - } - - // penalize user-provided tokens - { - slot.sparams.penalty_prompt_tokens.clear(); - slot.sparams.use_penalty_prompt_tokens = false; - - const auto & penalty_prompt = data.find("penalty_prompt"); - - if (penalty_prompt != data.end()) { - if (penalty_prompt->is_string()) { - const auto penalty_prompt_string = penalty_prompt->get(); - slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false); - - if (slot.params.n_predict > 0) { - slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict); - } - slot.sparams.use_penalty_prompt_tokens = true; - - LOG_VERBOSE("penalty_prompt_tokens", { - {"id_slot", slot.id}, - {"tokens", slot.sparams.penalty_prompt_tokens}, - }); - } - else if (penalty_prompt->is_array()) { - const auto n_tokens = penalty_prompt->size(); - slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict)); - - const int n_vocab = llama_n_vocab(model); - for (const auto & penalty_token : *penalty_prompt) { - if (penalty_token.is_number_integer()) { - const auto tok = penalty_token.get(); - if (tok >= 0 && tok < n_vocab) { - slot.sparams.penalty_prompt_tokens.push_back(tok); - } - } - } - slot.sparams.use_penalty_prompt_tokens = true; - - LOG_VERBOSE("penalty_prompt_tokens", { - {"id_slot", slot.id}, - {"tokens", slot.sparams.penalty_prompt_tokens}, - }); - } - } - } - - { - slot.sparams.logit_bias.clear(); - - if (json_value(data, "ignore_eos", false)) { - slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } - - const auto & logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(model); - for (const auto & el : *logit_bias) { - // TODO: we may want to throw errors here, in case "el" is incorrect - if (el.is_array() && el.size() == 2) { - float bias; - if (el[1].is_number()) { - bias = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - bias = -INFINITY; - } else { - continue; - } - - if (el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - slot.sparams.logit_bias[tok] = bias; - } - } else if (el[0].is_string()) { - auto toks = llama_tokenize(model, el[0].get(), false); - for (auto tok : toks) { - slot.sparams.logit_bias[tok] = bias; - } - } - } - } - } - } - - { - slot.params.antiprompt.clear(); - - const auto & stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto & word : *stop) { - if (!word.empty()) { - slot.params.antiprompt.push_back(word); - } - } - } - } - - { - const auto & samplers_sequence = data.find("samplers"); - if (samplers_sequence != data.end() && samplers_sequence->is_array()) { - std::vector sampler_names; - for (const auto & sampler_name : *samplers_sequence) { - if (sampler_name.is_string()) { - sampler_names.emplace_back(sampler_name); - } - } - slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false); - } else { - slot.sparams.samplers_sequence = default_sparams.samplers_sequence; - } - } - - { - if (slot.ctx_sampling != nullptr) { - llama_sampling_free(slot.ctx_sampling); - } - slot.ctx_sampling = llama_sampling_init(slot.sparams); - if (slot.ctx_sampling == nullptr) { - // for now, the only error that may happen here is invalid grammar - send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); - return false; - } - } - - slot.command = SLOT_COMMAND_LOAD_PROMPT; - slot.prompt_tokens.clear(); - - LOG_INFO("slot is processing task", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - }); - - return true; - } - - void kv_cache_clear() { - LOG_VERBOSE("clearing KV cache", {}); - - // clear the entire KV cache - llama_kv_cache_clear(ctx); - clean_kv_cache = false; - } - - void system_prompt_update() { - LOG_VERBOSE("system prompt update", { - {"system_prompt", system_prompt}, - }); - - kv_cache_clear(); - system_tokens.clear(); - - if (!system_prompt.empty()) { - system_tokens = ::llama_tokenize(ctx, system_prompt, true); - - llama_batch_clear(batch); - - for (int i = 0; i < (int)system_tokens.size(); ++i) { - llama_batch_add(batch, system_tokens[i], i, { 0 }, false); - } - - const int32_t n_batch = llama_n_batch(ctx); - - for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, 0, 0, // unused - }; - - if (llama_decode(ctx, batch_view) != 0) { - LOG_ERROR("llama_decode() failed", {}); - return; - } - } - - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i <= params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } - } - - system_need_update = false; - } - - bool system_prompt_set(const std::string & sys_prompt) { - system_prompt = sys_prompt; - - LOG_VERBOSE("system prompt process", { - {"system_prompt", system_prompt}, - }); - - // release all slots - for (server_slot & slot : slots) { - slot.release(); - } - - system_need_update = true; - return true; - } - - bool process_token(completion_token_output & result, server_slot & slot) { - // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special); - slot.sampled = result.tok; - - // search stop word and delete it - slot.generated_text += token_str; - slot.has_next_token = true; - - if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) { - // we can change penalty_prompt_tokens because it is always created from scratch each request - slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); - } - - // check if there is incomplete UTF-8 character at the end - bool incomplete = false; - for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { - unsigned char c = slot.generated_text[slot.generated_text.size() - i]; - if ((c & 0xC0) == 0x80) { - // continuation byte: 10xxxxxx - continue; - } - if ((c & 0xE0) == 0xC0) { - // 2-byte character: 110xxxxx ... - incomplete = i < 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character: 1110xxxx ... - incomplete = i < 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character: 11110xxx ... - incomplete = i < 4; - } - // else 1-byte character or invalid byte - break; - } - - if (!incomplete) { - size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); - - const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; - - size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL); - if (stop_pos != std::string::npos) { - is_stop_full = true; - slot.generated_text.erase( - slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } else { - is_stop_full = false; - stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL); - } - - // check if there is any token to predict - if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { - // no send the stop word in the response - result.text_to_send = slot.generated_text.substr(pos, std::string::npos); - slot.n_sent_text += result.text_to_send.size(); - // add the token to slot queue and cache - } - - slot.add_token_string(result); - if (slot.params.stream) { - send_partial_response(slot, result); - } - } - - if (incomplete) { - slot.has_next_token = true; - } - - // check the limits - if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) { - slot.stopped_limit = true; - slot.has_next_token = false; - - LOG_VERBOSE("stopped by limit", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_decoded", slot.n_decoded}, - {"n_predict", slot.params.n_predict}, - }); - } - - if (llama_token_is_eog(model, result.tok)) { - slot.stopped_eos = true; - slot.has_next_token = false; - - LOG_VERBOSE("eos token found", {}); - } - - auto n_ctx_train = llama_n_ctx_train(model); - if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 - && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { - LOG_WARNING("n_predict is not set and self-context extend is disabled." - " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", { - { "id_slot", slot.id }, - { "params.n_predict", slot.params.n_predict }, - { "slot.n_prompt_tokens", slot.n_prompt_tokens }, - { "slot.n_decoded", slot.n_decoded }, - { "slot.n_predict", slot.n_predict }, - { "n_slots", params.n_parallel }, - { "slot.n_ctx", slot.n_ctx }, - { "n_ctx", n_ctx }, - { "n_ctx_train", n_ctx_train }, - { "ga_n", slot.ga_n }, - }); - slot.truncated = true; - slot.stopped_limit = true; - slot.has_next_token = false; // stop prediction - } - - LOG_VERBOSE("next token", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"token", result.tok}, - {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"n_decoded", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }); - - return slot.has_next_token; // continue - } - - json get_formated_generation(const server_slot & slot) const { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); - const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); - - std::vector samplers_sequence; - samplers_sequence.reserve(slot.sparams.samplers_sequence.size()); - for (const auto & sampler_type : slot.sparams.samplers_sequence) { - samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type)); - } - - return json { - {"n_ctx", slot.n_ctx}, - {"n_predict", slot.n_predict}, - {"model", params.model_alias}, - {"seed", slot.sparams.seed}, - {"temperature", slot.sparams.temp}, - {"dynatemp_range", slot.sparams.dynatemp_range}, - {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, - {"top_k", slot.sparams.top_k}, - {"top_p", slot.sparams.top_p}, - {"min_p", slot.sparams.min_p}, - {"tfs_z", slot.sparams.tfs_z}, - {"typical_p", slot.sparams.typical_p}, - {"repeat_last_n", slot.sparams.penalty_last_n}, - {"repeat_penalty", slot.sparams.penalty_repeat}, - {"presence_penalty", slot.sparams.penalty_present}, - {"frequency_penalty", slot.sparams.penalty_freq}, - {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, - {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, - {"mirostat", slot.sparams.mirostat}, - {"mirostat_tau", slot.sparams.mirostat_tau}, - {"mirostat_eta", slot.sparams.mirostat_eta}, - {"penalize_nl", slot.sparams.penalize_nl}, - {"stop", slot.params.antiprompt}, - {"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict - {"n_keep", slot.params.n_keep}, - {"n_discard", slot.params.n_discard}, - {"ignore_eos", ignore_eos}, - {"stream", slot.params.stream}, - {"logit_bias", slot.sparams.logit_bias}, - {"n_probs", slot.sparams.n_probs}, - {"min_keep", slot.sparams.min_keep}, - {"grammar", slot.sparams.grammar}, - {"samplers", samplers_sequence} - }; - } - - void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - send_error(task.id, task.id_multi, error, type); - } - - void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - send_error(slot.id_task, slot.id_multi, error, type); - } - - void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - LOG_ERROR("task error", { - {"id_multi", id_multi}, - {"id_task", id_task}, - {"error", error}, - }); - - server_task_result res; - res.id = id_task; - res.id_multi = id_multi; - res.stop = false; - res.error = true; - res.data = format_error_response(error, type); - - queue_results.send(res); - } - - void send_partial_response(server_slot & slot, completion_token_output tkn) { - server_task_result res; - res.id = slot.id_task; - res.id_multi = slot.id_multi; - res.error = false; - res.stop = false; - res.data = json { - {"content", tkn.text_to_send}, - {"stop", false}, - {"id_slot", slot.id}, - {"multimodal", false} - }; - - if (slot.sparams.n_probs > 0) { - const std::vector to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); - const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); - const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); - - std::vector probs_output; - if (probs_pos < probs_stop_pos) { - probs_output = std::vector( - slot.generated_token_probs.begin() + probs_pos, - slot.generated_token_probs.begin() + probs_stop_pos); - } - slot.n_sent_token_probs = probs_stop_pos; - - res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); - } - - if (slot.oaicompat) { - res.data["oaicompat_token_ctr"] = slot.n_decoded; - res.data["model"] = slot.oaicompat_model; - } - - queue_results.send(res); - } - - void send_final_response(const server_slot & slot) { - server_task_result res; - res.id = slot.id_task; - res.id_multi = slot.id_multi; - res.error = false; - res.stop = true; - res.data = json { - {"content", !slot.params.stream ? slot.generated_text : ""}, - {"id_slot", slot.id}, - {"stop", true}, - {"model", params.model_alias}, - {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.n_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, - {"prompt", slot.prompt}, - {"truncated", slot.truncated}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()} - }; - - if (slot.sparams.n_probs > 0) { - std::vector probs; - if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); - - size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - safe_offset); - } else { - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end()); - } - - res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs); - } - - if (slot.oaicompat) { - res.data["oaicompat_token_ctr"] = slot.n_decoded; - res.data["model"] = slot.oaicompat_model; - } - - queue_results.send(res); - } - - void send_embedding(const server_slot & slot, const llama_batch & batch) { - server_task_result res; - res.id = slot.id_task; - res.id_multi = slot.id_multi; - res.error = false; - res.stop = true; - - const int n_embd = llama_n_embd(model); - - std::vector embd_res(n_embd, 0.0f); - - for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { - continue; - } - - const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - } - - if (embd == NULL) { - LOG_ERROR("failed to get embeddings", { - {"token", batch.token [i]}, - {"seq_id", batch.seq_id[i][0]} - }); - - res.data = json { - {"embedding", std::vector(n_embd, 0.0f)}, - }; - - continue; - } - - llama_embd_normalize(embd, embd_res.data(), n_embd); - - res.data = json { - {"embedding", embd_res}, - }; - } - - queue_results.send(res); - } - - void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding) { - server_task task; - task.id = id_task; - task.id_multi = id_multi; - task.id_target = 0; - task.data = std::move(data); - task.infill = infill; - task.embedding = embedding; - task.type = SERVER_TASK_TYPE_COMPLETION; - - // when a completion task's prompt array is not a singleton, we split it into multiple requests - // otherwise, it's a single-prompt task, we actually queue it - // if there's numbers in the prompt array it will be treated as an array of tokens - if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) { - bool numbers = false; - for (const auto & e : task.data.at("prompt")) { - if (e.is_number()) { - numbers = true; - break; - } - } - - // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers, - // it will completely stall the server. I don't know where the bug for this is. - // - // if there are numbers, it needs to be treated like a single prompt, - // queue_tasks handles a mix of strings and numbers just fine. - if (numbers) { - queue_tasks.post(task); - } else { - split_multiprompt_task(id_task, task); - } - } else { - queue_tasks.post(task); - } - } - - void request_cancel(int id_task) { - server_task task; - task.type = SERVER_TASK_TYPE_CANCEL; - task.id_target = id_task; - - queue_tasks.post(task); - } - - void split_multiprompt_task(int id_multi, const server_task & multiprompt_task) { - const int prompt_count = multiprompt_task.data.at("prompt").size(); - if (prompt_count <= 1) { - send_error(multiprompt_task, "error while handling multiple prompts"); - return; - } - - // generate all the ID for subtask - std::vector subtask_ids(prompt_count); - for (int i = 0; i < prompt_count; i++) { - subtask_ids[i] = queue_tasks.get_new_id(); - } - - // queue up the multitask so we can track its subtask progression - queue_tasks.add_multitask(id_multi, subtask_ids); - - // add subtasks - for (int i = 0; i < prompt_count; i++) { - json subtask_data = multiprompt_task.data; - subtask_data["prompt"] = subtask_data.at("prompt")[i]; - - // subtasks inherit everything else (infill mode, embedding mode, etc.) - request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding); - } - } - - void process_single_task(const server_task & task) { - switch (task.type) { - case SERVER_TASK_TYPE_COMPLETION: - { - const int id_slot = json_value(task.data, "id_slot", -1); - - server_slot * slot; - - if (id_slot != -1) { - slot = get_slot_by_id(id_slot); - } else { - std::string prompt; - if (task.data.contains("prompt") && task.data.at("prompt").is_string()) { - prompt = json_value(task.data, "prompt", std::string()); - } - - slot = get_available_slot(prompt); - } - - if (slot == nullptr) { - // if no slot is available, we defer this task for processing later - LOG_VERBOSE("no slot is available", {{"id_task", task.id}}); - queue_tasks.defer(task); - break; - } - if (!slot->available()) { - // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); - queue_tasks.defer(task); - break; - } - - if (task.data.contains("system_prompt")) { - std::string sys_prompt = json_value(task.data, "system_prompt", std::string()); - system_prompt_set(sys_prompt); - - for (server_slot & slot : slots) { - slot.n_past = 0; - slot.n_past_se = 0; - } - } - - slot->reset(); - - slot->id_task = task.id; - slot->id_multi = task.id_multi; - slot->infill = task.infill; - slot->embedding = task.embedding; - - if (!launch_slot_with_task(*slot, task)) { - LOG_ERROR("error while launching slot", task.data); - break; - } - } break; - case SERVER_TASK_TYPE_CANCEL: - { - // release slot linked with the task id - for (auto & slot : slots) { - if (slot.id_task == task.id_target) { - slot.release(); - break; - } - } - } break; - case SERVER_TASK_TYPE_NEXT_RESPONSE: - { - // do nothing - } break; - case SERVER_TASK_TYPE_METRICS: - { - json slots_data = json::array(); - - int n_idle_slots = 0; - int n_processing_slots = 0; - - for (server_slot & slot : slots) { - json slot_data = get_formated_generation(slot); - slot_data["id"] = slot.id; - slot_data["id_task"] = slot.id_task; - slot_data["state"] = slot.state; - slot_data["prompt"] = slot.prompt; - slot_data["next_token"] = { - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"n_decoded", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }; - - if (slot_data["state"] == SLOT_STATE_IDLE) { - n_idle_slots++; - } else { - n_processing_slots++; - } - - slots_data.push_back(slot_data); - } - LOG_INFO("slot data", { - {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots} - }); - - LOG_VERBOSE("slot data", { - {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots}, - {"slots", slots_data} - }); - - server_task_result res; - res.id = task.id; - res.id_multi = task.id_multi; - res.stop = true; - res.error = false; - res.data = { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "deferred", queue_tasks.queue_tasks_deferred.size() }, - { "t_start", metrics.t_start}, - - { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total}, - { "t_tokens_generation_total", metrics.t_tokens_generation_total}, - { "n_tokens_predicted_total", metrics.n_tokens_predicted_total}, - { "t_prompt_processing_total", metrics.t_prompt_processing_total}, - - { "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed}, - { "t_prompt_processing", metrics.t_prompt_processing}, - { "n_tokens_predicted", metrics.n_tokens_predicted}, - { "t_tokens_generation", metrics.t_tokens_generation}, - - { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, - { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, - - { "slots", slots_data }, - }; - - if (json_value(task.data, "reset_bucket", false)) { - metrics.reset_bucket(); - } - queue_results.send(res); - } break; - case SERVER_TASK_TYPE_SLOT_SAVE: - { - int id_slot = task.data.at("id_slot"); - server_slot * slot = get_slot_by_id(id_slot); - if (slot == nullptr) { - send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); - break; - } - if (!slot->available()) { - // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); - queue_tasks.defer(task); - break; - } - - const size_t token_count = slot->cache_tokens.size(); - const int64_t t_start = ggml_time_us(); - - std::string filename = task.data.at("filename"); - std::string filepath = task.data.at("filepath"); - - const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count); - - const int64_t t_end = ggml_time_us(); - const double t_save_ms = (t_end - t_start) / 1000.0; - - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_saved", token_count }, // tokens saved - { "n_written", nwrite }, // bytes written - { "timings", { - { "save_ms", t_save_ms } - } } - }; - queue_results.send(result); - } break; - case SERVER_TASK_TYPE_SLOT_RESTORE: - { - int id_slot = task.data.at("id_slot"); - server_slot * slot = get_slot_by_id(id_slot); - if (slot == nullptr) { - send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); - break; - } - if (!slot->available()) { - // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); - queue_tasks.defer(task); - break; - } - - const int64_t t_start = ggml_time_us(); - - std::string filename = task.data.at("filename"); - std::string filepath = task.data.at("filepath"); - - slot->cache_tokens.resize(slot->n_ctx); - size_t token_count = 0; - size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count); - if (nread == 0) { - slot->cache_tokens.resize(0); - send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST); - break; - } - slot->cache_tokens.resize(token_count); - - const int64_t t_end = ggml_time_us(); - const double t_restore_ms = (t_end - t_start) / 1000.0; - - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_restored", token_count }, // tokens restored - { "n_read", nread }, // bytes read - { "timings", { - { "restore_ms", t_restore_ms } - } } - }; - queue_results.send(result); - } break; - case SERVER_TASK_TYPE_SLOT_ERASE: - { - int id_slot = task.data.at("id_slot"); - server_slot * slot = get_slot_by_id(id_slot); - if (slot == nullptr) { - send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); - break; - } - if (!slot->available()) { - // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); - queue_tasks.defer(task); - break; - } - - // Erase token cache - const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1); - slot->cache_tokens.clear(); - - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "n_erased", n_erased } - }; - queue_results.send(result); - } break; - } - } - - void on_finish_multitask(const server_task_multi & multitask) { - // all subtasks done == multitask is done - server_task_result result; - result.id = multitask.id; - result.stop = true; - result.error = false; - - // collect json results into one json result - std::vector result_jsons; - for (const auto & subres : multitask.results) { - result_jsons.push_back(subres.data); - result.error = result.error && subres.error; - } - result.data = json { - { "results", result_jsons } - }; - - queue_results.send(result); - } - - void update_slots() { - if (system_need_update) { - system_prompt_update(); - } - - // release slots - for (auto & slot : slots) { - if (slot.command == SLOT_COMMAND_RELEASE) { - slot.state = SLOT_STATE_IDLE; - slot.command = SLOT_COMMAND_NONE; - slot.t_last_used = ggml_time_us(); - - LOG_INFO("slot released", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", n_ctx}, - {"n_past", slot.n_past}, - {"n_system_tokens", system_tokens.size()}, - {"n_cache_tokens", slot.cache_tokens.size()}, - {"truncated", slot.truncated} - }); - - queue_tasks.notify_slot_changed(); - } - } - - // check if all slots are idle - { - bool all_idle = true; - - for (auto & slot : slots) { - if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) { - all_idle = false; - break; - } - } - - if (all_idle) { - LOG_INFO("all slots are idle", {}); - if (system_prompt.empty() && clean_kv_cache) { - kv_cache_clear(); - } - - return; - } - } - - { - LOG_VERBOSE("posting NEXT_RESPONSE", {}); - - server_task task; - task.type = SERVER_TASK_TYPE_NEXT_RESPONSE; - task.id_target = -1; - - queue_tasks.post(task); - } - - // apply context-shift if needed - // TODO: simplify and improve - for (server_slot & slot : slots) { - if (slot.ga_n == 1) { - if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) { - // Shift context - const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; - const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); - - LOG_INFO("slot context shift", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_keep", n_keep}, - {"n_left", n_left}, - {"n_discard", n_discard}, - {"n_ctx", n_ctx}, - {"n_past", slot.n_past}, - {"n_system_tokens", system_tokens.size()}, - {"n_cache_tokens", slot.cache_tokens.size()} - }); - - llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); - - if (slot.params.cache_prompt) { - for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - } - - slot.n_past -= n_discard; - - slot.truncated = true; - } - } - } - - // start populating the batch for this iteration - llama_batch_clear(batch); - - // frist, add sampled tokens from any ongoing sequences - for (auto & slot : slots) { - if (slot.state == SLOT_STATE_IDLE) { - continue; - } - - slot.i_batch = batch.n_tokens; - - const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - - // TODO: we always have to take into account the "system_tokens" - // this is not great and needs to be improved somehow - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true); - - slot.n_past += 1; - - if (slot.params.cache_prompt) { - slot.cache_tokens.push_back(slot.sampled); - } - - LOG_VERBOSE("slot decode token", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", n_ctx}, - {"n_past", slot.n_past}, - {"n_system_tokens", system_tokens.size()}, - {"n_cache_tokens", slot.cache_tokens.size()}, - {"truncated", slot.truncated} - }); - } - - // process in chunks of params.n_batch - int32_t n_batch = llama_n_batch(ctx); - int32_t n_ubatch = llama_n_ubatch(ctx); - - // track if this is an embedding or non-embedding batch - // if we've added sampled tokens above, we are in non-embedding mode - // -1: none, 0: non-embedding, 1: embedding - int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; - - // next, batch any pending prompts without exceeding n_batch - if (params.cont_batching || batch.n_tokens == 0) { - for (auto & slot : slots) { - // this slot still has a prompt to be processed - if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) { - auto & prompt_tokens = slot.prompt_tokens; - - // we haven't tokenized the prompt yet - do it now: - if (prompt_tokens.empty()) { - LOG_VERBOSE("tokenizing prompt", { - {"id_slot", slot.id}, - {"id_task", slot.id_task} - }); - - slot.t_start_process_prompt = ggml_time_us(); - slot.t_start_generation = 0; - - if (slot.infill) { - const bool add_bos = llama_should_add_bos_token(model); - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } - - auto prefix_tokens = tokenize(slot.params.input_prefix, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false); - - const int space_token = 29871; // TODO: this should not be hardcoded - if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } - - prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); - - auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; - auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; - if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); - } - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - - const llama_token middle_token = llama_token_middle(model); - if (middle_token >= 0) { - embd_inp.push_back(middle_token); - } - - prompt_tokens = embd_inp; - } else { - prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt - } - - slot.n_past = 0; - slot.n_prompt_tokens = prompt_tokens.size(); - - LOG_VERBOSE("prompt tokenized", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, - }); - - // empty prompt passed -> release the slot and send empty response - if (prompt_tokens.empty()) { - LOG_INFO("empty prompt - releasing slot", { - {"id_slot", slot.id}, - {"id_task", slot.id_task} - }); - - slot.state = SLOT_STATE_PROCESSING; - slot.command = SLOT_COMMAND_NONE; - slot.release(); - slot.print_timings(); - send_final_response(slot); - continue; - } - - if (slot.embedding) { - // this prompt is too large to process - discard it - if (slot.n_prompt_tokens > n_ubatch) { - slot.state = SLOT_STATE_PROCESSING; - slot.command = SLOT_COMMAND_NONE; - slot.release(); - send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); - continue; - } - } else { - if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.n_prompt_tokens; - } - slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - - // if input prompt is too big, truncate it (if group attention self-extend is disabled) - if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { - const int n_left = slot.n_ctx - slot.params.n_keep; - - const int n_block_size = n_left / 2; - const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; - - std::vector new_tokens( - prompt_tokens.begin(), - prompt_tokens.begin() + slot.params.n_keep); - - new_tokens.insert( - new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, - prompt_tokens.end()); - - prompt_tokens = std::move(new_tokens); - - slot.truncated = true; - slot.n_prompt_tokens = prompt_tokens.size(); - - LOG_VERBOSE("input truncated", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, - }); - - GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); - } - - llama_sampling_reset(slot.ctx_sampling); - - if (!slot.params.cache_prompt) { - slot.n_past_se = 0; - slot.ga_i = 0; - } else { - GGML_ASSERT(slot.ga_n == 1); - - // reuse any previously computed tokens that are common with the new prompt - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - - // push the prompt into the sampling context (do not apply grammar) - for (int i = 0; i < slot.n_past; ++i) { - llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false); - } - } - } - - if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { - // we have to evaluate at least 1 token to generate logits. - LOG_INFO("we have to evaluate at least 1 token to generate logits", { - { "id_slot", slot.id }, - { "id_task", slot.id_task } - }); - - slot.n_past--; - if (slot.ga_i > 0) { - slot.n_past_se--; - } - } - - slot.n_prompt_tokens_processed = 0; - } - - if (slot.embedding) { - // cannot fit the prompt in the current batch - will try next iter - if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { - continue; - } - } - - // check that we are in the right batch_type, if not defer the slot - bool slot_type = slot.embedding ? 1 : 0; - if (batch_type == -1) { - batch_type = slot_type; - } else if (batch_type != slot_type) { - continue; - } - - // keep only the common part - int p0 = (int) system_tokens.size() + slot.n_past; - if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) { - // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); - - p0 = (int) system_tokens.size(); - if (p0 != 0) { - // copy over the system prompt when there is one - llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1); - } - - // there is no common part left (except for the system prompt) - slot.n_past = 0; - slot.n_past_se = 0; - slot.ga_i = 0; - // TODO: is the system prompt ever in the sampling context? - llama_sampling_reset(slot.ctx_sampling); - } - - // remove the non-common part from the cache - slot.cache_tokens.resize(slot.n_past); - - LOG_INFO("kv cache rm [p0, end)", { - { "id_slot", slot.id }, - { "id_task", slot.id_task }, - { "p0", p0 } - }); - - int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - - int32_t ga_i = slot.ga_i; - int32_t ga_n = slot.ga_n; - int32_t ga_w = slot.ga_w; - - // add prompt tokens for processing in the current batch - // TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow - for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) { - if (slot.ga_n != 1) { - while (slot_npast >= ga_i + ga_w) { - const int bd = (ga_w/ga_n)*(ga_n - 1); - slot_npast -= bd; - ga_i += ga_w/ga_n; - } - } - - llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false); - - if (slot.params.cache_prompt) { - slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); - } - - slot.n_prompt_tokens_processed++; - slot_npast++; - } - - LOG_VERBOSE("prompt processing progress", { - {"id_slot", slot.id}, - {"n_past", slot.n_past}, - {"n_ctx", n_ctx}, - {"n_tokens", batch.n_tokens}, - {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens}, - }); - - // entire prompt has been processed - start decoding new tokens - if (slot.n_past == slot.n_prompt_tokens) { - slot.state = SLOT_STATE_PROCESSING; - slot.command = SLOT_COMMAND_NONE; - - GGML_ASSERT(batch.n_tokens > 0); - - // extract the logits only for the last token - batch.logits[batch.n_tokens - 1] = true; - - slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; - - LOG_VERBOSE("prompt done", { - {"id_slot", slot.id}, - {"n_past", slot.n_past}, - {"n_ctx", n_ctx}, - {"n_tokens", batch.n_tokens}, - }); - } - } - - if (batch.n_tokens >= n_batch) { - break; - } - } - } - - if (batch.n_tokens == 0) { - LOG_VERBOSE("no tokens to decode", {}); - return; - } - - LOG_VERBOSE("decoding batch", { - {"n_tokens", batch.n_tokens}, - }); - - // make sure we're in the right embedding mode - llama_set_embeddings(ctx, batch_type == 1); - - // process the created batch of tokens - for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); - - for (auto & slot : slots) { - if (slot.ga_n != 1) { - // context extension via Self-Extend - // TODO: simplify and/or abstract this - while (slot.n_past_se >= slot.ga_i + slot.ga_w) { - const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; - const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); - const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); - - slot.n_past_se -= bd; - - slot.ga_i += slot.ga_w / slot.ga_n; - - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); - } - - slot.n_past_se += n_tokens; - } - } - - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, 0, 0, // unused - }; - - const int ret = llama_decode(ctx, batch_view); - - if (ret != 0) { - if (n_batch == 1 || ret < 0) { - // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", { - {"i", i}, - {"n_batch", ret}, - {"ret", ret}, - }); - for (auto & slot : slots) { - slot.state = SLOT_STATE_PROCESSING; - slot.command = SLOT_COMMAND_NONE; - slot.release(); - send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); - } - break; // break loop of n_batch - } - - // retry with half the batch size to try to find a free slot in the KV cache - n_batch /= 2; - i -= n_batch; - - LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", { - {"i", i}, - {"n_batch", n_batch}, - {"ret", ret}, - }); - - continue; // continue loop of n_batch - } - - for (auto & slot : slots) { - if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { - continue; // continue loop of slots - } - - // prompt evaluated for embedding - if (slot.embedding) { - send_embedding(slot, batch_view); - slot.release(); - slot.i_batch = -1; - continue; // continue loop of slots - } - - completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); - - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); - - slot.n_decoded += 1; - if (slot.n_decoded == 1) { - slot.t_start_generation = ggml_time_us(); - slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; - metrics.on_prompt_eval(slot); - } - - llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; - result.tok = id; - - const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs); - if (n_probs > 0) { - const size_t n_valid = slot.ctx_sampling->n_valid; - - // Make sure at least n_probs top tokens are at the front of the vector: - if (slot.sparams.temp == 0.0f && n_probs > n_valid) { - llama_sample_top_k(ctx, &cur_p, n_probs, 0); - } - - if (slot.sparams.temp == 0.0f) { - // With greedy sampling the probabilities have possibly not been calculated. - for (size_t i = 0; i < n_probs; ++i) { - result.probs.push_back({ - cur_p.data[i].id, - i == 0 ? 1.0f : 0.0f - }); - } - } else { - for (size_t i = 0; i < n_probs; ++i) { - result.probs.push_back({ - cur_p.data[i].id, - i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability. - }); - } - } - } - - if (!process_token(result, slot)) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); - } - - slot.i_batch = -1; - } - } - - LOG_VERBOSE("run slots completed", {}); - } - - json model_meta() const { - return json { - {"vocab_type", llama_vocab_type (model)}, - {"n_vocab", llama_n_vocab (model)}, - {"n_ctx_train", llama_n_ctx_train (model)}, - {"n_embd", llama_n_embd (model)}, - {"n_params", llama_model_n_params(model)}, - {"size", llama_model_size (model)}, - }; - } -}; - -static void log_server_request(const httplib::Request & req, const httplib::Response & res) { - // skip GH copilot requests when using default port - if (req.path == "/v1/health" || req.path == "/v1/completions") { - return; - } - - LOG_INFO("request", { - {"remote_addr", req.remote_addr}, - {"remote_port", req.remote_port}, - {"status", res.status}, - {"method", req.method}, - {"path", req.path}, - {"params", req.params}, - }); - - LOG_VERBOSE("request", { - {"request", req.body}, - {"response", res.body}, - }); -} - -std::function shutdown_handler; -std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; - -inline void signal_handler(int signal) { - if (is_terminating.test_and_set()) { - // in case it hangs, we can force terminate the server by hitting Ctrl+C twice - // this is for better developer experience, we can remove when the server is stable enough - fprintf(stderr, "Received second interrupt, terminating immediately.\n"); - exit(1); - } - - shutdown_handler(signal); -} - -int main(int argc, char ** argv) { -#if SERVER_VERBOSE != 1 - log_disable(); -#endif - // own arguments required by this example - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); - return 1; - } - - // TODO: not great to use extern vars - server_log_json = params.log_json; - server_verbose = params.verbosity > 0; - - // struct that contains llama context and inference - server_context ctx_server; - - if (!params.system_prompt.empty()) { - ctx_server.system_prompt_set(params.system_prompt); - } - - if (params.model_alias == "unknown") { - params.model_alias = params.model; - } - - llama_backend_init(); - llama_numa_init(params.numa); - - LOG_INFO("build info", { - {"build", LLAMA_BUILD_NUMBER}, - {"commit", LLAMA_COMMIT} - }); - - LOG_INFO("system info", { - {"n_threads", params.n_threads}, - {"n_threads_batch", params.n_threads_batch}, - {"total_threads", std::thread::hardware_concurrency()}, - {"system_info", llama_print_system_info()}, - }); - - std::unique_ptr svr; -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}}); - svr.reset( - new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) - ); - } else { - LOG_INFO("Running without SSL", {}); - svr.reset(new httplib::Server()); - } -#else - svr.reset(new httplib::Server()); -#endif - - std::atomic state{SERVER_STATE_LOADING_MODEL}; - - svr->set_default_headers({{"Server", "llama.cpp"}}); - - // CORS preflight - svr->Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - res.set_header("Access-Control-Allow-Credentials", "true"); - res.set_header("Access-Control-Allow-Methods", "POST"); - res.set_header("Access-Control-Allow-Headers", "*"); - return res.set_content("", "application/json; charset=utf-8"); - }); - - svr->set_logger(log_server_request); - - auto res_error = [](httplib::Response & res, json error_data) { - json final_response {{"error", error_data}}; - res.set_content(final_response.dump(), "application/json; charset=utf-8"); - res.status = json_value(error_data, "code", 500); - }; - - svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { - std::string message; - try { - std::rethrow_exception(std::move(ep)); - } catch (std::exception & e) { - message = e.what(); - } catch (...) { - message = "Unknown Exception"; - } - - json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - LOG_VERBOSE("Got exception", formatted_error); - res_error(res, formatted_error); - }); - - svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) { - if (res.status == 404) { - res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND)); - } - // for other error codes, we skip processing here because it's already done by res_error() - }); - - // set timeouts and change hostname and port - svr->set_read_timeout (params.timeout_read); - svr->set_write_timeout(params.timeout_write); - - if (!svr->bind_to_port(params.hostname, params.port)) { - fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port); - return 1; - } - - std::unordered_map log_data; - - log_data["hostname"] = params.hostname; - log_data["port"] = std::to_string(params.port); - - if (params.api_keys.size() == 1) { - auto key = params.api_keys[0]; - log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0)); - } else if (params.api_keys.size() > 1) { - log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded"; - } - - // Necessary similarity of prompt for slot selection - ctx_server.slot_prompt_similarity = params.slot_prompt_similarity; - - // load the model - if (!ctx_server.load_model(params)) { - state.store(SERVER_STATE_ERROR); - return 1; - } else { - ctx_server.init(); - state.store(SERVER_STATE_READY); - } - - LOG_INFO("model loaded", {}); - - const auto model_meta = ctx_server.model_meta(); - - // if a custom chat template is not supplied, we will use the one that comes with the model (if any) - if (params.chat_template.empty()) { - if (!ctx_server.validate_model_chat_template()) { - LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); - params.chat_template = "chatml"; - } - } - - // print sample chat example to make it clear which template is used - { - LOG_INFO("chat template", { - {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, - {"built_in", params.chat_template.empty()}, - }); - } - - // - // Middlewares - // - - auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { - // TODO: should we apply API key to all endpoints, including "/health" and "/models"? - static const std::set protected_endpoints = { - "/props", - "/completion", - "/completions", - "/v1/completions", - "/chat/completions", - "/v1/chat/completions", - "/infill", - "/tokenize", - "/detokenize", - "/embedding", - "/embeddings", - "/v1/embeddings", - }; - - // If API key is not set, skip validation - if (params.api_keys.empty()) { - return true; - } - - // If path is not in protected_endpoints list, skip validation - if (protected_endpoints.find(req.path) == protected_endpoints.end()) { - return true; - } - - // Check for API key in the header - auto auth_header = req.get_header_value("Authorization"); - - std::string prefix = "Bearer "; - if (auth_header.substr(0, prefix.size()) == prefix) { - std::string received_api_key = auth_header.substr(prefix.size()); - if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) { - return true; // API key is valid - } - } - - // API key is invalid or not provided - // TODO: make another middleware for CORS related logic - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); - - LOG_WARNING("Unauthorized: Invalid API Key", {}); - - return false; - }; - - // register server middlewares - svr->set_pre_routing_handler([&middleware_validate_api_key](const httplib::Request & req, httplib::Response & res) { - if (!middleware_validate_api_key(req, res)) { - return httplib::Server::HandlerResponse::Handled; - } - return httplib::Server::HandlerResponse::Unhandled; - }); - - // - // Route handlers (or controllers) - // - - const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) { - server_state current_state = state.load(); - switch (current_state) { - case SERVER_STATE_READY: - { - // request slots data using task queue - server_task task; - task.id = ctx_server.queue_tasks.get_new_id(); - task.type = SERVER_TASK_TYPE_METRICS; - task.id_target = -1; - - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); - - // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); - - const int n_idle_slots = result.data.at("idle"); - const int n_processing_slots = result.data.at("processing"); - - json health = { - {"status", "ok"}, - {"slots_idle", n_idle_slots}, - {"slots_processing", n_processing_slots} - }; - - res.status = 200; // HTTP OK - if (params.endpoint_slots && req.has_param("include_slots")) { - health["slots"] = result.data.at("slots"); - } - - if (n_idle_slots == 0) { - health["status"] = "no slot available"; - if (req.has_param("fail_on_no_slot")) { - res.status = 503; // HTTP Service Unavailable - } - } - - res.set_content(health.dump(), "application/json"); - break; - } - case SERVER_STATE_LOADING_MODEL: - { - res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); - } break; - case SERVER_STATE_ERROR: - { - res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER)); - } break; - } - }; - - const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) { - if (!params.endpoint_slots) { - res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - // request slots data using task queue - server_task task; - task.id = ctx_server.queue_tasks.get_new_id(); - task.id_multi = -1; - task.id_target = -1; - task.type = SERVER_TASK_TYPE_METRICS; - - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); - - // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); - - res.set_content(result.data.at("slots").dump(), "application/json"); - res.status = 200; // HTTP OK - }; - - const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { - if (!params.endpoint_metrics) { - res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - // request slots data using task queue - server_task task; - task.id = ctx_server.queue_tasks.get_new_id(); - task.id_multi = -1; - task.id_target = -1; - task.type = SERVER_TASK_TYPE_METRICS; - task.data.push_back({{"reset_bucket", true}}); - - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); - - // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); - - json data = result.data; - - const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed"); - const uint64_t t_prompt_processing = data.at("t_prompt_processing"); - - const uint64_t n_tokens_predicted = data.at("n_tokens_predicted"); - const uint64_t t_tokens_generation = data.at("t_tokens_generation"); - - const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells"); - - // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names - json all_metrics_def = json { - {"counter", {{ - {"name", "prompt_tokens_total"}, - {"help", "Number of prompt tokens processed."}, - {"value", (uint64_t) data.at("n_prompt_tokens_processed_total")} - }, { - {"name", "prompt_seconds_total"}, - {"help", "Prompt process time"}, - {"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3} - }, { - {"name", "tokens_predicted_total"}, - {"help", "Number of generation tokens processed."}, - {"value", (uint64_t) data.at("n_tokens_predicted_total")} - }, { - {"name", "tokens_predicted_seconds_total"}, - {"help", "Predict process time"}, - {"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3} - }}}, - {"gauge", {{ - {"name", "prompt_tokens_seconds"}, - {"help", "Average prompt throughput in tokens/s."}, - {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.} - },{ - {"name", "predicted_tokens_seconds"}, - {"help", "Average generation throughput in tokens/s."}, - {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.} - },{ - {"name", "kv_cache_usage_ratio"}, - {"help", "KV-cache usage. 1 means 100 percent usage."}, - {"value", 1. * kv_cache_used_cells / params.n_ctx} - },{ - {"name", "kv_cache_tokens"}, - {"help", "KV-cache tokens."}, - {"value", (uint64_t) data.at("kv_cache_tokens_count")} - },{ - {"name", "requests_processing"}, - {"help", "Number of request processing."}, - {"value", (uint64_t) data.at("processing")} - },{ - {"name", "requests_deferred"}, - {"help", "Number of request deferred."}, - {"value", (uint64_t) data.at("deferred")} - }}} - }; - - std::stringstream prometheus; - - for (const auto & el : all_metrics_def.items()) { - const auto & type = el.key(); - const auto & metrics_def = el.value(); - - for (const auto & metric_def : metrics_def) { - const std::string name = metric_def.at("name"); - const std::string help = metric_def.at("help"); - - auto value = json_value(metric_def, "value", 0.); - prometheus << "# HELP llamacpp:" << name << " " << help << "\n" - << "# TYPE llamacpp:" << name << " " << type << "\n" - << "llamacpp:" << name << " " << value << "\n"; - } - } - - const int64_t t_start = data.at("t_start"); - res.set_header("Process-Start-Time-Unix", std::to_string(t_start)); - - res.set_content(prometheus.str(), "text/plain; version=0.0.4"); - res.status = 200; // HTTP OK - }; - - const auto handle_slots_save = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { - json request_data = json::parse(req.body); - std::string filename = request_data.at("filename"); - if (!fs_validate_filename(filename)) { - res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); - return; - } - std::string filepath = params.slot_save_path + filename; - - server_task task; - task.type = SERVER_TASK_TYPE_SLOT_SAVE; - task.data = { - { "id_slot", id_slot }, - { "filename", filename }, - { "filepath", filepath } - }; - - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); - - server_task_result result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); - - if (result.error) { - res_error(res, result.data); - } else { - res.set_content(result.data.dump(), "application/json"); - } - }; - - const auto handle_slots_restore = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { - json request_data = json::parse(req.body); - std::string filename = request_data.at("filename"); - if (!fs_validate_filename(filename)) { - res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); - return; - } - std::string filepath = params.slot_save_path + filename; - - server_task task; - task.type = SERVER_TASK_TYPE_SLOT_RESTORE; - task.data = { - { "id_slot", id_slot }, - { "filename", filename }, - { "filepath", filepath } - }; - - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); - - server_task_result result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); - - if (result.error) { - res_error(res, result.data); - } else { - res.set_content(result.data.dump(), "application/json"); - } - }; - - const auto handle_slots_erase = [&ctx_server, &res_error](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { - server_task task; - task.type = SERVER_TASK_TYPE_SLOT_ERASE; - task.data = { - { "id_slot", id_slot }, - }; - - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); - - server_task_result result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); - - if (result.error) { - res_error(res, result.data); - } else { - res.set_content(result.data.dump(), "application/json"); - } - }; - - const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - - std::string id_slot_str = req.path_params.at("id_slot"); - int id_slot; - - try { - id_slot = std::stoi(id_slot_str); - } catch (const std::exception &) { - res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - std::string action = req.get_param_value("action"); - - if (action == "save") { - handle_slots_save(req, res, id_slot); - } else if (action == "restore") { - handle_slots_restore(req, res, id_slot); - } else if (action == "erase") { - handle_slots_erase(req, res, id_slot); - } else { - res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST)); - } - }; - - const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - std::string template_key = "tokenizer.chat_template", curr_tmpl; - int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0); - if (tlen > 0) { - std::vector curr_tmpl_buf(tlen + 1, 0); - if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) { - curr_tmpl = std::string(curr_tmpl_buf.data(), tlen); - } - } - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - json data = { - { "system_prompt", ctx_server.system_prompt.c_str() }, - { "default_generation_settings", ctx_server.default_generation_settings_for_props }, - { "total_slots", ctx_server.params.n_parallel }, - { "chat_template", curr_tmpl.c_str() } - }; - - res.set_content(data.dump(), "application/json; charset=utf-8"); - }; - - const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { - if (ctx_server.params.embedding) { - res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - - json data = json::parse(req.body); - - const int id_task = ctx_server.queue_tasks.get_new_id(); - - ctx_server.queue_results.add_waiting_task_id(id_task); - ctx_server.request_completion(id_task, -1, data, false, false); - - if (!json_value(data, "stream", false)) { - server_task_result result = ctx_server.queue_results.recv(id_task); - if (!result.error && result.stop) { - res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); - } else { - res_error(res, result.data); - } - - ctx_server.queue_results.remove_waiting_task_id(id_task); - } else { - const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) { - while (true) { - server_task_result result = ctx_server.queue_results.recv(id_task); - if (!result.error) { - const std::string str = - "data: " + - result.data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; - - LOG_VERBOSE("data stream", { - { "to_send", str } - }); - - if (!sink.write(str.c_str(), str.size())) { - ctx_server.queue_results.remove_waiting_task_id(id_task); - return false; - } - - if (result.stop) { - break; - } - } else { - const std::string str = - "error: " + - result.data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; - - LOG_VERBOSE("data stream", { - { "to_send", str } - }); - - if (!sink.write(str.c_str(), str.size())) { - ctx_server.queue_results.remove_waiting_task_id(id_task); - return false; - } - - break; - } - } - - ctx_server.queue_results.remove_waiting_task_id(id_task); - sink.done(); - - return true; - }; - - auto on_complete = [id_task, &ctx_server] (bool) { - // cancel - ctx_server.request_cancel(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); - }; - - res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); - } - }; - - const auto handle_models = [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - - json models = { - {"object", "list"}, - {"data", { - { - {"id", params.model_alias}, - {"object", "model"}, - {"created", std::time(0)}, - {"owned_by", "llamacpp"}, - {"meta", model_meta} - }, - }} - }; - - res.set_content(models.dump(), "application/json; charset=utf-8"); - }; - - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { - if (ctx_server.params.embedding) { - res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - - const int id_task = ctx_server.queue_tasks.get_new_id(); - - ctx_server.queue_results.add_waiting_task_id(id_task); - ctx_server.request_completion(id_task, -1, data, false, false); - - const auto completion_id = gen_chatcmplid(); - if (!json_value(data, "stream", false)) { - server_task_result result = ctx_server.queue_results.recv(id_task); - - if (!result.error && result.stop) { - json result_oai = format_final_response_oaicompat(data, result.data, completion_id); - - res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); - } else { - res_error(res, result.data); - } - ctx_server.queue_results.remove_waiting_task_id(id_task); - } else { - const auto chunked_content_provider = [id_task, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { - while (true) { - server_task_result result = ctx_server.queue_results.recv(id_task); - if (!result.error) { - std::vector result_array = format_partial_response_oaicompat(result.data, completion_id); - - for (auto it = result_array.begin(); it != result_array.end(); ++it) { - if (!it->empty()) { - const std::string str = - "data: " + - it->dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; - LOG_VERBOSE("data stream", {{"to_send", str}}); - if (!sink.write(str.c_str(), str.size())) { - ctx_server.queue_results.remove_waiting_task_id(id_task); - return false; - } - } - } - if (result.stop) { - break; - } - } else { - const std::string str = - "error: " + - result.data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; - LOG_VERBOSE("data stream", {{"to_send", str}}); - if (!sink.write(str.c_str(), str.size())) { - ctx_server.queue_results.remove_waiting_task_id(id_task); - return false; - } - break; - } - } - sink.done(); - ctx_server.queue_results.remove_waiting_task_id(id_task); - return true; - }; - - auto on_complete = [id_task, &ctx_server](bool) { - // cancel request - ctx_server.request_cancel(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); - }; - - res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); - } - }; - - const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { - if (ctx_server.params.embedding) { - res_error(res, format_error_response("This server does not support infill. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - - json data = json::parse(req.body); - - const int id_task = ctx_server.queue_tasks.get_new_id(); - - ctx_server.queue_results.add_waiting_task_id(id_task); - ctx_server.request_completion(id_task, -1, data, true, false); - - if (!json_value(data, "stream", false)) { - server_task_result result = ctx_server.queue_results.recv(id_task); - if (!result.error && result.stop) { - res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); - } else { - res_error(res, result.data); - } - - ctx_server.queue_results.remove_waiting_task_id(id_task); - } else { - const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) { - while (true) { - server_task_result result = ctx_server.queue_results.recv(id_task); - if (!result.error) { - const std::string str = - "data: " + - result.data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; - - LOG_VERBOSE("data stream", { - { "to_send", str } - }); - - if (!sink.write(str.c_str(), str.size())) { - ctx_server.queue_results.remove_waiting_task_id(id_task); - return false; - } - - if (result.stop) { - break; - } - } else { - break; - } - } - - ctx_server.queue_results.remove_waiting_task_id(id_task); - sink.done(); - - return true; - }; - - auto on_complete = [id_task, &ctx_server] (bool) { - ctx_server.request_cancel(id_task); - }; - - res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); - } - }; - - const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - const json body = json::parse(req.body); - - std::vector tokens; - if (body.count("content") != 0) { - const bool add_special = json_value(body, "add_special", false); - tokens = ctx_server.tokenize(body.at("content"), add_special); - } - const json data = format_tokenizer_response(tokens); - return res.set_content(data.dump(), "application/json; charset=utf-8"); - }; - - const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - const json body = json::parse(req.body); - - std::string content; - if (body.count("tokens") != 0) { - const std::vector tokens = body.at("tokens"); - content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend()); - } - - const json data = format_detokenized_response(content); - return res.set_content(data.dump(), "application/json; charset=utf-8"); - }; - - const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - - const json body = json::parse(req.body); - bool is_openai = false; - - // an input prompt can be a string or a list of tokens (integer) - json prompt; - if (body.count("input") != 0) { - is_openai = true; - prompt = body.at("input"); - } else if (body.count("content") != 0) { - // with "content", we only support single prompt - prompt = std::vector{body.at("content")}; - } else { - res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - // create and queue the task - json responses; - { - const int id_task = ctx_server.queue_tasks.get_new_id(); - ctx_server.queue_results.add_waiting_task_id(id_task); - ctx_server.request_completion(id_task, -1, {{"prompt", prompt}}, false, true); - - // get the result - server_task_result result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); - if (!result.error) { - if (result.data.count("results")) { - // result for multi-task - responses = result.data.at("results"); - } else { - // result for single task - responses = std::vector{result.data}; - } - } else { - // error received, ignore everything else - res_error(res, result.data); - return; - } - } - - // write JSON response - json root = is_openai - ? format_embeddings_response_oaicompat(body, responses) - : responses[0]; - return res.set_content(root.dump(), "application/json; charset=utf-8"); - }; - - auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) { - return [content, len, mime_type](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(content), len, mime_type); - return false; - }; - }; - - // - // Router - // - - // register static assets routes - if (!params.public_path.empty()) { - // Set the base directory for serving static files - svr->set_base_dir(params.public_path); - } - - // using embedded static files - svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); - svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); - svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); - svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); - - // add new-ui files - svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); - svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); - svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); - svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); - svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); - - // register API routes - svr->Get ("/health", handle_health); - svr->Get ("/slots", handle_slots); - svr->Get ("/metrics", handle_metrics); - svr->Get ("/props", handle_props); - svr->Get ("/v1/models", handle_models); - svr->Post("/completion", handle_completions); // legacy - svr->Post("/completions", handle_completions); - svr->Post("/v1/completions", handle_completions); - svr->Post("/chat/completions", handle_chat_completions); - svr->Post("/v1/chat/completions", handle_chat_completions); - svr->Post("/infill", handle_infill); - svr->Post("/embedding", handle_embeddings); // legacy - svr->Post("/embeddings", handle_embeddings); - svr->Post("/v1/embeddings", handle_embeddings); - svr->Post("/tokenize", handle_tokenize); - svr->Post("/detokenize", handle_detokenize); - if (!params.slot_save_path.empty()) { - // only enable slot endpoints if slot_save_path is set - svr->Post("/slots/:id_slot", handle_slots_action); - } - - // - // Start the server - // - if (params.n_threads_http < 1) { - // +2 threads for monitoring endpoints - params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); - } - log_data["n_threads_http"] = std::to_string(params.n_threads_http); - svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); }; - - LOG_INFO("HTTP server listening", log_data); - - // run the HTTP server in a thread - see comment below - std::thread t([&]() { - if (!svr->listen_after_bind()) { - state.store(SERVER_STATE_ERROR); - return 1; - } - - return 0; - }); - - ctx_server.queue_tasks.on_new_task(std::bind( - &server_context::process_single_task, &ctx_server, std::placeholders::_1)); - ctx_server.queue_tasks.on_finish_multitask(std::bind( - &server_context::on_finish_multitask, &ctx_server, std::placeholders::_1)); - ctx_server.queue_tasks.on_update_slots(std::bind( - &server_context::update_slots, &ctx_server)); - ctx_server.queue_results.on_multitask_update(std::bind( - &server_queue::update_multitask, - &ctx_server.queue_tasks, - std::placeholders::_1, - std::placeholders::_2, - std::placeholders::_3 - )); - - shutdown_handler = [&](int) { - ctx_server.queue_tasks.terminate(); - }; - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = signal_handler; - sigemptyset (&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); - sigaction(SIGTERM, &sigint_action, NULL); -#elif defined (_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; - }; - SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); -#endif - - ctx_server.queue_tasks.start_loop(); - - svr->stop(); - t.join(); - - llama_backend_free(); - - return 0; -} diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md deleted file mode 100644 index 5e6cb277b..000000000 --- a/examples/server/tests/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Server tests - -Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) -and [behave](https://behave.readthedocs.io/en/latest/): - -* [issues.feature](./features/issues.feature) Pending issues scenario -* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests -* [security.feature](./features/security.feature) Security, CORS and API Key -* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc... - -Tests target GitHub workflows job runners with 4 vCPU. - -Requests are -using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) -based http client. - -Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. -To mitigate it, you can increase values in `n_predict`, `kv_size`. - -### Install dependencies - -`pip install -r requirements.txt` - -### Run tests - -1. Build the server - -```shell -cd ../../.. -cmake -B build -DLLAMA_CURL=ON -cmake --build build --target llama-server -``` - -2. Start the test: `./tests.sh` - -It's possible to override some scenario steps values with environment variables: - -| variable | description | -|--------------------------|------------------------------------------------------------------------------------------------| -| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` | -| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` | -| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` | -| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format | -| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` | - -### Run @bug, @wip or @wrong_usage annotated scenario - -Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope. - -- `@bug` annotation aims to link a scenario with a GitHub issue. -- `@wrong_usage` are meant to show user issue that are actually an expected behavior -- `@wip` to focus on a scenario working in progress -- `@slow` heavy test, disabled by default - -To run a scenario annotated with `@bug`, start: - -```shell -DEBUG=ON ./tests.sh --no-skipped --tags bug --stop -``` - -After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. - -```shell -./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile" -``` diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature deleted file mode 100644 index 6f163ce04..000000000 --- a/examples/server/tests/features/embeddings.feature +++ /dev/null @@ -1,96 +0,0 @@ -@llama.cpp -@embeddings -Feature: llama.cpp server - - Background: Server startup - Given a server listening on localhost:8080 - And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf - And a model file bert-bge-small.gguf - And a model alias bert-bge-small - And 42 as server seed - And 2 slots - And 1024 as batch size - And 1024 as ubatch size - And 2048 KV cache size - And embeddings extraction - Then the server is starting - Then the server is healthy - - Scenario: Embedding - When embeddings are computed for: - """ - What is the capital of Bulgaria ? - """ - Then embeddings are generated - - Scenario: OAI Embeddings compatibility - Given a model bert-bge-small - When an OAI compatible embeddings computation request for: - """ - What is the capital of Spain ? - """ - Then embeddings are generated - - Scenario: OAI Embeddings compatibility with multiple inputs - Given a model bert-bge-small - Given a prompt: - """ - In which country Paris is located ? - """ - And a prompt: - """ - Is Madrid the capital of Spain ? - """ - When an OAI compatible embeddings computation request for multiple inputs - Then embeddings are generated - - Scenario: Multi users embeddings - Given a prompt: - """ - Write a very long story about AI. - """ - And a prompt: - """ - Write another very long music lyrics. - """ - And a prompt: - """ - Write a very long poem. - """ - And a prompt: - """ - Write a very long joke. - """ - Given concurrent embedding requests - Then the server is busy - Then the server is idle - Then all embeddings are generated - - Scenario: Multi users OAI compatibility embeddings - Given a prompt: - """ - In which country Paris is located ? - """ - And a prompt: - """ - Is Madrid the capital of Spain ? - """ - And a prompt: - """ - What is the biggest US city ? - """ - And a prompt: - """ - What is the capital of Bulgaria ? - """ - And a model bert-bge-small - Given concurrent OAI embedding requests - Then the server is busy - Then the server is idle - Then all embeddings are generated - - Scenario: All embeddings should be the same - Given 10 fixed prompts - And a model bert-bge-small - Given concurrent OAI embedding requests - Then all embeddings are the same diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py deleted file mode 100644 index e7845dc2f..000000000 --- a/examples/server/tests/features/environment.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -import signal -import socket -import sys -import time -import traceback -from contextlib import closing -from subprocess import TimeoutExpired - - -def before_scenario(context, scenario): - context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON' - if context.debug: - print("DEBUG=ON") - print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m") - port = 8080 - if 'PORT' in os.environ: - port = int(os.environ['PORT']) - if is_server_listening("localhost", port): - assert False, "Server already started" - - -def after_scenario(context, scenario): - try: - if 'server_process' not in context or context.server_process is None: - return - if scenario.status == "failed": - if 'GITHUB_ACTIONS' in os.environ: - print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n") - if os.path.isfile('llama.log'): - with closing(open('llama.log', 'r')) as f: - for line in f: - print(line) - if not is_server_listening(context.server_fqdn, context.server_port): - print("\x1b[33;101mERROR: Server stopped listening\x1b[0m") - - if context.server_process.poll() is not None: - assert False, f"Server not running pid={context.server_process.pid} ..." - - server_graceful_shutdown(context) # SIGINT - - try: - context.server_process.wait(0.5) - except TimeoutExpired: - print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...") - context.server_process.kill() # SIGKILL - context.server_process.wait() - - while is_server_listening(context.server_fqdn, context.server_port): - time.sleep(0.1) - except Exception: - print("ignoring error in after_scenario:") - traceback.print_exc(file=sys.stdout) - - -def server_graceful_shutdown(context): - print(f"shutting down server pid={context.server_process.pid} ...") - if os.name == 'nt': - interrupt = signal.CTRL_C_EVENT - else: - interrupt = signal.SIGINT - context.server_process.send_signal(interrupt) - - -def is_server_listening(server_fqdn, server_port): - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: - result = sock.connect_ex((server_fqdn, server_port)) - _is_server_listening = result == 0 - if _is_server_listening: - print(f"server is listening on {server_fqdn}:{server_port}...") - return _is_server_listening diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature deleted file mode 100644 index 7b13e44ca..000000000 --- a/examples/server/tests/features/issues.feature +++ /dev/null @@ -1,5 +0,0 @@ -# List of ongoing issues -# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug -@bug -Feature: Issues - # No confirmed issue at the moment diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature deleted file mode 100644 index 6cd306a2b..000000000 --- a/examples/server/tests/features/parallel.feature +++ /dev/null @@ -1,102 +0,0 @@ -@llama.cpp -@parallel -Feature: Parallel - - Background: Server startup - Given a server listening on localhost:8080 - And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models - And a model file test-model-00001-of-00003.gguf - And 42 as server seed - And 128 as batch size - And 256 KV cache size - And 2 slots - And continuous batching - Then the server is starting - Then the server is healthy - - Scenario Outline: Multi users completion - Given a prompt: - """ - Write a very long story about AI. - """ - And a prompt: - """ - Write another very long music lyrics. - """ - And max tokens to predict - Given concurrent completion requests - Then the server is busy - Then the server is idle - And all slots are idle - Then all prompts are predicted with tokens - Examples: - | n_predict | - | 128 | - - Scenario Outline: Multi users OAI completions compatibility - Given a system prompt You are a writer. - And a model tinyllama-2 - Given a prompt: - """ - Write a very long book. - """ - And a prompt: - """ - Write another a poem. - """ - And max tokens to predict - And streaming is - Given concurrent OAI completions requests - Then the server is busy - Then the server is idle - Then all prompts are predicted with tokens - Examples: - | streaming | n_predict | - | disabled | 128 | - | enabled | 64 | - - Scenario Outline: Multi users OAI completions compatibility no v1 - Given a system prompt You are a writer. - And a model tinyllama-2 - Given a prompt: - """ - Write a very long book. - """ - And a prompt: - """ - Write another a poem. - """ - And max tokens to predict - And streaming is - Given concurrent OAI completions requests no v1 - Then the server is busy - Then the server is idle - Then all prompts are predicted with tokens - Examples: - | streaming | n_predict | - | disabled | 128 | - | enabled | 64 | - - - Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969 - Given a prompt: - """ - Write a very long story about AI. - """ - And a prompt: - """ - Write another very long music lyrics. - """ - And a prompt: - """ - Write a very long poem. - """ - And a prompt: - """ - Write a very long joke. - """ - And 128 max tokens to predict - Given concurrent completion requests - Then the server is busy - Then the server is idle - Then all prompts are predicted diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature deleted file mode 100644 index 6a5a84e6a..000000000 --- a/examples/server/tests/features/passkey.feature +++ /dev/null @@ -1,54 +0,0 @@ -# run with: ./tests.sh --no-skipped --tags passkey -@passkey -@slow -Feature: Passkey / Self-extend with context shift - - Background: Server startup - Given a server listening on localhost:8080 - - # Generates a long text of junk and inserts a secret passkey number inside it. - # Then we query the LLM for the secret passkey. - # see #3856 and #4810 - Scenario Outline: Passkey - Given a model file from HF repo - And as batch size - And as number of junk - And server max tokens to predict - And 42 as seed - And KV cache size - And 1 slots - And group attention factor to extend context size through self-extend - And group attention width to extend context size through self-extend - # Can be override with N_GPU_LAYERS - And GPU offloaded layers - Then the server is starting - Then the server is healthy - Given available models - Then model 0 is trained on tokens context - Given a prefix prompt: - """ - here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there. - """ - And a passkey prompt template: - """ - The pass key is Remember it. is the pass key. - """ - And a junk suffix prompt: - """ - The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. - """ - And a suffix prompt: - """ - What is the pass key? The pass key is - """ - Given a "" passkey challenge prompt with the passkey inserted every junk - And a completion request with no api error - Then tokens are predicted matching - - Examples: - | hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content | - | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 4 | 512 | 250 | 50 | 42 | 1 | 42 | - | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 2 | 512 | 250 | 50 | 42 | 1 | \b((?!42)\w)+\b | - #| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 | - #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0 - # 987 | diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature deleted file mode 100644 index e8e1b5414..000000000 --- a/examples/server/tests/features/results.feature +++ /dev/null @@ -1,118 +0,0 @@ -@llama.cpp -@results -Feature: Results - - Background: Server startup - Given a server listening on localhost:8080 - And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models - And a model file test-model-00001-of-00003.gguf - And 128 as batch size - And 1024 KV cache size - And 128 max tokens to predict - And continuous batching - - Scenario Outline: consistent results with same seed - Given slots - And 1.0 temperature - Then the server is starting - Then the server is healthy - - Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42 - - Given concurrent completion requests - Then the server is busy - Then the server is idle - And all slots are idle - Then all predictions are equal - Examples: - | n_slots | - | 1 | - # FIXME: unified KV cache nondeterminism - # | 2 | - - Scenario Outline: different results with different seed - Given slots - And 1.0 temperature - Then the server is starting - Then the server is healthy - - Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42 - Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43 - Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44 - Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45 - - Given concurrent completion requests - Then the server is busy - Then the server is idle - And all slots are idle - Then all predictions are different - Examples: - | n_slots | - | 1 | - | 2 | - - Scenario Outline: consistent results with same seed and varying batch size - Given 4 slots - And temperature - # And 0 as draft - Then the server is starting - Then the server is healthy - - Given 1 prompts "Write a very long story about AI." with seed 42 - And concurrent completion requests - # Then the server is busy # Not all slots will be utilized. - Then the server is idle - And all slots are idle - - Given prompts "Write a very long story about AI." with seed 42 - And concurrent completion requests - # Then the server is busy # Not all slots will be utilized. - Then the server is idle - And all slots are idle - - Then all predictions are equal - Examples: - | n_parallel | temp | - | 1 | 0.0 | - | 1 | 1.0 | - # FIXME: unified KV cache nondeterminism - # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227 - # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 - # and https://github.com/ggerganov/llama.cpp/pull/7347 . - # | 2 | 0.0 | - # | 4 | 0.0 | - # | 2 | 1.0 | - # | 4 | 1.0 | - - Scenario Outline: consistent token probs with same seed and prompt - Given slots - And KV cache size - And 1.0 temperature - And max tokens to predict - Then the server is starting - Then the server is healthy - - Given 1 prompts "The meaning of life is" with seed 42 - And concurrent completion requests - # Then the server is busy # Not all slots will be utilized. - Then the server is idle - And all slots are idle - - Given prompts "The meaning of life is" with seed 42 - And concurrent completion requests - # Then the server is busy # Not all slots will be utilized. - Then the server is idle - And all slots are idle - - Then all token probabilities are equal - Examples: - | n_slots | n_kv | n_predict | n_parallel | - | 4 | 1024 | 1 | 1 | - # FIXME: unified KV cache nondeterminism - # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227 - # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 - # and https://github.com/ggerganov/llama.cpp/pull/7347 . - # | 4 | 1024 | 1 | 4 | - # | 4 | 1024 | 100 | 1 | - # This test still fails even the above patches; the first token probabilities are already different. - # | 4 | 1024 | 100 | 4 | diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature deleted file mode 100644 index eb82e7aca..000000000 --- a/examples/server/tests/features/security.feature +++ /dev/null @@ -1,68 +0,0 @@ -@llama.cpp -@security -Feature: Security - - Background: Server startup with an api key defined - Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models - And a server api key llama.cpp - Then the server is starting - Then the server is healthy - - Scenario Outline: Completion with some user api key - Given a prompt test - And a user api key - And 4 max tokens to predict - And a completion request with api error - - Examples: Prompts - | api_key | api_error | - | llama.cpp | no | - | llama.cpp | no | - | hackeme | raised | - | | raised | - - Scenario Outline: OAI Compatibility - Given a system prompt test - And a user prompt test - And a model test - And 2 max tokens to predict - And streaming is disabled - And a user api key - Given an OAI compatible chat completions request with api error - - Examples: Prompts - | api_key | api_error | - | llama.cpp | no | - | llama.cpp | no | - | hackme | raised | - - Scenario Outline: OAI Compatibility (invalid response formats) - Given a system prompt test - And a user prompt test - And a response format - And a model test - And 2 max tokens to predict - And streaming is disabled - Given an OAI compatible chat completions request with raised api error - - Examples: Prompts - | response_format | - | {"type": "sound"} | - | {"type": "json_object", "schema": 123} | - | {"type": "json_object", "schema": {"type": 123}} | - | {"type": "json_object", "schema": {"type": "hiccup"}} | - - - Scenario Outline: CORS Options - Given a user api key llama.cpp - When an OPTIONS request is sent from - Then CORS header is set to - - Examples: Headers - | origin | cors_header | cors_header_value | - | localhost | Access-Control-Allow-Origin | localhost | - | web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr | - | origin | Access-Control-Allow-Credentials | true | - | web.mydomain.fr | Access-Control-Allow-Methods | POST | - | web.mydomain.fr | Access-Control-Allow-Headers | * | diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature deleted file mode 100644 index b55971454..000000000 --- a/examples/server/tests/features/server.feature +++ /dev/null @@ -1,112 +0,0 @@ -@llama.cpp -@server -Feature: llama.cpp server - - Background: Server startup - Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models - And a model file test-model.gguf - And a model alias tinyllama-2 - And BOS token is 1 - And 42 as server seed - # KV Cache corresponds to the total amount of tokens - # that can be stored across all independent sequences: #4130 - # see --ctx-size and #5568 - And 256 KV cache size - And 32 as batch size - And 2 slots - And 64 server max tokens to predict - And prometheus compatible metrics exposed - Then the server is starting - Then the server is healthy - - Scenario: Health - Then the server is ready - And all slots are idle - - - Scenario Outline: Completion - Given a prompt - And max tokens to predict - And a completion request with no api error - Then tokens are predicted matching - And the completion is truncated - And prompt tokens are processed - And prometheus metrics are exposed - And metric llamacpp:tokens_predicted is - - Examples: Prompts - | prompt | n_predict | re_content | n_prompt | n_predicted | truncated | - | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not | - | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not | - - Scenario: Completion prompt truncated - Given a prompt: - """ - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. - Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. - Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. - """ - And a completion request with no api error - Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl - And the completion is truncated - And 109 prompt tokens are processed - - - Scenario Outline: OAI Compatibility - Given a model - And a system prompt - And a user prompt - And max tokens to predict - And streaming is - Given an OAI compatible chat completions request with no api error - Then tokens are predicted matching - And prompt tokens are processed - And the completion is truncated - - Examples: Prompts - | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated | - | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not | - | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | | - - - Scenario Outline: OAI Compatibility w/ response format - Given a model test - And a system prompt test - And a user prompt test - And a response format - And 10 max tokens to predict - Given an OAI compatible chat completions request with no api error - Then tokens are predicted matching - - Examples: Prompts - | response_format | n_predicted | re_content | - | {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" | - | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] | - | {"type": "json_object"} | 10 | \{ " Jacky. | - - - Scenario: Tokenize / Detokenize - When tokenizing: - """ - What is the capital of France ? - """ - Then tokens can be detokenized - And tokens do not begin with BOS - - Scenario: Tokenize w/ BOS - Given adding special tokens - When tokenizing: - """ - What is the capital of Germany? - """ - Then tokens begin with BOS - Given first token is removed - Then tokens can be detokenized - - Scenario: Models available - Given available models - Then 1 models are supported - Then model 0 is identified by tinyllama-2 - Then model 0 is trained on 128 tokens context diff --git a/examples/server/tests/features/slotsave.feature b/examples/server/tests/features/slotsave.feature deleted file mode 100644 index 1c281c074..000000000 --- a/examples/server/tests/features/slotsave.feature +++ /dev/null @@ -1,58 +0,0 @@ -@llama.cpp -@slotsave -Feature: llama.cpp server slot management - - Background: Server startup - Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models - And prompt caching is enabled - And 2 slots - And . as slot save path - And 2048 KV cache size - And 42 as server seed - And 24 max tokens to predict - Then the server is starting - Then the server is healthy - - Scenario: Save and Restore Slot - # First prompt in slot 1 should be fully processed - Given a user prompt "What is the capital of France?" - And using slot id 1 - And a completion request with no api error - Then 24 tokens are predicted matching (Lily|cake) - And 22 prompt tokens are processed - When the slot 1 is saved with filename "slot1.bin" - Then the server responds with status code 200 - # Since we have cache, this should only process the last tokens - Given a user prompt "What is the capital of Germany?" - And a completion request with no api error - Then 24 tokens are predicted matching (Thank|special) - And 7 prompt tokens are processed - # Loading the original cache into slot 0, - # we should only be processing 1 prompt token and get the same output - When the slot 0 is restored with filename "slot1.bin" - Then the server responds with status code 200 - Given a user prompt "What is the capital of France?" - And using slot id 0 - And a completion request with no api error - Then 24 tokens are predicted matching (Lily|cake) - And 1 prompt tokens are processed - # For verification that slot 1 was not corrupted during slot 0 load, same thing - Given a user prompt "What is the capital of Germany?" - And using slot id 1 - And a completion request with no api error - Then 24 tokens are predicted matching (Thank|special) - And 1 prompt tokens are processed - - Scenario: Erase Slot - Given a user prompt "What is the capital of France?" - And using slot id 1 - And a completion request with no api error - Then 24 tokens are predicted matching (Lily|cake) - And 22 prompt tokens are processed - When the slot 1 is erased - Then the server responds with status code 200 - Given a user prompt "What is the capital of France?" - And a completion request with no api error - Then 24 tokens are predicted matching (Lily|cake) - And 22 prompt tokens are processed diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py deleted file mode 100644 index df0814cc9..000000000 --- a/examples/server/tests/features/steps/steps.py +++ /dev/null @@ -1,1360 +0,0 @@ -import asyncio -import json -import os -import re -import socket -import subprocess -import sys -import threading -import time -from collections.abc import Sequence -from contextlib import closing -from re import RegexFlag -from typing import Any, Literal, cast - -import aiohttp -import numpy as np -import openai -from openai.types.chat import ChatCompletionChunk -from behave import step # pyright: ignore[reportAttributeAccessIssue] -from behave.api.async_step import async_run_until_complete -from prometheus_client import parser - -# pyright: reportRedeclaration=false - -@step("a server listening on {server_fqdn}:{server_port}") -def step_server_config(context, server_fqdn: str, server_port: str): - context.server_fqdn = server_fqdn - context.server_port = int(server_port) - context.n_threads = None - context.n_gpu_layer = None - if 'PORT' in os.environ: - context.server_port = int(os.environ['PORT']) - print(f"$PORT set, overriding server port with to {context.server_port}") - if 'FQDN' in os.environ: - context.server_fqdn = os.environ['FQDN'] - print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}") - if 'N_GPU_LAYERS' in os.environ: - context.n_gpu_layer = int(os.environ['N_GPU_LAYERS']) - print(f"$N_GPU_LAYERS set, overriding n_gpu_layer with to {context.n_gpu_layer}") - - context.base_url = f'http://{context.server_fqdn}:{context.server_port}' - - context.model_alias = None - context.model_file = None - context.model_hf_repo = None - context.model_hf_file = None - context.model_url = None - context.n_batch = None - context.n_ubatch = None - context.n_ctx = None - context.n_ga = None - context.n_ga_w = None - context.n_predict = None - context.n_prompts = 0 - context.n_server_predict = None - context.slot_save_path = None - context.id_slot = None - context.cache_prompt = None - context.n_slots = None - context.prompt_prefix = None - context.prompt_suffix = None - context.server_api_key = None - context.server_continuous_batching = False - context.server_embeddings = False - context.server_metrics = False - context.server_process = None - context.seed = None - context.draft = None - context.server_seed = None - context.user_api_key = None - context.response_format = None - context.temperature = None - - context.tasks_result = [] - context.concurrent_tasks = [] - context.prompts = [] - - -@step('a model file {hf_file} from HF repo {hf_repo}') -def step_download_hf_model(context, hf_file: str, hf_repo: str): - context.model_hf_repo = hf_repo - context.model_hf_file = hf_file - context.model_file = os.path.basename(hf_file) - - -@step('a model file {model_file}') -def step_model_file(context, model_file: str): - context.model_file = model_file - - -@step('a model url {model_url}') -def step_model_url(context, model_url: str): - context.model_url = model_url - - -@step('a model alias {model_alias}') -def step_model_alias(context, model_alias: str): - context.model_alias = model_alias - - -@step('{seed:d} as server seed') -def step_seed(context, seed: int): - context.server_seed = seed - - -@step('{ngl:d} GPU offloaded layers') -def step_n_gpu_layer(context, ngl: int): - if 'N_GPU_LAYERS' in os.environ: - new_ngl = int(os.environ['N_GPU_LAYERS']) - if context.debug: - print(f"-ngl upgraded from {ngl} to {new_ngl}") - ngl = new_ngl - context.n_gpu_layer = ngl - - -@step('{n_threads:d} threads') -def step_n_threads(context, n_threads: int): - context.n_thread = n_threads - - -@step('{draft:d} as draft') -def step_draft(context, draft: int): - context.draft = draft - - -@step('{n_ctx:d} KV cache size') -def step_n_ctx(context, n_ctx: int): - context.n_ctx = n_ctx - - -@step('{n_slots:d} slots') -def step_n_slots(context, n_slots: int): - context.n_slots = n_slots - - -@step('{n_predict:d} server max tokens to predict') -def step_server_n_predict(context, n_predict: int): - context.n_server_predict = n_predict - - -@step('{slot_save_path} as slot save path') -def step_slot_save_path(context, slot_save_path: str): - context.slot_save_path = slot_save_path - - -@step('using slot id {id_slot:d}') -def step_id_slot(context, id_slot: int): - context.id_slot = id_slot - - -@step('prompt caching is enabled') -def step_enable_prompt_cache(context): - context.cache_prompt = True - - -@step('continuous batching') -def step_server_continuous_batching(context): - context.server_continuous_batching = True - - -@step('embeddings extraction') -def step_server_embeddings(context): - context.server_embeddings = True - - -@step('prometheus compatible metrics exposed') -def step_server_metrics(context): - context.server_metrics = True - - -@step("the server is starting") -def step_start_server(context): - start_server_background(context) - attempts = 0 - max_attempts = 20 - if 'GITHUB_ACTIONS' in os.environ: - max_attempts *= 2 - - addrs = socket.getaddrinfo(context.server_fqdn, context.server_port, type=socket.SOCK_STREAM) - family, typ, proto, _, sockaddr = addrs[0] - - while True: - with closing(socket.socket(family, typ, proto)) as sock: - result = sock.connect_ex(sockaddr) - if result == 0: - print("\x1b[33;46mserver started!\x1b[0m") - return - attempts += 1 - if attempts > max_attempts: - assert False, "server not started" - print(f"waiting for server to start, connect error code = {result}...") - time.sleep(0.1) - - -@step("the server is {expecting_status}") -@async_run_until_complete -async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str): - match expecting_status: - case 'healthy': - await wait_for_health_status(context, context.base_url, 200, 'ok', - timeout=30) - - case 'ready' | 'idle': - await wait_for_health_status(context, context.base_url, 200, 'ok', - timeout=30, - params={'fail_on_no_slot': 0, 'include_slots': 0}, - slots_idle=context.n_slots, - slots_processing=0, - expected_slots=[{'id': slot_id, 'state': 0} - for slot_id in - range(context.n_slots if context.n_slots else 1)]) - case 'busy': - await wait_for_health_status(context, context.base_url, 503, - 'no slot available', - params={'fail_on_no_slot': 0, 'include_slots': 0}, - slots_idle=0, - slots_processing=context.n_slots, - expected_slots=[{'id': slot_id, 'state': 1} - for slot_id in - range(context.n_slots if context.n_slots else 1)]) - case _: - assert False, "unknown status" - - -@step('all slots are {expected_slot_status_string}') -@async_run_until_complete -async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str): - match expected_slot_status_string: - case 'idle': - expected_slot_status = 0 - case 'busy': - expected_slot_status = 1 - case _: - assert False, "unknown status" - - expected_slots = [{'id': slot_id, 'state': expected_slot_status} - for slot_id in range(context.n_slots)] - await request_slots_status(context, expected_slots) - - -@step('a completion request with {api_error} api error') -@async_run_until_complete -async def step_request_completion(context, api_error: Literal['raised'] | str): - expect_api_error = api_error == 'raised' - seeds = await completions_seed(context, num_seeds=1) - completion = await request_completion(context.prompts.pop(), - seeds[0] if seeds is not None else seeds, - context.base_url, - debug=context.debug, - n_predict=context.n_predict, - cache_prompt=context.cache_prompt, - id_slot=context.id_slot, - expect_api_error=expect_api_error, - user_api_key=context.user_api_key, - temperature=context.temperature) - context.tasks_result.append(completion) - if context.debug: - print(f"Completion response: {completion}") - if expect_api_error: - assert completion == 401, f"completion must be an 401 status code: {completion}" - - -@step('{predicted_n:d} tokens are predicted matching {re_content}') -def step_n_tokens_predicted_with_content(context, predicted_n, re_content): - context.completion = context.tasks_result.pop() - assert_n_tokens_predicted(context.completion, predicted_n, re_content) - - -@step('{predicted_n:d} tokens are predicted') -def step_n_tokens_predicted(context, predicted_n): - context.completion = context.tasks_result.pop() - assert_n_tokens_predicted(context.completion, predicted_n) - - -@step('all predictions are equal') -@async_run_until_complete -async def step_predictions_equal(context): - n_completions = await gather_tasks_results(context) - assert n_completions >= 2, "need at least 2 completions" - assert_all_predictions_equal(context.tasks_result) - context.tasks_result = [] - - -@step('all predictions are different') -@async_run_until_complete -async def step_predictions_different(context): - n_completions = await gather_tasks_results(context) - assert n_completions >= 2, "need at least 2 completions" - assert_all_predictions_different(context.tasks_result) - context.tasks_result = [] - - -@step('all token probabilities are equal') -@async_run_until_complete -async def step_token_probabilities_equal(context): - n_completions = await gather_tasks_results(context) - assert n_completions >= 2, "need at least 2 completions" - assert_all_token_probabilities_equal(context.tasks_result) - context.tasks_result = [] - - -@step('the completion is truncated') -def step_assert_completion_truncated(context): - step_assert_completion_truncated(context, '') - - -@step('the completion is {truncated} truncated') -def step_assert_completion_truncated(context, truncated): - truncated = truncated != "not" - assert context.completion['truncated'] == truncated, f'{context.completion}' - - -@step('{n_prompt:d} prompt tokens are processed') -def step_impl(context, n_prompt): - assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}" - - -@step('a user prompt {user_prompt}') -def step_user_prompt(context, user_prompt): - context.prompts.append(user_prompt) - context.n_prompts = len(context.prompts) - - -@step('a system prompt {system_prompt}') -def step_system_prompt(context, system_prompt): - context.system_prompt = system_prompt - - -@step('a model {model}') -def step_model(context, model): - context.model = model - - -@step('{max_tokens:d} max tokens to predict') -def step_max_tokens(context, max_tokens): - context.n_predict = max_tokens - - -@step('a response format {response_format}') -def step_response_format(context, response_format): - context.response_format = json.loads(response_format) - - -@step('{temperature:f} temperature') -def step_temperature(context, temperature): - context.temperature = temperature - - -@step('streaming is {enable_streaming}') -def step_streaming(context, enable_streaming): - context.enable_streaming = enable_streaming == 'enabled' - - -@step('a user api key {user_api_key}') -def step_user_api_key(context, user_api_key): - context.user_api_key = user_api_key - - -@step('no user api key') -def step_no_user_api_key(context): - context.user_api_key = None - - -@step('a user api key ') -def step_no_user_api_key_space(context): - context.user_api_key = None - - -@step('a server api key {server_api_key}') -def step_server_api_key(context, server_api_key): - context.server_api_key = server_api_key - - -@step('{n_junk:d} as number of junk') -def step_n_junk(context, n_junk): - context.n_junk = n_junk - - -@step('{n_batch:d} as batch size') -def step_n_batch(context, n_batch): - context.n_batch = n_batch - - -@step('{n_ubatch:d} as ubatch size') -def step_n_ubatch(context, n_ubatch): - context.n_ubatch = n_ubatch - - -@step('{seed:d} as seed') -def step_seed(context, seed): - if context.seed is None: - context.seed = [seed] - else: - context.seed.append(seed) - - -@step('BOS token is {bos:d}') -def step_bos_token(context, bos): - context.bos = bos - - -@step('a prefix prompt') -def step_prompt_prefix(context): - context.prompt_prefix = context_text(context) - - -@step('a junk suffix prompt') -def step_prompt_junk_suffix(context): - context.prompt_junk_suffix = context_text(context) - - -@step('a suffix prompt') -def step_prompt_suffix(context): - context.prompt_suffix = context_text(context) - - -@step('{n_ga:d} group attention factor' - ' to extend context size through self-extend') -def step_impl(context, n_ga): - context.n_ga = n_ga - - -@step('{n_ga_w:d} group attention width to extend context size through self-extend') -def step_impl(context, n_ga_w): - context.n_ga_w = n_ga_w - - -@step('a passkey prompt template') -def step_prompt_passkey(context): - context.prompt_passkey = context_text(context) - - -@step('{n_prompts:d} fixed prompts') -def step_fixed_prompts(context, n_prompts): - context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)]) - context.n_prompts = n_prompts - - -@step('a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk') -def step_prompt_passkey(context, passkey, i_pos): - prompt = "" - for i in range(context.n_junk): - if i % context.n_junk == i_pos: - prompt += context.prompt_passkey # the passkey is already substituted - prompt += context.prompt_junk_suffix - if context.debug: - passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m" - print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```") - context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix) - context.n_prompts = len(context.prompts) - - -@step('an OAI compatible chat completions request with {api_error} api error') -@async_run_until_complete -async def step_oai_chat_completions(context, api_error): - if context.debug: - print(f"Submitting OAI compatible completions request...") - expect_api_error = api_error == 'raised' - seeds = await completions_seed(context, num_seeds=1), - completion = await oai_chat_completions(context.prompts.pop(), - seeds[0] if seeds is not None else seeds, - context.system_prompt, - context.base_url, - '/v1/chat', - False, - model=context.model if hasattr(context, 'model') else None, - - n_predict=context.n_predict - if hasattr(context, 'n_predict') else None, - - enable_streaming=context.enable_streaming - if hasattr(context, 'enable_streaming') else None, - - response_format=context.response_format - if hasattr(context, 'response_format') else None, - - user_api_key=context.user_api_key - if hasattr(context, 'user_api_key') else None, - - expect_api_error=expect_api_error) - context.tasks_result.append(completion) - if context.debug: - print(f"Completion response: {completion}") - if expect_api_error: - assert completion == 401, f"completion must be an 401 status code: {completion}" - - if context.debug: - print(f"Completion response: {completion}") - - -@step('a prompt') -def step_a_prompt(context): - context.prompts.append(context_text(context)) - context.n_prompts = len(context.prompts) - - -@step('a prompt {prompt}') -def step_a_prompt_prompt(context, prompt): - context.prompts.append(prompt) - context.n_prompts = len(context.prompts) - - -@step('{num_prompts:d} prompts {prompt} with seed {seed:d}') -def step_many_prompts(context, num_prompts, prompt, seed): - if context.seed is None: - context.seed = [] - for _ in range(num_prompts): - context.seed.append(seed) - context.prompts.append(prompt) - context.n_prompts = len(context.prompts) - - -@step('concurrent completion requests') -@async_run_until_complete() -async def step_concurrent_completion_requests(context): - await concurrent_requests( - context, - request_completion, - # prompt is inserted automatically - context.base_url, - debug=context.debug, - prompt_prefix=context.prompt_prefix, - prompt_suffix=context.prompt_suffix, - n_predict=context.n_predict if hasattr(context, 'n_predict') else None, - user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None, - temperature=context.temperature, - ) - - -@step('concurrent OAI completions requests') -@async_run_until_complete -async def step_oai_chat_completions(context): - await concurrent_requests(context, oai_chat_completions, - # user_prompt is inserted automatically - context.system_prompt, - context.base_url, - '/v1/chat/completions', - True, # async_client - model=context.model - if hasattr(context, 'model') else None, - n_predict=context.n_predict - if hasattr(context, 'n_predict') else None, - enable_streaming=context.enable_streaming - if hasattr(context, 'enable_streaming') else None, - response_format=context.response_format - if hasattr(context, 'response_format') else None, - user_api_key=context.user_api_key - if hasattr(context, 'user_api_key') else None) - - -@step('concurrent OAI completions requests no v1') -@async_run_until_complete -async def step_oai_chat_completions(context): - await concurrent_requests(context, oai_chat_completions, - # user_prompt is inserted automatically - context.system_prompt, - context.base_url, - '/chat/completions', - True, # async_client - model=context.model - if hasattr(context, 'model') else None, - n_predict=context.n_predict - if hasattr(context, 'n_predict') else None, - enable_streaming=context.enable_streaming - if hasattr(context, 'enable_streaming') else None, - response_format=context.response_format - if hasattr(context, 'response_format') else None, - user_api_key=context.user_api_key - if hasattr(context, 'user_api_key') else None) - - -@step('all prompts are predicted') -@async_run_until_complete -async def step_all_prompts_are_predicted(context): - await all_prompts_are_predicted(context) - - -@step('all prompts are predicted with {n_expected_predicted:d} tokens') -@async_run_until_complete -async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted): - await all_prompts_are_predicted(context, n_expected_predicted) - - -async def all_prompts_are_predicted(context, expected_predicted_n=None): - n_completions = await gather_tasks_results(context) - assert n_completions > 0 - for i in range(n_completions): - assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n) - assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests" - - -@step('embeddings are computed for') -@async_run_until_complete -async def step_compute_embedding(context): - context.n_prompts = 1 - context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url) - - -@step('all embeddings are the same') -@async_run_until_complete -async def step_all_embeddings_are_the_same(context): - n_embedding_requests = await gather_tasks_results(context) - assert n_embedding_requests > 0 - embeddings = [] - for i in range(n_embedding_requests): - embedding = context.tasks_result.pop().pop() - embeddings.append(embedding) - assert_embeddings(embedding) - n = len(embeddings) - for i in range(n-1): - for j in range(i+1, n): - embedding1 = np.array(embeddings[i]) - embedding2 = np.array(embeddings[j]) - if context.debug: - print(f"embedding1: {embedding1[-8:]}") - print(f"embedding2: {embedding2[-8:]}") - similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) - msg = f"Similarity between {i} and {j}: {similarity:.10f}" - if context.debug: - print(f"{msg}") - assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg - - -@step('embeddings are generated') -def step_assert_embeddings(context): - assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n" - f"context.n_prompts={context.n_prompts}\n" - f"context.embeddings={context.embeddings}") - for embedding in context.embeddings: - assert_embeddings(embedding) - - -@step('an OAI compatible embeddings computation request for') -@async_run_until_complete -async def step_oai_compute_embeddings(context): - context.n_prompts = 1 - context.embeddings = await request_oai_embeddings(context_text(context), None, - base_url=context.base_url, - user_api_key=context.user_api_key, - model=context.model) - - -@step('an OAI compatible embeddings computation request for multiple inputs') -@async_run_until_complete -async def step_oai_compute_embeddings_multiple_inputs(context): - context.embeddings = await request_oai_embeddings(context.prompts, None, - base_url=context.base_url, - user_api_key=context.user_api_key, - model=context.model) - context.prompts.clear() - - -@step('concurrent embedding requests') -@async_run_until_complete() -async def step_concurrent_embedding_requests(context): - await concurrent_requests(context, - request_embedding, - # prompt is inserted automatically - base_url=context.base_url) - - -@step('concurrent OAI embedding requests') -@async_run_until_complete() -async def step_concurrent_oai_embedding_requests(context): - await concurrent_requests(context, - request_oai_embeddings, - # prompt is inserted automatically - base_url=context.base_url, - async_client=True, - model=context.model) - - -@step('all embeddings are generated') -@async_run_until_complete() -async def all_embeddings_are_generated(context): - n_embedding_requests = await gather_tasks_results(context) - assert n_embedding_requests == context.n_prompts - for i in range(n_embedding_requests): - assert_embeddings(context.tasks_result.pop().pop()) - - -@step('adding special tokens') -def step_tokenize_set_add_special(context): - context.tokenize_add_special = True - - -@step('tokenizing') -@async_run_until_complete -async def step_tokenize(context): - context.tokenized_text = context_text(context) - async with aiohttp.ClientSession() as session: - tokenize_args = { - "content": context.tokenized_text, - } - if getattr(context, 'tokenize_add_special', None) is not None: - tokenize_args['add_special'] = context.tokenize_add_special - async with session.post(f'{context.base_url}/tokenize', - json=tokenize_args) as response: - assert response.status == 200 - tokenize_json = await response.json() - context.tokens = tokenize_json['tokens'] - - -@step('tokens can be detokenized') -@async_run_until_complete -async def step_detokenize(context): - assert len(context.tokens) > 0 - async with aiohttp.ClientSession() as session: - async with session.post(f'{context.base_url}/detokenize', - json={ - "tokens": context.tokens, - }) as response: - assert response.status == 200 - detokenize_json = await response.json() - # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15 - assert context.tokenized_text == detokenize_json['content'].strip() - - -@step('tokens begin with BOS') -def step_strings_for_tokenization(context): - assert context.tokens[0] == context.bos - - -@step('tokens do not begin with BOS') -def step_strings_for_tokenization(context): - assert context.tokens[0] != context.bos - - -@step('first token is removed') -def step_strings_for_tokenization(context): - context.tokens = context.tokens[1:] - - -@step('an OPTIONS request is sent from {origin}') -@async_run_until_complete -async def step_options_request(context, origin): - async with aiohttp.ClientSession() as session: - headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin} - async with session.options(f'{context.base_url}/v1/chat/completions', - headers=headers) as response: - assert response.status == 200 - context.options_response = response - - -@step('CORS header {cors_header} is set to {cors_header_value}') -def step_check_options_header_value(context, cors_header, cors_header_value): - assert context.options_response.headers[cors_header] == cors_header_value - - -@step('prometheus metrics are exposed') -@async_run_until_complete -async def step_prometheus_metrics_exported(context): - async with aiohttp.ClientSession() as session: - async with await session.get(f'{context.base_url}/metrics') as metrics_response: - assert metrics_response.status == 200 - assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" - metrics_raw = await metrics_response.text() - metric_exported = False - if context.debug: - print(f"/metrics answer:\n{metrics_raw}") - context.metrics = {} - for metric in parser.text_string_to_metric_families(metrics_raw): - match metric.name: - case "llamacpp:kv_cache_usage_ratio": - assert len(metric.samples) > 0 - metric_exported = True - context.metrics[metric.name] = metric - assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time" - assert metric_exported, "No metrics exported" - - -@step('metric {metric_name} is {metric_value:d}') -def step_assert_metric_value(context, metric_name, metric_value): - if metric_name not in context.metrics: - assert False, f"no metric {metric_name} in {context.metrics.keys()}" - assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}" - - -@step('available models') -def step_available_models(context): - # openai client always expects an api_key - openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope' - openai.base_url = f'{context.base_url}/v1/' - context.models = openai.models.list().data - - -@step('{n_model:d} models are supported') -def step_supported_models(context, n_model): - if context.debug: - print("server models available:", context.models) - assert len(context.models) == n_model - - -@step('model {i_model:d} is {param} {preposition} {param_value}') -def step_supported_models(context, i_model: int, param: Literal['identified', 'trained'] | str, preposition: str, param_value: str): - assert i_model < len(context.models) - model = context.models[i_model] - - param_value = param_value.split(' ', 1)[0] - match param: - case 'identified': - value = model.id - case 'trained': - value = str(model.meta["n_ctx_train"]) - case _: - assert False, "param {param} not supported" - assert param_value == value, f"model param {param} {value} != {param_value}" - - -async def concurrent_requests(context, f_completion, *args, **kwargs): - context.n_prompts = len(context.prompts) - if context.debug: - print(f"starting {context.n_prompts} concurrent completion requests...") - assert context.n_prompts > 0 - seeds = await completions_seed(context) - assert seeds is not None - for prompt_no in range(context.n_prompts): - shifted_args = [context.prompts.pop(), seeds[prompt_no], *args] - context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) - await asyncio.sleep(0.1) - - -@step('the slot {slot_id:d} is saved with filename "{filename}"') -@async_run_until_complete -async def step_save_slot(context, slot_id, filename): - async with aiohttp.ClientSession() as session: - async with session.post(f'{context.base_url}/slots/{slot_id}?action=save', - json={"filename": filename}, - headers={"Content-Type": "application/json"}) as response: - context.response = response - - -@step('the slot {slot_id:d} is restored with filename "{filename}"') -@async_run_until_complete -async def step_restore_slot(context, slot_id, filename): - async with aiohttp.ClientSession() as session: - async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore', - json={"filename": filename}, - headers={"Content-Type": "application/json"}) as response: - context.response = response - - -@step('the slot {slot_id:d} is erased') -@async_run_until_complete -async def step_erase_slot(context, slot_id): - async with aiohttp.ClientSession() as session: - async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase', - headers={"Content-Type": "application/json"}) as response: - context.response = response - - -@step('the server responds with status code {status_code:d}') -def step_server_responds_with_status_code(context, status_code): - assert context.response.status == status_code - - -async def request_completion(prompt, - seed, - base_url, - debug=False, - prompt_prefix=None, - prompt_suffix=None, - n_predict=None, - cache_prompt=False, - id_slot=None, - expect_api_error=None, - user_api_key=None, - temperature=None) -> int | dict[str, Any]: - if debug: - print(f"Sending completion request: {prompt}") - origin = "my.super.domain" - headers = { - 'Origin': origin - } - if user_api_key is not None: - if debug: - print(f"Set user_api_key: {user_api_key}") - headers['Authorization'] = f'Bearer {user_api_key}' - - async with aiohttp.ClientSession() as session: - async with session.post(f'{base_url}/completion', - json={ - "input_prefix": prompt_prefix, - "prompt": prompt, - "input_suffix": prompt_suffix, - "n_predict": n_predict if n_predict is not None else -1, - "cache_prompt": cache_prompt, - "id_slot": id_slot, - "seed": seed if seed is not None else 42, - "temperature": temperature if temperature is not None else 0.8, - "n_probs": 2, - }, - headers=headers, - timeout=3600) as response: - if expect_api_error is None or not expect_api_error: - assert response.status == 200 - assert response.headers['Access-Control-Allow-Origin'] == origin - return await response.json() - else: - return response.status - - -async def oai_chat_completions(user_prompt, - seed, - system_prompt, - base_url: str, - base_path: str, - async_client, - debug=False, - temperature=None, - model=None, - n_predict=None, - enable_streaming=None, - response_format=None, - user_api_key=None, - expect_api_error=None) -> int | dict[str, Any]: - if debug: - print(f"Sending OAI Chat completions request: {user_prompt}") - # openai client always expects an api key - user_api_key = user_api_key if user_api_key is not None else 'nope' - seed = seed if seed is not None else 42 - enable_streaming = enable_streaming if enable_streaming is not None else False - payload = { - "messages": [ - { - "role": "system", - "content": system_prompt, - }, - { - "role": "user", - "content": user_prompt, - } - ], - "model": model, - "max_tokens": n_predict, - "stream": enable_streaming, - "temperature": temperature if temperature is not None else 0.0, - "seed": seed, - } - if response_format is not None: - payload['response_format'] = response_format - completion_response = { - 'content': '', - 'timings': { - 'predicted_n': 0, - 'prompt_n': 0 - } - } - if async_client: - origin = 'llama.cpp' - headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} - async with aiohttp.ClientSession() as session: - async with session.post(f'{base_url}{base_path}', - json=payload, - headers=headers) as response: - if enable_streaming: - assert response.status == 200 - assert response.headers['Access-Control-Allow-Origin'] == origin - assert response.headers['Content-Type'] == "text/event-stream" - event_received = True - while event_received: - event_received = False - async for line_in_bytes in response.content: - line = line_in_bytes.decode('utf-8') - line = line.rstrip('\n').rstrip('\r') - if line == '': - continue - event_data = line.split(': ', 1) - assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```' - chunk_raw = event_data[1] - - chunk = json.loads(chunk_raw) - assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```" - delta = chunk['choices'][0]['delta'] - if 'content' in delta: - completion_response['content'] += delta['content'] - completion_response['timings']['predicted_n'] += 1 - else: - if expect_api_error is None or not expect_api_error: - assert response.status == 200 - assert response.headers['Access-Control-Allow-Origin'] == origin - assert response.headers['Content-Type'] == "application/json; charset=utf-8" - chat_completion_raw = await response.json() - completion_response = { - 'content': chat_completion_raw['choices'][0]['message'], - 'timings': { - 'predicted_n': chat_completion_raw['usage']['completion_tokens'], - 'prompt_n': chat_completion_raw['usage']['prompt_tokens'] - } - } - else: - return response.status - else: - try: - openai.api_key = user_api_key - openai.base_url = f'{base_url}{base_path.removesuffix("chat")}' - assert model is not None - chat_completion = openai.chat.completions.create( - messages=payload['messages'], - model=model, - max_tokens=n_predict, - stream=enable_streaming, - response_format=payload.get('response_format') or openai.NOT_GIVEN, - seed=seed, - temperature=payload['temperature'] - ) - except openai.AuthenticationError as e: - if expect_api_error is not None and expect_api_error: - return 401 - else: - assert False, f'error raised: {e}' - - if enable_streaming: - chat_completion = cast(openai.Stream[ChatCompletionChunk], chat_completion) - for chunk in chat_completion: - assert len(chunk.choices) == 1 - delta = chunk.choices[0].delta - if delta.content is not None: - completion_response['content'] += delta.content - completion_response['timings']['predicted_n'] += 1 - completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop' - else: - assert len(chat_completion.choices) == 1 - assert chat_completion.usage is not None - completion_response = { - 'content': chat_completion.choices[0].message.content, - 'timings': { - 'predicted_n': chat_completion.usage.completion_tokens, - 'prompt_n': chat_completion.usage.prompt_tokens - }, - 'truncated': chat_completion.choices[0].finish_reason != 'stop' - } - if debug: - print("OAI response formatted to llama.cpp:", completion_response) - return completion_response - - -async def request_embedding(content, seed, base_url=None) -> list[list[float]]: - async with aiohttp.ClientSession() as session: - async with session.post(f'{base_url}/embedding', - json={ - "content": content, - }) as response: - assert response.status == 200 - response_json = await response.json() - return [response_json['embedding']] - - -async def request_oai_embeddings(input, seed, - base_url=None, user_api_key=None, - model=None, async_client=False) -> list[list[float]]: - # openai client always expects an api_key - user_api_key = user_api_key if user_api_key is not None else 'nope' - if async_client: - origin = 'llama.cpp' - headers=[] - if user_api_key is not None: - headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} - async with aiohttp.ClientSession() as session: - async with session.post(f'{base_url}/v1/embeddings', - json={ - "input": input, - "model": model, - }, - headers=headers, - timeout=3600) as response: - assert response.status == 200, f"received status code not expected: {response.status}" - assert response.headers['Access-Control-Allow-Origin'] == origin - assert response.headers['Content-Type'] == "application/json; charset=utf-8" - response_json = await response.json() - assert response_json['model'] == model, f"invalid model received: {response_json['model']}" - assert response_json['object'] == 'list' - if isinstance(input, Sequence): - embeddings = [] - for an_oai_embeddings in response_json['data']: - embeddings.append(an_oai_embeddings['embedding']) - else: - embeddings = [response_json['data']['embedding']] - return embeddings - else: - openai.api_key = user_api_key - openai.base_url = f'{base_url}/v1/' - assert model is not None - oai_embeddings = openai.embeddings.create( - model=model, - input=input, - ) - - return [e.embedding for e in oai_embeddings.data] - - -def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): - content = completion_response['content'] - n_predicted = completion_response['timings']['predicted_n'] - assert len(content) > 0, "no token predicted" - if re_content is not None: - p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL) - matches = p.finditer(content) - last_match = 0 - highlighted = '' - for match in matches: - start, end = match.span() - highlighted += content[last_match: start] - highlighted += '\x1b[33m' - highlighted += content[start: end] - highlighted += '\x1b[0m' - last_match = end - highlighted += content[last_match:] - if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': - print(f"Checking completion response: {highlighted}") - assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```' - if expected_predicted_n and expected_predicted_n > 0: - assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' - f' {n_predicted} <> {expected_predicted_n}') - -def assert_all_predictions_equal(completion_responses): - if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': - for i, response_i in enumerate(completion_responses): - content_i = response_i['content'] - print(f"content {i}: {content_i}") - for i, response_i in enumerate(completion_responses): - content_i = response_i['content'] - for j, response_j in enumerate(completion_responses): - if i == j: - continue - content_j = response_j['content'] - assert content_i == content_j, "contents not equal" - - -def assert_all_predictions_different(completion_responses): - if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': - for i, response_i in enumerate(completion_responses): - content_i = response_i['content'] - print(f"content {i}: {content_i}") - for i, response_i in enumerate(completion_responses): - content_i = response_i['content'] - for j, response_j in enumerate(completion_responses): - if i == j: - continue - content_j = response_j['content'] - assert content_i != content_j, "contents not different" - - -def assert_all_token_probabilities_equal(completion_responses): - n_predict = len(completion_responses[0]['completion_probabilities']) - if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': - for pos in range(n_predict): - for i, response_i in enumerate(completion_responses): - probs_i = response_i['completion_probabilities'][pos]['probs'] - print(f"pos {pos}, probs {i}: {probs_i}") - for pos in range(n_predict): - for i, response_i in enumerate(completion_responses): - probs_i = response_i['completion_probabilities'][pos]['probs'] - for j, response_j in enumerate(completion_responses): - if i == j: - continue - probs_j = response_j['completion_probabilities'][pos]['probs'] - assert probs_i == probs_j, "contents not equal" - - -async def gather_tasks_results(context): - n_tasks = len(context.concurrent_tasks) - if context.debug: - print(f"Waiting for all {n_tasks} tasks results...") - for task_no in range(n_tasks): - context.tasks_result.append(await context.concurrent_tasks.pop()) - n_completions = len(context.tasks_result) - return n_completions - - -async def wait_for_health_status(context, - base_url, - expected_http_status_code, - expected_health_status, - timeout=3, - params=None, - slots_idle=None, - slots_processing=None, - expected_slots=None): - if context.debug: - print(f"Starting checking for health for expected_health_status={expected_health_status}") - interval = 0.5 - counter = 0 - if 'GITHUB_ACTIONS' in os.environ: - timeout *= 2 - - async with aiohttp.ClientSession() as session: - while True: - async with await session.get(f'{base_url}/health', params=params) as health_response: - status_code = health_response.status - health = await health_response.json() - if context.debug: - print(f"HEALTH - response for expected health status='{expected_health_status}' on " - f"'{base_url}/health'?{params} is {health}\n") - if (status_code == expected_http_status_code - and health['status'] == expected_health_status - and (slots_idle is None or health['slots_idle'] == slots_idle) - and (slots_processing is None or health['slots_processing'] == slots_processing)): - if expected_slots is not None: - assert_slots_status(health['slots'], expected_slots) - return - if (status_code == expected_http_status_code - and health['status'] == expected_health_status - and (slots_idle is None or health['slots_idle'] == slots_idle) - and (slots_processing is None or health['slots_processing'] == slots_processing)): - if expected_slots is not None: - assert_slots_status(health['slots'], expected_slots) - return - await asyncio.sleep(interval) - - counter += interval - if counter >= timeout: - # Sometimes health requests are triggered after completions are predicted - if expected_http_status_code == 503: - if len(context.tasks_result) == 0: - print("\x1b[5;37;43mWARNING: forcing concurrent tasks," - " busy health check missed, probably too fast inference\x1b[0m\n") - n_completions = await gather_tasks_results(context) - if n_completions > 0: - return - - assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}' - - -def assert_embeddings(embeddings): - assert len(embeddings) > 0 - embeddings_computed = False - for emb in embeddings: - if not isinstance(emb, float): - assert False, f"Bad embeddings: {embeddings}" - if emb != 0: - embeddings_computed = True - assert embeddings_computed, f"Embeddings: {embeddings}" - - -async def request_slots_status(context, expected_slots): - async with aiohttp.ClientSession() as session: - async with await session.get(f'{context.base_url}/slots') as slots_response: - assert slots_response.status == 200 - slots = await slots_response.json() - assert_slots_status(slots, expected_slots) - - -def assert_slots_status(slots, expected_slots): - assert len(slots) == len(expected_slots) - for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)): - for key in expected: - assert expected[key] == slot[key], (f"invalid slot {slot_id}" - f" expected[{key}] != slot[{key}]" - f" = {expected[key]} != {slot[key]}") - - -async def completions_seed(context, num_seeds=None): - if hasattr(context, "seed") and context.seed is not None: - assert len(context.seed) == context.n_prompts - if num_seeds is None: - num_seeds = context.n_prompts - assert num_seeds <= context.n_prompts - seeds = context.seed[:num_seeds] - context.seed = context.seed[num_seeds:] if num_seeds < context.n_prompts else None - return seeds - - if hasattr(context, "server_seed") and context.server_seed is not None: - if num_seeds is None: - return [context.server_seed] * context.n_prompts - else: - return [context.server_seed] * num_seeds - return None - - -def context_text(context): - return context.text.replace('\r', '') - - -def start_server_background(context): - if os.name == 'nt': - context.server_path = '../../../build/bin/Release/llama-server.exe' - else: - context.server_path = '../../../build/bin/llama-server' - if 'LLAMA_SERVER_BIN_PATH' in os.environ: - context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] - server_listen_addr = context.server_fqdn - server_args = [ - '--host', server_listen_addr, - '--port', context.server_port, - ] - if context.model_file: - server_args.extend(['--model', context.model_file]) - if context.model_url: - server_args.extend(['--model-url', context.model_url]) - if context.model_hf_repo: - server_args.extend(['--hf-repo', context.model_hf_repo]) - if context.model_hf_file: - server_args.extend(['--hf-file', context.model_hf_file]) - if context.n_batch: - server_args.extend(['--batch-size', context.n_batch]) - if context.n_ubatch: - server_args.extend(['--ubatch-size', context.n_ubatch]) - if context.n_threads: - server_args.extend(['--threads', context.threads]) - if context.n_gpu_layer: - server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) - if context.draft is not None: - server_args.extend(['--draft', context.draft]) - if context.server_continuous_batching: - server_args.append('--cont-batching') - if context.server_embeddings: - server_args.append('--embedding') - if context.server_metrics: - server_args.append('--metrics') - if context.model_alias: - server_args.extend(['--alias', context.model_alias]) - if context.n_ctx: - server_args.extend(['--ctx-size', context.n_ctx]) - if context.n_slots: - server_args.extend(['--parallel', context.n_slots]) - if context.n_server_predict: - server_args.extend(['--n-predict', context.n_server_predict]) - if context.slot_save_path: - server_args.extend(['--slot-save-path', context.slot_save_path]) - if context.server_api_key: - server_args.extend(['--api-key', context.server_api_key]) - if context.n_ga: - server_args.extend(['--grp-attn-n', context.n_ga]) - if context.n_ga_w: - server_args.extend(['--grp-attn-w', context.n_ga_w]) - if context.debug: - server_args.append('--verbose') - if 'SERVER_LOG_FORMAT_JSON' not in os.environ: - server_args.extend(['--log-format', "text"]) - - args = [str(arg) for arg in [context.server_path, *server_args]] - print(f"bench: starting server with: {' '.join(args)}") - - flags = 0 - if 'nt' == os.name: - flags |= subprocess.DETACHED_PROCESS - flags |= subprocess.CREATE_NEW_PROCESS_GROUP - flags |= subprocess.CREATE_NO_WINDOW - - pkwargs = { - 'creationflags': flags, - 'stdout': subprocess.PIPE, - 'stderr': subprocess.PIPE - } - context.server_process = subprocess.Popen( - [str(arg) for arg in [context.server_path, *server_args]], - **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] - - def server_log(in_stream, out_stream): - for line in iter(in_stream.readline, b''): - print(line.decode('utf-8'), end='', file=out_stream) - - thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout)) - thread_stdout.start() - - thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr)) - thread_stderr.start() - - print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}") diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature deleted file mode 100644 index cf14b3b44..000000000 --- a/examples/server/tests/features/wrong_usages.feature +++ /dev/null @@ -1,22 +0,0 @@ -# run with: ./tests.sh --no-skipped --tags wrong_usage -@wrong_usage -Feature: Wrong usage of llama.cpp server - - #3969 The user must always set --n-predict option - # to cap the number of tokens any completion request can generate - # or pass n_predict/max_tokens in the request. - Scenario: Infinite loop - Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models - # Uncomment below to fix the issue - #And 64 server max tokens to predict - Then the server is starting - Given a prompt: - """ - Go to: infinite loop - """ - # Uncomment below to fix the issue - #And 128 max tokens to predict - Given concurrent completion requests - Then the server is idle - Then all prompts are predicted diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt deleted file mode 100644 index 2c741ea10..000000000 --- a/examples/server/tests/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -aiohttp~=3.9.3 -behave~=1.2.6 -huggingface_hub~=0.20.3 -numpy~=1.26.4 -openai~=1.30.3 -prometheus-client~=0.20.0 diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh deleted file mode 100755 index 72a0fbad8..000000000 --- a/examples/server/tests/tests.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -eu - -if [ $# -lt 1 ] -then - # Start @llama.cpp scenario - behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp -else - behave "$@" -fi diff --git a/examples/server/themes/README.md b/examples/server/themes/README.md deleted file mode 100644 index 62e721a27..000000000 --- a/examples/server/themes/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# LLaMA.cpp Server Wild Theme - -Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`. - -![image](wild/wild.png) diff --git a/examples/server/themes/buttons-top/README.md b/examples/server/themes/buttons-top/README.md deleted file mode 100644 index 808c4cf81..000000000 --- a/examples/server/themes/buttons-top/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# LLaMA.cpp Server Buttons Top Theme - -Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page. - -To use simply run server with `--path=themes/buttons_top` - -![image](buttons_top.png) diff --git a/examples/server/themes/buttons-top/buttons_top.png b/examples/server/themes/buttons-top/buttons_top.png deleted file mode 100644 index c54454519..000000000 Binary files a/examples/server/themes/buttons-top/buttons_top.png and /dev/null differ diff --git a/examples/server/themes/buttons-top/favicon.ico b/examples/server/themes/buttons-top/favicon.ico deleted file mode 100644 index 89e154a0a..000000000 Binary files a/examples/server/themes/buttons-top/favicon.ico and /dev/null differ diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html deleted file mode 100644 index 8334bcde5..000000000 --- a/examples/server/themes/buttons-top/index.html +++ /dev/null @@ -1,1056 +0,0 @@ - - - - - - - llama.cpp - chat - - - - - - - -
- -
-
- - - diff --git a/examples/server/themes/wild/README.md b/examples/server/themes/wild/README.md deleted file mode 100644 index 560bcc81b..000000000 --- a/examples/server/themes/wild/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# LLaMA.cpp Server Wild Theme - -Simple tweaks to the UI. To use simply run server with `--path=themes/wild` - -![image](wild.png) diff --git a/examples/server/themes/wild/favicon.ico b/examples/server/themes/wild/favicon.ico deleted file mode 100644 index 89e154a0a..000000000 Binary files a/examples/server/themes/wild/favicon.ico and /dev/null differ diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html deleted file mode 100644 index 8361c5774..000000000 --- a/examples/server/themes/wild/index.html +++ /dev/null @@ -1,1060 +0,0 @@ - - - - - - - llama.cpp - chat - - - - - - - -
- -
-
- - - diff --git a/examples/server/themes/wild/llama_cpp.png b/examples/server/themes/wild/llama_cpp.png deleted file mode 100644 index bad1dc9fc..000000000 Binary files a/examples/server/themes/wild/llama_cpp.png and /dev/null differ diff --git a/examples/server/themes/wild/llamapattern.png b/examples/server/themes/wild/llamapattern.png deleted file mode 100644 index 2a159ce6a..000000000 Binary files a/examples/server/themes/wild/llamapattern.png and /dev/null differ diff --git a/examples/server/themes/wild/wild.png b/examples/server/themes/wild/wild.png deleted file mode 100644 index 46ffa0f3e..000000000 Binary files a/examples/server/themes/wild/wild.png and /dev/null differ diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp deleted file mode 100644 index db6b3b74d..000000000 --- a/examples/server/utils.hpp +++ /dev/null @@ -1,654 +0,0 @@ -#pragma once - -#include "llama.h" -#include "common.h" - -// Change JSON_ASSERT from assert() to GGML_ASSERT: -#define JSON_ASSERT GGML_ASSERT -#include "json.hpp" - -#include -#include -#include -#include - -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" - -using json = nlohmann::ordered_json; - -// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 -enum error_type { - ERROR_TYPE_INVALID_REQUEST, - ERROR_TYPE_AUTHENTICATION, - ERROR_TYPE_SERVER, - ERROR_TYPE_NOT_FOUND, - ERROR_TYPE_PERMISSION, - ERROR_TYPE_UNAVAILABLE, // custom error - ERROR_TYPE_NOT_SUPPORTED, // custom error -}; - -extern bool server_verbose; -extern bool server_log_json; - -#ifndef SERVER_VERBOSE -#define SERVER_VERBOSE 1 -#endif - -#if SERVER_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do \ - { \ - if (server_verbose) \ - { \ - server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) -#endif - -#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) - -static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra); - -template -static T json_value(const json & body, const std::string & key, const T & default_value) { - // Fallback null to default value - if (body.contains(key) && !body.at(key).is_null()) { - try { - return body.at(key); - } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) { - std::stringstream ss; - ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value."; - LOG_WARNING(ss.str().c_str(), body); - return default_value; - } - } else { - return default_value; - } -} - -static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) { - std::stringstream ss_tid; - ss_tid << std::this_thread::get_id(); - json log = json{ - {"tid", ss_tid.str()}, - {"timestamp", time(nullptr)}, - }; - - if (server_log_json) { - log.merge_patch({ - {"level", level}, - {"function", function}, - {"line", line}, - {"msg", message}, - }); - - if (!extra.empty()) { - log.merge_patch(extra); - } - - printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); - } else { - char buf[1024]; - snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - - if (!extra.empty()) { - log.merge_patch(extra); - } - std::stringstream ss; - ss << buf << " |"; - for (const auto & el : log.items()) - { - const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); - ss << " " << el.key() << "=" << value; - } - - const std::string str = ss.str(); - printf("%.*s\n", (int)str.size(), str.data()); - } - fflush(stdout); -} - -// -// chat template utils -// - -// Format given chat. If tmpl is empty, we take the template from model metadata -inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { - std::vector chat; - - for (size_t i = 0; i < messages.size(); ++i) { - const auto & curr_msg = messages[i]; - - std::string role = json_value(curr_msg, "role", std::string("")); - - std::string content; - if (curr_msg.contains("content")) { - if (curr_msg["content"].is_string()) { - content = curr_msg["content"].get(); - } else if (curr_msg["content"].is_array()) { - for (const auto & part : curr_msg["content"]) { - if (part.contains("text")) { - content += "\n" + part["text"].get(); - } - } - } else { - throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); - } - } else { - throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); - } - - chat.push_back({role, content}); - } - - auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); - LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); - return formatted_chat; -} - -// -// base64 utils (TODO: move to common in the future) -// - -static const std::string base64_chars = - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; - -static inline bool is_base64(uint8_t c) { - return (isalnum(c) || (c == '+') || (c == '/')); -} - -static inline std::vector base64_decode(const std::string & encoded_string) { - int i = 0; - int j = 0; - int in_ = 0; - - int in_len = encoded_string.size(); - - uint8_t char_array_4[4]; - uint8_t char_array_3[3]; - - std::vector ret; - - while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { - char_array_4[i++] = encoded_string[in_]; in_++; - if (i == 4) { - for (i = 0; i < 4; i++) { - char_array_4[i] = base64_chars.find(char_array_4[i]); - } - - char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (i = 0; (i < 3); i++) { - ret.push_back(char_array_3[i]); - } - - i = 0; - } - } - - if (i) { - for (j = i; j < 4; j++) { - char_array_4[j] = 0; - } - - for (j = 0; j < 4; j++) { - char_array_4[j] = base64_chars.find(char_array_4[j]); - } - - char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (j = 0; j < i - 1; j++) { - ret.push_back(char_array_3[j]); - } - } - - return ret; -} - -// -// random string / id -// - -static std::string random_string() { - static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - - std::random_device rd; - std::mt19937 generator(rd()); - - std::string result(32, ' '); - - for (int i = 0; i < 32; ++i) { - result[i] = str[generator() % str.size()]; - } - - return result; -} - -static std::string gen_chatcmplid() { - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - - return chatcmplid.str(); -} - -// -// other common utils -// - -static size_t common_part(const std::vector & a, const std::vector & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} - - return i; -} - -static size_t common_part(const std::string & a, const std::string & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} - - return i; -} - -static bool ends_with(const std::string & str, const std::string & suffix) { - return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} - -static size_t find_partial_stop_string(const std::string &stop, const std::string &text) { - if (!text.empty() && !stop.empty()) { - const char text_last_char = text.back(); - for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { - if (stop[char_index] == text_last_char) { - const std::string current_partial = stop.substr(0, char_index + 1); - if (ends_with(text, current_partial)) { - return text.size() - char_index - 1; - } - } - } - } - - return std::string::npos; -} - -// TODO: reuse llama_detokenize -template -static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { - std::string ret; - for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); - } - - return ret; -} - -// format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); - - // if the size is 1 and first bit is 1, meaning it's a partial character - // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) { - std::stringstream ss; - ss << std::hex << (out[0] & 0xff); - std::string res(ss.str()); - out = "byte: \\x" + res; - } - - return out; -} - -struct completion_token_output { - llama_token tok; - std::string text_to_send; - - struct token_prob { - llama_token tok; - float prob; - }; - - std::vector probs; -}; - -// convert a vector of completion_token_output to json -static json probs_vector_to_json(const llama_context * ctx, const std::vector & probs) { - json out = json::array(); - - for (const auto & prob : probs) { - json probs_for_token = json::array(); - - for (const auto & p : prob.probs) { - const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json { - {"tok_str", tok_str}, - {"prob", p.prob}, - }); - } - - const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json { - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - - return out; -} - -// -// OAI utils -// - -static json oaicompat_completion_params_parse( - const struct llama_model * model, - const json & body, /* openai api json semantics */ - const std::string & chat_template) { - json llama_params; - - llama_params["__oaicompat"] = true; - - // Map OpenAI parameters to llama.cpp parameters - // - // For parameters that are defined by the OpenAI documentation (e.g. - // temperature), we explicitly specify OpenAI's intended default; we - // need to do that because sometimes OpenAI disagrees with llama.cpp - // - // https://platform.openai.com/docs/api-reference/chat/create - llama_sampling_params default_sparams; - llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); - llama_params["logit_bias"] = json_value(body, "logit_bias", json::object()); - llama_params["n_predict"] = json_value(body, "max_tokens", -1); - llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); - llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); - llama_params["stream"] = json_value(body, "stream", false); - llama_params["temperature"] = json_value(body, "temperature", 1.0); - llama_params["top_p"] = json_value(body, "top_p", 1.0); - - // Apply chat template to the list of messages - llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); - - // Handle "stop" field - if (body.contains("stop") && body.at("stop").is_string()) { - llama_params["stop"] = json::array({body.at("stop").get()}); - } else { - llama_params["stop"] = json_value(body, "stop", json::array()); - } - - // Handle "response_format" field - if (body.contains("response_format")) { - json response_format = json_value(body, "response_format", json::object()); - std::string response_type = json_value(response_format, "type", std::string()); - if (response_type == "json_object") { - llama_params["json_schema"] = json_value(response_format, "schema", json::object()); - } else if (!response_type.empty() && response_type != "text") { - throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); - } - } - - // Handle "n" field - int n_choices = json_value(body, "n", 1); - if (n_choices != 1) { - throw std::runtime_error("Only one completion choice is allowed"); - } - - // Handle "logprobs" field - // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future - if (body.contains("logprobs")) { - llama_params["n_probs"] = json_value(body, "top_logprobs", 20); - } else if (body.contains("top_logprobs")) { - throw std::runtime_error("top_logprobs requires logprobs to be set to true"); - } - - // Params supported by OAI but unsupported by llama.cpp - static const std::vector unsupported_params { "tools", "tool_choice" }; - for (auto & param : unsupported_params) { - if (body.contains(param)) { - throw std::runtime_error("Unsupported param: " + param); - } - } - - // Copy remaining properties to llama_params - // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint. - // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp - for (const auto & item : body.items()) { - // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens" - if (!llama_params.contains(item.key()) || item.key() == "n_predict") { - llama_params[item.key()] = item.value(); - } - } - - return llama_params; -} - -static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) { - bool stopped_word = result.count("stopped_word") != 0; - bool stopped_eos = json_value(result, "stopped_eos", false); - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason = "length"; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - - json choices = - streaming ? json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}) - : json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{{"content", content}, - {"role", "assistant"}}}}}); - - std::time_t t = std::time(0); - - json res = json { - {"choices", choices}, - {"created", t}, - {"model", - json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, - {"usage", json { - {"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens} - }}, - {"id", completion_id} - }; - - if (server_verbose) { - res["__verbose"] = result; - } - - if (result.contains("completion_probabilities")) { - res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); - } - - return res; -} - -// return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(json result, const std::string & completion_id) { - if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { - return std::vector({result}); - } - - bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; - std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - - bool stopped_word = json_value(result, "stopped_word", false); - bool stopped_eos = json_value(result, "stopped_eos", false); - bool stopped_limit = json_value(result, "stopped_limit", false); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - if (stopped_limit) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}} - }})}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); - } - } - - json ret = json { - {"choices", choices}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"} - }; - if (!finish_reason.empty()) { - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - ret.push_back({"usage", json { - {"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens} - }}); - } - - return std::vector({ret}); -} - -static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { - json data = json::array(); - int i = 0; - for (auto & elem : embeddings) { - data.push_back(json{ - {"embedding", json_value(elem, "embedding", json::array())}, - {"index", i++}, - {"object", "embedding"} - }); - } - - json res = json { - {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", "list"}, - {"usage", json { - {"prompt_tokens", 0}, - {"total_tokens", 0} - }}, - {"data", data} - }; - - return res; -} - -static json format_tokenizer_response(const std::vector & tokens) { - return json { - {"tokens", tokens} - }; -} - -static json format_detokenized_response(const std::string & content) { - return json { - {"content", content} - }; -} - -static json format_error_response(const std::string & message, const enum error_type type) { - std::string type_str; - int code = 500; - switch (type) { - case ERROR_TYPE_INVALID_REQUEST: - type_str = "invalid_request_error"; - code = 400; - break; - case ERROR_TYPE_AUTHENTICATION: - type_str = "authentication_error"; - code = 401; - break; - case ERROR_TYPE_NOT_FOUND: - type_str = "not_found_error"; - code = 404; - break; - case ERROR_TYPE_SERVER: - type_str = "server_error"; - code = 500; - break; - case ERROR_TYPE_PERMISSION: - type_str = "permission_error"; - code = 403; - break; - case ERROR_TYPE_NOT_SUPPORTED: - type_str = "not_supported_error"; - code = 501; - break; - case ERROR_TYPE_UNAVAILABLE: - type_str = "unavailable_error"; - code = 503; - break; - } - return json { - {"code", code}, - {"message", message}, - {"type", type_str}, - }; -} diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt deleted file mode 100644 index 070cfbe7a..000000000 --- a/examples/simple/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-simple) -add_executable(${TARGET} simple.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/simple/README.md b/examples/simple/README.md deleted file mode 100644 index 49e24501c..000000000 --- a/examples/simple/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# llama.cpp/example/simple - -The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt. - -```bash -./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" - -... - -main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32 - - Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old - -main: decoded 27 tokens in 2.31 s, speed: 11.68 t/s - -llama_print_timings: load time = 579.15 ms -llama_print_timings: sample time = 0.72 ms / 28 runs ( 0.03 ms per token, 38888.89 tokens per second) -llama_print_timings: prompt eval time = 655.63 ms / 10 tokens ( 65.56 ms per token, 15.25 tokens per second) -llama_print_timings: eval time = 2180.97 ms / 27 runs ( 80.78 ms per token, 12.38 tokens per second) -llama_print_timings: total time = 2891.13 ms -``` diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp deleted file mode 100644 index 69a92cf7d..000000000 --- a/examples/simple/simple.cpp +++ /dev/null @@ -1,175 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include - -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); - LOG_TEE("\n"); -} - -int main(int argc, char ** argv) { - gpt_params params; - - params.prompt = "Hello my name is"; - params.n_predict = 32; - - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); - return 1; - } - - // total length of the sequence including the prompt - const int n_predict = params.n_predict; - - // init LLM - - llama_backend_init(); - llama_numa_init(params.numa); - - // initialize the model - - llama_model_params model_params = llama_model_params_from_gpt_params(params); - - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); - - if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); - return 1; - } - - // initialize the context - - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); - - llama_context * ctx = llama_new_context_with_model(model, ctx_params); - - if (ctx == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); - return 1; - } - - // tokenize the prompt - - std::vector tokens_list; - tokens_list = ::llama_tokenize(ctx, params.prompt, true); - - const int n_ctx = llama_n_ctx(ctx); - const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); - - LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); - - // make sure the KV cache is big enough to hold all the prompt and generated tokens - if (n_kv_req > n_ctx) { - LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__); - return 1; - } - - // print the prompt token-by-token - - fprintf(stderr, "\n"); - - for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); - } - - fflush(stderr); - - // create a llama_batch with size 512 - // we use this object to submit token data for decoding - - llama_batch batch = llama_batch_init(512, 0, 1); - - // evaluate the initial prompt - for (size_t i = 0; i < tokens_list.size(); i++) { - llama_batch_add(batch, tokens_list[i], i, { 0 }, false); - } - - // llama_decode will output logits only for the last token of the prompt - batch.logits[batch.n_tokens - 1] = true; - - if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return 1; - } - - // main loop - - int n_cur = batch.n_tokens; - int n_decode = 0; - - const auto t_main_start = ggml_time_us(); - - while (n_cur <= n_predict) { - // sample the next token - { - auto n_vocab = llama_n_vocab(model); - auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1); - - std::vector candidates; - candidates.reserve(n_vocab); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - // sample the most likely token - const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); - - // is it an end of generation? - if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { - LOG_TEE("\n"); - - break; - } - - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); - fflush(stdout); - - // prepare the next batch - llama_batch_clear(batch); - - // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_cur, { 0 }, true); - - n_decode += 1; - } - - n_cur += 1; - - // evaluate the current batch with the transformer model - if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); - return 1; - } - } - - LOG_TEE("\n"); - - const auto t_main_end = ggml_time_us(); - - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", - __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - - llama_print_timings(ctx); - - fprintf(stderr, "\n"); - - llama_batch_free(batch); - - llama_free(ctx); - llama_free_model(model); - - llama_backend_free(); - - return 0; -} diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt deleted file mode 100644 index aa208e7aa..000000000 --- a/examples/speculative/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-speculative) -add_executable(${TARGET} speculative.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/speculative/README.md b/examples/speculative/README.md deleted file mode 100644 index a6608c5fe..000000000 --- a/examples/speculative/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# llama.cpp/examples/speculative - -Demonstration of speculative decoding and tree-based speculative decoding techniques - -More info: - -- https://github.com/ggerganov/llama.cpp/pull/2926 -- https://github.com/ggerganov/llama.cpp/pull/3624 -- https://github.com/ggerganov/llama.cpp/pull/5625 diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp deleted file mode 100644 index 0939a1a6a..000000000 --- a/examples/speculative/speculative.cpp +++ /dev/null @@ -1,615 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include - -#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 -#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 - -struct seq_draft { - bool active = false; - bool drafting = false; - bool skip = false; - - int i_batch_dft = 0; - std::vector i_batch_tgt; - - std::vector tokens; - std::vector> dists; - - struct llama_sampling_context * ctx_sampling; -}; - -int main(int argc, char ** argv) { - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); - return 1; - } - - if (params.model_draft.empty()) { - fprintf(stderr, "%s: error: --model-draft is required\n", __func__); - return 1; - } - - // max number of parallel drafting sequences (i.e. tree branches) - const int n_seq_dft = params.n_parallel; - - // probability threshold for splitting a draft branch (only for n_seq_dft > 1) - const float p_split = params.p_split; - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - std::default_random_engine rng(params.seed); - std::uniform_real_distribution<> u_dist; - -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("speculative", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - - // init llama.cpp - llama_backend_init(); - llama_numa_init(params.numa); - - llama_model * model_tgt = NULL; - llama_model * model_dft = NULL; - - llama_context * ctx_tgt = NULL; - llama_context * ctx_dft = NULL; - - // load the target model - std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); - - // load the draft model - params.model = params.model_draft; - params.n_gpu_layers = params.n_gpu_layers_draft; - if (params.n_threads_draft > 0) { - params.n_threads = params.n_threads_draft; - } - params.n_threads_batch = params.n_threads_batch_draft; - std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); - - const bool vocab_type_tgt = llama_vocab_type(model_tgt); - LOG("vocab_type tgt: %d\n", vocab_type_tgt); - - const bool vocab_type_dft = llama_vocab_type(model_dft); - LOG("vocab_type dft: %d\n", vocab_type_dft); - - if (vocab_type_tgt != vocab_type_dft) { - fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__); - fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt); - return 1; - } - - if ( - llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || - llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || - llama_token_bos(model_tgt) != llama_token_bos(model_dft) || - llama_token_eos(model_tgt) != llama_token_eos(model_dft) - ) { - fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__); - return 1; - } - - { - const int n_vocab_tgt = llama_n_vocab(model_tgt); - const int n_vocab_dft = llama_n_vocab(model_dft); - const int vocab_diff = n_vocab_tgt > n_vocab_dft - ? n_vocab_tgt - n_vocab_dft - : n_vocab_dft - n_vocab_tgt; - - if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { - fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__); - fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", - n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); - return 1; - } - - for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { - const char * token_text_tgt = llama_token_get_text(model_tgt, i); - const char * token_text_dft = llama_token_get_text(model_dft, i); - if (std::strcmp(token_text_tgt, token_text_dft) != 0) { - fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__); - fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i, - llama_token_to_piece(ctx_tgt, i).c_str(), - llama_token_to_piece(ctx_dft, i).c_str()); - return 1; - } - } - } - - - // Tokenize the prompt - std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true); - - const int max_context_size = llama_n_ctx(ctx_tgt); - const int max_tokens_list_size = max_context_size - 4; - - if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); - return 1; - } - - fprintf(stderr, "\n\n"); - - for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str()); - } - - fflush(stderr); - - const int n_input = inp.size(); - - const auto t_enc_start = ggml_time_us(); - - // eval the prompt with both models - llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); - llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); - llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0)); - - const auto t_enc_end = ggml_time_us(); - - // the 2 models should have the same vocab - //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft)); - - // how many tokens to draft each time - int n_draft = params.n_draft; - - int n_predict = 0; - int n_drafted = 0; - int n_accept = 0; - - int n_past_tgt = inp.size(); - int n_past_dft = inp.size(); - - // used to determine end of generation - bool has_eos = false; - - // target model sampling context - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); - - // draft sequence data - std::vector drafts(n_seq_dft); - - params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar - if (params.sparams.temp == 0) { - params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model - } - - for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].ctx_sampling = llama_sampling_init(params.sparams); - } - - llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); - llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft); - - const auto t_dec_start = ggml_time_us(); - - // sample from the last token of the prompt - drafts[0].i_batch_tgt.resize(1); - drafts[0].i_batch_tgt[0] = 0; - - while (true) { - std::set active_seqs = {}; - - // print current draft sequences - for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].active) { - continue; - } - - active_seqs.insert(s); - const auto & tokens = drafts[s].tokens; - - LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str()); - } - - int i_dft = 0; - int s_keep = 0; - - llama_token token_id; - std::string token_str; - - // loop until we fail to accept a drafted token or we run out of drafted tokens - while (true) { - - // check if the target token matches any of the drafts - // for stochastic sampling, attempt to match the token with the drafted tokens - { - bool accept = false; - if (params.sparams.temp > 0) { - // stochastic verification - - llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL); - llama_sample_softmax(ctx_tgt, &dist_tgt); - float p_tgt = 0, p_dft = 0; - - // GGML_ASSERT(dist_tgt.size() == dist_dft.size()); - - while (active_seqs.size() > 0) { - // randomly select a sequence to verify from active sequences - std::uniform_int_distribution u_int_dist(0, active_seqs.size() - 1); - int s = *std::next(active_seqs.begin(), u_int_dist(rng)); - if (i_dft >= (int) drafts[s].tokens.size()) { - drafts[s].active = false; - active_seqs.erase(s); - continue; - } - if (accept) { - // if we already accepted a token, we can skip the rest - if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) { - drafts[s].active = false; - active_seqs.erase(s); - } - continue; - } - LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size()); - float r = u_dist(rng); - llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true }; - // acquire the token probabilities assigned by the draft and target models - for (size_t i = 0; i < dist_tgt.size; i++) { - if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) { - p_tgt = dist_tgt.data[i].p; - } - if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) { - p_dft = dist_dft.data[i].p; - } - if (p_tgt && p_dft) { - break; - } - } - LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt); - if (r <= p_tgt / p_dft) { - s_keep = s; - accept = true; - token_id = drafts[s].tokens[i_dft]; - token_str = llama_token_to_piece(ctx_tgt, token_id); - llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); - - LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); - break; - } else { - LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); - drafts[s].active = false; - - // calculate residual probability - GGML_ASSERT(dist_tgt.sorted); - GGML_ASSERT(dist_dft.sorted); - float sum_probs = 0.0f; - - // sort dist by id - std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) { - return a.id < b.id; - }); - std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) { - return a.id < b.id; - }); - - for (size_t i = 0; i < dist_tgt.size; i++) { - dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p); - sum_probs += dist_tgt.data[i].p; - } - for (size_t i = 0; i < dist_tgt.size; i++) { - dist_tgt.data[i].p /= sum_probs; - } - - // sort dist_tgt by p desc - std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) { - return a.p > b.p; - }); - } - - active_seqs.erase(s); - for(int i = 0; i < n_seq_dft; i++) { - if (i == s) { - continue; - } - if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) { - // synchronize active status for sequences with the same drafted token - drafts[i].active = drafts[i].active && accept; - if (!drafts[i].active) { - active_seqs.erase(s); - } - } - } - } - - if (!accept) { - // all drafted tokens were rejected - // sample from the target model - LOG("all drafted tokens were rejected, sampling from residual distribution\n"); - token_id = llama_sample_token(ctx_tgt, &dist_tgt); - llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); - token_str = llama_token_to_piece(ctx_tgt, token_id); - } - - } else { - // greedy verification - - // sample from the target model - LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); - token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); - - llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); - - //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str()); - - token_str = llama_token_to_piece(ctx_tgt, token_id); - - for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].active) { - continue; - } - - if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) { - LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str()); - - s_keep = s; - accept = true; - } else { - drafts[s].active = false; - } - } - } - - if (llama_token_is_eog(model_tgt, token_id)) { - has_eos = true; - } - ++n_predict; - - if (accept) { - ++n_accept; - ++n_past_tgt; - ++n_past_dft; - ++i_dft; - if (params.use_color) { - // Color token according to its origin sequence - printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str()); - } else { - printf("%s", token_str.c_str()); - } - fflush(stdout); - continue; - } else { - printf("%s", token_str.c_str()); - fflush(stdout); - break; - } - } - } - - { - LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str()); - - // TODO: simplify - { - LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); - - llama_kv_cache_seq_keep(ctx_dft, s_keep); - llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(ctx_dft, 0); - - llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1); - llama_kv_cache_seq_keep(ctx_tgt, s_keep); - llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(ctx_tgt, 0); - } - - for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].active = false; - drafts[s].tokens.clear(); - drafts[s].i_batch_tgt.clear(); - drafts[s].dists.clear(); - } - // note: will be erased after the speculation phase - drafts[0].tokens.push_back(token_id); - drafts[0].dists.push_back(std::vector()); - drafts[0].i_batch_tgt.push_back(0); - - llama_batch_clear(batch_dft); - llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); - - llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); - // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); - llama_decode(ctx_dft, batch_dft); - - ++n_past_dft; - } - - if (n_predict > params.n_predict || has_eos) { - break; - } - - llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling); - - int n_seq_cur = 1; - int n_past_cur = n_past_dft; - - for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].active = false; - drafts[s].drafting = false; - } - drafts[0].active = true; - drafts[0].drafting = true; - drafts[0].i_batch_dft = 0; - - llama_batch_clear(batch_tgt); - llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); - - // sample n_draft tokens from the draft model using tree-based sampling - for (int i = 0; i < n_draft; ++i) { - batch_dft.n_tokens = 0; - - for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].skip = false; - } - - for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].drafting || drafts[s].skip) { - continue; - } - - llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft); - - const auto & cur_p = drafts[s].ctx_sampling->cur; - - for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) { - LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str()); - } - - std::vector sa(1, s); - - // attempt to split the branch if the probability is high enough - for (int f = 1; f < 8; ++f) { - if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) { - LOG("splitting seq %3d into %3d\n", s, n_seq_cur); - - llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); - llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); - - // all previous tokens from this branch are now also part of the new branch - for (int t = 0; t < batch_tgt.n_tokens; ++t) { - for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) { - if (batch_tgt.seq_id[t][p] == s) { - batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur; - batch_tgt.n_seq_id[t]++; - break; - } - } - } - - // copy the draft state - drafts[n_seq_cur].active = true; - drafts[n_seq_cur].drafting = true; - drafts[n_seq_cur].skip = true; - - drafts[n_seq_cur].tokens = drafts[s].tokens; - drafts[n_seq_cur].dists = drafts[s].dists; - drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft; - drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; - - llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling); - - sa.push_back(n_seq_cur); - - n_seq_cur++; - } else { - break; - } - } - - // add drafted token for each sequence - for (int is = 0; is < (int) sa.size(); ++is) { - const llama_token id = cur_p[is].id; - - const int s = sa[is]; - - llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true); - - drafts[s].tokens.push_back(id); - // save cur_p.data into drafts[s].dists - drafts[s].dists.push_back(cur_p); - - // add unique drafted tokens to the target batch - drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); - - llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); - - // add the token to the batch for batched decoding with the draft model - drafts[s].i_batch_dft = batch_dft.n_tokens; - - llama_batch_add(batch_dft, id, n_past_cur, { s }, true); - - if (batch_tgt.n_tokens > n_draft) { - drafts[s].drafting = false; - } - } - } - - // no sequence is drafting anymore - if (batch_dft.n_tokens == 0) { - break; - } - - // evaluate the drafted tokens on the draft model - llama_decode(ctx_dft, batch_dft); - ++n_past_cur; - ++n_drafted; - - if (batch_tgt.n_tokens > n_draft) { - break; - } - } - - // evaluate the target model on the drafted tokens - { - llama_kv_cache_seq_keep(ctx_tgt, 0); - for (int s = 1; s < n_seq_dft; ++s) { - llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); - } - - // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); - llama_decode(ctx_tgt, batch_tgt); - ++n_past_tgt; - } - - // the first token is always proposed by the target model before the speculation loop so we erase it here - for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].active) { - continue; - } - - drafts[s].tokens.erase(drafts[s].tokens.begin()); - drafts[s].dists.erase(drafts[s].dists.begin()); - } - } - - auto t_dec_end = ggml_time_us(); - - LOG_TEE("\n\n"); - - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - - LOG_TEE("\ndraft:\n"); - llama_print_timings(ctx_dft); - - LOG_TEE("\ntarget:\n"); - llama_print_timings(ctx_tgt); - - llama_sampling_free(ctx_sampling); - for (int s = 0; s < n_seq_dft; ++s) { - llama_sampling_free(drafts[s].ctx_sampling); - } - - llama_batch_free(batch_dft); - - llama_free(ctx_tgt); - llama_free_model(model_tgt); - - llama_free(ctx_dft); - llama_free_model(model_dft); - - llama_backend_free(); - - fprintf(stderr, "\n\n"); - - return 0; -} diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt deleted file mode 100644 index b704dcae1..000000000 --- a/examples/tokenize/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-tokenize) -add_executable(${TARGET} tokenize.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp deleted file mode 100644 index 2afb6024c..000000000 --- a/examples/tokenize/tokenize.cpp +++ /dev/null @@ -1,406 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include - -#if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#include -#include // For CommandLineToArgvW -#endif - -static void print_usage_information(const char * argv0, FILE * stream) { - fprintf(stream, "usage: %s [options]\n\n", argv0); - fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n"); - fprintf(stream, "and prints the resulting tokens to standard output.\n\n"); - fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n"); - fprintf(stream, "to control the behavior of the tokenizer.\n\n"); - fprintf(stream, " The possible options are:\n"); - fprintf(stream, "\n"); - fprintf(stream, " -h, --help print this help and exit\n"); - fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n"); - fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n"); - fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); - fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); - fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); - fprintf(stream, " --stdin read prompt from standard input.\n"); - fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); - fprintf(stream, " --no-parse-special do not parse control tokens.\n"); - fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); - fprintf(stream, " --show-count print the total number of tokens.\n"); -} - -static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) text; - (void) user_data; -} - -static std::string read_prompt_from_file(const char * filepath, bool & success) { - success = false; - - std::ifstream in(filepath, std::ios::binary); - if (!in) { - fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno)); - return std::string(); - } - // do not assume the file is seekable (e.g. /dev/stdin) - std::stringstream buffer; - buffer << in.rdbuf(); - if (in.fail()) { - fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno)); - return std::string(); - } - - success = true; - return buffer.str(); -} - -// -// Function: ingest_args(...) -> vector -// -// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded -// strings, as an STL vector. -// -// In particular, it handles character encoding shenanigans on Windows. -// -// Note: raw_argc and raw_argv are not actually read at all on Windows. -// On Windows we call GetCommandLineW to get the arguments in wchar_t -// format, ignoring the regular argc/argv arguments to main(). -// -// TODO: potential opportunity to roll common stuff into common/console.cpp -// in relation to Windows wchar_t shenanigans. -static std::vector ingest_args(int raw_argc, char ** raw_argv) { - std::vector argv; - - // Handle Windows, if given non-ASCII arguments. - // We convert wchar_t arguments into UTF-8 char* on this platform. - // Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters - // without throwing tantrums. -#if defined(_WIN32) - int argc; - const LPWSTR cmdline_wargv = GetCommandLineW(); - LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc); - - // silence unused arg warnings - (void) raw_argc; - (void) raw_argv; - - for (int i = 0; i < argc; ++i) { - int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL); - char * output_buf = (char *) calloc(length_needed+1, sizeof(char)); - GGML_ASSERT(output_buf); - - WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL); - output_buf[length_needed] = '\0'; - - argv.push_back(output_buf); - free(output_buf); - } - - LocalFree((HLOCAL) wargv); -#else - int argc = raw_argc; - for (int i = 0; i < argc; ++i) { - argv.push_back(raw_argv[i]); - } -#endif - - GGML_ASSERT((unsigned int) argc == argv.size()); - - return argv; -} - -// -// Function: write_utf8_cstr_to_stdout(const char *) -> -// -// writes a string to standard output; taking into account that on Windows -// to display correctly you have to use special handling. Works even if the -// user has not set a unicode code page on a Windows cmd.exe. -// -// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something -// a human-readable is written instead. -// -// On non-Windows systems, simply printfs() the string. -static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) { - invalid_utf8 = false; - -#if defined(_WIN32) - // Are we in a console? - HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE); - DWORD dwMode = 0; - - // According to Microsoft docs: - // "WriteConsole fails if it is used with a standard handle that is redirected to a file." - // Also according to the docs, you can use GetConsoleMode to check for that. - if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) { - printf("%s", str); - return; - } - - // MultiByteToWideChar reports an error if str is empty, don't report - // them as invalid_utf8. - if (*str == 0) { - return; - } - int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0); - if (length_needed == 0) { - DWORD err = GetLastError(); - if (err == ERROR_NO_UNICODE_TRANSLATION) { - invalid_utf8 = true; - int len = strlen(str); - printf("<"); - for (int i = 0; i < len; ++i) { - if (i > 0) { - printf(" "); - } - printf("%02x", (uint8_t) str[i]); - } - printf(">"); - return; - } - GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way."); - } - - LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr)); - GGML_ASSERT(wstr); - - MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed); - WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL); - - free(wstr); -#else - // TODO: reporting invalid_utf8 would be useful on non-Windows too. - // printf will silently just write bad unicode. - printf("%s", str); -#endif -} - -int main(int raw_argc, char ** raw_argv) { - const std::vector argv = ingest_args(raw_argc, raw_argv); - const int argc = argv.size(); - - if (argc <= 1) { - print_usage_information(argv[0].c_str(), stderr); - return 1; - } - - ////// - // Read out all the command line arguments. - ////// - - // variables where to put any arguments we see. - bool printing_ids = false; - bool no_bos = false; - bool no_parse_special = false; - bool disable_logging = false; - bool show_token_count = false; - const char * model_path = NULL; - const char * prompt_path = NULL; - const char * prompt_arg = NULL; - - // track which arguments were explicitly given - // used for sanity checking down the line - bool model_path_set = false; - bool prompt_path_set = false; - bool prompt_set = false; - bool stdin_set = false; - - int iarg = 1; - for (; iarg < argc; ++iarg) { - std::string arg{argv[iarg]}; - if (arg == "-h" || arg == "--help") { - print_usage_information(argv[0].c_str(), stdout); - return 0; - } - else if (arg == "--ids") { - printing_ids = true; - } - else if (arg == "-m" || arg == "--model") { - if (model_path_set) { - fprintf(stderr, "Error: -m or --model specified multiple times.\n"); - return 1; - } - model_path = argv[++iarg].c_str(); - model_path_set = true; - } - else if (arg == "--no-bos") { - no_bos = true; - } - else if (arg == "--no-parse-special") { - no_parse_special = true; - } - else if (arg == "-p" || arg == "--prompt") { - if (prompt_set) { - fprintf(stderr, "Error: -p or --prompt specified multiple times.\n"); - return 1; - } - prompt_arg = argv[++iarg].c_str(); - prompt_set = true; - } - else if (arg == "-f" || arg == "--file") { - if (prompt_path_set) { - fprintf(stderr, "Error: -f or --file specified multiple times.\n"); - return 1; - } - prompt_path = argv[++iarg].c_str(); - prompt_path_set = true; - } - else if (arg == "--stdin") { - stdin_set = true; - } - else if (arg == "--log-disable") { - disable_logging = true; - } - else if (arg == "--show-count") { - show_token_count = true; - } - else { - fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str()); - return 1; - } - } - - ////// - // Sanity check the command line arguments. - ////// - - // Check that we have the required stuff set. - if (model_path_set && model_path == NULL) { - fprintf(stderr, "Error: --model requires an argument.\n"); - return 1; - } - if (!model_path_set) { - fprintf(stderr, "Error: must specify --model.\n"); - return 1; - } - if (prompt_path_set && prompt_path == NULL) { - fprintf(stderr, "Error: --file requires an argument.\n"); - return 1; - } - if (prompt_set && prompt_arg == NULL) { - fprintf(stderr, "Error: --prompt requires an argument.\n"); - return 1; - } - const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set); - if (prompts_set > 1) { - fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n"); - return 1; - } - // Must have some prompt. - if (prompts_set == 0) { - fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n"); - return 1; - } - - GGML_ASSERT(model_path); - GGML_ASSERT(prompt_path || prompt_arg || stdin_set); - - ////// - // Figure out where will the prompt come from. - ////// - - std::string prompt; - if (prompt_path_set) { - bool success = false; - prompt = read_prompt_from_file(prompt_path, success); - if (!success) { - return 1; - } - } else if (prompt_set) { - prompt = prompt_arg; - } else { - GGML_ASSERT(stdin_set); - // we read stdin *after* loading model (early exit if model cannot - // be loaded, which can be a nicer user experience) - } - - ////// - // Start actually doing the tokenizing stuff. - ////// - -#ifdef LOG_DISABLE_LOGS - disable_logging = true; -#endif - - if (disable_logging) { - llama_log_set(llama_log_callback_null, NULL); - } - - llama_backend_init(); - - llama_model_params model_params = llama_model_default_params(); - model_params.vocab_only = true; - llama_model * model = llama_load_model_from_file(model_path, model_params); - if (!model) { - fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path); - return 1; - } - - llama_context_params ctx_params = llama_context_default_params(); - llama_context * ctx = llama_new_context_with_model(model, ctx_params); - if (!ctx) { - fprintf(stderr, "Error: could not create context.\n"); - return 1; - } - - // read entire prompt from stdin? - if (stdin_set) { - GGML_ASSERT(!prompt_path_set && !prompt_set); - - std::stringstream stdin_buffer; - stdin_buffer << std::cin.rdbuf(); - if (std::cin.fail()) { - fprintf(stderr, "Error: could not read the entire standard input.\n"); - return 1; - } - - prompt = stdin_buffer.str(); - } - - const bool model_wants_add_bos = llama_should_add_bos_token(model); - const bool add_bos = model_wants_add_bos && !no_bos; - const bool parse_special = !no_parse_special; - - std::vector tokens; - tokens = ::llama_tokenize(model, prompt, add_bos, parse_special); - - if (printing_ids) { - printf("["); - } - - for (int i = 0; i < (int) tokens.size(); i++) { - if (printing_ids) { - if (i > 0) { - printf(", "); - } - printf("%d", tokens[i]); - } else { - bool invalid_utf8 = false; - printf("%6d -> '", tokens[i]); - write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); - if (invalid_utf8) { - printf("' (utf-8 decode failure)\n"); - } else { - printf("'\n"); - } - } - } - - if (printing_ids) { - printf("]\n"); - } - - if (show_token_count) { - printf("Total number of tokens: %ld\n", tokens.size()); - } - // silence valgrind - llama_free(ctx); - llama_free_model(model); - - return 0; -}