diff --git a/Makefile b/Makefile index 7a69ad1b3..8f3e17da4 100644 --- a/Makefile +++ b/Makefile @@ -688,7 +688,7 @@ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml $(CXX) $(CXXFLAGS) -c $< -o $@ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h -COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o +COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o common.o: common/common.cpp $(COMMON_H_DEPS) $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -756,7 +756,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS) +batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -788,7 +788,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) diff --git a/build.zig b/build.zig index 7f36e5968..e05ca2120 100644 --- a/build.zig +++ b/build.zig @@ -128,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void { const clip = make.obj("clip", "examples/llava/clip.cpp"); const llava = make.obj("llava", "examples/llava/llava.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 1d840e5f7..0ec8d6d8d 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -47,9 +47,6 @@ if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() -set(TARGET json-schema-to-grammar) -add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h) - set(TARGET common) add_library(${TARGET} STATIC @@ -63,6 +60,7 @@ add_library(${TARGET} STATIC grammar-parser.h grammar-parser.cpp json.hpp + json-schema-to-grammar.cpp train.h train.cpp ngram-cache.h diff --git a/common/common.cpp b/common/common.cpp index dda514785..52576cba3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1,4 +1,6 @@ #include "common.h" +#include "json.hpp" +#include "json-schema-to-grammar.h" #include "llama.h" #include @@ -68,6 +70,8 @@ #define LLAMA_CURL_MAX_HEADER_LENGTH 256 #endif // LLAMA_USE_CURL +using json = nlohmann::ordered_json; + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -1148,6 +1152,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa ); return true; } + if (arg == "-j" || arg == "--json-schema") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); + return true; + } if (arg == "--override-kv") { if (++i >= argc) { invalid_param = true; @@ -1353,6 +1365,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); printf(" --grammar-file FNAME file to read grammar from\n"); + printf(" -j SCHEMA, --json-schema SCHEMA\n"); + printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n"); + printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n"); printf(" --cfg-negative-prompt PROMPT\n"); printf(" negative prompt to use for guidance. (default: empty)\n"); printf(" --cfg-negative-prompt-file FNAME\n"); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 2e518e71d..0b4efc3e7 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1221,6 +1221,14 @@ class LlamaModel(Model): except FileNotFoundError: self._set_vocab_llama_hf() + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot']) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -2240,6 +2248,13 @@ class GemmaModel(Model): def set_vocab(self): self._set_vocab_sentencepiece() + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot']) + special_vocab._set_special_token("prefix", 67) + special_vocab._set_special_token("suffix", 69) + special_vocab._set_special_token("middle", 68) + special_vocab._set_special_token("eot", 70) + special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): hparams = self.hparams diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md index 64cc19204..a3a3c1389 100644 --- a/examples/gritlm/README.md +++ b/examples/gritlm/README.md @@ -21,12 +21,12 @@ not have to be performed at all. ### Running the example Download a Grit model: ```console -$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf +$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models ``` Run the example using the downloaded model: ```console -$ ./gritlm -m gritlm-7b_q4_1.gguf +$ ./gritlm -m models/gritlm-7b_q4_1.gguf Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 diff --git a/examples/main/README.md b/examples/main/README.md index 10a589ceb..649f4e0f3 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -304,10 +304,12 @@ These options help improve the performance and memory usage of the LLaMA models. - `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation. -### Grammars +### Grammars & JSON schemas - `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax. +- `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead. + ### Quantization For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize). diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index bab79aaea..9a3374fdc 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1852,12 +1852,20 @@ int main(int argc, char ** argv) { const int32_t n_ctx = params.n_ctx; + if (n_ctx <= 0) { + fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__); + return 1; + } + const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence; + if (ppl) { - int n_seq = std::max(1, params.n_batch / n_ctx); - int32_t n_kv = n_seq * n_ctx; + const int32_t n_seq = std::max(1, params.n_batch / n_ctx); + const int32_t n_kv = n_seq * n_ctx; + params.n_parallel = n_seq; - params.n_ctx = n_kv; + params.n_ctx = n_kv; + params.n_batch = std::min(params.n_batch, n_kv); } else { params.n_batch = std::min(params.n_batch, params.n_ctx); diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index d2ee47d01..61f584174 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -11,7 +11,7 @@ install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) -target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) if (LLAMA_SERVER_SSL) find_package(OpenSSL REQUIRED) target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2566b2fb8..1358206a3 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -90,6 +90,11 @@ class Keys: HF_JSON = "tokenizer.huggingface.json" RWKV = "tokenizer.rwkv.world" CHAT_TEMPLATE = "tokenizer.chat_template" + # FIM/Infill special tokens constants + PREFIX_ID = "tokenizer.ggml.prefix_token_id" + SUFFIX_ID = "tokenizer.ggml.suffix_token_id" + MIDDLE_ID = "tokenizer.ggml.middle_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" # @@ -885,3 +890,7 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV +KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID +KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID +KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index f4c440766..ff9326d59 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -469,6 +469,18 @@ class GGUFWriter: def add_chat_template(self, value: str) -> None: self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + def add_prefix_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.PREFIX_ID, id) + + def add_suffix_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id) + + def add_middle_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id) + + def add_eot_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.EOT_ID, id) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/llama.cpp b/llama.cpp index a5ef2fd8f..38e593625 100644 --- a/llama.cpp +++ b/llama.cpp @@ -327,6 +327,10 @@ enum llm_kv { LLM_KV_TOKENIZER_ADD_PREFIX, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, + LLM_KV_TOKENIZER_PREFIX_ID, + LLM_KV_TOKENIZER_SUFFIX_ID, + LLM_KV_TOKENIZER_MIDDLE_ID, + LLM_KV_TOKENIZER_EOT_ID, }; static const std::map LLM_KV_NAMES = { @@ -399,6 +403,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, + { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, + { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, + { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, }; struct LLM_KV { @@ -2055,10 +2063,10 @@ struct llama_vocab { int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. id linefeed_id = 13; - id special_prefix_id = 32007; - id special_middle_id = 32009; - id special_suffix_id = 32008; - id special_eot_id = 32010; + id special_prefix_id = -1; + id special_suffix_id = -1; + id special_middle_id = -1; + id special_eot_id = -1; bool add_space_prefix = true; @@ -4072,6 +4080,30 @@ static void llm_load_vocab( vocab.special_cls_id = -1; vocab.special_mask_id = -1; + // For Fill-In-the-Middle (FIM)/infill models which where converted + // prior to support of FIM special tokens in GGUF, the following + // will allow those models to continue to work. The general names + // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and + // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once + // new versions of these models have been published. + std::string gen_name; + ml.get_key(LLM_KV_GENERAL_NAME, gen_name); + std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(), + [](unsigned char c){ return std::tolower(c); }); + if (gen_name.find("code") != std::string::npos) { + if (model.arch == LLM_ARCH_LLAMA) { + vocab.special_prefix_id = 32007; + vocab.special_suffix_id = 32008; + vocab.special_middle_id = 32009; + vocab.special_eot_id = 32010; + } else if (model.arch == LLM_ARCH_GEMMA) { + vocab.special_prefix_id = 67; + vocab.special_suffix_id = 69; + vocab.special_middle_id = 68; + vocab.special_eot_id = 70; + } + } + const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); if (add_space_prefix_keyidx != -1) { vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); @@ -4185,13 +4217,17 @@ static void llm_load_vocab( // special tokens { const std::vector> special_token_types = { - { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id }, - { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id }, - { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id }, - { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id }, - { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id }, - { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id }, - { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id }, + { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id }, + { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id }, + { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id }, + { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id }, + { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id }, + { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id }, + { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id }, + { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id }, + { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id }, + { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id }, + { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, }; for (const auto & it : special_token_types) { const std::string & key = kv(std::get<0>(it)); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b5d7bb59c..89f23ca2d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -25,7 +25,7 @@ function(llama_test source) add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) - target_link_libraries(${TEST_TARGET} PRIVATE common json-schema-to-grammar) + target_link_libraries(${TEST_TARGET} PRIVATE common) add_test( NAME ${TEST_TARGET} WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}