diff --git a/.flake8 b/.flake8 index 18fba2c15..d64c2564a 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,17 @@ [flake8] max-line-length = 125 -ignore = W503 +ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503 +exclude = + # Do not traverse examples + examples, + # Do not include package initializers + __init__.py, + # No need to traverse our git directory + .git, + # There's no value in checking cache directories + __pycache__, + # No need to include the build path + build, + # This contains builds that we don't want to check + dist # This is generated with `python build .` for package releases +# max-complexity = 10 diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 3e968d179..de0d994c8 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -52,7 +52,19 @@ jobs: ftype: q4_0 pr_comment_enabled: "true" - if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }} + if: | + inputs.gpu-series == 'Standard_NC4as_T4_v3' + || ( + github.event_name == 'schedule' + && github.ref_name == 'master' + && github.repository_owner == 'ggerganov' + ) + || github.event_name == 'pull_request_target' + || ( + github.event_name == 'push' + && github.event.ref == 'refs/heads/master' + && github.repository_owner == 'ggerganov' + ) steps: - name: Clone id: checkout diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index 5be17f157..a8d46f31d 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -20,5 +20,4 @@ jobs: - name: flake8 Lint uses: py-actions/flake8@v2 with: - ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" - exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py" + plugins: "flake8-no-print" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 65796fe2e..91d791628 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,13 +3,14 @@ exclude: prompts/.*.txt repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.2.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - id: check-added-large-files - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.0.0 hooks: - id: flake8 + additional_dependencies: [flake8-no-print] diff --git a/CMakeLists.txt b/CMakeLists.txt index 21c7c0774..382432159 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) +option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF) + option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) @@ -410,6 +412,9 @@ if (LLAMA_CUDA) if (LLAMA_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() + if (LLAMA_CUDA_NO_VMM) + add_compile_definitions(GGML_CUDA_NO_VMM) + endif() add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) if (DEFINED LLAMA_CUDA_DMMV_Y) @@ -435,7 +440,11 @@ if (LLAMA_CUDA) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) + if (LLAMA_CUDA_NO_VMM) + # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) + else() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + endif() if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) # 52 == lowest CUDA 12 standard diff --git a/Makefile b/Makefile index 0a73f2a58..c568dd008 100644 --- a/Makefile +++ b/Makefile @@ -77,11 +77,10 @@ test: $(TEST_TARGETS) ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \ + ./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \ elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \ continue; \ elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \ diff --git a/README.md b/README.md index 514ef3af1..885322e68 100644 --- a/README.md +++ b/README.md @@ -712,6 +712,8 @@ Building the program with BLAS support may lead to some performance improvements To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. +Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face. + ```bash # obtain the official LLaMA model weights and place them in ./models ls ./models @@ -977,48 +979,20 @@ Here is a demo of an interactive session running on Pixel 5 phone: https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 -#### Building the Project using Termux (F-Droid) -Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card. - -Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU. - -If you opt to utilize OpenBLAS, you'll need to install the corresponding package. +#### Build on Android using Termux +[Termux](https://github.com/termux/termux-app#installation) is an alternative to execute `llama.cpp` on an Android device (no root required). ``` -apt install libopenblas +apt update && apt upgrade -y +apt install git ``` -Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages: +It's recommended to move your model inside the `~/` directory for best performance: ``` -apt install ocl-icd opencl-headers opencl-clhpp clinfo +cd storage/downloads +mv model.gguf ~/ ``` -In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below: -``` -cmake . -make -cp libclblast.so* $PREFIX/lib -cp ./include/clblast.h ../llama.cpp -``` - -Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below: -``` -cp /data/data/com.termux/files/usr/include/openblas/cblas.h . -cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h . -make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice) -``` - -Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below: -``` -GGML_OPENCL_PLATFORM=0 -GGML_OPENCL_DEVICE=0 -export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH -``` - -(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ ) - -For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. - -Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script. +[Follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`. ### Docker diff --git a/ci/run.sh b/ci/run.sh index bf21b6b31..e67c1a5ff 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -160,9 +160,8 @@ function gg_run_test_scripts_debug { set -e - # TODO: too slow, run on dedicated node - #(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log - #(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + (cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log set +e } @@ -695,8 +694,10 @@ test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run embd_bge_small - test $ret -eq 0 && gg_run test_scripts_debug - test $ret -eq 0 && gg_run test_scripts_release + if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then + test $ret -eq 0 && gg_run test_scripts_debug + test $ret -eq 0 && gg_run test_scripts_release + fi if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_CUDA} ]; then diff --git a/common/common.cpp b/common/common.cpp index 243b88abf..467fb014e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -76,7 +76,7 @@ int32_t get_num_physical_cores() { // enumerate the set of thread siblings, num entries is num cores std::unordered_set siblings; for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { - std::ifstream thread_siblings("/sys/devices/system/cpu" + std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings"); if (!thread_siblings.is_open()) { break; // no more cpus diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py old mode 100644 new mode 100755 index b019c1e3d..46a225462 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # This script downloads the tokenizer models of the specified models from Huggingface and # generates the get_vocab_base_pre() function for convert-hf-to-gguf.py # @@ -21,6 +23,7 @@ # TODO: automate the update of convert-hf-to-gguf.py # +import logging import os import requests import sys @@ -28,12 +31,18 @@ import json from hashlib import sha256 from enum import IntEnum, auto +from transformers import AutoTokenizer + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger("convert-hf-to-gguf-update") + class TOKENIZER_TYPE(IntEnum): SPM = auto() BPE = auto() WPM = auto() + # TODO: this string has to exercise as much pre-tokenizer functionality as possible # will be updated with time - contributions welcome chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' @@ -41,36 +50,40 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍ if len(sys.argv) == 2: token = sys.argv[1] else: - print("Usage: python convert-hf-to-gguf-update.py ") + logger.info("Usage: python convert-hf-to-gguf-update.py ") sys.exit(1) # TODO: add models here, base models preferred models = [ - { "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, - { "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, - { "name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, - { "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, - { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, - { "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, - { "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, - { "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, - { "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, - { "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, - ] + {"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, + {"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, + {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, + {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, + {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, + {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, + {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, + {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, + {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, + {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, + {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, +] # make directory "models/tokenizers" if it doesn't exist if not os.path.exists("models/tokenizers"): os.makedirs("models/tokenizers") + def download_file_with_auth(url, token, save_path): headers = {"Authorization": f"Bearer {token}"} response = requests.get(url, headers=headers) if response.status_code == 200: with open(save_path, 'wb') as f: f.write(response.content) - print(f"File {save_path} downloaded successfully") + logger.info(f"File {save_path} downloaded successfully") else: - print(f"Failed to download file. Status code: {response.status_code}") + logger.info(f"Failed to download file. Status code: {response.status_code}") + # download the tokenizer models for model in models: @@ -81,10 +94,10 @@ for model in models: if not os.path.exists(f"models/tokenizers/{name}"): os.makedirs(f"models/tokenizers/{name}") else: - print(f"Directory models/tokenizers/{name} already exists - skipping") + logger.info(f"Directory models/tokenizers/{name} already exists - skipping") continue - print(f"Downloading {name} to models/tokenizers/{name}") + logger.info(f"Downloading {name} to models/tokenizers/{name}") url = f"{repo}/raw/main/config.json" save_path = f"models/tokenizers/{name}/config.json" @@ -94,6 +107,14 @@ for model in models: save_path = f"models/tokenizers/{name}/tokenizer.json" download_file_with_auth(url, token, save_path) + # if downloaded file is less than 1KB, we likely need to download an LFS instead + if os.path.getsize(save_path) < 1024: + # remove the file + os.remove(save_path) + url = f"{repo}/resolve/main/tokenizer.json" + save_path = f"models/tokenizers/{name}/tokenizer.json" + download_file_with_auth(url, token, save_path) + if tokt == TOKENIZER_TYPE.SPM: url = f"{repo}/resolve/main/tokenizer.model" save_path = f"models/tokenizers/{name}/tokenizer.model" @@ -115,80 +136,82 @@ for model in models: continue # create the tokenizer - from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() - print(f"model: {name}") - print(f"tokt: {tokt}") - print(f"repo: {model['repo']}") - print(f"chktok: {chktok}") - print(f"chkhsh: {chkhsh}") + logger.info(f"model: {name}") + logger.info(f"tokt: {tokt}") + logger.info(f"repo: {model['repo']}") + logger.info(f"chktok: {chktok}") + logger.info(f"chkhsh: {chkhsh}") # print the "pre_tokenizer" content from the tokenizer.json with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f: cfg = json.load(f) pre_tokenizer = cfg["pre_tokenizer"] - print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) + logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) - print(f"\n") + logger.info("") src_ifs += f" if chkhsh == \"{chkhsh}\":\n" src_ifs += f" # ref: {model['repo']}\n" src_ifs += f" res = \"{name}\"\n" -src_func = "" -src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n" -src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n" -src_func += " # is specific for the BPE pre-tokenizer used by the model\n" -src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n" -src_func += " # use in llama.cpp to implement the same pre-tokenizer\n" -src_func += "\n" -src_func += f" chktxt = {repr(chktxt)}\n" -src_func += "\n" -src_func += " chktok = tokenizer.encode(chktxt)\n" -src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n" -src_func += "\n" -src_func += " print(f\"chktok: {chktok}\")\n" -src_func += " print(f\"chkhsh: {chkhsh}\")\n" -src_func += "\n" -src_func += " res = None\n" -src_func += "\n" -src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n" -src_func += " # or pull the latest version of the model from Huggingface\n" -src_func += " # don't edit the hashes manually!\n" -src_func += f"{src_ifs}\n" -src_func += " if res is None:\n" -src_func += " print(\"\\n\")\n" -src_func += " print(\"**************************************************************************************\")\n" -src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n" -src_func += " print(\"** There are 2 possible reasons for this:\")\n" -src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n" -src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n" -src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n" -src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n" -src_func += " print(\"**\")\n" -src_func += " print(f\"** chkhsh: {chkhsh}\")\n" -src_func += " print(\"**************************************************************************************\")\n" -src_func += " print(\"\\n\")\n" -src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n" -src_func += "\n" -src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n" -src_func += " print(f\"chkhsh: {chkhsh}\")\n" -src_func += "\n" -src_func += " return res\n" +src_func = f""" + def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer -print(src_func) + chktxt = {repr(chktxt)} -print("\n") -print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") -print("\n") + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {{chktok}}") + logger.debug(f"chkhsh: {{chkhsh}}") + + res = None + + # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! +{src_ifs} + if res is None: + logger.warning("\\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {{chkhsh}}") + logger.warning("**************************************************************************************") + logger.warning("\\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}") + logger.debug(f"chkhsh: {{chkhsh}}") + + return res +""" + +print(src_func) # noqa: NP100 + +logger.info("\n") +logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") +logger.info("\n") # generate tests for each tokenizer model tests = [ + "ied 4 ½ months", + "Führer", "", " ", " ", @@ -250,7 +273,6 @@ for model in models: tokt = model["tokt"] # create the tokenizer - from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f: @@ -265,15 +287,15 @@ for model in models: f.write(f" {r}") f.write("\n") - print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*") + logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*") # generate commands for creating vocab files -print("\nRun the following commands to generate the vocab files for testing:\n") +logger.info("\nRun the following commands to generate the vocab files for testing:\n") for model in models: name = model["name"] - print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") + print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 -print("\n") +logger.info("\n") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 2f146d730..f7441e6b8 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging import argparse import contextlib import json @@ -26,6 +27,8 @@ import gguf from convert import LlamaHfVocab, permute +logger = logging.getLogger("hf-to-gguf") + ###### MODEL DEFINITIONS ###### @@ -76,7 +79,7 @@ class Model(ABC): def get_tensors(self) -> Iterator[tuple[str, Tensor]]: for part_name in self.part_names: - print(f"gguf: loading model part '{part_name}'") + logger.info(f"gguf: loading model part '{part_name}'") ctx: ContextManager[Any] if self.is_safetensors: from safetensors import safe_open @@ -95,42 +98,42 @@ class Model(ABC): if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) - print(f"gguf: context length = {n_ctx}") + logger.info(f"gguf: context length = {n_ctx}") n_embd = self.find_hparam(["hidden_size", "n_embd"]) self.gguf_writer.add_embedding_length(n_embd) - print(f"gguf: embedding length = {n_embd}") + logger.info(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) - print(f"gguf: feed forward length = {n_ff}") + logger.info(f"gguf: feed forward length = {n_ff}") n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_head_count(n_head) - print(f"gguf: head count = {n_head}") + logger.info(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) - print(f"gguf: key-value head count = {n_head_kv}") + logger.info(f"gguf: key-value head count = {n_head_kv}") if (rope_theta := self.hparams.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) - print(f"gguf: rope theta = {rope_theta}") + logger.info(f"gguf: rope theta = {rope_theta}") if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - print(f"gguf: rms norm epsilon = {f_rms_eps}") + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) - print(f"gguf: layer norm epsilon = {f_norm_eps}") + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") if (n_experts := self.hparams.get("num_local_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) - print(f"gguf: expert count = {n_experts}") + logger.info(f"gguf: expert count = {n_experts}") if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) - print(f"gguf: experts used count = {n_experts_used}") + logger.info(f"gguf: experts used count = {n_experts_used}") self.gguf_writer.add_file_type(self.ftype) - print(f"gguf: file type = {self.ftype}") + logger.info(f"gguf: file type = {self.ftype}") def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) @@ -151,8 +154,7 @@ class Model(ABC): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -169,7 +171,7 @@ class Model(ABC): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -274,8 +276,8 @@ class Model(ABC): chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() - print(f"chktok: {chktok}") - print(f"chkhsh: {chkhsh}") + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") res = None @@ -306,24 +308,30 @@ class Model(ABC): if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" + if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" if res is None: - print("\n") - print("**************************************************************************************") - print("** WARNING: The BPE pre-tokenizer was not recognized!") - print("** There are 2 possible reasons for this:") - print("** - the model has not been added to convert-hf-to-gguf-update.py yet") - print("** - the pre-tokenization config has changed upstream") - print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") - print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") - print("**") - print(f"** chkhsh: {chkhsh}") - print("**************************************************************************************") - print("\n") + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - print(f"tokenizer.ggml.pre: {res}") - print(f"chkhsh: {chkhsh}") + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + logger.debug(f"chkhsh: {chkhsh}") return res @@ -439,9 +447,7 @@ class Model(ABC): if vocab_size > len(tokens): pad_count = vocab_size - len(tokens) - print( - f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" - ) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") for i in range(1, pad_count + 1): tokens.append(f"[PAD{i}]") scores.append(-1000.0) @@ -553,7 +559,7 @@ class BloomModel(Model): ), axis=0, ) - print("re-format attention.linear_qkv.weight") + logger.info("re-format attention.linear_qkv.weight") elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) data = np.concatenate( @@ -564,13 +570,12 @@ class BloomModel(Model): ), axis=0, ) - print("re-format attention.linear_qkv.bias") + logger.info("re-format attention.linear_qkv.bias") # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -587,13 +592,13 @@ class BloomModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.info(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) if not has_lm_head and name == "word_embeddings.weight": self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") @Model.register("MPTForCausalLM") @@ -653,8 +658,7 @@ class MPTModel(Model): else: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -671,7 +675,7 @@ class MPTModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -697,8 +701,7 @@ class OrionModel(Model): elif "model_max_length" in self.hparams: ctx_length = self.hparams["model_max_length"] else: - print("gguf: can not find ctx length parameter.") - sys.exit() + raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_name(self.dir_model.name) @@ -736,8 +739,7 @@ class OrionModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -754,7 +756,7 @@ class OrionModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -779,8 +781,7 @@ class BaichuanModel(Model): elif "model_max_length" in self.hparams: ctx_length = self.hparams["model_max_length"] else: - print("gguf: can not find ctx length parameter.") - sys.exit() + raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_source_hf_repo(hf_repo) @@ -809,7 +810,7 @@ class BaichuanModel(Model): for i in range(block_count): if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: - print(f"Unpacking and permuting layer {i}") + logger.info(f"Unpacking and permuting layer {i}") model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ self._reverse_hf_permute_part(w, 0, head_count, head_count) model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ @@ -834,8 +835,7 @@ class BaichuanModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -852,7 +852,7 @@ class BaichuanModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: @@ -937,8 +937,7 @@ class XverseModel(Model): elif "model_max_length" in self.hparams: ctx_length = self.hparams["model_max_length"] else: - print("gguf: can not find ctx length parameter.") - sys.exit() + raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_source_hf_repo(hf_repo) @@ -987,8 +986,7 @@ class XverseModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1005,7 +1003,7 @@ class XverseModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: @@ -1092,8 +1090,7 @@ class FalconModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1110,7 +1107,7 @@ class FalconModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1197,8 +1194,7 @@ class RefactModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1215,7 +1211,7 @@ class RefactModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1264,10 +1260,9 @@ class PersimmonModel(Model): data = data_torch.to(torch.float32).squeeze().numpy() new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1332,8 +1327,7 @@ class StableLMModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1350,7 +1344,7 @@ class StableLMModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1366,8 +1360,7 @@ class StableLMModel(Model): merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): data = data.astype(np.float32) @@ -1375,7 +1368,7 @@ class StableLMModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1427,7 +1420,7 @@ class LlamaModel(Model): experts = dict() for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype @@ -1480,10 +1473,9 @@ class LlamaModel(Model): new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) continue @@ -1491,8 +1483,7 @@ class LlamaModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1509,7 +1500,7 @@ class LlamaModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1584,10 +1575,9 @@ class GrokModel(Model): new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) continue @@ -1595,8 +1585,7 @@ class GrokModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1613,7 +1602,7 @@ class GrokModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1646,7 +1635,7 @@ class DbrxModel(Model): self.gguf_writer.add_layer_norm_eps(1e-5) self.gguf_writer.add_file_type(self.ftype) - print(f"gguf: file type = {self.ftype}") + logger.info(f"gguf: file type = {self.ftype}") def write_tensors(self): block_count = self.hparams.get("n_layers") @@ -1689,8 +1678,7 @@ class DbrxModel(Model): # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1698,8 +1686,7 @@ class DbrxModel(Model): # Most of the codebase that takes in 1D tensors only handles F32 tensors # and most of the outputs tensors are F32. if data_dtype != np.float32 and n_dims == 1: - print(f"Can not map tensor {name!r}: all 1D tensors must be F32") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}: all 1D tensors must be F32") # if f32 desired, convert any float16 to float32 if self.ftype == 0 and data_dtype == np.float16: @@ -1709,7 +1696,7 @@ class DbrxModel(Model): if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1771,8 +1758,7 @@ class MiniCPMModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1789,7 +1775,7 @@ class MiniCPMModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1855,8 +1841,7 @@ class QwenModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1873,7 +1858,7 @@ class QwenModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1950,10 +1935,9 @@ class Qwen2MoeModel(Model): new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) continue @@ -1961,8 +1945,7 @@ class Qwen2MoeModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1979,7 +1962,7 @@ class Qwen2MoeModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2024,8 +2007,7 @@ class GPT2Model(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2042,13 +2024,13 @@ class GPT2Model(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) # note: GPT2 output is tied to (same as) wte in original model if new_name == "token_embd.weight": - print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor("output.weight", data) @@ -2087,8 +2069,7 @@ class Phi3MiniModel(Model): tokenizer_path = self.dir_model / 'tokenizer.model' if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) - sys.exit(1) + raise ValueError(f'Error: Missing {tokenizer_path}') tokenizer = SentencePieceProcessor(str(tokenizer_path)) @@ -2126,7 +2107,7 @@ class Phi3MiniModel(Model): for key in added_tokens_json: token_id = added_tokens_json[key] if (token_id >= vocab_size): - print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue tokens[token_id] = key.encode("utf-8") @@ -2208,8 +2189,7 @@ class PlamoModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") # shuffle for broadcasting of gqa in ggml_mul_mat if new_name.endswith("attn_q.weight"): @@ -2240,7 +2220,7 @@ class PlamoModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2286,8 +2266,7 @@ class CodeShellModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2304,13 +2283,13 @@ class CodeShellModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) if not has_lm_head and name == "transformer.wte.weight": self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") @Model.register("InternLM2ForCausalLM") @@ -2332,7 +2311,7 @@ class InternLM2Model(Model): toktypes: list[int] = [] if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + logger.error(f'Error: Missing {tokenizer_path}') sys.exit(1) sentencepiece_model = model.ModelProto() @@ -2349,7 +2328,7 @@ class InternLM2Model(Model): if text == b"\x00": # (TODO): fixme # Hack here and replace the \x00 characters. - print(f"InternLM2 convert token '{text}' to '🐉'!") + logger.debug(f"InternLM2 convert token '{text}' to '🐉'!") text = "🐉" toktype = SentencePieceTokenTypes.NORMAL @@ -2390,7 +2369,7 @@ class InternLM2Model(Model): # TODO: this is a hack, should be fixed # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ + logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ in chat mode so that the conversation can end normally.") special_vocab.add_to_gguf(self.gguf_writer) @@ -2435,8 +2414,7 @@ in chat mode so that the conversation can end normally.") # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2453,7 +2431,7 @@ in chat mode so that the conversation can end normally.") if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) def write_tensors(self): @@ -2564,8 +2542,7 @@ class BertModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") # convert any unsupported data types to float32 if data_torch.dtype not in (torch.float16, torch.float32): @@ -2585,7 +2562,7 @@ class BertModel(Model): # if f32 desired, convert any float16 to float32 new_dtype = np.float32 - print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}") if data.dtype != new_dtype: data = data.astype(new_dtype) @@ -2664,7 +2641,7 @@ class GemmaModel(Model): # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": - print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") continue old_dtype = data_torch.dtype @@ -2681,8 +2658,7 @@ class GemmaModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2693,7 +2669,7 @@ class GemmaModel(Model): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2721,7 +2697,7 @@ class MambaModel(Model): else: # Use the GPT-NeoX tokenizer when no tokenizer files are present tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" - print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") neox_reader = gguf.GGUFReader(tokenizer_path, "r") field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) @@ -2793,17 +2769,16 @@ class MambaModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") if name.endswith(".A_log"): - print("A_log --> A ==> " + new_name) + logger.debug("A_log --> A ==> " + new_name) data_torch = -torch.exp(data_torch) # assuming token_embd.weight is seen before output.weight if tok_embd is not None and new_name == output_name: if torch.equal(tok_embd, data_torch): - print(f"{output_name} is equivalent to {tok_embd_name}, omitting") + logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") continue if new_name == tok_embd_name: tok_embd = data_torch @@ -2826,7 +2801,7 @@ class MambaModel(Model): if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2885,8 +2860,7 @@ class OlmoModel(Model): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2903,7 +2877,7 @@ class OlmoModel(Model): if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2936,6 +2910,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)") parser.add_argument("--model-name", type=str, default=None, help="name of the model") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") return parser.parse_args() @@ -2943,6 +2918,8 @@ def parse_args() -> argparse.Namespace: def main() -> None: args = parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + dir_model = args.model if args.awq_path: @@ -2951,15 +2928,15 @@ def main() -> None: tmp_model_path = args.model / "weighted_model" dir_model = tmp_model_path if tmp_model_path.is_dir(): - print(f"{tmp_model_path} exists as a weighted model.") + logger.info(f"{tmp_model_path} exists as a weighted model.") else: tmp_model_path.mkdir(parents=True, exist_ok=True) - print("Saving new weighted model ...") + logger.info("Saving new weighted model ...") add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - print(f"Saved weighted model at {tmp_model_path}.") + logger.info(f"Saved weighted model at {tmp_model_path}.") if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file=sys.stderr) + logger.error(f'Error: {args.model} is not a directory') sys.exit(1) ftype_map = { @@ -2973,7 +2950,7 @@ def main() -> None: # output in the same directory as the model by default fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' - print(f"Loading model: {dir_model.name}") + logger.info(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) @@ -2981,20 +2958,20 @@ def main() -> None: model_class = Model.from_model_architecture(hparams["architectures"][0]) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file) - print("Set model parameters") + logger.info("Set model parameters") model_instance.set_gguf_parameters() - print("Set model tokenizer") + logger.info("Set model tokenizer") model_instance.set_vocab() if args.vocab_only: - print(f"Exporting model vocab to '{fname_out}'") + logger.info(f"Exporting model vocab to '{fname_out}'") model_instance.write_vocab() else: - print(f"Exporting model to '{fname_out}'") + logger.info(f"Exporting model to '{fname_out}'") model_instance.write() - print(f"Model successfully exported to '{fname_out}'") + logger.info(f"Model successfully exported to '{fname_out}'") if __name__ == '__main__': diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index 5354b748b..9349de3b3 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import struct @@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +logger = logging.getLogger("ggml-to-gguf") + class GGMLFormat(IntEnum): GGML = 0 @@ -125,7 +128,6 @@ class Tensor: self.start_offset = offset self.len_bytes = n_bytes offset += n_bytes - # print(n_dims, name_len, dtype, self.dims, self.name, pad) return offset - orig_offset @@ -175,7 +177,7 @@ class GGMLModel: offset += self.validate_header(data, offset) hp = Hyperparameters() offset += hp.load(data, offset) - print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') + logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') self.validate_conversion(hp.ftype) vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML) offset += vocab.load(data, offset, hp.n_vocab) @@ -215,12 +217,12 @@ class GGMLToGGUF: if float(hp.n_head) / float(x) == gqa: n_kv_head = x assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param" - print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') + logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') self.n_kv_head = n_kv_head self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer) def save(self): - print('* Preparing to save GGUF file') + logger.info('* Preparing to save GGUF file') gguf_writer = gguf.GGUFWriter( self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], @@ -230,11 +232,11 @@ class GGMLToGGUF: if self.special_vocab is not None: self.special_vocab.add_to_gguf(gguf_writer) self.add_tensors(gguf_writer) - print(" gguf: write header") + logger.info(" gguf: write header") gguf_writer.write_header_to_file() - print(" gguf: write metadata") + logger.info(" gguf: write metadata") gguf_writer.write_kv_data_to_file() - print(" gguf: write tensors") + logger.info(" gguf: write tensors") gguf_writer.write_tensors_to_file() gguf_writer.close() @@ -250,7 +252,7 @@ class GGMLToGGUF: name = cfg.name if cfg.name is not None else cfg.input.name except UnicodeDecodeError: name = None - print('* Adding model parameters and KV items') + logger.info('* Adding model parameters and KV items') if name is not None: gguf_writer.add_name(name) gguf_writer.add_description(desc) @@ -287,7 +289,7 @@ class GGMLToGGUF: toktypes = [] if self.vocab_override is not None: vo = self.vocab_override - print('* Adding vocab item(s)') + logger.info('* Adding vocab item(s)') for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): tokens.append(vbytes) scores.append(score) @@ -299,7 +301,7 @@ class GGMLToGGUF: if len(toktypes) > 0: gguf_writer.add_token_types(toktypes) return - print(f'* Adding {hp.n_vocab} vocab item(s)') + logger.info(f'* Adding {hp.n_vocab} vocab item(s)') assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab' for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): tt = 1 # Normal @@ -334,7 +336,7 @@ class GGMLToGGUF: def add_tensors(self, gguf_writer): tensor_map = self.name_map data = self.data - print(f'* Adding {len(self.model.tensors)} tensor(s)') + logger.info(f'* Adding {len(self.model.tensors)} tensor(s)') for tensor in self.model.tensors: name = str(tensor.name, 'UTF-8') mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) @@ -344,7 +346,6 @@ class GGMLToGGUF: temp = tempdims[1] tempdims[1] = tempdims[0] tempdims[0] = temp - # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}') gguf_writer.add_tensor( mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], @@ -401,33 +402,35 @@ def handle_args(): help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument("--vocabtype", default="spm,hfft", help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") return parser.parse_args() def main(): cfg = handle_args() - print(f'* Using config: {cfg}') - print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') + logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO) + logger.info(f'* Using config: {cfg}') + logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===') if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'): - print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') + logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') data = np.memmap(cfg.input, mode = 'r') model = GGMLModel() - print('* Scanning GGML input file') + logger.info('* Scanning GGML input file') offset = model.load(data, 0) # noqa - print(f'* GGML model hyperparameters: {model.hyperparameters}') + logger.info(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None special_vocab = None if cfg.model_metadata_dir is not None: (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters) - print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') - print(f'* Overriding params: {params_override}') - print(f'* Overriding vocab: {vocab_override}') - print(f'* Special vocab: {special_vocab}') + logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') + logger.info(f'* Overriding params: {params_override}') + logger.info(f'* Overriding vocab: {vocab_override}') + logger.info(f'* Special vocab: {special_vocab}') else: - print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') + logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') if model.file_format == GGMLFormat.GGML: - print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') + logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') converter = GGMLToGGUF( model, data, cfg, params_override = params_override, @@ -435,7 +438,7 @@ def main(): special_vocab = special_vocab ) converter.save() - print(f'* Successful completion. Output saved to: {cfg.output}') + logger.info(f'* Successful completion. Output saved to: {cfg.output}') if __name__ == '__main__': diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 9a9936dec..f09fa85fe 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import json import os import struct @@ -15,6 +16,9 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) import gguf +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger("lora-to-gguf") + NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} @@ -48,11 +52,9 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty if __name__ == '__main__': if len(sys.argv) < 2: - print(f"Usage: python {sys.argv[0]} [arch]") - print( - "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" - ) - print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") + logger.info(f"Usage: python {sys.argv[0]} [arch]") + logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'") + logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") sys.exit(1) input_json = os.path.join(sys.argv[1], "adapter_config.json") @@ -70,7 +72,7 @@ if __name__ == '__main__': arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" if arch_name not in gguf.MODEL_ARCH_NAMES.values(): - print(f"Error: unsupported architecture {arch_name}") + logger.error(f"Error: unsupported architecture {arch_name}") sys.exit(1) arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] @@ -80,21 +82,21 @@ if __name__ == '__main__': params = json.load(f) if params["peft_type"] != "LORA": - print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") + logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") sys.exit(1) if params["fan_in_fan_out"] is True: - print("Error: param fan_in_fan_out is not supported") + logger.error("Error: param fan_in_fan_out is not supported") sys.exit(1) if params["bias"] is not None and params["bias"] != "none": - print("Error: param bias is not supported") + logger.error("Error: param bias is not supported") sys.exit(1) # TODO: these seem to be layers that have been trained but without lora. # doesn't seem widely used but eventually should be supported if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: - print("Error: param modules_to_save is not supported") + logger.error("Error: param modules_to_save is not supported") sys.exit(1) with open(output_path, "wb") as fout: @@ -125,13 +127,13 @@ if __name__ == '__main__': suffix = k[-len(lora_suffixes[0]):] k = k[: -len(lora_suffixes[0])] else: - print(f"Error: unrecognized tensor name {orig_k}") + logger.error(f"Error: unrecognized tensor name {orig_k}") sys.exit(1) tname = name_map.get_name(k) if tname is None: - print(f"Error: could not map tensor name {orig_k}") - print(" Note: the arch parameter must be specified if the model is not llama") + logger.error(f"Error: could not map tensor name {orig_k}") + logger.error(" Note: the arch parameter must be specified if the model is not llama") sys.exit(1) if suffix == ".lora_A.weight": @@ -141,8 +143,8 @@ if __name__ == '__main__': else: assert False - print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") + logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") write_tensor_header(fout, tname, t.shape, t.dtype) t.tofile(fout) - print(f"Converted {input_json} and {input_model} to {output_path}") + logger.info(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index aba575426..07dcade74 100755 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import sys @@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +logger = logging.getLogger("persimmon-to-gguf") + def _flatten_dict(dct, tensors, prefix=None): assert isinstance(dct, dict) @@ -30,9 +33,9 @@ def _flatten_dict(dct, tensors, prefix=None): def _get_sentencepiece_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' - print('gguf: getting sentencepiece tokenizer from', tokenizer_path) + logger.info('getting sentencepiece tokenizer from', tokenizer_path) tokenizer = SentencePieceProcessor(str(tokenizer_path)) - print('gguf: adding tokens') + logger.info('adding tokens') tokens: list[bytes] = [] scores: list[float] = [] toktypes: list[int] = [] @@ -67,8 +70,10 @@ def main(): parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file") parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release") - parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory") + parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) sys.path.append(str(args.adept_inference_dir)) persimmon_model = torch.load(args.ckpt_path) hparams = persimmon_model['args'] @@ -107,7 +112,7 @@ def main(): gguf_writer.add_eos_token_id(71013) tensor_map = gguf.get_tensor_name_map(arch, block_count) - print(tensor_map) + logger.info(tensor_map) for name in tensors.keys(): data_torch = tensors[name] if name.endswith(".self_attention.rotary_emb.inv_freq"): @@ -117,22 +122,21 @@ def main(): data = data_torch.to(torch.float32).squeeze().numpy() new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() + raise ValueError(f"Can not map tensor '{name}'") + n_dims = len(data.shape) - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}") gguf_writer.add_tensor(new_name, data) - print("gguf: write header") + logger.info("gguf: write header") gguf_writer.write_header_to_file() - print("gguf: write metadata") + logger.info("gguf: write metadata") gguf_writer.write_kv_data_to_file() - print("gguf: write tensors") + logger.info("gguf: write tensors") gguf_writer.write_tensors_to_file() gguf_writer.close() - print(f"gguf: model successfully exported to '{args.outfile}'") - print("") + logger.info(f"gguf: model successfully exported to '{args.outfile}'") if __name__ == '__main__': diff --git a/convert.py b/convert.py index 1c700cf6a..7f0b6b749 100755 --- a/convert.py +++ b/convert.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import concurrent.futures import enum @@ -35,6 +36,8 @@ import gguf if TYPE_CHECKING: from typing_extensions import Self, TypeAlias +logger = logging.getLogger("convert") + if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): faulthandler.register(signal.SIGUSR1) @@ -643,7 +646,6 @@ class LlamaHfVocab(Vocab): def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: - # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) @@ -1033,12 +1035,12 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) # Check for a vocab size mismatch if params.n_vocab == vocab.vocab_size: - print("Ignoring added_tokens.json since model matches vocab size without it.") + logger.warning("Ignoring added_tokens.json since model matches vocab size without it.") return if pad_vocab and params.n_vocab > vocab.vocab_size: pad_count = params.n_vocab - vocab.vocab_size - print( + logger.debug( f"Padding vocab with {pad_count} token(s) - through " ) for i in range(1, pad_count + 1): @@ -1166,7 +1168,7 @@ class OutputFile: elapsed = time.time() - start size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) - print( + logger.info( f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" ) self.gguf.write_tensor_data(ndarray) @@ -1281,12 +1283,12 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> # HF models permut or pack some of the tensors, so we need to undo that for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" in model: - print(f"Permuting layer {i}") + logger.debug(f"Permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: - print(f"Unpacking and permuting layer {i}") + logger.debug(f"Unpacking and permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) @@ -1299,15 +1301,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) if name_new is None: if skip_unknown: - print(f"Unexpected tensor name: {name} - skipping") + logger.warning(f"Unexpected tensor name: {name} - skipping") continue raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") if tensor_type in should_skip: - print(f"skipping tensor {name_new}") + logger.debug(f"skipping tensor {name_new}") continue - print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") + logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") out[name_new] = lazy_tensor return out @@ -1372,7 +1374,7 @@ def load_some_model(path: Path) -> ModelPlus: paths = find_multifile_paths(path) models_plus: list[ModelPlus] = [] for path in paths: - print(f"Loading model file {path}") + logger.info(f"Loading model file {path}") models_plus.append(lazy_load_file(path)) model_plus = merge_multifile_models(models_plus) @@ -1413,7 +1415,7 @@ class VocabFactory: else: raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}") - print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}") + logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}") return vocab def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]: @@ -1438,19 +1440,19 @@ def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" if ret in model_paths: - sys.stderr.write( + logger.error( f"Error: Default output path ({ret}) would overwrite the input. " - "Please explicitly specify a path using --outfile.\n") + "Please explicitly specify a path using --outfile.") sys.exit(1) return ret def do_dump_model(model_plus: ModelPlus) -> None: - print(f"model_plus.paths = {model_plus.paths!r}") - print(f"model_plus.format = {model_plus.format!r}") - print(f"model_plus.vocab = {model_plus.vocab!r}") + print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100 + print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100 + print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100 for name, lazy_tensor in model_plus.model.items(): - print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100 def main(args_in: list[str] | None = None) -> None: @@ -1473,8 +1475,18 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args(args_in) + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + elif args.dump_single or args.dump: + # Avoid printing anything besides the dump output + logging.basicConfig(level=logging.WARNING) + else: + logging.basicConfig(level=logging.INFO) + if args.no_vocab and args.vocab_only: raise ValueError("--vocab-only does not make sense with --no-vocab") @@ -1491,6 +1503,7 @@ def main(args_in: list[str] | None = None) -> None: if args.dump: do_dump_model(model_plus) return + endianess = gguf.GGUFEndian.LITTLE if args.big_endian: endianess = gguf.GGUFEndian.BIG @@ -1513,7 +1526,7 @@ def main(args_in: list[str] | None = None) -> None: "q8_0": GGMLFileType.MostlyQ8_0, }[args.outtype] - print(f"params = {params}") + logger.info(f"params = {params}") model_parent_path = model_plus.paths[0].parent vocab_path = Path(args.vocab_dir or args.model or model_parent_path) @@ -1528,15 +1541,14 @@ def main(args_in: list[str] | None = None) -> None: outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") + logger.info(f"Wrote {outfile}") return if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: vocab = model_plus.vocab - print(f"Vocab info: {vocab}") - print(f"Special vocab info: {special_vocab}") - + logger.info(f"Vocab info: {vocab}") + logger.info(f"Special vocab info: {special_vocab}") model = model_plus.model model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) @@ -1544,11 +1556,11 @@ def main(args_in: list[str] | None = None) -> None: outfile = args.outfile or default_outfile(model_plus.paths, ftype) params.ftype = ftype - print(f"Writing {outfile}, format {ftype}") + logger.info(f"Writing {outfile}, format {ftype}") OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") + logger.info(f"Wrote {outfile}") if __name__ == '__main__': diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 39c75e0a7..e04feeae3 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -32,6 +32,7 @@ struct split_params { int n_split_tensors = 128; std::string input; std::string output; + bool no_tensor_first_split = false; bool dry_run = false; }; @@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) { printf(" --merge merge multiple GGUF to a single GGUF\n"); printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); printf(" --split-max-size N(M|G) max size per split\n"); + printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n"); printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); printf("\n"); } @@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p arg_found = true; params.dry_run = true; } + if (arg == "--no-tensor-first-split") { + arg_found = true; + params.no_tensor_first_split = true; + } if (is_op_set) { throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); @@ -200,10 +206,10 @@ struct split_strategy { // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits int i_split = -1; struct gguf_context * ctx_out = NULL; - auto new_ctx_out = [&]() { + auto new_ctx_out = [&](bool allow_no_tensors) { i_split++; if (ctx_out != NULL) { - if (gguf_get_n_tensors(ctx_out) == 0) { + if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) { fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n"); exit(EXIT_FAILURE); } @@ -220,7 +226,12 @@ struct split_strategy { }; // initialize ctx_out for the first split - new_ctx_out(); + new_ctx_out(false); + + // skip first split if no_tensor_first_split is set + if (params.no_tensor_first_split) { + new_ctx_out(true); + } // process tensors one by one size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) @@ -230,7 +241,7 @@ struct split_strategy { size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); size_t next_tensors_size = curr_tensors_size + n_bytes; if (should_split(i, next_tensors_size)) { - new_ctx_out(); + new_ctx_out(false); curr_tensors_size = n_bytes; } else { curr_tensors_size = next_tensors_size; diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh index 57588204d..7ca6fa7f2 100755 --- a/examples/gguf-split/tests.sh +++ b/examples/gguf-split/tests.sh @@ -55,15 +55,15 @@ $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32 echo PASS echo -# 4. Split with no tensor in metadata -#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors -#echo PASS -#echo +# 4. Split with no tensors in the first split +$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors +echo PASS +echo # 4b. Test the sharded model is loading properly -#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32 -#echo PASS -#echo +$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32 +echo PASS +echo # 5. Merge #$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 95c3095dd..40128ec44 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -178,6 +178,7 @@ struct cmd_params { std::vector> tensor_split; std::vector use_mmap; std::vector embeddings; + ggml_numa_strategy numa; int reps; bool verbose; output_formats output_format; @@ -200,6 +201,7 @@ static const cmd_params cmd_params_defaults = { /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, /* use_mmap */ {true}, /* embeddings */ {false}, + /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* verbose */ false, /* output_format */ MARKDOWN @@ -224,6 +226,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" --numa (default: disabled)\n"); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -396,6 +399,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); + } else if (arg == "--numa") { + if (++i >= argc) { + invalid_param = true; + break; + } else { + std::string value(argv[i]); + /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; break; } + } } else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) { invalid_param = true; @@ -1215,6 +1229,7 @@ int main(int argc, char ** argv) { llama_log_set(llama_null_log_callback, NULL); } llama_backend_init(); + llama_numa_init(params.numa); // initialize printer std::unique_ptr p; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f60530cf3..ff0814b2f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1383,9 +1383,10 @@ struct server_context { if (!slot.params.stream && slot.stopped_word) { const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); + size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); probs = std::vector( slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - stop_word_toks.size()); + slot.generated_token_probs.end() - safe_offset); } else { probs = std::vector( slot.generated_token_probs.begin(), diff --git a/flake.lock b/flake.lock index b738da7c6..c9ead0bf7 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1712014858, - "narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=", + "lastModified": 1714641030, + "narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "9126214d0a59633752a136528f5f3b9aa8565b7d", + "rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1714076141, - "narHash": "sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY=", + "lastModified": 1714635257, + "narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856", + "rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f", "type": "github" }, "original": { @@ -36,20 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "dir": "lib", - "lastModified": 1711703276, - "narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "d8fe5e6c92d0d190646fb9f1056741a229980089", - "type": "github" + "lastModified": 1714640452, + "narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=", + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" }, "original": { - "dir": "lib", - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", - "type": "github" + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" } }, "root": { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 348e641a5..e42a81627 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -122,7 +122,7 @@ static ggml_cuda_device_info ggml_cuda_init() { for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; -#if !defined(GGML_USE_HIPBLAS) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) CUdevice device; CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); @@ -268,7 +268,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { }; // pool with virtual memory -#if !defined(GGML_USE_HIPBLAS) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) struct ggml_cuda_pool_vmm : public ggml_cuda_pool { static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB @@ -365,7 +365,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { #endif // !defined(GGML_USE_HIPBLAS) std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(int device) { -#if !defined(GGML_USE_HIPBLAS) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) if (ggml_cuda_info().devices[device].vmm) { return std::unique_ptr(new ggml_cuda_pool_vmm(device)); } diff --git a/ggml.c b/ggml.c index 74ecd5927..82179a125 100644 --- a/ggml.c +++ b/ggml.c @@ -21139,7 +21139,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } // read the tensor infos - { + if (ctx->header.n_tensors > 0) { ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 5dd700963..1d9a0cc82 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -1,11 +1,14 @@ #!/usr/bin/env python +import logging import argparse import asyncio import os import sys from tempfile import gettempdir, NamedTemporaryFile +logger = logging.getLogger("ggml-vk-generate-shaders") + shader_f32 = """ #define FLOAT_TYPE float """ @@ -2498,7 +2501,7 @@ async def string_to_spv(name, code, defines, fp16=True): stdout, stderr = await proc.communicate() - print(" ".join(cmd)) + logger.info(" ".join(cmd)) if proc.returncode: raise RuntimeError(f"{name=} {f.name=} {stdout=} {stderr=}") @@ -2507,7 +2510,7 @@ async def string_to_spv(name, code, defines, fp16=True): cmd.extend([f"-D{key}={value}" for key, value in defines.items()]) code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())]) - print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}") + logger.error(f"cannot compile {name}\n\n{code_with_lines}\n\n{error}") f.close() os.remove(f.name) sys.exit(proc.returncode) @@ -2520,7 +2523,7 @@ async def string_to_spv(name, code, defines, fp16=True): async def main(): - print("ggml_vulkan: Generating and compiling shaders to SPIR-V") + logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V") tasks = [] @@ -2768,9 +2771,12 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator") parser.add_argument("--glslc", help="Path to glslc") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if args.glslc: GLSLC = args.glslc diff --git a/gguf-py/examples/reader.py b/gguf-py/examples/reader.py index 62e0769da..d841048c6 100644 --- a/gguf-py/examples/reader.py +++ b/gguf-py/examples/reader.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 +import logging import sys from pathlib import Path from gguf.gguf_reader import GGUFReader +logger = logging.getLogger("reader") sys.path.insert(0, str(Path(__file__).parent.parent)) @@ -18,28 +20,28 @@ def read_gguf_file(gguf_file_path): reader = GGUFReader(gguf_file_path) # List all key-value pairs in a columnized format - print("Key-Value Pairs:") + print("Key-Value Pairs:") # noqa: NP100 max_key_length = max(len(key) for key in reader.fields.keys()) for key, field in reader.fields.items(): value = field.parts[field.data[0]] - print(f"{key:{max_key_length}} : {value}") - print("----") + print(f"{key:{max_key_length}} : {value}") # noqa: NP100 + print("----") # noqa: NP100 # List all tensors - print("Tensors:") + print("Tensors:") # noqa: NP100 tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}" - print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) - print("-" * 80) + print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100 + print("-" * 80) # noqa: NP100 for tensor in reader.tensors: shape_str = "x".join(map(str, tensor.shape)) size_str = str(tensor.n_elements) quantization_str = tensor.tensor_type.name - print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) + print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100 if __name__ == '__main__': if len(sys.argv) < 2: - print("Usage: reader.py ") + logger.info("Usage: reader.py ") sys.exit(1) gguf_file_path = sys.argv[1] read_gguf_file(gguf_file_path) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6d597bfd9..4f232e18d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1,6 +1,5 @@ from __future__ import annotations -import sys from enum import Enum, IntEnum, auto from typing import Any @@ -854,8 +853,7 @@ class GGUFValueType(IntEnum): return GGUFValueType.INT32 # TODO: need help with 64-bit types in Python else: - print("Unknown type:", type(val)) - sys.exit() + raise ValueError(f"Unknown type: {type(val)}") # Note: Does not support GGML_QKK_64 diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 2bdb15525..db8525d85 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -4,6 +4,7 @@ # from __future__ import annotations +import logging import os from collections import OrderedDict from typing import Any, Literal, NamedTuple, TypeVar, Union @@ -27,6 +28,7 @@ from gguf.constants import ( GGUFValueType, ) +logger = logging.getLogger(__name__) READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION] @@ -142,7 +144,7 @@ class GGUFReader: # TODO: add option to generate error on duplicate keys # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') - print(f'Warning: Duplicate key {field.name} at offset {field.offset}') + logger.warning(f'Duplicate key {field.name} at offset {field.offset}') self.fields[field.name + '_{}'.format(field.offset)] = field else: self.fields[field.name] = field diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 089aece87..d9cfbf711 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os import shutil import struct @@ -24,6 +25,8 @@ from .constants import ( TokenType, ) +logger = logging.getLogger(__name__) + class WriterState(Enum): EMPTY = auto() @@ -67,7 +70,7 @@ class GGUFWriter: self.use_temp_file = use_temp_file self.temp_file = None self.tensors = [] - print("gguf: This GGUF file is for {0} Endian only".format( + logger.info("gguf: This GGUF file is for {0} Endian only".format( "Big" if self.endianess == GGUFEndian.BIG else "Little", )) self.state = WriterState.EMPTY diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 378eaecad..c97a78f39 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -1,13 +1,15 @@ from __future__ import annotations +import logging import json import os -import sys from pathlib import Path from typing import Any, Callable from .gguf_writer import GGUFWriter +logger = logging.getLogger(__name__) + class SpecialVocab: merges: list[str] @@ -40,38 +42,29 @@ class SpecialVocab: def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if self.merges: if not quiet: - print(f'gguf: Adding {len(self.merges)} merge(s).') + logger.info(f'Adding {len(self.merges)} merge(s).') gw.add_token_merges(self.merges) elif self.load_merges: - print( - 'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.', - file = sys.stderr, - ) + logger.warning('Adding merges requested but no merges found, output may be non-functional.') for typ, tokid in self.special_token_ids.items(): id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) if id_handler is None: - print( - f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', - file = sys.stderr, - ) + logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping') continue if not quiet: - print(f'gguf: Setting special token type {typ} to {tokid}') + logger.info(f'Setting special token type {typ} to {tokid}') id_handler(tokid) for typ, value in self.add_special_token.items(): add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) if add_handler is None: - print( - f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping', - file = sys.stderr, - ) + logger.warning(f'No handler for add_{typ}_token with value {value} - skipping') continue if not quiet: - print(f'gguf: Setting add_{typ}_token to {value}') + logger.info(f'Setting add_{typ}_token to {value}') add_handler(value) if self.chat_template is not None: if not quiet: - print(f'gguf: Setting chat_template to {self.chat_template}') + logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) def _load(self, path: Path) -> None: @@ -99,10 +92,7 @@ class SpecialVocab: continue parts = line.split(None, 3) if len(parts) != 2: - print( - f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring', - file = sys.stderr, - ) + logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring') continue merges.append(f'{parts[0]} {parts[1]}') self.merges = merges @@ -118,10 +108,7 @@ class SpecialVocab: return self.special_token_ids[typ] = tid return - print( - f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping', - file = sys.stderr, - ) + logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer_file = path / 'tokenizer.json' @@ -144,10 +131,7 @@ class SpecialVocab: if chat_template is None or isinstance(chat_template, (str, list)): self.chat_template = chat_template else: - print( - f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring', - file = sys.stderr - ) + logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring') for typ in self.special_token_types: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): diff --git a/gguf-py/scripts/gguf-convert-endian.py b/gguf-py/scripts/gguf-convert-endian.py index 10a16ad06..b698af0fe 100755 --- a/gguf-py/scripts/gguf-convert-endian.py +++ b/gguf-py/scripts/gguf-convert-endian.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import sys +from tqdm import tqdm from pathlib import Path import numpy as np @@ -14,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / import gguf +logger = logging.getLogger("gguf-convert-endian") + def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: if np.uint32(1) == np.uint32(1).newbyteorder("<"): @@ -29,11 +33,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None else: file_endian = host_endian order = host_endian if args.order == "native" else args.order - print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") + logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") if file_endian == order: - print(f"* File is already {order.upper()} endian. Nothing to do.") + logger.info(f"* File is already {order.upper()} endian. Nothing to do.") sys.exit(0) - print("* Checking tensors for conversion compatibility") + logger.info("* Checking tensors for conversion compatibility") for tensor in reader.tensors: if tensor.tensor_type not in ( gguf.GGMLQuantizationType.F32, @@ -41,51 +45,64 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None gguf.GGMLQuantizationType.Q8_0, ): raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") - print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") + logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") if args.dry_run: return - print("\n*** Warning *** Warning *** Warning **") - print("* This conversion process may damage the file. Ensure you have a backup.") + logger.warning("*** Warning *** Warning *** Warning **") + logger.warning("* This conversion process may damage the file. Ensure you have a backup.") if order != host_endian: - print("* Requested endian differs from host, you will not be able to load the model on this machine.") - print("* The file will be modified immediately, so if conversion fails or is interrupted") - print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") + logger.warning("* Requested endian differs from host, you will not be able to load the model on this machine.") + logger.warning("* The file will be modified immediately, so if conversion fails or is interrupted") + logger.warning("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") response = input("YES, I am sure> ") if response != "YES": - print("You didn't enter YES. Okay then, see ya!") + logger.warning("You didn't enter YES. Okay then, see ya!") sys.exit(0) - print(f"\n* Converting fields ({len(reader.fields)})") + logger.info(f"* Converting fields ({len(reader.fields)})") for idx, field in enumerate(reader.fields.values()): - print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") + logger.info(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") for part in field.parts: part.byteswap(inplace=True) - print(f"\n* Converting tensors ({len(reader.tensors)})") - for idx, tensor in enumerate(reader.tensors): - print( - f" - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, " - f"elements={tensor.n_elements}... ", - end="", + logger.info(f"* Converting tensors ({len(reader.tensors)})") + + for idx, tensor in enumerate(pbar := tqdm(reader.tensors, desc="Converting tensor")): + log_message = ( + f"Converting tensor {repr(tensor.name)}, " + f"type={tensor.tensor_type.name}, " + f"elements={tensor.n_elements} " ) - tensor_type = tensor.tensor_type + + # Byte-swap each part of the tensor's field for part in tensor.field.parts: part.byteswap(inplace=True) - if tensor_type != gguf.GGMLQuantizationType.Q8_0: + + # Byte-swap tensor data if necessary + if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0: + # Handle Q8_0 tensor blocks (block_q8_0) + # Specific handling of block_q8_0 is required. + # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations. + + block_size = 34 # 34 bytes = + 32 * + + n_blocks = len(tensor.data) // block_size + for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): + block_offs = block_num * block_size + + # Byte-Swap f16 sized delta field + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + # Byte-Swap Q8 weights + if block_num % 100000 == 0: + inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") + + else: + # Handle other tensor types tensor.data.byteswap(inplace=True) - print() - continue - # A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes - block_size = 34 - n_blocks = len(tensor.data) // block_size - for block_num in range(n_blocks): - block_offs = block_num * block_size - # I know I said f16, but it doesn't matter here - any simple 16 bit type works. - delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) - delta.byteswap(inplace=True) - if block_num % 100000 == 0: - print(f"[{(n_blocks - block_num) // 1000}K]", end="") - sys.stdout.flush() - print() - print("* Completion") + + pbar.set_description(log_message) + + logger.info("* Completion") def main() -> None: @@ -102,8 +119,13 @@ def main() -> None: "--dry-run", action="store_true", help="Don't actually change anything", ) + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + logger.info(f'* Loading: {args.model}') reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+') convert_byteorder(reader, args) diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py index dbf891508..2d3c3943f 100755 --- a/gguf-py/scripts/gguf-dump.py +++ b/gguf-py/scripts/gguf-dump.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import sys @@ -15,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / from gguf import GGUFReader, GGUFValueType # noqa: E402 +logger = logging.getLogger("gguf-dump") + def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG' @@ -29,8 +32,8 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: # please see the comments in the modify_gguf.py example. def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: host_endian, file_endian = get_file_host_endian(reader) - print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') - print(f'\n* Dumping {len(reader.fields)} key/value pair(s)') + print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') # noqa: NP100 + print(f'* Dumping {len(reader.fields)} key/value pair(s)') # noqa: NP100 for n, field in enumerate(reader.fields.values(), 1): if not field.types: pretty_type = 'N/A' @@ -39,20 +42,21 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count else: pretty_type = str(field.types[-1].name) - print(f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '') + + log_message = f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}' if len(field.types) == 1: curr_type = field.types[0] if curr_type == GGUFValueType.STRING: - print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '') + log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])) elif field.types[0] in reader.gguf_scalar_to_np: - print(' = {0}'.format(field.parts[-1][0]), end = '') - print() + log_message += ' = {0}'.format(field.parts[-1][0]) + print(log_message) # noqa: NP100 if args.no_tensors: return - print(f'\n* Dumping {len(reader.tensors)} tensor(s)') + print(f'* Dumping {len(reader.tensors)} tensor(s)') # noqa: NP100 for n, tensor in enumerate(reader.tensors, 1): prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) - print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') + print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') # noqa: NP100 def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: @@ -103,10 +107,17 @@ def main() -> None: parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata") parser.add_argument("--json", action="store_true", help="Produce JSON output") parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if not args.json: - print(f'* Loading: {args.model}') + logger.info(f'* Loading: {args.model}') + reader = GGUFReader(args.model, 'r') + if args.json: dump_metadata_json(reader, args) else: diff --git a/gguf-py/scripts/gguf-set-metadata.py b/gguf-py/scripts/gguf-set-metadata.py index 3ebdfa898..e35b651b8 100755 --- a/gguf-py/scripts/gguf-set-metadata.py +++ b/gguf-py/scripts/gguf-set-metadata.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import logging import argparse import os import sys @@ -10,6 +11,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / from gguf import GGUFReader # noqa: E402 +logger = logging.getLogger("gguf-set-metadata") + def minimal_example(filename: str) -> None: reader = GGUFReader(filename, 'r+') @@ -41,36 +44,33 @@ def minimal_example(filename: str) -> None: def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: field = reader.get_field(args.key) if field is None: - print(f'! Field {repr(args.key)} not found', file = sys.stderr) + logger.error(f'! Field {repr(args.key)} not found') sys.exit(1) # Note that field.types is a list of types. This is because the GGUF # format supports arrays. For example, an array of UINT32 would # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32] handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None if handler is None: - print( - f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}', - file = sys.stderr, - ) + logger.error(f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}') sys.exit(1) current_value = field.parts[field.data[0]][0] new_value = handler(args.value) - print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') + logger.info(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') if current_value == new_value: - print(f'- Key {repr(args.key)} already set to requested value {current_value}') + logger.info(f'- Key {repr(args.key)} already set to requested value {current_value}') sys.exit(0) if args.dry_run: sys.exit(0) if not args.force: - print('*** Warning *** Warning *** Warning **') - print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') - print('* Enter exactly YES if you are positive you want to proceed:') + logger.warning('*** Warning *** Warning *** Warning **') + logger.warning('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') + logger.warning('* Enter exactly YES if you are positive you want to proceed:') response = input('YES, I am sure> ') if response != 'YES': - print("You didn't enter YES. Okay then, see ya!") + logger.info("You didn't enter YES. Okay then, see ya!") sys.exit(0) field.parts[field.data[0]][0] = new_value - print('* Field changed. Successful completion.') + logger.info('* Field changed. Successful completion.') def main() -> None: @@ -80,8 +80,13 @@ def main() -> None: parser.add_argument("value", type=str, help="Metadata value to set") parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything") parser.add_argument("--force", action="store_true", help="Change the field without confirmation") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + logger.info(f'* Loading: {args.model}') reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+') set_metadata(reader, args) diff --git a/llama.cpp b/llama.cpp index 18d6297ce..aeb5c08df 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2359,7 +2359,7 @@ static bool llama_kv_cache_init( cache.recurrent = model.arch == LLM_ARCH_MAMBA; cache.v_trans = !cparams.flash_attn; - // TODO: support mixed reccurent Transformer architectues + // TODO: support mixed recurrent Transformer architectures // NOTE: (!a || b) is a logical implication (a -> b) GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s()); GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s()); @@ -4383,6 +4383,12 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "gpt-2") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; + } else if ( + tokenizer_pre == "refact") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; + } else if ( + tokenizer_pre == "command-r") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -11952,7 +11958,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(llama_is_byte_token(vocab, id)); - const auto& token_data = vocab.id_to_token.at(id); + const auto & token_data = vocab.id_to_token.at(id); switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { auto buf = token_data.text.substr(3, 2); @@ -12212,14 +12218,13 @@ struct llm_tokenizer_bpe { "\\s?\\p{L}+", "\\s?\\p{P}+", "[一-龥ࠀ-一가-퟿]+", - "\\p{N}+", + "\\p{N}", }); break; case LLAMA_VOCAB_PRE_TYPE_FALCON: word_collection = unicode_regex_split(text, { "[\\p{P}\\$\\+<=>\\^~\\|]+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "\\p{N}+", "[0-9][0-9][0-9]", }); break; @@ -12235,6 +12240,13 @@ struct llm_tokenizer_bpe { }); break; case LLAMA_VOCAB_PRE_TYPE_STARCODER: + case LLAMA_VOCAB_PRE_TYPE_REFACT: + case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + word_collection = unicode_regex_split(text, { + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; case LLAMA_VOCAB_PRE_TYPE_GPT2: word_collection = unicode_regex_split(text, { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", @@ -17466,9 +17478,10 @@ int32_t llama_tokenize( static std::string llama_decode_text(const std::string & text) { std::string decoded_text; - auto unicode_sequences = unicode_cpts_from_utf8(text); - for (auto & unicode_sequence : unicode_sequences) { - decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence)); + + const auto cpts = unicode_cpts_from_utf8(text); + for (const auto cpt : cpts) { + decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt)); } return decoded_text; diff --git a/llama.h b/llama.h index 059d78f11..e2fd53ab7 100644 --- a/llama.h +++ b/llama.h @@ -79,6 +79,8 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_MPT = 5, LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_REFACT = 8, + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, }; // note: these values should be synchronized with ggml_rope @@ -171,7 +173,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef bool (*llama_progress_callback)(float progress, void *ctx); + typedef bool (*llama_progress_callback)(float progress, void * user_data); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences diff --git a/models/ggml-vocab-bert-bge.gguf.inp b/models/ggml-vocab-bert-bge.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-bert-bge.gguf.inp +++ b/models/ggml-vocab-bert-bge.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-bert-bge.gguf.out b/models/ggml-vocab-bert-bge.gguf.out index 969552e1c..e4a76cdb0 100644 --- a/models/ggml-vocab-bert-bge.gguf.out +++ b/models/ggml-vocab-bert-bge.gguf.out @@ -1,3 +1,5 @@ + 29464 2094 1018 1092 2706 + 11865 17875 diff --git a/models/ggml-vocab-command-r.gguf b/models/ggml-vocab-command-r.gguf new file mode 100644 index 000000000..b553eab33 Binary files /dev/null and b/models/ggml-vocab-command-r.gguf differ diff --git a/models/ggml-vocab-command-r.gguf.inp b/models/ggml-vocab-command-r.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-command-r.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-command-r.gguf.out b/models/ggml-vocab-command-r.gguf.out new file mode 100644 index 000000000..cc4277daa --- /dev/null +++ b/models/ggml-vocab-command-r.gguf.out @@ -0,0 +1,43 @@ + 2536 228 27 228 22957 6983 + 45 193433 + + 228 + 1667 + 1742 + 205 + 206 + 2126 + 11516 + 34777 + 28339 3845 + 46609 3845 + 28339 3930 + 46609 3930 + 46609 3930 8 + 28339 19 3845 8 + 46609 19 3845 8 + 2075 1801 11254 107 255 21 19317 + 94 23 27 31 228 30 21213 20752 39267 6405 9980 + 4929 40071 2196 3236 8750 1764 37097 41168 + 38111 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 38111 231 38111 257 38111 235 165 24629 38111 239 + 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 1737 10203 109160 1875 2222 2517 3342 12523 16 + 28339 + 46609 + 228 46609 + 1667 46609 + 1742 46609 + 1742 46609 1856 46609 + 1737 + 206 1857 + 14 4515 + 28339 19 1770 14 1954 8 4070 1955 1933 80503 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 + 26 + 26 26 + 26 26 26 + 26 26 26 26 + 26 26 26 26 26 + 26 26 26 26 26 26 + 26 26 26 26 26 26 26 + 26 26 26 26 26 26 26 26 + 26 26 26 26 26 26 26 26 26 + 127731 51628 205 57788 18494 97469 126134 206 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 11254 107 255 2226 107 255 228 26 228 26 26 228 26 26 26 228 26 26 26 26 228 26 26 26 26 26 228 26 26 26 26 26 26 228 26 26 26 26 26 26 26 228 26 26 26 26 26 26 26 26 228 26 21 26 228 26 2271 26 228 26 3834 26 182018 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 188568 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 8391 158343 3512 40071 2196 3236 8750 1764 37097 41168 29721 32797 25646 3802 4975 4975 116167 57178 10251 154048 27292 1767 5125 2632 2155 91 2378 1919 1914 2782 19 2155 3354 1933 5470 38 2155 52 2068 5470 1767 4961 3059 1894 19 2155 43 1933 3026 2725 23186 38 2930 14 20676 1671 14 83 51 diff --git a/models/ggml-vocab-deepseek-coder.gguf.inp b/models/ggml-vocab-deepseek-coder.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.inp +++ b/models/ggml-vocab-deepseek-coder.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-coder.gguf.out b/models/ggml-vocab-deepseek-coder.gguf.out index 8ef585c78..9ccc560d6 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.out +++ b/models/ggml-vocab-deepseek-coder.gguf.out @@ -1,3 +1,5 @@ + 1050 207 19 207 19192 4217 + 37 32009 71 6247 207 243 diff --git a/models/ggml-vocab-deepseek-llm.gguf.inp b/models/ggml-vocab-deepseek-llm.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.inp +++ b/models/ggml-vocab-deepseek-llm.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-llm.gguf.out b/models/ggml-vocab-deepseek-llm.gguf.out index 0ea9d66e3..fd94b896d 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.out +++ b/models/ggml-vocab-deepseek-llm.gguf.out @@ -1,3 +1,5 @@ + 1052 207 19 207 19109 4223 + 37 100014 71 6245 207 243 diff --git a/models/ggml-vocab-falcon.gguf.inp b/models/ggml-vocab-falcon.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-falcon.gguf.inp +++ b/models/ggml-vocab-falcon.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-falcon.gguf.out b/models/ggml-vocab-falcon.gguf.out index cb8da7b1a..209b04cda 100644 --- a/models/ggml-vocab-falcon.gguf.out +++ b/models/ggml-vocab-falcon.gguf.out @@ -1,3 +1,5 @@ + 878 204 31 3068 133 2137 + 28611 132 30042 204 258 diff --git a/models/ggml-vocab-gpt-2.gguf.inp b/models/ggml-vocab-gpt-2.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-gpt-2.gguf.inp +++ b/models/ggml-vocab-gpt-2.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-gpt-2.gguf.out b/models/ggml-vocab-gpt-2.gguf.out index 9986f38e4..78430f0d3 100644 --- a/models/ggml-vocab-gpt-2.gguf.out +++ b/models/ggml-vocab-gpt-2.gguf.out @@ -1,3 +1,5 @@ + 798 604 25208 1933 + 37 9116 71 11751 220 220 220 diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-llama-bpe.gguf.inp +++ b/models/ggml-vocab-llama-bpe.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out index 4d903e1cd..1f00e3812 100644 --- a/models/ggml-vocab-llama-bpe.gguf.out +++ b/models/ggml-vocab-llama-bpe.gguf.out @@ -1,3 +1,5 @@ + 1142 220 19 220 27154 4038 + 37 51853 261 220 256 diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-llama-spm.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-llama-spm.gguf.inp +++ b/models/ggml-vocab-llama-spm.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-llama-spm.gguf.out index 15d00b106..9c3327cb5 100644 --- a/models/ggml-vocab-llama-spm.gguf.out +++ b/models/ggml-vocab-llama-spm.gguf.out @@ -1,3 +1,5 @@ + 474 287 29871 29946 29871 30226 7378 + 383 4000 261 259 1678 diff --git a/models/ggml-vocab-mpt.gguf.inp b/models/ggml-vocab-mpt.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-mpt.gguf.inp +++ b/models/ggml-vocab-mpt.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-mpt.gguf.out b/models/ggml-vocab-mpt.gguf.out index 1f4b0eb3a..d8d0fe909 100644 --- a/models/ggml-vocab-mpt.gguf.out +++ b/models/ggml-vocab-mpt.gguf.out @@ -1,3 +1,5 @@ + 728 577 24142 2607 + 39 26288 6554 209 50276 diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf index 72fdb409c..f8022a385 100644 Binary files a/models/ggml-vocab-phi-3.gguf and b/models/ggml-vocab-phi-3.gguf differ diff --git a/models/ggml-vocab-phi-3.gguf.inp b/models/ggml-vocab-phi-3.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-phi-3.gguf.inp +++ b/models/ggml-vocab-phi-3.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-phi-3.gguf.out b/models/ggml-vocab-phi-3.gguf.out index 15d00b106..9c3327cb5 100644 --- a/models/ggml-vocab-phi-3.gguf.out +++ b/models/ggml-vocab-phi-3.gguf.out @@ -1,3 +1,5 @@ + 474 287 29871 29946 29871 30226 7378 + 383 4000 261 259 1678 diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf index 8f26cfb76..52afcf01a 100644 Binary files a/models/ggml-vocab-refact.gguf and b/models/ggml-vocab-refact.gguf differ diff --git a/models/ggml-vocab-refact.gguf.inp b/models/ggml-vocab-refact.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-refact.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-refact.gguf.out b/models/ggml-vocab-refact.gguf.out new file mode 100644 index 000000000..06b15c090 --- /dev/null +++ b/models/ggml-vocab-refact.gguf.out @@ -0,0 +1,43 @@ + 4833 225 38 225 143 140 17723 + 56 2006 3935 265 + + 225 + 261 + 264 + 202 + 203 + 478 + 2831 + 15773 + 8279 5788 + 12000 5788 + 8279 10896 + 12000 10896 + 12000 10896 19 + 8279 30 5788 19 + 12000 30 5788 19 + 458 438 5945 118 252 32 3766 + 105 34 38 42 225 41 102 1707 12530 10180 1479 8278 + 39862 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 + 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 14574 228 14574 254 14574 232 30457 228 14574 236 + 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 308 2585 22680 688 1401 2819 4369 2404 27 + 8279 + 12000 + 225 12000 + 261 12000 + 264 12000 + 264 12000 284 12000 + 308 + 203 280 + 25 34666 + 8279 30 533 25 464 19 4971 884 844 18458 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 + 37 + 37 37 + 37 37 37 + 37 37 37 37 + 37 37 37 37 37 + 37 37 37 37 37 37 + 37 37 37 37 37 37 37 + 37 37 37 37 37 37 37 37 + 37 37 37 37 37 37 37 37 37 + 334 719 8878 202 10885 4222 16104 28570 203 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 5945 118 252 3807 118 252 225 37 225 37 37 225 37 37 37 225 37 37 37 37 225 37 37 37 37 37 225 37 37 37 37 37 37 225 37 37 37 37 37 37 37 225 37 37 37 37 37 37 37 37 225 37 32 37 225 37 497 37 225 37 1179 37 225 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 36628 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 20921 16623 13028 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 5881 9592 13299 914 31753 31359 9163 3202 35472 10397 439 4763 2583 330 102 1455 938 1182 2017 30 330 613 844 3654 49 330 63 646 3654 439 4621 1930 561 30 330 54 844 2124 1629 35993 49 2688 25 7709 312 25 94 62 diff --git a/models/ggml-vocab-starcoder.gguf.inp b/models/ggml-vocab-starcoder.gguf.inp index 0389f00c7..0a89107c6 100644 --- a/models/ggml-vocab-starcoder.gguf.inp +++ b/models/ggml-vocab-starcoder.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-starcoder.gguf.out b/models/ggml-vocab-starcoder.gguf.out index cd04254af..ccb55c7fe 100644 --- a/models/ggml-vocab-starcoder.gguf.out +++ b/models/ggml-vocab-starcoder.gguf.out @@ -1,3 +1,5 @@ + 4850 244 57 244 162 159 17722 + 75 2022 3943 284 244 280 diff --git a/requirements/requirements-convert.txt b/requirements/requirements-convert.txt index a3d6ecec0..5520ba732 100644 --- a/requirements/requirements-convert.txt +++ b/requirements/requirements-convert.txt @@ -1,5 +1,5 @@ numpy~=1.24.4 sentencepiece~=0.1.98 -transformers>=4.35.2,<5.0.0 +transformers>=4.40.1,<5.0.0 gguf>=0.1.0 protobuf>=4.21.0,<5.0.0 diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index ef7f19ecb..3892fd25c 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import logging import argparse import heapq import sys @@ -11,9 +12,11 @@ try: import git from tabulate import tabulate except ImportError as e: - print("ERROR: the following Python libraries are required: GitPython, tabulate.") + print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100 raise e +logger = logging.getLogger("compare-llama-bench") + # Properties by which to differentiate results per commit: KEY_PROPERTIES = [ "cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas", @@ -94,8 +97,7 @@ parser.add_argument("-s", "--show", help=help_s) known_args, unknown_args = parser.parse_known_args() if unknown_args: - print(f"ERROR: Received unknown args: {unknown_args}.") - print() + logger.error(f"Received unknown args: {unknown_args}.") parser.print_help() sys.exit(1) @@ -108,8 +110,7 @@ if input_file is None: input_file = sqlite_files[0] if input_file is None: - print("ERROR: Cannot find a suitable input file, please provide one.") - print() + logger.error("Cannot find a suitable input file, please provide one.") parser.print_help() sys.exit(1) @@ -194,23 +195,19 @@ if known_args.baseline is not None: hexsha8_baseline = get_commit_hexsha8(known_args.baseline) name_baseline = known_args.baseline if hexsha8_baseline is None: - print(f"ERROR: cannot find data for baseline={known_args.baseline}.") + logger.error(f"cannot find data for baseline={known_args.baseline}.") sys.exit(1) # Otherwise, search for the most recent parent of master for which there is data: elif repo is not None: hexsha8_baseline = find_parent_in_data(repo.heads.master.commit) if hexsha8_baseline is None: - print("ERROR: No baseline was provided and did not find data for any master branch commits.") - print() + logger.error("No baseline was provided and did not find data for any master branch commits.") parser.print_help() sys.exit(1) else: - print( - "ERROR: No baseline was provided and the current working directory " - "is not part of a git repository from which a baseline could be inferred." - ) - print() + logger.error("No baseline was provided and the current working directory " + "is not part of a git repository from which a baseline could be inferred.") parser.print_help() sys.exit(1) @@ -227,7 +224,7 @@ if known_args.compare is not None: hexsha8_compare = get_commit_hexsha8(known_args.compare) name_compare = known_args.compare if hexsha8_compare is None: - print(f"ERROR: cannot find data for compare={known_args.compare}.") + logger.error(f"cannot find data for compare={known_args.compare}.") sys.exit(1) # Otherwise, search for the commit for llama-bench was most recently run # and that is not a parent of master: @@ -241,16 +238,12 @@ elif repo is not None: break if hexsha8_compare is None: - print("ERROR: No compare target was provided and did not find data for any non-master commits.") - print() + logger.error("No compare target was provided and did not find data for any non-master commits.") parser.print_help() sys.exit(1) else: - print( - "ERROR: No compare target was provided and the current working directory " - "is not part of a git repository from which a compare target could be inferred." - ) - print() + logger.error("No compare target was provided and the current working directory " + "is not part of a git repository from which a compare target could be inferred.\n") parser.print_help() sys.exit(1) @@ -284,8 +277,7 @@ if known_args.show is not None: if prop not in KEY_PROPERTIES[:-2]: # Last two values are n_prompt, n_gen. unknown_cols.append(prop) if unknown_cols: - print(f"ERROR: Unknown values for --show: {', '.join(unknown_cols)}") - print() + logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}") parser.print_usage() sys.exit(1) rows_show = get_rows(show) @@ -369,7 +361,7 @@ if "gpu_info" in show: headers = [PRETTY_NAMES[p] for p in show] headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"] -print(tabulate( +logger.info(tabulate( table, headers=headers, floatfmt=".2f", diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py new file mode 100644 index 000000000..977e65613 --- /dev/null +++ b/scripts/gen-unicode-data.py @@ -0,0 +1,66 @@ +import regex + + +def cpt_to_utf8_str(cpt): + if cpt <= 0xFF: + return bytes([cpt, 0, 0, 0]) + elif cpt <= 0xFFFF: + return bytes([cpt & 0xFF, cpt >> 8, 0, 0]) + elif cpt <= 0xFFFFFF: + return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0]) + else: + return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24]) + + +def is_match(codepoint, regex_expr): + try: + res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32')) + return res is not None + except Exception: + return False + + +def get_matches(regex_expr): + unicode_ranges = [] + current_range = None + + for codepoint in range(0x110000): + if is_match(codepoint, regex_expr): + if current_range is None: + current_range = [codepoint, codepoint] + else: + current_range[1] = codepoint + elif current_range is not None: + unicode_ranges.append(tuple(current_range)) + current_range = None + + if current_range is not None: + unicode_ranges.append(tuple(current_range)) + + return unicode_ranges + + +def print_cat(cat, ranges): + print("const std::vector> unicode_ranges_{} = {{".format(cat)) # noqa: NP100 + cnt = 0 + for start, end in ranges: + if cnt % 4 != 0: + print(" ", end="") # noqa: NP100 + print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100 + if cnt % 4 == 3: + print("") # noqa: NP100 + cnt += 1 + + if cnt % 4 != 0: + print("") # noqa: NP100 + print("};") # noqa: NP100 + print("") # noqa: NP100 + + +print_cat("number", get_matches(r'\p{N}')) +print_cat("letter", get_matches(r'\p{L}')) +print_cat("whitespace", get_matches(r'\p{Z}')) +print_cat("accent_mark", get_matches(r'\p{M}')) +print_cat("punctuation", get_matches(r'\p{P}')) +print_cat("symbol", get_matches(r'\p{S}')) +print_cat("control", get_matches(r'\p{C}')) diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py index a18252730..e986a3604 100755 --- a/scripts/run-with-preset.py +++ b/scripts/run-with-preset.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import logging import argparse import os import subprocess @@ -7,6 +8,8 @@ import sys import yaml +logger = logging.getLogger("run-with-preset") + CLI_ARGS_MAIN_PERPLEXITY = [ "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape", "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", @@ -56,6 +59,7 @@ parser.add_argument("-bin", "--binary", help="The binary to run.") parser.add_argument("yaml_files", nargs="*", help="Arbitrary number of YAML files from which to read preset values. " "If two files specify the same values the later one will be used.") +parser.add_argument("--verbose", action="store_true", help="increase output verbosity") known_args, unknown_args = parser.parse_known_args() @@ -63,6 +67,8 @@ if not known_args.yaml_files and not unknown_args: parser.print_help() sys.exit(0) +logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO) + props = dict() for yaml_file in known_args.yaml_files: @@ -85,7 +91,7 @@ elif binary.lower().endswith("llama-bench"): elif binary.lower().endswith("server"): cli_args = CLI_ARGS_SERVER else: - print(f"Unknown binary: {binary}") + logger.error(f"Unknown binary: {binary}") sys.exit(1) command_list = [binary] @@ -121,11 +127,11 @@ for cli_arg in cli_args: num_unused = len(props) if num_unused > 10: - print(f"The preset file contained a total of {num_unused} unused properties.") + logger.info(f"The preset file contained a total of {num_unused} unused properties.") elif num_unused > 0: - print("The preset file contained the following unused properties:") + logger.info("The preset file contained the following unused properties:") for prop, value in props.items(): - print(f" {prop}: {value}") + logger.info(f" {prop}: {value}") command_list += unknown_args diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py index dff4b4734..0b5b9aafa 100755 --- a/scripts/verify-checksum-models.py +++ b/scripts/verify-checksum-models.py @@ -1,8 +1,11 @@ #!/usr/bin/env python3 +import logging import os import hashlib +logger = logging.getLogger("verify-checksum-models") + def sha256sum(file): block_size = 16 * 1024 * 1024 # 16 MB block size @@ -27,7 +30,7 @@ hash_list_file = os.path.join(llama_path, "SHA256SUMS") # Check if the hash list file exists if not os.path.exists(hash_list_file): - print(f"Hash list file not found: {hash_list_file}") + logger.error(f"Hash list file not found: {hash_list_file}") exit(1) # Read the hash file content and split it into an array of lines @@ -46,7 +49,7 @@ for line in hash_list: file_path = os.path.join(llama_path, filename) # Informing user of the progress of the integrity check - print(f"Verifying the checksum of {file_path}") + logger.info(f"Verifying the checksum of {file_path}") # Check if the file exists if os.path.exists(file_path): @@ -73,9 +76,9 @@ for line in hash_list: # Print column headers for results table -print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) -print("-" * 80) +print("filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) # noqa: NP100 +print("-" * 80) # noqa: NP100 # Output the results as a table for r in results: - print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") + print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") # noqa: NP100 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d23e7f771..208f0cf7b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -74,13 +74,16 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) -llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) -llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf) # TODO: enable when fixed +# https://github.com/ggerganov/llama.cpp/pull/7036 #llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) +#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) # build test-tokenizer-1-bpe target once and add many tests add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp) diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py deleted file mode 100644 index 33a272441..000000000 --- a/tests/test-tokenizer-0-bpe.py +++ /dev/null @@ -1,117 +0,0 @@ -# tests with BPE tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/ -# - -import argparse - -from transformers import AutoTokenizer - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is 🦙.cpp", - "w048 7tuijk dsdfhu", - "нещо на Български", - "កាន់តែពិសេសអាចខលចេញ", - "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - -for text in tests: - print('text: ', text) - print(tokenizer.encode(text)) - print(tokenizer.decode(tokenizer.encode(text))) - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - # LLaMA v3 for some reason strips the space for these tokens (and others) - # if x == 662: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 1174: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 2564: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 758: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 949: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 5354: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # else: - # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-spm.py b/tests/test-tokenizer-0-spm.py deleted file mode 100644 index be12a6b93..000000000 --- a/tests/test-tokenizer-0-spm.py +++ /dev/null @@ -1,114 +0,0 @@ -# tests with SPM tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/ -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/ -# - - -import argparse - -from sentencepiece import SentencePieceProcessor - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is 🦙.cpp", - "w048 7tuijk dsdfhu", - "нещо на Български", - "កាន់តែពិសេសអាចខលចេញ", - "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - - -for text in tests: - print('text: ', text) - print('\nwith bos:') - print(tokenizer.encode(text, add_bos=True)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) - print('\nwithout bos:') - print(tokenizer.encode(text, add_bos=False)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) - -print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello' -print("'" + tokenizer.id_to_piece(29871) + "'") # '_' -print("'" + tokenizer.decode([15043]) + "'") # 'Hello' -print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' -print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' -print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text, add_bos=False) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s, add_bos=True) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 5122757cf..d478f1041 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -55,8 +55,10 @@ // return _k_tests; //} -static std::map> read_tests(const std::string & fname_inp, const std::string & fname_out) { - std::map> tests; +using llama_tests = std::map>; + +static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) { + llama_tests tests; std::ifstream ifs_inp(fname_inp); if (!ifs_inp) { @@ -175,12 +177,20 @@ int main(int argc, char **argv) { bool success = true; - const auto k_tests = read_tests(fname_inp, fname_out); + const auto k_tests = [&]() -> llama_tests { + if (!fname_text.empty()) { + return {}; + } - if (k_tests.empty()) { - fprintf(stderr, "%s : error: no tests found\n", __func__); - return 1; - } + const auto res = read_tests(fname_inp, fname_out); + + if (res.empty()) { + fprintf(stderr, "%s : error: no tests found\n", __func__); + exit(1); + } + + return res; + }(); const bool add_special = false; @@ -238,7 +248,17 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - const std::vector res = llama_tokenize(ctx, text, add_special); + std::vector res; + + { + const auto t_start = ggml_time_us(); + + res = llama_tokenize(ctx, text, add_special); + + const auto t_end = ggml_time_us(); + + fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0); + } fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); @@ -252,7 +272,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector{tok})) << "'" << std::endl; + //ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector{tok})) << "'" << std::endl; + ofs << tok << "\n"; } } diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py new file mode 100644 index 000000000..cd760d1ce --- /dev/null +++ b/tests/test-tokenizer-0.py @@ -0,0 +1,46 @@ +import time +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize", required=True) +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer +fname_tok = args.fname_tok + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +print('tokenizing file: ', fname_tok) # noqa: NP100 +fname_out = fname_tok + '.tok' +with open(fname_tok, 'r', encoding='utf-8') as f: + lines = f.readlines() + s = ''.join(lines) + t_start = time.time() + res = tokenizer.encode(s, add_special_tokens=False) + t_end = time.time() + print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100 + with open(fname_out, 'w', encoding='utf-8') as f: + for x in res: + # LLaMA v3 for some reason strips the space for these tokens (and others) + # if x == 662: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 1174: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 2564: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 758: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 949: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 5354: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # else: + # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') + # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') + f.write(str(x) + '\n') + print('len(res): ', len(res)) # noqa: NP100 + print('len(lines): ', len(lines)) # noqa: NP100 +print('results written to: ', fname_out) # noqa: NP100 diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh new file mode 100755 index 000000000..2fb8632d8 --- /dev/null +++ b/tests/test-tokenizer-0.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +# Usage: +# +# test-tokenizer-0.sh +# + +if [ $# -ne 2 ]; then + printf "Usage: $0 \n" + exit 1 +fi + +name=$1 +input=$2 + +make -j tests/test-tokenizer-0 + +printf "Testing %s on %s ...\n" $name $input + +python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 +cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" + +./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 +cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" + +diff $input.tok $input.tokcpp > /dev/null 2>&1 + +if [ $? -eq 0 ]; then + printf "Tokenization is correct!\n" +else + diff $input.tok $input.tokcpp | head -n 32 + + printf "Tokenization differs!\n" +fi diff --git a/unicode-data.cpp b/unicode-data.cpp index e6bafb3a9..07bf02c45 100644 --- a/unicode-data.cpp +++ b/unicode-data.cpp @@ -5,27 +5,47 @@ #include #include -const std::vector> unicode_ranges_digit = { -{0x00000030, 0x00000039}, {0x000000B2, 0x000000B3}, {0x000000B9, 0x000000B9}, {0x00000660, 0x00000669}, -{0x000006F0, 0x000006F9}, {0x000007C0, 0x000007C9}, {0x00000966, 0x0000096F}, {0x000009E6, 0x000009EF}, -{0x00000A66, 0x00000A6F}, {0x00000AE6, 0x00000AEF}, {0x00000B66, 0x00000B6F}, {0x00000BE6, 0x00000BEF}, -{0x00000C66, 0x00000C6F}, {0x00000CE6, 0x00000CEF}, {0x00000D66, 0x00000D6F}, {0x00000DE6, 0x00000DEF}, -{0x00000E50, 0x00000E59}, {0x00000ED0, 0x00000ED9}, {0x00000F20, 0x00000F29}, {0x00001040, 0x00001049}, -{0x00001090, 0x00001099}, {0x00001369, 0x00001371}, {0x000017E0, 0x000017E9}, {0x00001810, 0x00001819}, -{0x00001946, 0x0000194F}, {0x000019D0, 0x000019DA}, {0x00001A80, 0x00001A89}, {0x00001A90, 0x00001A99}, -{0x00001B50, 0x00001B59}, {0x00001BB0, 0x00001BB9}, {0x00001C40, 0x00001C49}, {0x00001C50, 0x00001C59}, -{0x00002070, 0x00002070}, {0x00002074, 0x00002079}, {0x00002080, 0x00002089}, {0x00002460, 0x00002468}, -{0x00002474, 0x0000247C}, {0x00002488, 0x00002490}, {0x000024EA, 0x000024EA}, {0x000024F5, 0x000024FD}, -{0x000024FF, 0x000024FF}, {0x00002776, 0x0000277E}, {0x00002780, 0x00002788}, {0x0000278A, 0x00002792}, -{0x0000A620, 0x0000A629}, {0x0000A8D0, 0x0000A8D9}, {0x0000A900, 0x0000A909}, {0x0000A9D0, 0x0000A9D9}, -{0x0000A9F0, 0x0000A9F9}, {0x0000AA50, 0x0000AA59}, {0x0000ABF0, 0x0000ABF9}, {0x0000FF10, 0x0000FF19}, -{0x000104A0, 0x000104A9}, {0x00010A40, 0x00010A43}, {0x00010D30, 0x00010D39}, {0x00010E60, 0x00010E68}, -{0x00011052, 0x0001105A}, {0x00011066, 0x0001106F}, {0x000110F0, 0x000110F9}, {0x00011136, 0x0001113F}, -{0x000111D0, 0x000111D9}, {0x000112F0, 0x000112F9}, {0x00011450, 0x00011459}, {0x000114D0, 0x000114D9}, -{0x00011650, 0x00011659}, {0x000116C0, 0x000116C9}, {0x00011730, 0x00011739}, {0x000118E0, 0x000118E9}, -{0x00011950, 0x00011959}, {0x00011C50, 0x00011C59}, {0x00011D50, 0x00011D59}, {0x00011DA0, 0x00011DA9}, -{0x00016A60, 0x00016A69}, {0x00016B50, 0x00016B59}, {0x0001D7CE, 0x0001D7FF}, {0x0001E140, 0x0001E149}, -{0x0001E2F0, 0x0001E2F9}, {0x0001E950, 0x0001E959}, {0x0001F100, 0x0001F10A}, {0x0001FBF0, 0x0001FBF9}, +// generated with scripts/gen-unicode-data.py +// +// TODO: generate unicode_map_lowercase +// TODO: generate unicode_map_nfd + +const std::vector> unicode_ranges_number = { +{0x00000030, 0x00000039}, {0x000000B2, 0x000000B3}, {0x000000B9, 0x000000B9}, {0x000000BC, 0x000000BE}, +{0x00000660, 0x00000669}, {0x000006F0, 0x000006F9}, {0x000007C0, 0x000007C9}, {0x00000966, 0x0000096F}, +{0x000009E6, 0x000009EF}, {0x000009F4, 0x000009F9}, {0x00000A66, 0x00000A6F}, {0x00000AE6, 0x00000AEF}, +{0x00000B66, 0x00000B6F}, {0x00000B72, 0x00000B77}, {0x00000BE6, 0x00000BF2}, {0x00000C66, 0x00000C6F}, +{0x00000C78, 0x00000C7E}, {0x00000CE6, 0x00000CEF}, {0x00000D58, 0x00000D5E}, {0x00000D66, 0x00000D78}, +{0x00000DE6, 0x00000DEF}, {0x00000E50, 0x00000E59}, {0x00000ED0, 0x00000ED9}, {0x00000F20, 0x00000F33}, +{0x00001040, 0x00001049}, {0x00001090, 0x00001099}, {0x00001369, 0x0000137C}, {0x000016EE, 0x000016F0}, +{0x000017E0, 0x000017E9}, {0x000017F0, 0x000017F9}, {0x00001810, 0x00001819}, {0x00001946, 0x0000194F}, +{0x000019D0, 0x000019DA}, {0x00001A80, 0x00001A89}, {0x00001A90, 0x00001A99}, {0x00001B50, 0x00001B59}, +{0x00001BB0, 0x00001BB9}, {0x00001C40, 0x00001C49}, {0x00001C50, 0x00001C59}, {0x00002070, 0x00002070}, +{0x00002074, 0x00002079}, {0x00002080, 0x00002089}, {0x00002150, 0x00002182}, {0x00002185, 0x00002189}, +{0x00002460, 0x0000249B}, {0x000024EA, 0x000024FF}, {0x00002776, 0x00002793}, {0x00002CFD, 0x00002CFD}, +{0x00003007, 0x00003007}, {0x00003021, 0x00003029}, {0x00003038, 0x0000303A}, {0x00003192, 0x00003195}, +{0x00003220, 0x00003229}, {0x00003248, 0x0000324F}, {0x00003251, 0x0000325F}, {0x00003280, 0x00003289}, +{0x000032B1, 0x000032BF}, {0x0000A620, 0x0000A629}, {0x0000A6E6, 0x0000A6EF}, {0x0000A830, 0x0000A835}, +{0x0000A8D0, 0x0000A8D9}, {0x0000A900, 0x0000A909}, {0x0000A9D0, 0x0000A9D9}, {0x0000A9F0, 0x0000A9F9}, +{0x0000AA50, 0x0000AA59}, {0x0000ABF0, 0x0000ABF9}, {0x0000FF10, 0x0000FF19}, {0x00010107, 0x00010133}, +{0x00010140, 0x00010178}, {0x0001018A, 0x0001018B}, {0x000102E1, 0x000102FB}, {0x00010320, 0x00010323}, +{0x00010341, 0x00010341}, {0x0001034A, 0x0001034A}, {0x000103D1, 0x000103D5}, {0x000104A0, 0x000104A9}, +{0x00010858, 0x0001085F}, {0x00010879, 0x0001087F}, {0x000108A7, 0x000108AF}, {0x000108FB, 0x000108FF}, +{0x00010916, 0x0001091B}, {0x000109BC, 0x000109BD}, {0x000109C0, 0x000109CF}, {0x000109D2, 0x000109FF}, +{0x00010A40, 0x00010A48}, {0x00010A7D, 0x00010A7E}, {0x00010A9D, 0x00010A9F}, {0x00010AEB, 0x00010AEF}, +{0x00010B58, 0x00010B5F}, {0x00010B78, 0x00010B7F}, {0x00010BA9, 0x00010BAF}, {0x00010CFA, 0x00010CFF}, +{0x00010D30, 0x00010D39}, {0x00010E60, 0x00010E7E}, {0x00010F1D, 0x00010F26}, {0x00010F51, 0x00010F54}, +{0x00010FC5, 0x00010FCB}, {0x00011052, 0x0001106F}, {0x000110F0, 0x000110F9}, {0x00011136, 0x0001113F}, +{0x000111D0, 0x000111D9}, {0x000111E1, 0x000111F4}, {0x000112F0, 0x000112F9}, {0x00011450, 0x00011459}, +{0x000114D0, 0x000114D9}, {0x00011650, 0x00011659}, {0x000116C0, 0x000116C9}, {0x00011730, 0x0001173B}, +{0x000118E0, 0x000118F2}, {0x00011950, 0x00011959}, {0x00011C50, 0x00011C6C}, {0x00011D50, 0x00011D59}, +{0x00011DA0, 0x00011DA9}, {0x00011F50, 0x00011F59}, {0x00011FC0, 0x00011FD4}, {0x00012400, 0x0001246E}, +{0x00016A60, 0x00016A69}, {0x00016AC0, 0x00016AC9}, {0x00016B50, 0x00016B59}, {0x00016B5B, 0x00016B61}, +{0x00016E80, 0x00016E96}, {0x0001D2C0, 0x0001D2D3}, {0x0001D2E0, 0x0001D2F3}, {0x0001D360, 0x0001D378}, +{0x0001D7CE, 0x0001D7FF}, {0x0001E140, 0x0001E149}, {0x0001E2F0, 0x0001E2F9}, {0x0001E4F0, 0x0001E4F9}, +{0x0001E8C7, 0x0001E8CF}, {0x0001E950, 0x0001E959}, {0x0001EC71, 0x0001ECAB}, {0x0001ECAD, 0x0001ECAF}, +{0x0001ECB1, 0x0001ECB4}, {0x0001ED01, 0x0001ED2D}, {0x0001ED2F, 0x0001ED3D}, {0x0001F100, 0x0001F10C}, +{0x0001FBF0, 0x0001FBF9}, }; const std::vector> unicode_ranges_letter = { @@ -41,73 +61,73 @@ const std::vector> unicode_ranges_letter = { {0x00000710, 0x00000710}, {0x00000712, 0x0000072F}, {0x0000074D, 0x000007A5}, {0x000007B1, 0x000007B1}, {0x000007CA, 0x000007EA}, {0x000007F4, 0x000007F5}, {0x000007FA, 0x000007FA}, {0x00000800, 0x00000815}, {0x0000081A, 0x0000081A}, {0x00000824, 0x00000824}, {0x00000828, 0x00000828}, {0x00000840, 0x00000858}, -{0x00000860, 0x0000086A}, {0x000008A0, 0x000008B4}, {0x000008B6, 0x000008C7}, {0x00000904, 0x00000939}, -{0x0000093D, 0x0000093D}, {0x00000950, 0x00000950}, {0x00000958, 0x00000961}, {0x00000971, 0x00000980}, -{0x00000985, 0x0000098C}, {0x0000098F, 0x00000990}, {0x00000993, 0x000009A8}, {0x000009AA, 0x000009B0}, -{0x000009B2, 0x000009B2}, {0x000009B6, 0x000009B9}, {0x000009BD, 0x000009BD}, {0x000009CE, 0x000009CE}, -{0x000009DC, 0x000009DD}, {0x000009DF, 0x000009E1}, {0x000009F0, 0x000009F1}, {0x000009FC, 0x000009FC}, -{0x00000A05, 0x00000A0A}, {0x00000A0F, 0x00000A10}, {0x00000A13, 0x00000A28}, {0x00000A2A, 0x00000A30}, -{0x00000A32, 0x00000A33}, {0x00000A35, 0x00000A36}, {0x00000A38, 0x00000A39}, {0x00000A59, 0x00000A5C}, -{0x00000A5E, 0x00000A5E}, {0x00000A72, 0x00000A74}, {0x00000A85, 0x00000A8D}, {0x00000A8F, 0x00000A91}, -{0x00000A93, 0x00000AA8}, {0x00000AAA, 0x00000AB0}, {0x00000AB2, 0x00000AB3}, {0x00000AB5, 0x00000AB9}, -{0x00000ABD, 0x00000ABD}, {0x00000AD0, 0x00000AD0}, {0x00000AE0, 0x00000AE1}, {0x00000AF9, 0x00000AF9}, -{0x00000B05, 0x00000B0C}, {0x00000B0F, 0x00000B10}, {0x00000B13, 0x00000B28}, {0x00000B2A, 0x00000B30}, -{0x00000B32, 0x00000B33}, {0x00000B35, 0x00000B39}, {0x00000B3D, 0x00000B3D}, {0x00000B5C, 0x00000B5D}, -{0x00000B5F, 0x00000B61}, {0x00000B71, 0x00000B71}, {0x00000B83, 0x00000B83}, {0x00000B85, 0x00000B8A}, -{0x00000B8E, 0x00000B90}, {0x00000B92, 0x00000B95}, {0x00000B99, 0x00000B9A}, {0x00000B9C, 0x00000B9C}, -{0x00000B9E, 0x00000B9F}, {0x00000BA3, 0x00000BA4}, {0x00000BA8, 0x00000BAA}, {0x00000BAE, 0x00000BB9}, -{0x00000BD0, 0x00000BD0}, {0x00000C05, 0x00000C0C}, {0x00000C0E, 0x00000C10}, {0x00000C12, 0x00000C28}, -{0x00000C2A, 0x00000C39}, {0x00000C3D, 0x00000C3D}, {0x00000C58, 0x00000C5A}, {0x00000C60, 0x00000C61}, -{0x00000C80, 0x00000C80}, {0x00000C85, 0x00000C8C}, {0x00000C8E, 0x00000C90}, {0x00000C92, 0x00000CA8}, -{0x00000CAA, 0x00000CB3}, {0x00000CB5, 0x00000CB9}, {0x00000CBD, 0x00000CBD}, {0x00000CDE, 0x00000CDE}, -{0x00000CE0, 0x00000CE1}, {0x00000CF1, 0x00000CF2}, {0x00000D04, 0x00000D0C}, {0x00000D0E, 0x00000D10}, -{0x00000D12, 0x00000D3A}, {0x00000D3D, 0x00000D3D}, {0x00000D4E, 0x00000D4E}, {0x00000D54, 0x00000D56}, -{0x00000D5F, 0x00000D61}, {0x00000D7A, 0x00000D7F}, {0x00000D85, 0x00000D96}, {0x00000D9A, 0x00000DB1}, -{0x00000DB3, 0x00000DBB}, {0x00000DBD, 0x00000DBD}, {0x00000DC0, 0x00000DC6}, {0x00000E01, 0x00000E30}, -{0x00000E32, 0x00000E33}, {0x00000E40, 0x00000E46}, {0x00000E81, 0x00000E82}, {0x00000E84, 0x00000E84}, -{0x00000E86, 0x00000E8A}, {0x00000E8C, 0x00000EA3}, {0x00000EA5, 0x00000EA5}, {0x00000EA7, 0x00000EB0}, -{0x00000EB2, 0x00000EB3}, {0x00000EBD, 0x00000EBD}, {0x00000EC0, 0x00000EC4}, {0x00000EC6, 0x00000EC6}, -{0x00000EDC, 0x00000EDF}, {0x00000F00, 0x00000F00}, {0x00000F40, 0x00000F47}, {0x00000F49, 0x00000F6C}, -{0x00000F88, 0x00000F8C}, {0x00001000, 0x0000102A}, {0x0000103F, 0x0000103F}, {0x00001050, 0x00001055}, -{0x0000105A, 0x0000105D}, {0x00001061, 0x00001061}, {0x00001065, 0x00001066}, {0x0000106E, 0x00001070}, -{0x00001075, 0x00001081}, {0x0000108E, 0x0000108E}, {0x000010A0, 0x000010C5}, {0x000010C7, 0x000010C7}, -{0x000010CD, 0x000010CD}, {0x000010D0, 0x000010FA}, {0x000010FC, 0x00001248}, {0x0000124A, 0x0000124D}, -{0x00001250, 0x00001256}, {0x00001258, 0x00001258}, {0x0000125A, 0x0000125D}, {0x00001260, 0x00001288}, -{0x0000128A, 0x0000128D}, {0x00001290, 0x000012B0}, {0x000012B2, 0x000012B5}, {0x000012B8, 0x000012BE}, -{0x000012C0, 0x000012C0}, {0x000012C2, 0x000012C5}, {0x000012C8, 0x000012D6}, {0x000012D8, 0x00001310}, -{0x00001312, 0x00001315}, {0x00001318, 0x0000135A}, {0x00001380, 0x0000138F}, {0x000013A0, 0x000013F5}, -{0x000013F8, 0x000013FD}, {0x00001401, 0x0000166C}, {0x0000166F, 0x0000167F}, {0x00001681, 0x0000169A}, -{0x000016A0, 0x000016EA}, {0x000016F1, 0x000016F8}, {0x00001700, 0x0000170C}, {0x0000170E, 0x00001711}, -{0x00001720, 0x00001731}, {0x00001740, 0x00001751}, {0x00001760, 0x0000176C}, {0x0000176E, 0x00001770}, -{0x00001780, 0x000017B3}, {0x000017D7, 0x000017D7}, {0x000017DC, 0x000017DC}, {0x00001820, 0x00001878}, -{0x00001880, 0x00001884}, {0x00001887, 0x000018A8}, {0x000018AA, 0x000018AA}, {0x000018B0, 0x000018F5}, -{0x00001900, 0x0000191E}, {0x00001950, 0x0000196D}, {0x00001970, 0x00001974}, {0x00001980, 0x000019AB}, -{0x000019B0, 0x000019C9}, {0x00001A00, 0x00001A16}, {0x00001A20, 0x00001A54}, {0x00001AA7, 0x00001AA7}, -{0x00001B05, 0x00001B33}, {0x00001B45, 0x00001B4B}, {0x00001B83, 0x00001BA0}, {0x00001BAE, 0x00001BAF}, -{0x00001BBA, 0x00001BE5}, {0x00001C00, 0x00001C23}, {0x00001C4D, 0x00001C4F}, {0x00001C5A, 0x00001C7D}, -{0x00001C80, 0x00001C88}, {0x00001C90, 0x00001CBA}, {0x00001CBD, 0x00001CBF}, {0x00001CE9, 0x00001CEC}, -{0x00001CEE, 0x00001CF3}, {0x00001CF5, 0x00001CF6}, {0x00001CFA, 0x00001CFA}, {0x00001D00, 0x00001DBF}, -{0x00001E00, 0x00001F15}, {0x00001F18, 0x00001F1D}, {0x00001F20, 0x00001F45}, {0x00001F48, 0x00001F4D}, -{0x00001F50, 0x00001F57}, {0x00001F59, 0x00001F59}, {0x00001F5B, 0x00001F5B}, {0x00001F5D, 0x00001F5D}, -{0x00001F5F, 0x00001F7D}, {0x00001F80, 0x00001FB4}, {0x00001FB6, 0x00001FBC}, {0x00001FBE, 0x00001FBE}, -{0x00001FC2, 0x00001FC4}, {0x00001FC6, 0x00001FCC}, {0x00001FD0, 0x00001FD3}, {0x00001FD6, 0x00001FDB}, -{0x00001FE0, 0x00001FEC}, {0x00001FF2, 0x00001FF4}, {0x00001FF6, 0x00001FFC}, {0x00002071, 0x00002071}, -{0x0000207F, 0x0000207F}, {0x00002090, 0x0000209C}, {0x00002102, 0x00002102}, {0x00002107, 0x00002107}, -{0x0000210A, 0x00002113}, {0x00002115, 0x00002115}, {0x00002119, 0x0000211D}, {0x00002124, 0x00002124}, -{0x00002126, 0x00002126}, {0x00002128, 0x00002128}, {0x0000212A, 0x0000212D}, {0x0000212F, 0x00002139}, -{0x0000213C, 0x0000213F}, {0x00002145, 0x00002149}, {0x0000214E, 0x0000214E}, {0x00002183, 0x00002184}, -{0x00002C00, 0x00002C2E}, {0x00002C30, 0x00002C5E}, {0x00002C60, 0x00002CE4}, {0x00002CEB, 0x00002CEE}, -{0x00002CF2, 0x00002CF3}, {0x00002D00, 0x00002D25}, {0x00002D27, 0x00002D27}, {0x00002D2D, 0x00002D2D}, -{0x00002D30, 0x00002D67}, {0x00002D6F, 0x00002D6F}, {0x00002D80, 0x00002D96}, {0x00002DA0, 0x00002DA6}, -{0x00002DA8, 0x00002DAE}, {0x00002DB0, 0x00002DB6}, {0x00002DB8, 0x00002DBE}, {0x00002DC0, 0x00002DC6}, -{0x00002DC8, 0x00002DCE}, {0x00002DD0, 0x00002DD6}, {0x00002DD8, 0x00002DDE}, {0x00002E2F, 0x00002E2F}, -{0x00003005, 0x00003006}, {0x00003031, 0x00003035}, {0x0000303B, 0x0000303C}, {0x00003041, 0x00003096}, -{0x0000309D, 0x0000309F}, {0x000030A1, 0x000030FA}, {0x000030FC, 0x000030FF}, {0x00003105, 0x0000312F}, -{0x00003131, 0x0000318E}, {0x000031A0, 0x000031BF}, {0x000031F0, 0x000031FF}, {0x00003400, 0x00004DBF}, -{0x00004E00, 0x00009FFC}, {0x0000A000, 0x0000A48C}, {0x0000A4D0, 0x0000A4FD}, {0x0000A500, 0x0000A60C}, -{0x0000A610, 0x0000A61F}, {0x0000A62A, 0x0000A62B}, {0x0000A640, 0x0000A66E}, {0x0000A67F, 0x0000A69D}, -{0x0000A6A0, 0x0000A6E5}, {0x0000A717, 0x0000A71F}, {0x0000A722, 0x0000A788}, {0x0000A78B, 0x0000A7BF}, -{0x0000A7C2, 0x0000A7CA}, {0x0000A7F5, 0x0000A801}, {0x0000A803, 0x0000A805}, {0x0000A807, 0x0000A80A}, +{0x00000860, 0x0000086A}, {0x00000870, 0x00000887}, {0x00000889, 0x0000088E}, {0x000008A0, 0x000008C9}, +{0x00000904, 0x00000939}, {0x0000093D, 0x0000093D}, {0x00000950, 0x00000950}, {0x00000958, 0x00000961}, +{0x00000971, 0x00000980}, {0x00000985, 0x0000098C}, {0x0000098F, 0x00000990}, {0x00000993, 0x000009A8}, +{0x000009AA, 0x000009B0}, {0x000009B2, 0x000009B2}, {0x000009B6, 0x000009B9}, {0x000009BD, 0x000009BD}, +{0x000009CE, 0x000009CE}, {0x000009DC, 0x000009DD}, {0x000009DF, 0x000009E1}, {0x000009F0, 0x000009F1}, +{0x000009FC, 0x000009FC}, {0x00000A05, 0x00000A0A}, {0x00000A0F, 0x00000A10}, {0x00000A13, 0x00000A28}, +{0x00000A2A, 0x00000A30}, {0x00000A32, 0x00000A33}, {0x00000A35, 0x00000A36}, {0x00000A38, 0x00000A39}, +{0x00000A59, 0x00000A5C}, {0x00000A5E, 0x00000A5E}, {0x00000A72, 0x00000A74}, {0x00000A85, 0x00000A8D}, +{0x00000A8F, 0x00000A91}, {0x00000A93, 0x00000AA8}, {0x00000AAA, 0x00000AB0}, {0x00000AB2, 0x00000AB3}, +{0x00000AB5, 0x00000AB9}, {0x00000ABD, 0x00000ABD}, {0x00000AD0, 0x00000AD0}, {0x00000AE0, 0x00000AE1}, +{0x00000AF9, 0x00000AF9}, {0x00000B05, 0x00000B0C}, {0x00000B0F, 0x00000B10}, {0x00000B13, 0x00000B28}, +{0x00000B2A, 0x00000B30}, {0x00000B32, 0x00000B33}, {0x00000B35, 0x00000B39}, {0x00000B3D, 0x00000B3D}, +{0x00000B5C, 0x00000B5D}, {0x00000B5F, 0x00000B61}, {0x00000B71, 0x00000B71}, {0x00000B83, 0x00000B83}, +{0x00000B85, 0x00000B8A}, {0x00000B8E, 0x00000B90}, {0x00000B92, 0x00000B95}, {0x00000B99, 0x00000B9A}, +{0x00000B9C, 0x00000B9C}, {0x00000B9E, 0x00000B9F}, {0x00000BA3, 0x00000BA4}, {0x00000BA8, 0x00000BAA}, +{0x00000BAE, 0x00000BB9}, {0x00000BD0, 0x00000BD0}, {0x00000C05, 0x00000C0C}, {0x00000C0E, 0x00000C10}, +{0x00000C12, 0x00000C28}, {0x00000C2A, 0x00000C39}, {0x00000C3D, 0x00000C3D}, {0x00000C58, 0x00000C5A}, +{0x00000C5D, 0x00000C5D}, {0x00000C60, 0x00000C61}, {0x00000C80, 0x00000C80}, {0x00000C85, 0x00000C8C}, +{0x00000C8E, 0x00000C90}, {0x00000C92, 0x00000CA8}, {0x00000CAA, 0x00000CB3}, {0x00000CB5, 0x00000CB9}, +{0x00000CBD, 0x00000CBD}, {0x00000CDD, 0x00000CDE}, {0x00000CE0, 0x00000CE1}, {0x00000CF1, 0x00000CF2}, +{0x00000D04, 0x00000D0C}, {0x00000D0E, 0x00000D10}, {0x00000D12, 0x00000D3A}, {0x00000D3D, 0x00000D3D}, +{0x00000D4E, 0x00000D4E}, {0x00000D54, 0x00000D56}, {0x00000D5F, 0x00000D61}, {0x00000D7A, 0x00000D7F}, +{0x00000D85, 0x00000D96}, {0x00000D9A, 0x00000DB1}, {0x00000DB3, 0x00000DBB}, {0x00000DBD, 0x00000DBD}, +{0x00000DC0, 0x00000DC6}, {0x00000E01, 0x00000E30}, {0x00000E32, 0x00000E33}, {0x00000E40, 0x00000E46}, +{0x00000E81, 0x00000E82}, {0x00000E84, 0x00000E84}, {0x00000E86, 0x00000E8A}, {0x00000E8C, 0x00000EA3}, +{0x00000EA5, 0x00000EA5}, {0x00000EA7, 0x00000EB0}, {0x00000EB2, 0x00000EB3}, {0x00000EBD, 0x00000EBD}, +{0x00000EC0, 0x00000EC4}, {0x00000EC6, 0x00000EC6}, {0x00000EDC, 0x00000EDF}, {0x00000F00, 0x00000F00}, +{0x00000F40, 0x00000F47}, {0x00000F49, 0x00000F6C}, {0x00000F88, 0x00000F8C}, {0x00001000, 0x0000102A}, +{0x0000103F, 0x0000103F}, {0x00001050, 0x00001055}, {0x0000105A, 0x0000105D}, {0x00001061, 0x00001061}, +{0x00001065, 0x00001066}, {0x0000106E, 0x00001070}, {0x00001075, 0x00001081}, {0x0000108E, 0x0000108E}, +{0x000010A0, 0x000010C5}, {0x000010C7, 0x000010C7}, {0x000010CD, 0x000010CD}, {0x000010D0, 0x000010FA}, +{0x000010FC, 0x00001248}, {0x0000124A, 0x0000124D}, {0x00001250, 0x00001256}, {0x00001258, 0x00001258}, +{0x0000125A, 0x0000125D}, {0x00001260, 0x00001288}, {0x0000128A, 0x0000128D}, {0x00001290, 0x000012B0}, +{0x000012B2, 0x000012B5}, {0x000012B8, 0x000012BE}, {0x000012C0, 0x000012C0}, {0x000012C2, 0x000012C5}, +{0x000012C8, 0x000012D6}, {0x000012D8, 0x00001310}, {0x00001312, 0x00001315}, {0x00001318, 0x0000135A}, +{0x00001380, 0x0000138F}, {0x000013A0, 0x000013F5}, {0x000013F8, 0x000013FD}, {0x00001401, 0x0000166C}, +{0x0000166F, 0x0000167F}, {0x00001681, 0x0000169A}, {0x000016A0, 0x000016EA}, {0x000016F1, 0x000016F8}, +{0x00001700, 0x00001711}, {0x0000171F, 0x00001731}, {0x00001740, 0x00001751}, {0x00001760, 0x0000176C}, +{0x0000176E, 0x00001770}, {0x00001780, 0x000017B3}, {0x000017D7, 0x000017D7}, {0x000017DC, 0x000017DC}, +{0x00001820, 0x00001878}, {0x00001880, 0x00001884}, {0x00001887, 0x000018A8}, {0x000018AA, 0x000018AA}, +{0x000018B0, 0x000018F5}, {0x00001900, 0x0000191E}, {0x00001950, 0x0000196D}, {0x00001970, 0x00001974}, +{0x00001980, 0x000019AB}, {0x000019B0, 0x000019C9}, {0x00001A00, 0x00001A16}, {0x00001A20, 0x00001A54}, +{0x00001AA7, 0x00001AA7}, {0x00001B05, 0x00001B33}, {0x00001B45, 0x00001B4C}, {0x00001B83, 0x00001BA0}, +{0x00001BAE, 0x00001BAF}, {0x00001BBA, 0x00001BE5}, {0x00001C00, 0x00001C23}, {0x00001C4D, 0x00001C4F}, +{0x00001C5A, 0x00001C7D}, {0x00001C80, 0x00001C88}, {0x00001C90, 0x00001CBA}, {0x00001CBD, 0x00001CBF}, +{0x00001CE9, 0x00001CEC}, {0x00001CEE, 0x00001CF3}, {0x00001CF5, 0x00001CF6}, {0x00001CFA, 0x00001CFA}, +{0x00001D00, 0x00001DBF}, {0x00001E00, 0x00001F15}, {0x00001F18, 0x00001F1D}, {0x00001F20, 0x00001F45}, +{0x00001F48, 0x00001F4D}, {0x00001F50, 0x00001F57}, {0x00001F59, 0x00001F59}, {0x00001F5B, 0x00001F5B}, +{0x00001F5D, 0x00001F5D}, {0x00001F5F, 0x00001F7D}, {0x00001F80, 0x00001FB4}, {0x00001FB6, 0x00001FBC}, +{0x00001FBE, 0x00001FBE}, {0x00001FC2, 0x00001FC4}, {0x00001FC6, 0x00001FCC}, {0x00001FD0, 0x00001FD3}, +{0x00001FD6, 0x00001FDB}, {0x00001FE0, 0x00001FEC}, {0x00001FF2, 0x00001FF4}, {0x00001FF6, 0x00001FFC}, +{0x00002071, 0x00002071}, {0x0000207F, 0x0000207F}, {0x00002090, 0x0000209C}, {0x00002102, 0x00002102}, +{0x00002107, 0x00002107}, {0x0000210A, 0x00002113}, {0x00002115, 0x00002115}, {0x00002119, 0x0000211D}, +{0x00002124, 0x00002124}, {0x00002126, 0x00002126}, {0x00002128, 0x00002128}, {0x0000212A, 0x0000212D}, +{0x0000212F, 0x00002139}, {0x0000213C, 0x0000213F}, {0x00002145, 0x00002149}, {0x0000214E, 0x0000214E}, +{0x00002183, 0x00002184}, {0x00002C00, 0x00002CE4}, {0x00002CEB, 0x00002CEE}, {0x00002CF2, 0x00002CF3}, +{0x00002D00, 0x00002D25}, {0x00002D27, 0x00002D27}, {0x00002D2D, 0x00002D2D}, {0x00002D30, 0x00002D67}, +{0x00002D6F, 0x00002D6F}, {0x00002D80, 0x00002D96}, {0x00002DA0, 0x00002DA6}, {0x00002DA8, 0x00002DAE}, +{0x00002DB0, 0x00002DB6}, {0x00002DB8, 0x00002DBE}, {0x00002DC0, 0x00002DC6}, {0x00002DC8, 0x00002DCE}, +{0x00002DD0, 0x00002DD6}, {0x00002DD8, 0x00002DDE}, {0x00002E2F, 0x00002E2F}, {0x00003005, 0x00003006}, +{0x00003031, 0x00003035}, {0x0000303B, 0x0000303C}, {0x00003041, 0x00003096}, {0x0000309D, 0x0000309F}, +{0x000030A1, 0x000030FA}, {0x000030FC, 0x000030FF}, {0x00003105, 0x0000312F}, {0x00003131, 0x0000318E}, +{0x000031A0, 0x000031BF}, {0x000031F0, 0x000031FF}, {0x00003400, 0x00004DBF}, {0x00004E00, 0x0000A48C}, +{0x0000A4D0, 0x0000A4FD}, {0x0000A500, 0x0000A60C}, {0x0000A610, 0x0000A61F}, {0x0000A62A, 0x0000A62B}, +{0x0000A640, 0x0000A66E}, {0x0000A67F, 0x0000A69D}, {0x0000A6A0, 0x0000A6E5}, {0x0000A717, 0x0000A71F}, +{0x0000A722, 0x0000A788}, {0x0000A78B, 0x0000A7CA}, {0x0000A7D0, 0x0000A7D1}, {0x0000A7D3, 0x0000A7D3}, +{0x0000A7D5, 0x0000A7D9}, {0x0000A7F2, 0x0000A801}, {0x0000A803, 0x0000A805}, {0x0000A807, 0x0000A80A}, {0x0000A80C, 0x0000A822}, {0x0000A840, 0x0000A873}, {0x0000A882, 0x0000A8B3}, {0x0000A8F2, 0x0000A8F7}, {0x0000A8FB, 0x0000A8FB}, {0x0000A8FD, 0x0000A8FE}, {0x0000A90A, 0x0000A925}, {0x0000A930, 0x0000A946}, {0x0000A960, 0x0000A97C}, {0x0000A984, 0x0000A9B2}, {0x0000A9CF, 0x0000A9CF}, {0x0000A9E0, 0x0000A9E4}, @@ -129,51 +149,60 @@ const std::vector> unicode_ranges_letter = { {0x000102A0, 0x000102D0}, {0x00010300, 0x0001031F}, {0x0001032D, 0x00010340}, {0x00010342, 0x00010349}, {0x00010350, 0x00010375}, {0x00010380, 0x0001039D}, {0x000103A0, 0x000103C3}, {0x000103C8, 0x000103CF}, {0x00010400, 0x0001049D}, {0x000104B0, 0x000104D3}, {0x000104D8, 0x000104FB}, {0x00010500, 0x00010527}, -{0x00010530, 0x00010563}, {0x00010600, 0x00010736}, {0x00010740, 0x00010755}, {0x00010760, 0x00010767}, -{0x00010800, 0x00010805}, {0x00010808, 0x00010808}, {0x0001080A, 0x00010835}, {0x00010837, 0x00010838}, -{0x0001083C, 0x0001083C}, {0x0001083F, 0x00010855}, {0x00010860, 0x00010876}, {0x00010880, 0x0001089E}, -{0x000108E0, 0x000108F2}, {0x000108F4, 0x000108F5}, {0x00010900, 0x00010915}, {0x00010920, 0x00010939}, -{0x00010980, 0x000109B7}, {0x000109BE, 0x000109BF}, {0x00010A00, 0x00010A00}, {0x00010A10, 0x00010A13}, -{0x00010A15, 0x00010A17}, {0x00010A19, 0x00010A35}, {0x00010A60, 0x00010A7C}, {0x00010A80, 0x00010A9C}, -{0x00010AC0, 0x00010AC7}, {0x00010AC9, 0x00010AE4}, {0x00010B00, 0x00010B35}, {0x00010B40, 0x00010B55}, -{0x00010B60, 0x00010B72}, {0x00010B80, 0x00010B91}, {0x00010C00, 0x00010C48}, {0x00010C80, 0x00010CB2}, -{0x00010CC0, 0x00010CF2}, {0x00010D00, 0x00010D23}, {0x00010E80, 0x00010EA9}, {0x00010EB0, 0x00010EB1}, -{0x00010F00, 0x00010F1C}, {0x00010F27, 0x00010F27}, {0x00010F30, 0x00010F45}, {0x00010FB0, 0x00010FC4}, -{0x00010FE0, 0x00010FF6}, {0x00011003, 0x00011037}, {0x00011083, 0x000110AF}, {0x000110D0, 0x000110E8}, -{0x00011103, 0x00011126}, {0x00011144, 0x00011144}, {0x00011147, 0x00011147}, {0x00011150, 0x00011172}, -{0x00011176, 0x00011176}, {0x00011183, 0x000111B2}, {0x000111C1, 0x000111C4}, {0x000111DA, 0x000111DA}, -{0x000111DC, 0x000111DC}, {0x00011200, 0x00011211}, {0x00011213, 0x0001122B}, {0x00011280, 0x00011286}, -{0x00011288, 0x00011288}, {0x0001128A, 0x0001128D}, {0x0001128F, 0x0001129D}, {0x0001129F, 0x000112A8}, -{0x000112B0, 0x000112DE}, {0x00011305, 0x0001130C}, {0x0001130F, 0x00011310}, {0x00011313, 0x00011328}, -{0x0001132A, 0x00011330}, {0x00011332, 0x00011333}, {0x00011335, 0x00011339}, {0x0001133D, 0x0001133D}, -{0x00011350, 0x00011350}, {0x0001135D, 0x00011361}, {0x00011400, 0x00011434}, {0x00011447, 0x0001144A}, -{0x0001145F, 0x00011461}, {0x00011480, 0x000114AF}, {0x000114C4, 0x000114C5}, {0x000114C7, 0x000114C7}, -{0x00011580, 0x000115AE}, {0x000115D8, 0x000115DB}, {0x00011600, 0x0001162F}, {0x00011644, 0x00011644}, -{0x00011680, 0x000116AA}, {0x000116B8, 0x000116B8}, {0x00011700, 0x0001171A}, {0x00011800, 0x0001182B}, +{0x00010530, 0x00010563}, {0x00010570, 0x0001057A}, {0x0001057C, 0x0001058A}, {0x0001058C, 0x00010592}, +{0x00010594, 0x00010595}, {0x00010597, 0x000105A1}, {0x000105A3, 0x000105B1}, {0x000105B3, 0x000105B9}, +{0x000105BB, 0x000105BC}, {0x00010600, 0x00010736}, {0x00010740, 0x00010755}, {0x00010760, 0x00010767}, +{0x00010780, 0x00010785}, {0x00010787, 0x000107B0}, {0x000107B2, 0x000107BA}, {0x00010800, 0x00010805}, +{0x00010808, 0x00010808}, {0x0001080A, 0x00010835}, {0x00010837, 0x00010838}, {0x0001083C, 0x0001083C}, +{0x0001083F, 0x00010855}, {0x00010860, 0x00010876}, {0x00010880, 0x0001089E}, {0x000108E0, 0x000108F2}, +{0x000108F4, 0x000108F5}, {0x00010900, 0x00010915}, {0x00010920, 0x00010939}, {0x00010980, 0x000109B7}, +{0x000109BE, 0x000109BF}, {0x00010A00, 0x00010A00}, {0x00010A10, 0x00010A13}, {0x00010A15, 0x00010A17}, +{0x00010A19, 0x00010A35}, {0x00010A60, 0x00010A7C}, {0x00010A80, 0x00010A9C}, {0x00010AC0, 0x00010AC7}, +{0x00010AC9, 0x00010AE4}, {0x00010B00, 0x00010B35}, {0x00010B40, 0x00010B55}, {0x00010B60, 0x00010B72}, +{0x00010B80, 0x00010B91}, {0x00010C00, 0x00010C48}, {0x00010C80, 0x00010CB2}, {0x00010CC0, 0x00010CF2}, +{0x00010D00, 0x00010D23}, {0x00010E80, 0x00010EA9}, {0x00010EB0, 0x00010EB1}, {0x00010F00, 0x00010F1C}, +{0x00010F27, 0x00010F27}, {0x00010F30, 0x00010F45}, {0x00010F70, 0x00010F81}, {0x00010FB0, 0x00010FC4}, +{0x00010FE0, 0x00010FF6}, {0x00011003, 0x00011037}, {0x00011071, 0x00011072}, {0x00011075, 0x00011075}, +{0x00011083, 0x000110AF}, {0x000110D0, 0x000110E8}, {0x00011103, 0x00011126}, {0x00011144, 0x00011144}, +{0x00011147, 0x00011147}, {0x00011150, 0x00011172}, {0x00011176, 0x00011176}, {0x00011183, 0x000111B2}, +{0x000111C1, 0x000111C4}, {0x000111DA, 0x000111DA}, {0x000111DC, 0x000111DC}, {0x00011200, 0x00011211}, +{0x00011213, 0x0001122B}, {0x0001123F, 0x00011240}, {0x00011280, 0x00011286}, {0x00011288, 0x00011288}, +{0x0001128A, 0x0001128D}, {0x0001128F, 0x0001129D}, {0x0001129F, 0x000112A8}, {0x000112B0, 0x000112DE}, +{0x00011305, 0x0001130C}, {0x0001130F, 0x00011310}, {0x00011313, 0x00011328}, {0x0001132A, 0x00011330}, +{0x00011332, 0x00011333}, {0x00011335, 0x00011339}, {0x0001133D, 0x0001133D}, {0x00011350, 0x00011350}, +{0x0001135D, 0x00011361}, {0x00011400, 0x00011434}, {0x00011447, 0x0001144A}, {0x0001145F, 0x00011461}, +{0x00011480, 0x000114AF}, {0x000114C4, 0x000114C5}, {0x000114C7, 0x000114C7}, {0x00011580, 0x000115AE}, +{0x000115D8, 0x000115DB}, {0x00011600, 0x0001162F}, {0x00011644, 0x00011644}, {0x00011680, 0x000116AA}, +{0x000116B8, 0x000116B8}, {0x00011700, 0x0001171A}, {0x00011740, 0x00011746}, {0x00011800, 0x0001182B}, {0x000118A0, 0x000118DF}, {0x000118FF, 0x00011906}, {0x00011909, 0x00011909}, {0x0001190C, 0x00011913}, {0x00011915, 0x00011916}, {0x00011918, 0x0001192F}, {0x0001193F, 0x0001193F}, {0x00011941, 0x00011941}, {0x000119A0, 0x000119A7}, {0x000119AA, 0x000119D0}, {0x000119E1, 0x000119E1}, {0x000119E3, 0x000119E3}, {0x00011A00, 0x00011A00}, {0x00011A0B, 0x00011A32}, {0x00011A3A, 0x00011A3A}, {0x00011A50, 0x00011A50}, -{0x00011A5C, 0x00011A89}, {0x00011A9D, 0x00011A9D}, {0x00011AC0, 0x00011AF8}, {0x00011C00, 0x00011C08}, +{0x00011A5C, 0x00011A89}, {0x00011A9D, 0x00011A9D}, {0x00011AB0, 0x00011AF8}, {0x00011C00, 0x00011C08}, {0x00011C0A, 0x00011C2E}, {0x00011C40, 0x00011C40}, {0x00011C72, 0x00011C8F}, {0x00011D00, 0x00011D06}, {0x00011D08, 0x00011D09}, {0x00011D0B, 0x00011D30}, {0x00011D46, 0x00011D46}, {0x00011D60, 0x00011D65}, {0x00011D67, 0x00011D68}, {0x00011D6A, 0x00011D89}, {0x00011D98, 0x00011D98}, {0x00011EE0, 0x00011EF2}, -{0x00011FB0, 0x00011FB0}, {0x00012000, 0x00012399}, {0x00012480, 0x00012543}, {0x00013000, 0x0001342E}, -{0x00014400, 0x00014646}, {0x00016800, 0x00016A38}, {0x00016A40, 0x00016A5E}, {0x00016AD0, 0x00016AED}, -{0x00016B00, 0x00016B2F}, {0x00016B40, 0x00016B43}, {0x00016B63, 0x00016B77}, {0x00016B7D, 0x00016B8F}, -{0x00016E40, 0x00016E7F}, {0x00016F00, 0x00016F4A}, {0x00016F50, 0x00016F50}, {0x00016F93, 0x00016F9F}, -{0x00016FE0, 0x00016FE1}, {0x00016FE3, 0x00016FE3}, {0x00017000, 0x000187F7}, {0x00018800, 0x00018CD5}, -{0x00018D00, 0x00018D08}, {0x0001B000, 0x0001B11E}, {0x0001B150, 0x0001B152}, {0x0001B164, 0x0001B167}, -{0x0001B170, 0x0001B2FB}, {0x0001BC00, 0x0001BC6A}, {0x0001BC70, 0x0001BC7C}, {0x0001BC80, 0x0001BC88}, -{0x0001BC90, 0x0001BC99}, {0x0001D400, 0x0001D454}, {0x0001D456, 0x0001D49C}, {0x0001D49E, 0x0001D49F}, -{0x0001D4A2, 0x0001D4A2}, {0x0001D4A5, 0x0001D4A6}, {0x0001D4A9, 0x0001D4AC}, {0x0001D4AE, 0x0001D4B9}, -{0x0001D4BB, 0x0001D4BB}, {0x0001D4BD, 0x0001D4C3}, {0x0001D4C5, 0x0001D505}, {0x0001D507, 0x0001D50A}, -{0x0001D50D, 0x0001D514}, {0x0001D516, 0x0001D51C}, {0x0001D51E, 0x0001D539}, {0x0001D53B, 0x0001D53E}, -{0x0001D540, 0x0001D544}, {0x0001D546, 0x0001D546}, {0x0001D54A, 0x0001D550}, {0x0001D552, 0x0001D6A5}, -{0x0001D6A8, 0x0001D6C0}, {0x0001D6C2, 0x0001D6DA}, {0x0001D6DC, 0x0001D6FA}, {0x0001D6FC, 0x0001D714}, -{0x0001D716, 0x0001D734}, {0x0001D736, 0x0001D74E}, {0x0001D750, 0x0001D76E}, {0x0001D770, 0x0001D788}, -{0x0001D78A, 0x0001D7A8}, {0x0001D7AA, 0x0001D7C2}, {0x0001D7C4, 0x0001D7CB}, {0x0001E100, 0x0001E12C}, -{0x0001E137, 0x0001E13D}, {0x0001E14E, 0x0001E14E}, {0x0001E2C0, 0x0001E2EB}, {0x0001E800, 0x0001E8C4}, +{0x00011F02, 0x00011F02}, {0x00011F04, 0x00011F10}, {0x00011F12, 0x00011F33}, {0x00011FB0, 0x00011FB0}, +{0x00012000, 0x00012399}, {0x00012480, 0x00012543}, {0x00012F90, 0x00012FF0}, {0x00013000, 0x0001342F}, +{0x00013441, 0x00013446}, {0x00014400, 0x00014646}, {0x00016800, 0x00016A38}, {0x00016A40, 0x00016A5E}, +{0x00016A70, 0x00016ABE}, {0x00016AD0, 0x00016AED}, {0x00016B00, 0x00016B2F}, {0x00016B40, 0x00016B43}, +{0x00016B63, 0x00016B77}, {0x00016B7D, 0x00016B8F}, {0x00016E40, 0x00016E7F}, {0x00016F00, 0x00016F4A}, +{0x00016F50, 0x00016F50}, {0x00016F93, 0x00016F9F}, {0x00016FE0, 0x00016FE1}, {0x00016FE3, 0x00016FE3}, +{0x00017000, 0x000187F7}, {0x00018800, 0x00018CD5}, {0x00018D00, 0x00018D08}, {0x0001AFF0, 0x0001AFF3}, +{0x0001AFF5, 0x0001AFFB}, {0x0001AFFD, 0x0001AFFE}, {0x0001B000, 0x0001B122}, {0x0001B132, 0x0001B132}, +{0x0001B150, 0x0001B152}, {0x0001B155, 0x0001B155}, {0x0001B164, 0x0001B167}, {0x0001B170, 0x0001B2FB}, +{0x0001BC00, 0x0001BC6A}, {0x0001BC70, 0x0001BC7C}, {0x0001BC80, 0x0001BC88}, {0x0001BC90, 0x0001BC99}, +{0x0001D400, 0x0001D454}, {0x0001D456, 0x0001D49C}, {0x0001D49E, 0x0001D49F}, {0x0001D4A2, 0x0001D4A2}, +{0x0001D4A5, 0x0001D4A6}, {0x0001D4A9, 0x0001D4AC}, {0x0001D4AE, 0x0001D4B9}, {0x0001D4BB, 0x0001D4BB}, +{0x0001D4BD, 0x0001D4C3}, {0x0001D4C5, 0x0001D505}, {0x0001D507, 0x0001D50A}, {0x0001D50D, 0x0001D514}, +{0x0001D516, 0x0001D51C}, {0x0001D51E, 0x0001D539}, {0x0001D53B, 0x0001D53E}, {0x0001D540, 0x0001D544}, +{0x0001D546, 0x0001D546}, {0x0001D54A, 0x0001D550}, {0x0001D552, 0x0001D6A5}, {0x0001D6A8, 0x0001D6C0}, +{0x0001D6C2, 0x0001D6DA}, {0x0001D6DC, 0x0001D6FA}, {0x0001D6FC, 0x0001D714}, {0x0001D716, 0x0001D734}, +{0x0001D736, 0x0001D74E}, {0x0001D750, 0x0001D76E}, {0x0001D770, 0x0001D788}, {0x0001D78A, 0x0001D7A8}, +{0x0001D7AA, 0x0001D7C2}, {0x0001D7C4, 0x0001D7CB}, {0x0001DF00, 0x0001DF1E}, {0x0001DF25, 0x0001DF2A}, +{0x0001E030, 0x0001E06D}, {0x0001E100, 0x0001E12C}, {0x0001E137, 0x0001E13D}, {0x0001E14E, 0x0001E14E}, +{0x0001E290, 0x0001E2AD}, {0x0001E2C0, 0x0001E2EB}, {0x0001E4D0, 0x0001E4EB}, {0x0001E7E0, 0x0001E7E6}, +{0x0001E7E8, 0x0001E7EB}, {0x0001E7ED, 0x0001E7EE}, {0x0001E7F0, 0x0001E7FE}, {0x0001E800, 0x0001E8C4}, {0x0001E900, 0x0001E943}, {0x0001E94B, 0x0001E94B}, {0x0001EE00, 0x0001EE03}, {0x0001EE05, 0x0001EE1F}, {0x0001EE21, 0x0001EE22}, {0x0001EE24, 0x0001EE24}, {0x0001EE27, 0x0001EE27}, {0x0001EE29, 0x0001EE32}, {0x0001EE34, 0x0001EE37}, {0x0001EE39, 0x0001EE39}, {0x0001EE3B, 0x0001EE3B}, {0x0001EE42, 0x0001EE42}, @@ -182,15 +211,14 @@ const std::vector> unicode_ranges_letter = { {0x0001EE5B, 0x0001EE5B}, {0x0001EE5D, 0x0001EE5D}, {0x0001EE5F, 0x0001EE5F}, {0x0001EE61, 0x0001EE62}, {0x0001EE64, 0x0001EE64}, {0x0001EE67, 0x0001EE6A}, {0x0001EE6C, 0x0001EE72}, {0x0001EE74, 0x0001EE77}, {0x0001EE79, 0x0001EE7C}, {0x0001EE7E, 0x0001EE7E}, {0x0001EE80, 0x0001EE89}, {0x0001EE8B, 0x0001EE9B}, -{0x0001EEA1, 0x0001EEA3}, {0x0001EEA5, 0x0001EEA9}, {0x0001EEAB, 0x0001EEBB}, {0x00020000, 0x0002A6DD}, -{0x0002A700, 0x0002B734}, {0x0002B740, 0x0002B81D}, {0x0002B820, 0x0002CEA1}, {0x0002CEB0, 0x0002EBE0}, -{0x0002F800, 0x0002FA1D}, {0x00030000, 0x0003134A}, +{0x0001EEA1, 0x0001EEA3}, {0x0001EEA5, 0x0001EEA9}, {0x0001EEAB, 0x0001EEBB}, {0x00020000, 0x0002A6DF}, +{0x0002A700, 0x0002B739}, {0x0002B740, 0x0002B81D}, {0x0002B820, 0x0002CEA1}, {0x0002CEB0, 0x0002EBE0}, +{0x0002F800, 0x0002FA1D}, {0x00030000, 0x0003134A}, {0x00031350, 0x000323AF}, }; const std::vector> unicode_ranges_whitespace = { -{0x00000009, 0x0000000D}, {0x0000001C, 0x00000020}, {0x00000085, 0x00000085}, {0x000000A0, 0x000000A0}, -{0x00001680, 0x00001680}, {0x00002000, 0x0000200A}, {0x00002028, 0x00002029}, {0x0000202F, 0x0000202F}, -{0x0000205F, 0x0000205F}, {0x00003000, 0x00003000}, +{0x00000020, 0x00000020}, {0x000000A0, 0x000000A0}, {0x00001680, 0x00001680}, {0x00002000, 0x0000200A}, +{0x00002028, 0x00002029}, {0x0000202F, 0x0000202F}, {0x0000205F, 0x0000205F}, {0x00003000, 0x00003000}, }; const std::vector> unicode_ranges_accent_mark = { @@ -200,72 +228,77 @@ const std::vector> unicode_ranges_accent_mark = { {0x000006E7, 0x000006E8}, {0x000006EA, 0x000006ED}, {0x00000711, 0x00000711}, {0x00000730, 0x0000074A}, {0x000007A6, 0x000007B0}, {0x000007EB, 0x000007F3}, {0x000007FD, 0x000007FD}, {0x00000816, 0x00000819}, {0x0000081B, 0x00000823}, {0x00000825, 0x00000827}, {0x00000829, 0x0000082D}, {0x00000859, 0x0000085B}, -{0x000008D3, 0x000008E1}, {0x000008E3, 0x00000903}, {0x0000093A, 0x0000093C}, {0x0000093E, 0x0000094F}, -{0x00000951, 0x00000957}, {0x00000962, 0x00000963}, {0x00000981, 0x00000983}, {0x000009BC, 0x000009BC}, -{0x000009BE, 0x000009C4}, {0x000009C7, 0x000009C8}, {0x000009CB, 0x000009CD}, {0x000009D7, 0x000009D7}, -{0x000009E2, 0x000009E3}, {0x000009FE, 0x000009FE}, {0x00000A01, 0x00000A03}, {0x00000A3C, 0x00000A3C}, -{0x00000A3E, 0x00000A42}, {0x00000A47, 0x00000A48}, {0x00000A4B, 0x00000A4D}, {0x00000A51, 0x00000A51}, -{0x00000A70, 0x00000A71}, {0x00000A75, 0x00000A75}, {0x00000A81, 0x00000A83}, {0x00000ABC, 0x00000ABC}, -{0x00000ABE, 0x00000AC5}, {0x00000AC7, 0x00000AC9}, {0x00000ACB, 0x00000ACD}, {0x00000AE2, 0x00000AE3}, -{0x00000AFA, 0x00000AFF}, {0x00000B01, 0x00000B03}, {0x00000B3C, 0x00000B3C}, {0x00000B3E, 0x00000B44}, -{0x00000B47, 0x00000B48}, {0x00000B4B, 0x00000B4D}, {0x00000B55, 0x00000B57}, {0x00000B62, 0x00000B63}, -{0x00000B82, 0x00000B82}, {0x00000BBE, 0x00000BC2}, {0x00000BC6, 0x00000BC8}, {0x00000BCA, 0x00000BCD}, -{0x00000BD7, 0x00000BD7}, {0x00000C00, 0x00000C04}, {0x00000C3E, 0x00000C44}, {0x00000C46, 0x00000C48}, -{0x00000C4A, 0x00000C4D}, {0x00000C55, 0x00000C56}, {0x00000C62, 0x00000C63}, {0x00000C81, 0x00000C83}, -{0x00000CBC, 0x00000CBC}, {0x00000CBE, 0x00000CC4}, {0x00000CC6, 0x00000CC8}, {0x00000CCA, 0x00000CCD}, -{0x00000CD5, 0x00000CD6}, {0x00000CE2, 0x00000CE3}, {0x00000D00, 0x00000D03}, {0x00000D3B, 0x00000D3C}, -{0x00000D3E, 0x00000D44}, {0x00000D46, 0x00000D48}, {0x00000D4A, 0x00000D4D}, {0x00000D57, 0x00000D57}, -{0x00000D62, 0x00000D63}, {0x00000D81, 0x00000D83}, {0x00000DCA, 0x00000DCA}, {0x00000DCF, 0x00000DD4}, -{0x00000DD6, 0x00000DD6}, {0x00000DD8, 0x00000DDF}, {0x00000DF2, 0x00000DF3}, {0x00000E31, 0x00000E31}, -{0x00000E34, 0x00000E3A}, {0x00000E47, 0x00000E4E}, {0x00000EB1, 0x00000EB1}, {0x00000EB4, 0x00000EBC}, -{0x00000EC8, 0x00000ECD}, {0x00000F18, 0x00000F19}, {0x00000F35, 0x00000F35}, {0x00000F37, 0x00000F37}, -{0x00000F39, 0x00000F39}, {0x00000F3E, 0x00000F3F}, {0x00000F71, 0x00000F84}, {0x00000F86, 0x00000F87}, -{0x00000F8D, 0x00000F97}, {0x00000F99, 0x00000FBC}, {0x00000FC6, 0x00000FC6}, {0x0000102B, 0x0000103E}, -{0x00001056, 0x00001059}, {0x0000105E, 0x00001060}, {0x00001062, 0x00001064}, {0x00001067, 0x0000106D}, -{0x00001071, 0x00001074}, {0x00001082, 0x0000108D}, {0x0000108F, 0x0000108F}, {0x0000109A, 0x0000109D}, -{0x0000135D, 0x0000135F}, {0x00001712, 0x00001714}, {0x00001732, 0x00001734}, {0x00001752, 0x00001753}, -{0x00001772, 0x00001773}, {0x000017B4, 0x000017D3}, {0x000017DD, 0x000017DD}, {0x0000180B, 0x0000180D}, +{0x00000898, 0x0000089F}, {0x000008CA, 0x000008E1}, {0x000008E3, 0x00000903}, {0x0000093A, 0x0000093C}, +{0x0000093E, 0x0000094F}, {0x00000951, 0x00000957}, {0x00000962, 0x00000963}, {0x00000981, 0x00000983}, +{0x000009BC, 0x000009BC}, {0x000009BE, 0x000009C4}, {0x000009C7, 0x000009C8}, {0x000009CB, 0x000009CD}, +{0x000009D7, 0x000009D7}, {0x000009E2, 0x000009E3}, {0x000009FE, 0x000009FE}, {0x00000A01, 0x00000A03}, +{0x00000A3C, 0x00000A3C}, {0x00000A3E, 0x00000A42}, {0x00000A47, 0x00000A48}, {0x00000A4B, 0x00000A4D}, +{0x00000A51, 0x00000A51}, {0x00000A70, 0x00000A71}, {0x00000A75, 0x00000A75}, {0x00000A81, 0x00000A83}, +{0x00000ABC, 0x00000ABC}, {0x00000ABE, 0x00000AC5}, {0x00000AC7, 0x00000AC9}, {0x00000ACB, 0x00000ACD}, +{0x00000AE2, 0x00000AE3}, {0x00000AFA, 0x00000AFF}, {0x00000B01, 0x00000B03}, {0x00000B3C, 0x00000B3C}, +{0x00000B3E, 0x00000B44}, {0x00000B47, 0x00000B48}, {0x00000B4B, 0x00000B4D}, {0x00000B55, 0x00000B57}, +{0x00000B62, 0x00000B63}, {0x00000B82, 0x00000B82}, {0x00000BBE, 0x00000BC2}, {0x00000BC6, 0x00000BC8}, +{0x00000BCA, 0x00000BCD}, {0x00000BD7, 0x00000BD7}, {0x00000C00, 0x00000C04}, {0x00000C3C, 0x00000C3C}, +{0x00000C3E, 0x00000C44}, {0x00000C46, 0x00000C48}, {0x00000C4A, 0x00000C4D}, {0x00000C55, 0x00000C56}, +{0x00000C62, 0x00000C63}, {0x00000C81, 0x00000C83}, {0x00000CBC, 0x00000CBC}, {0x00000CBE, 0x00000CC4}, +{0x00000CC6, 0x00000CC8}, {0x00000CCA, 0x00000CCD}, {0x00000CD5, 0x00000CD6}, {0x00000CE2, 0x00000CE3}, +{0x00000CF3, 0x00000CF3}, {0x00000D00, 0x00000D03}, {0x00000D3B, 0x00000D3C}, {0x00000D3E, 0x00000D44}, +{0x00000D46, 0x00000D48}, {0x00000D4A, 0x00000D4D}, {0x00000D57, 0x00000D57}, {0x00000D62, 0x00000D63}, +{0x00000D81, 0x00000D83}, {0x00000DCA, 0x00000DCA}, {0x00000DCF, 0x00000DD4}, {0x00000DD6, 0x00000DD6}, +{0x00000DD8, 0x00000DDF}, {0x00000DF2, 0x00000DF3}, {0x00000E31, 0x00000E31}, {0x00000E34, 0x00000E3A}, +{0x00000E47, 0x00000E4E}, {0x00000EB1, 0x00000EB1}, {0x00000EB4, 0x00000EBC}, {0x00000EC8, 0x00000ECE}, +{0x00000F18, 0x00000F19}, {0x00000F35, 0x00000F35}, {0x00000F37, 0x00000F37}, {0x00000F39, 0x00000F39}, +{0x00000F3E, 0x00000F3F}, {0x00000F71, 0x00000F84}, {0x00000F86, 0x00000F87}, {0x00000F8D, 0x00000F97}, +{0x00000F99, 0x00000FBC}, {0x00000FC6, 0x00000FC6}, {0x0000102B, 0x0000103E}, {0x00001056, 0x00001059}, +{0x0000105E, 0x00001060}, {0x00001062, 0x00001064}, {0x00001067, 0x0000106D}, {0x00001071, 0x00001074}, +{0x00001082, 0x0000108D}, {0x0000108F, 0x0000108F}, {0x0000109A, 0x0000109D}, {0x0000135D, 0x0000135F}, +{0x00001712, 0x00001715}, {0x00001732, 0x00001734}, {0x00001752, 0x00001753}, {0x00001772, 0x00001773}, +{0x000017B4, 0x000017D3}, {0x000017DD, 0x000017DD}, {0x0000180B, 0x0000180D}, {0x0000180F, 0x0000180F}, {0x00001885, 0x00001886}, {0x000018A9, 0x000018A9}, {0x00001920, 0x0000192B}, {0x00001930, 0x0000193B}, {0x00001A17, 0x00001A1B}, {0x00001A55, 0x00001A5E}, {0x00001A60, 0x00001A7C}, {0x00001A7F, 0x00001A7F}, -{0x00001AB0, 0x00001AC0}, {0x00001B00, 0x00001B04}, {0x00001B34, 0x00001B44}, {0x00001B6B, 0x00001B73}, +{0x00001AB0, 0x00001ACE}, {0x00001B00, 0x00001B04}, {0x00001B34, 0x00001B44}, {0x00001B6B, 0x00001B73}, {0x00001B80, 0x00001B82}, {0x00001BA1, 0x00001BAD}, {0x00001BE6, 0x00001BF3}, {0x00001C24, 0x00001C37}, {0x00001CD0, 0x00001CD2}, {0x00001CD4, 0x00001CE8}, {0x00001CED, 0x00001CED}, {0x00001CF4, 0x00001CF4}, -{0x00001CF7, 0x00001CF9}, {0x00001DC0, 0x00001DF9}, {0x00001DFB, 0x00001DFF}, {0x000020D0, 0x000020F0}, -{0x00002CEF, 0x00002CF1}, {0x00002D7F, 0x00002D7F}, {0x00002DE0, 0x00002DFF}, {0x0000302A, 0x0000302F}, -{0x00003099, 0x0000309A}, {0x0000A66F, 0x0000A672}, {0x0000A674, 0x0000A67D}, {0x0000A69E, 0x0000A69F}, -{0x0000A6F0, 0x0000A6F1}, {0x0000A802, 0x0000A802}, {0x0000A806, 0x0000A806}, {0x0000A80B, 0x0000A80B}, -{0x0000A823, 0x0000A827}, {0x0000A82C, 0x0000A82C}, {0x0000A880, 0x0000A881}, {0x0000A8B4, 0x0000A8C5}, -{0x0000A8E0, 0x0000A8F1}, {0x0000A8FF, 0x0000A8FF}, {0x0000A926, 0x0000A92D}, {0x0000A947, 0x0000A953}, -{0x0000A980, 0x0000A983}, {0x0000A9B3, 0x0000A9C0}, {0x0000A9E5, 0x0000A9E5}, {0x0000AA29, 0x0000AA36}, -{0x0000AA43, 0x0000AA43}, {0x0000AA4C, 0x0000AA4D}, {0x0000AA7B, 0x0000AA7D}, {0x0000AAB0, 0x0000AAB0}, -{0x0000AAB2, 0x0000AAB4}, {0x0000AAB7, 0x0000AAB8}, {0x0000AABE, 0x0000AABF}, {0x0000AAC1, 0x0000AAC1}, -{0x0000AAEB, 0x0000AAEF}, {0x0000AAF5, 0x0000AAF6}, {0x0000ABE3, 0x0000ABEA}, {0x0000ABEC, 0x0000ABED}, -{0x0000FB1E, 0x0000FB1E}, {0x0000FE00, 0x0000FE0F}, {0x0000FE20, 0x0000FE2F}, {0x000101FD, 0x000101FD}, -{0x000102E0, 0x000102E0}, {0x00010376, 0x0001037A}, {0x00010A01, 0x00010A03}, {0x00010A05, 0x00010A06}, -{0x00010A0C, 0x00010A0F}, {0x00010A38, 0x00010A3A}, {0x00010A3F, 0x00010A3F}, {0x00010AE5, 0x00010AE6}, -{0x00010D24, 0x00010D27}, {0x00010EAB, 0x00010EAC}, {0x00010F46, 0x00010F50}, {0x00011000, 0x00011002}, -{0x00011038, 0x00011046}, {0x0001107F, 0x00011082}, {0x000110B0, 0x000110BA}, {0x00011100, 0x00011102}, +{0x00001CF7, 0x00001CF9}, {0x00001DC0, 0x00001DFF}, {0x000020D0, 0x000020F0}, {0x00002CEF, 0x00002CF1}, +{0x00002D7F, 0x00002D7F}, {0x00002DE0, 0x00002DFF}, {0x0000302A, 0x0000302F}, {0x00003099, 0x0000309A}, +{0x0000A66F, 0x0000A672}, {0x0000A674, 0x0000A67D}, {0x0000A69E, 0x0000A69F}, {0x0000A6F0, 0x0000A6F1}, +{0x0000A802, 0x0000A802}, {0x0000A806, 0x0000A806}, {0x0000A80B, 0x0000A80B}, {0x0000A823, 0x0000A827}, +{0x0000A82C, 0x0000A82C}, {0x0000A880, 0x0000A881}, {0x0000A8B4, 0x0000A8C5}, {0x0000A8E0, 0x0000A8F1}, +{0x0000A8FF, 0x0000A8FF}, {0x0000A926, 0x0000A92D}, {0x0000A947, 0x0000A953}, {0x0000A980, 0x0000A983}, +{0x0000A9B3, 0x0000A9C0}, {0x0000A9E5, 0x0000A9E5}, {0x0000AA29, 0x0000AA36}, {0x0000AA43, 0x0000AA43}, +{0x0000AA4C, 0x0000AA4D}, {0x0000AA7B, 0x0000AA7D}, {0x0000AAB0, 0x0000AAB0}, {0x0000AAB2, 0x0000AAB4}, +{0x0000AAB7, 0x0000AAB8}, {0x0000AABE, 0x0000AABF}, {0x0000AAC1, 0x0000AAC1}, {0x0000AAEB, 0x0000AAEF}, +{0x0000AAF5, 0x0000AAF6}, {0x0000ABE3, 0x0000ABEA}, {0x0000ABEC, 0x0000ABED}, {0x0000FB1E, 0x0000FB1E}, +{0x0000FE00, 0x0000FE0F}, {0x0000FE20, 0x0000FE2F}, {0x000101FD, 0x000101FD}, {0x000102E0, 0x000102E0}, +{0x00010376, 0x0001037A}, {0x00010A01, 0x00010A03}, {0x00010A05, 0x00010A06}, {0x00010A0C, 0x00010A0F}, +{0x00010A38, 0x00010A3A}, {0x00010A3F, 0x00010A3F}, {0x00010AE5, 0x00010AE6}, {0x00010D24, 0x00010D27}, +{0x00010EAB, 0x00010EAC}, {0x00010EFD, 0x00010EFF}, {0x00010F46, 0x00010F50}, {0x00010F82, 0x00010F85}, +{0x00011000, 0x00011002}, {0x00011038, 0x00011046}, {0x00011070, 0x00011070}, {0x00011073, 0x00011074}, +{0x0001107F, 0x00011082}, {0x000110B0, 0x000110BA}, {0x000110C2, 0x000110C2}, {0x00011100, 0x00011102}, {0x00011127, 0x00011134}, {0x00011145, 0x00011146}, {0x00011173, 0x00011173}, {0x00011180, 0x00011182}, {0x000111B3, 0x000111C0}, {0x000111C9, 0x000111CC}, {0x000111CE, 0x000111CF}, {0x0001122C, 0x00011237}, -{0x0001123E, 0x0001123E}, {0x000112DF, 0x000112EA}, {0x00011300, 0x00011303}, {0x0001133B, 0x0001133C}, -{0x0001133E, 0x00011344}, {0x00011347, 0x00011348}, {0x0001134B, 0x0001134D}, {0x00011357, 0x00011357}, -{0x00011362, 0x00011363}, {0x00011366, 0x0001136C}, {0x00011370, 0x00011374}, {0x00011435, 0x00011446}, -{0x0001145E, 0x0001145E}, {0x000114B0, 0x000114C3}, {0x000115AF, 0x000115B5}, {0x000115B8, 0x000115C0}, -{0x000115DC, 0x000115DD}, {0x00011630, 0x00011640}, {0x000116AB, 0x000116B7}, {0x0001171D, 0x0001172B}, -{0x0001182C, 0x0001183A}, {0x00011930, 0x00011935}, {0x00011937, 0x00011938}, {0x0001193B, 0x0001193E}, -{0x00011940, 0x00011940}, {0x00011942, 0x00011943}, {0x000119D1, 0x000119D7}, {0x000119DA, 0x000119E0}, -{0x000119E4, 0x000119E4}, {0x00011A01, 0x00011A0A}, {0x00011A33, 0x00011A39}, {0x00011A3B, 0x00011A3E}, -{0x00011A47, 0x00011A47}, {0x00011A51, 0x00011A5B}, {0x00011A8A, 0x00011A99}, {0x00011C2F, 0x00011C36}, -{0x00011C38, 0x00011C3F}, {0x00011C92, 0x00011CA7}, {0x00011CA9, 0x00011CB6}, {0x00011D31, 0x00011D36}, -{0x00011D3A, 0x00011D3A}, {0x00011D3C, 0x00011D3D}, {0x00011D3F, 0x00011D45}, {0x00011D47, 0x00011D47}, -{0x00011D8A, 0x00011D8E}, {0x00011D90, 0x00011D91}, {0x00011D93, 0x00011D97}, {0x00011EF3, 0x00011EF6}, -{0x00016AF0, 0x00016AF4}, {0x00016B30, 0x00016B36}, {0x00016F4F, 0x00016F4F}, {0x00016F51, 0x00016F87}, -{0x00016F8F, 0x00016F92}, {0x00016FE4, 0x00016FE4}, {0x00016FF0, 0x00016FF1}, {0x0001BC9D, 0x0001BC9E}, -{0x0001D165, 0x0001D169}, {0x0001D16D, 0x0001D172}, {0x0001D17B, 0x0001D182}, {0x0001D185, 0x0001D18B}, -{0x0001D1AA, 0x0001D1AD}, {0x0001D242, 0x0001D244}, {0x0001DA00, 0x0001DA36}, {0x0001DA3B, 0x0001DA6C}, -{0x0001DA75, 0x0001DA75}, {0x0001DA84, 0x0001DA84}, {0x0001DA9B, 0x0001DA9F}, {0x0001DAA1, 0x0001DAAF}, -{0x0001E000, 0x0001E006}, {0x0001E008, 0x0001E018}, {0x0001E01B, 0x0001E021}, {0x0001E023, 0x0001E024}, -{0x0001E026, 0x0001E02A}, {0x0001E130, 0x0001E136}, {0x0001E2EC, 0x0001E2EF}, {0x0001E8D0, 0x0001E8D6}, +{0x0001123E, 0x0001123E}, {0x00011241, 0x00011241}, {0x000112DF, 0x000112EA}, {0x00011300, 0x00011303}, +{0x0001133B, 0x0001133C}, {0x0001133E, 0x00011344}, {0x00011347, 0x00011348}, {0x0001134B, 0x0001134D}, +{0x00011357, 0x00011357}, {0x00011362, 0x00011363}, {0x00011366, 0x0001136C}, {0x00011370, 0x00011374}, +{0x00011435, 0x00011446}, {0x0001145E, 0x0001145E}, {0x000114B0, 0x000114C3}, {0x000115AF, 0x000115B5}, +{0x000115B8, 0x000115C0}, {0x000115DC, 0x000115DD}, {0x00011630, 0x00011640}, {0x000116AB, 0x000116B7}, +{0x0001171D, 0x0001172B}, {0x0001182C, 0x0001183A}, {0x00011930, 0x00011935}, {0x00011937, 0x00011938}, +{0x0001193B, 0x0001193E}, {0x00011940, 0x00011940}, {0x00011942, 0x00011943}, {0x000119D1, 0x000119D7}, +{0x000119DA, 0x000119E0}, {0x000119E4, 0x000119E4}, {0x00011A01, 0x00011A0A}, {0x00011A33, 0x00011A39}, +{0x00011A3B, 0x00011A3E}, {0x00011A47, 0x00011A47}, {0x00011A51, 0x00011A5B}, {0x00011A8A, 0x00011A99}, +{0x00011C2F, 0x00011C36}, {0x00011C38, 0x00011C3F}, {0x00011C92, 0x00011CA7}, {0x00011CA9, 0x00011CB6}, +{0x00011D31, 0x00011D36}, {0x00011D3A, 0x00011D3A}, {0x00011D3C, 0x00011D3D}, {0x00011D3F, 0x00011D45}, +{0x00011D47, 0x00011D47}, {0x00011D8A, 0x00011D8E}, {0x00011D90, 0x00011D91}, {0x00011D93, 0x00011D97}, +{0x00011EF3, 0x00011EF6}, {0x00011F00, 0x00011F01}, {0x00011F03, 0x00011F03}, {0x00011F34, 0x00011F3A}, +{0x00011F3E, 0x00011F42}, {0x00013440, 0x00013440}, {0x00013447, 0x00013455}, {0x00016AF0, 0x00016AF4}, +{0x00016B30, 0x00016B36}, {0x00016F4F, 0x00016F4F}, {0x00016F51, 0x00016F87}, {0x00016F8F, 0x00016F92}, +{0x00016FE4, 0x00016FE4}, {0x00016FF0, 0x00016FF1}, {0x0001BC9D, 0x0001BC9E}, {0x0001CF00, 0x0001CF2D}, +{0x0001CF30, 0x0001CF46}, {0x0001D165, 0x0001D169}, {0x0001D16D, 0x0001D172}, {0x0001D17B, 0x0001D182}, +{0x0001D185, 0x0001D18B}, {0x0001D1AA, 0x0001D1AD}, {0x0001D242, 0x0001D244}, {0x0001DA00, 0x0001DA36}, +{0x0001DA3B, 0x0001DA6C}, {0x0001DA75, 0x0001DA75}, {0x0001DA84, 0x0001DA84}, {0x0001DA9B, 0x0001DA9F}, +{0x0001DAA1, 0x0001DAAF}, {0x0001E000, 0x0001E006}, {0x0001E008, 0x0001E018}, {0x0001E01B, 0x0001E021}, +{0x0001E023, 0x0001E024}, {0x0001E026, 0x0001E02A}, {0x0001E08F, 0x0001E08F}, {0x0001E130, 0x0001E136}, +{0x0001E2AE, 0x0001E2AE}, {0x0001E2EC, 0x0001E2EF}, {0x0001E4EC, 0x0001E4EF}, {0x0001E8D0, 0x0001E8D6}, {0x0001E944, 0x0001E94A}, {0x000E0100, 0x000E01EF}, }; @@ -276,7 +309,7 @@ const std::vector> unicode_ranges_punctuation = { {0x000000B6, 0x000000B7}, {0x000000BB, 0x000000BB}, {0x000000BF, 0x000000BF}, {0x0000037E, 0x0000037E}, {0x00000387, 0x00000387}, {0x0000055A, 0x0000055F}, {0x00000589, 0x0000058A}, {0x000005BE, 0x000005BE}, {0x000005C0, 0x000005C0}, {0x000005C3, 0x000005C3}, {0x000005C6, 0x000005C6}, {0x000005F3, 0x000005F4}, -{0x00000609, 0x0000060A}, {0x0000060C, 0x0000060D}, {0x0000061B, 0x0000061B}, {0x0000061E, 0x0000061F}, +{0x00000609, 0x0000060A}, {0x0000060C, 0x0000060D}, {0x0000061B, 0x0000061B}, {0x0000061D, 0x0000061F}, {0x0000066A, 0x0000066D}, {0x000006D4, 0x000006D4}, {0x00000700, 0x0000070D}, {0x000007F7, 0x000007F9}, {0x00000830, 0x0000083E}, {0x0000085E, 0x0000085E}, {0x00000964, 0x00000965}, {0x00000970, 0x00000970}, {0x000009FD, 0x000009FD}, {0x00000A76, 0x00000A76}, {0x00000AF0, 0x00000AF0}, {0x00000C77, 0x00000C77}, @@ -286,37 +319,38 @@ const std::vector> unicode_ranges_punctuation = { {0x00001360, 0x00001368}, {0x00001400, 0x00001400}, {0x0000166E, 0x0000166E}, {0x0000169B, 0x0000169C}, {0x000016EB, 0x000016ED}, {0x00001735, 0x00001736}, {0x000017D4, 0x000017D6}, {0x000017D8, 0x000017DA}, {0x00001800, 0x0000180A}, {0x00001944, 0x00001945}, {0x00001A1E, 0x00001A1F}, {0x00001AA0, 0x00001AA6}, -{0x00001AA8, 0x00001AAD}, {0x00001B5A, 0x00001B60}, {0x00001BFC, 0x00001BFF}, {0x00001C3B, 0x00001C3F}, -{0x00001C7E, 0x00001C7F}, {0x00001CC0, 0x00001CC7}, {0x00001CD3, 0x00001CD3}, {0x00002010, 0x00002027}, -{0x00002030, 0x00002043}, {0x00002045, 0x00002051}, {0x00002053, 0x0000205E}, {0x0000207D, 0x0000207E}, -{0x0000208D, 0x0000208E}, {0x00002308, 0x0000230B}, {0x00002329, 0x0000232A}, {0x00002768, 0x00002775}, -{0x000027C5, 0x000027C6}, {0x000027E6, 0x000027EF}, {0x00002983, 0x00002998}, {0x000029D8, 0x000029DB}, -{0x000029FC, 0x000029FD}, {0x00002CF9, 0x00002CFC}, {0x00002CFE, 0x00002CFF}, {0x00002D70, 0x00002D70}, -{0x00002E00, 0x00002E2E}, {0x00002E30, 0x00002E4F}, {0x00002E52, 0x00002E52}, {0x00003001, 0x00003003}, -{0x00003008, 0x00003011}, {0x00003014, 0x0000301F}, {0x00003030, 0x00003030}, {0x0000303D, 0x0000303D}, -{0x000030A0, 0x000030A0}, {0x000030FB, 0x000030FB}, {0x0000A4FE, 0x0000A4FF}, {0x0000A60D, 0x0000A60F}, -{0x0000A673, 0x0000A673}, {0x0000A67E, 0x0000A67E}, {0x0000A6F2, 0x0000A6F7}, {0x0000A874, 0x0000A877}, -{0x0000A8CE, 0x0000A8CF}, {0x0000A8F8, 0x0000A8FA}, {0x0000A8FC, 0x0000A8FC}, {0x0000A92E, 0x0000A92F}, -{0x0000A95F, 0x0000A95F}, {0x0000A9C1, 0x0000A9CD}, {0x0000A9DE, 0x0000A9DF}, {0x0000AA5C, 0x0000AA5F}, -{0x0000AADE, 0x0000AADF}, {0x0000AAF0, 0x0000AAF1}, {0x0000ABEB, 0x0000ABEB}, {0x0000FD3E, 0x0000FD3F}, -{0x0000FE10, 0x0000FE19}, {0x0000FE30, 0x0000FE52}, {0x0000FE54, 0x0000FE61}, {0x0000FE63, 0x0000FE63}, -{0x0000FE68, 0x0000FE68}, {0x0000FE6A, 0x0000FE6B}, {0x0000FF01, 0x0000FF03}, {0x0000FF05, 0x0000FF0A}, -{0x0000FF0C, 0x0000FF0F}, {0x0000FF1A, 0x0000FF1B}, {0x0000FF1F, 0x0000FF20}, {0x0000FF3B, 0x0000FF3D}, -{0x0000FF3F, 0x0000FF3F}, {0x0000FF5B, 0x0000FF5B}, {0x0000FF5D, 0x0000FF5D}, {0x0000FF5F, 0x0000FF65}, -{0x00010100, 0x00010102}, {0x0001039F, 0x0001039F}, {0x000103D0, 0x000103D0}, {0x0001056F, 0x0001056F}, -{0x00010857, 0x00010857}, {0x0001091F, 0x0001091F}, {0x0001093F, 0x0001093F}, {0x00010A50, 0x00010A58}, -{0x00010A7F, 0x00010A7F}, {0x00010AF0, 0x00010AF6}, {0x00010B39, 0x00010B3F}, {0x00010B99, 0x00010B9C}, -{0x00010EAD, 0x00010EAD}, {0x00010F55, 0x00010F59}, {0x00011047, 0x0001104D}, {0x000110BB, 0x000110BC}, -{0x000110BE, 0x000110C1}, {0x00011140, 0x00011143}, {0x00011174, 0x00011175}, {0x000111C5, 0x000111C8}, -{0x000111CD, 0x000111CD}, {0x000111DB, 0x000111DB}, {0x000111DD, 0x000111DF}, {0x00011238, 0x0001123D}, -{0x000112A9, 0x000112A9}, {0x0001144B, 0x0001144F}, {0x0001145A, 0x0001145B}, {0x0001145D, 0x0001145D}, -{0x000114C6, 0x000114C6}, {0x000115C1, 0x000115D7}, {0x00011641, 0x00011643}, {0x00011660, 0x0001166C}, -{0x0001173C, 0x0001173E}, {0x0001183B, 0x0001183B}, {0x00011944, 0x00011946}, {0x000119E2, 0x000119E2}, -{0x00011A3F, 0x00011A46}, {0x00011A9A, 0x00011A9C}, {0x00011A9E, 0x00011AA2}, {0x00011C41, 0x00011C45}, -{0x00011C70, 0x00011C71}, {0x00011EF7, 0x00011EF8}, {0x00011FFF, 0x00011FFF}, {0x00012470, 0x00012474}, -{0x00016A6E, 0x00016A6F}, {0x00016AF5, 0x00016AF5}, {0x00016B37, 0x00016B3B}, {0x00016B44, 0x00016B44}, -{0x00016E97, 0x00016E9A}, {0x00016FE2, 0x00016FE2}, {0x0001BC9F, 0x0001BC9F}, {0x0001DA87, 0x0001DA8B}, -{0x0001E95E, 0x0001E95F}, +{0x00001AA8, 0x00001AAD}, {0x00001B5A, 0x00001B60}, {0x00001B7D, 0x00001B7E}, {0x00001BFC, 0x00001BFF}, +{0x00001C3B, 0x00001C3F}, {0x00001C7E, 0x00001C7F}, {0x00001CC0, 0x00001CC7}, {0x00001CD3, 0x00001CD3}, +{0x00002010, 0x00002027}, {0x00002030, 0x00002043}, {0x00002045, 0x00002051}, {0x00002053, 0x0000205E}, +{0x0000207D, 0x0000207E}, {0x0000208D, 0x0000208E}, {0x00002308, 0x0000230B}, {0x00002329, 0x0000232A}, +{0x00002768, 0x00002775}, {0x000027C5, 0x000027C6}, {0x000027E6, 0x000027EF}, {0x00002983, 0x00002998}, +{0x000029D8, 0x000029DB}, {0x000029FC, 0x000029FD}, {0x00002CF9, 0x00002CFC}, {0x00002CFE, 0x00002CFF}, +{0x00002D70, 0x00002D70}, {0x00002E00, 0x00002E2E}, {0x00002E30, 0x00002E4F}, {0x00002E52, 0x00002E5D}, +{0x00003001, 0x00003003}, {0x00003008, 0x00003011}, {0x00003014, 0x0000301F}, {0x00003030, 0x00003030}, +{0x0000303D, 0x0000303D}, {0x000030A0, 0x000030A0}, {0x000030FB, 0x000030FB}, {0x0000A4FE, 0x0000A4FF}, +{0x0000A60D, 0x0000A60F}, {0x0000A673, 0x0000A673}, {0x0000A67E, 0x0000A67E}, {0x0000A6F2, 0x0000A6F7}, +{0x0000A874, 0x0000A877}, {0x0000A8CE, 0x0000A8CF}, {0x0000A8F8, 0x0000A8FA}, {0x0000A8FC, 0x0000A8FC}, +{0x0000A92E, 0x0000A92F}, {0x0000A95F, 0x0000A95F}, {0x0000A9C1, 0x0000A9CD}, {0x0000A9DE, 0x0000A9DF}, +{0x0000AA5C, 0x0000AA5F}, {0x0000AADE, 0x0000AADF}, {0x0000AAF0, 0x0000AAF1}, {0x0000ABEB, 0x0000ABEB}, +{0x0000FD3E, 0x0000FD3F}, {0x0000FE10, 0x0000FE19}, {0x0000FE30, 0x0000FE52}, {0x0000FE54, 0x0000FE61}, +{0x0000FE63, 0x0000FE63}, {0x0000FE68, 0x0000FE68}, {0x0000FE6A, 0x0000FE6B}, {0x0000FF01, 0x0000FF03}, +{0x0000FF05, 0x0000FF0A}, {0x0000FF0C, 0x0000FF0F}, {0x0000FF1A, 0x0000FF1B}, {0x0000FF1F, 0x0000FF20}, +{0x0000FF3B, 0x0000FF3D}, {0x0000FF3F, 0x0000FF3F}, {0x0000FF5B, 0x0000FF5B}, {0x0000FF5D, 0x0000FF5D}, +{0x0000FF5F, 0x0000FF65}, {0x00010100, 0x00010102}, {0x0001039F, 0x0001039F}, {0x000103D0, 0x000103D0}, +{0x0001056F, 0x0001056F}, {0x00010857, 0x00010857}, {0x0001091F, 0x0001091F}, {0x0001093F, 0x0001093F}, +{0x00010A50, 0x00010A58}, {0x00010A7F, 0x00010A7F}, {0x00010AF0, 0x00010AF6}, {0x00010B39, 0x00010B3F}, +{0x00010B99, 0x00010B9C}, {0x00010EAD, 0x00010EAD}, {0x00010F55, 0x00010F59}, {0x00010F86, 0x00010F89}, +{0x00011047, 0x0001104D}, {0x000110BB, 0x000110BC}, {0x000110BE, 0x000110C1}, {0x00011140, 0x00011143}, +{0x00011174, 0x00011175}, {0x000111C5, 0x000111C8}, {0x000111CD, 0x000111CD}, {0x000111DB, 0x000111DB}, +{0x000111DD, 0x000111DF}, {0x00011238, 0x0001123D}, {0x000112A9, 0x000112A9}, {0x0001144B, 0x0001144F}, +{0x0001145A, 0x0001145B}, {0x0001145D, 0x0001145D}, {0x000114C6, 0x000114C6}, {0x000115C1, 0x000115D7}, +{0x00011641, 0x00011643}, {0x00011660, 0x0001166C}, {0x000116B9, 0x000116B9}, {0x0001173C, 0x0001173E}, +{0x0001183B, 0x0001183B}, {0x00011944, 0x00011946}, {0x000119E2, 0x000119E2}, {0x00011A3F, 0x00011A46}, +{0x00011A9A, 0x00011A9C}, {0x00011A9E, 0x00011AA2}, {0x00011B00, 0x00011B09}, {0x00011C41, 0x00011C45}, +{0x00011C70, 0x00011C71}, {0x00011EF7, 0x00011EF8}, {0x00011F43, 0x00011F4F}, {0x00011FFF, 0x00011FFF}, +{0x00012470, 0x00012474}, {0x00012FF1, 0x00012FF2}, {0x00016A6E, 0x00016A6F}, {0x00016AF5, 0x00016AF5}, +{0x00016B37, 0x00016B3B}, {0x00016B44, 0x00016B44}, {0x00016E97, 0x00016E9A}, {0x00016FE2, 0x00016FE2}, +{0x0001BC9F, 0x0001BC9F}, {0x0001DA87, 0x0001DA8B}, {0x0001E95E, 0x0001E95F}, }; const std::vector> unicode_ranges_symbol = { @@ -328,40 +362,41 @@ const std::vector> unicode_ranges_symbol = { {0x00000375, 0x00000375}, {0x00000384, 0x00000385}, {0x000003F6, 0x000003F6}, {0x00000482, 0x00000482}, {0x0000058D, 0x0000058F}, {0x00000606, 0x00000608}, {0x0000060B, 0x0000060B}, {0x0000060E, 0x0000060F}, {0x000006DE, 0x000006DE}, {0x000006E9, 0x000006E9}, {0x000006FD, 0x000006FE}, {0x000007F6, 0x000007F6}, -{0x000007FE, 0x000007FF}, {0x000009F2, 0x000009F3}, {0x000009FA, 0x000009FB}, {0x00000AF1, 0x00000AF1}, -{0x00000B70, 0x00000B70}, {0x00000BF3, 0x00000BFA}, {0x00000C7F, 0x00000C7F}, {0x00000D4F, 0x00000D4F}, -{0x00000D79, 0x00000D79}, {0x00000E3F, 0x00000E3F}, {0x00000F01, 0x00000F03}, {0x00000F13, 0x00000F13}, -{0x00000F15, 0x00000F17}, {0x00000F1A, 0x00000F1F}, {0x00000F34, 0x00000F34}, {0x00000F36, 0x00000F36}, -{0x00000F38, 0x00000F38}, {0x00000FBE, 0x00000FC5}, {0x00000FC7, 0x00000FCC}, {0x00000FCE, 0x00000FCF}, -{0x00000FD5, 0x00000FD8}, {0x0000109E, 0x0000109F}, {0x00001390, 0x00001399}, {0x0000166D, 0x0000166D}, -{0x000017DB, 0x000017DB}, {0x00001940, 0x00001940}, {0x000019DE, 0x000019FF}, {0x00001B61, 0x00001B6A}, -{0x00001B74, 0x00001B7C}, {0x00001FBD, 0x00001FBD}, {0x00001FBF, 0x00001FC1}, {0x00001FCD, 0x00001FCF}, -{0x00001FDD, 0x00001FDF}, {0x00001FED, 0x00001FEF}, {0x00001FFD, 0x00001FFE}, {0x00002044, 0x00002044}, -{0x00002052, 0x00002052}, {0x0000207A, 0x0000207C}, {0x0000208A, 0x0000208C}, {0x000020A0, 0x000020BF}, -{0x00002100, 0x00002101}, {0x00002103, 0x00002106}, {0x00002108, 0x00002109}, {0x00002114, 0x00002114}, -{0x00002116, 0x00002118}, {0x0000211E, 0x00002123}, {0x00002125, 0x00002125}, {0x00002127, 0x00002127}, -{0x00002129, 0x00002129}, {0x0000212E, 0x0000212E}, {0x0000213A, 0x0000213B}, {0x00002140, 0x00002144}, -{0x0000214A, 0x0000214D}, {0x0000214F, 0x0000214F}, {0x0000218A, 0x0000218B}, {0x00002190, 0x00002307}, -{0x0000230C, 0x00002328}, {0x0000232B, 0x00002426}, {0x00002440, 0x0000244A}, {0x0000249C, 0x000024E9}, -{0x00002500, 0x00002767}, {0x00002794, 0x000027C4}, {0x000027C7, 0x000027E5}, {0x000027F0, 0x00002982}, -{0x00002999, 0x000029D7}, {0x000029DC, 0x000029FB}, {0x000029FE, 0x00002B73}, {0x00002B76, 0x00002B95}, -{0x00002B97, 0x00002BFF}, {0x00002CE5, 0x00002CEA}, {0x00002E50, 0x00002E51}, {0x00002E80, 0x00002E99}, -{0x00002E9B, 0x00002EF3}, {0x00002F00, 0x00002FD5}, {0x00002FF0, 0x00002FFB}, {0x00003004, 0x00003004}, -{0x00003012, 0x00003013}, {0x00003020, 0x00003020}, {0x00003036, 0x00003037}, {0x0000303E, 0x0000303F}, -{0x0000309B, 0x0000309C}, {0x00003190, 0x00003191}, {0x00003196, 0x0000319F}, {0x000031C0, 0x000031E3}, -{0x00003200, 0x0000321E}, {0x0000322A, 0x00003247}, {0x00003250, 0x00003250}, {0x00003260, 0x0000327F}, -{0x0000328A, 0x000032B0}, {0x000032C0, 0x000033FF}, {0x00004DC0, 0x00004DFF}, {0x0000A490, 0x0000A4C6}, -{0x0000A700, 0x0000A716}, {0x0000A720, 0x0000A721}, {0x0000A789, 0x0000A78A}, {0x0000A828, 0x0000A82B}, -{0x0000A836, 0x0000A839}, {0x0000AA77, 0x0000AA79}, {0x0000AB5B, 0x0000AB5B}, {0x0000AB6A, 0x0000AB6B}, -{0x0000FB29, 0x0000FB29}, {0x0000FBB2, 0x0000FBC1}, {0x0000FDFC, 0x0000FDFD}, {0x0000FE62, 0x0000FE62}, -{0x0000FE64, 0x0000FE66}, {0x0000FE69, 0x0000FE69}, {0x0000FF04, 0x0000FF04}, {0x0000FF0B, 0x0000FF0B}, -{0x0000FF1C, 0x0000FF1E}, {0x0000FF3E, 0x0000FF3E}, {0x0000FF40, 0x0000FF40}, {0x0000FF5C, 0x0000FF5C}, -{0x0000FF5E, 0x0000FF5E}, {0x0000FFE0, 0x0000FFE6}, {0x0000FFE8, 0x0000FFEE}, {0x0000FFFC, 0x0000FFFD}, -{0x00010137, 0x0001013F}, {0x00010179, 0x00010189}, {0x0001018C, 0x0001018E}, {0x00010190, 0x0001019C}, -{0x000101A0, 0x000101A0}, {0x000101D0, 0x000101FC}, {0x00010877, 0x00010878}, {0x00010AC8, 0x00010AC8}, -{0x0001173F, 0x0001173F}, {0x00011FD5, 0x00011FF1}, {0x00016B3C, 0x00016B3F}, {0x00016B45, 0x00016B45}, -{0x0001BC9C, 0x0001BC9C}, {0x0001D000, 0x0001D0F5}, {0x0001D100, 0x0001D126}, {0x0001D129, 0x0001D164}, -{0x0001D16A, 0x0001D16C}, {0x0001D183, 0x0001D184}, {0x0001D18C, 0x0001D1A9}, {0x0001D1AE, 0x0001D1E8}, +{0x000007FE, 0x000007FF}, {0x00000888, 0x00000888}, {0x000009F2, 0x000009F3}, {0x000009FA, 0x000009FB}, +{0x00000AF1, 0x00000AF1}, {0x00000B70, 0x00000B70}, {0x00000BF3, 0x00000BFA}, {0x00000C7F, 0x00000C7F}, +{0x00000D4F, 0x00000D4F}, {0x00000D79, 0x00000D79}, {0x00000E3F, 0x00000E3F}, {0x00000F01, 0x00000F03}, +{0x00000F13, 0x00000F13}, {0x00000F15, 0x00000F17}, {0x00000F1A, 0x00000F1F}, {0x00000F34, 0x00000F34}, +{0x00000F36, 0x00000F36}, {0x00000F38, 0x00000F38}, {0x00000FBE, 0x00000FC5}, {0x00000FC7, 0x00000FCC}, +{0x00000FCE, 0x00000FCF}, {0x00000FD5, 0x00000FD8}, {0x0000109E, 0x0000109F}, {0x00001390, 0x00001399}, +{0x0000166D, 0x0000166D}, {0x000017DB, 0x000017DB}, {0x00001940, 0x00001940}, {0x000019DE, 0x000019FF}, +{0x00001B61, 0x00001B6A}, {0x00001B74, 0x00001B7C}, {0x00001FBD, 0x00001FBD}, {0x00001FBF, 0x00001FC1}, +{0x00001FCD, 0x00001FCF}, {0x00001FDD, 0x00001FDF}, {0x00001FED, 0x00001FEF}, {0x00001FFD, 0x00001FFE}, +{0x00002044, 0x00002044}, {0x00002052, 0x00002052}, {0x0000207A, 0x0000207C}, {0x0000208A, 0x0000208C}, +{0x000020A0, 0x000020C0}, {0x00002100, 0x00002101}, {0x00002103, 0x00002106}, {0x00002108, 0x00002109}, +{0x00002114, 0x00002114}, {0x00002116, 0x00002118}, {0x0000211E, 0x00002123}, {0x00002125, 0x00002125}, +{0x00002127, 0x00002127}, {0x00002129, 0x00002129}, {0x0000212E, 0x0000212E}, {0x0000213A, 0x0000213B}, +{0x00002140, 0x00002144}, {0x0000214A, 0x0000214D}, {0x0000214F, 0x0000214F}, {0x0000218A, 0x0000218B}, +{0x00002190, 0x00002307}, {0x0000230C, 0x00002328}, {0x0000232B, 0x00002426}, {0x00002440, 0x0000244A}, +{0x0000249C, 0x000024E9}, {0x00002500, 0x00002767}, {0x00002794, 0x000027C4}, {0x000027C7, 0x000027E5}, +{0x000027F0, 0x00002982}, {0x00002999, 0x000029D7}, {0x000029DC, 0x000029FB}, {0x000029FE, 0x00002B73}, +{0x00002B76, 0x00002B95}, {0x00002B97, 0x00002BFF}, {0x00002CE5, 0x00002CEA}, {0x00002E50, 0x00002E51}, +{0x00002E80, 0x00002E99}, {0x00002E9B, 0x00002EF3}, {0x00002F00, 0x00002FD5}, {0x00002FF0, 0x00002FFB}, +{0x00003004, 0x00003004}, {0x00003012, 0x00003013}, {0x00003020, 0x00003020}, {0x00003036, 0x00003037}, +{0x0000303E, 0x0000303F}, {0x0000309B, 0x0000309C}, {0x00003190, 0x00003191}, {0x00003196, 0x0000319F}, +{0x000031C0, 0x000031E3}, {0x00003200, 0x0000321E}, {0x0000322A, 0x00003247}, {0x00003250, 0x00003250}, +{0x00003260, 0x0000327F}, {0x0000328A, 0x000032B0}, {0x000032C0, 0x000033FF}, {0x00004DC0, 0x00004DFF}, +{0x0000A490, 0x0000A4C6}, {0x0000A700, 0x0000A716}, {0x0000A720, 0x0000A721}, {0x0000A789, 0x0000A78A}, +{0x0000A828, 0x0000A82B}, {0x0000A836, 0x0000A839}, {0x0000AA77, 0x0000AA79}, {0x0000AB5B, 0x0000AB5B}, +{0x0000AB6A, 0x0000AB6B}, {0x0000FB29, 0x0000FB29}, {0x0000FBB2, 0x0000FBC2}, {0x0000FD40, 0x0000FD4F}, +{0x0000FDCF, 0x0000FDCF}, {0x0000FDFC, 0x0000FDFF}, {0x0000FE62, 0x0000FE62}, {0x0000FE64, 0x0000FE66}, +{0x0000FE69, 0x0000FE69}, {0x0000FF04, 0x0000FF04}, {0x0000FF0B, 0x0000FF0B}, {0x0000FF1C, 0x0000FF1E}, +{0x0000FF3E, 0x0000FF3E}, {0x0000FF40, 0x0000FF40}, {0x0000FF5C, 0x0000FF5C}, {0x0000FF5E, 0x0000FF5E}, +{0x0000FFE0, 0x0000FFE6}, {0x0000FFE8, 0x0000FFEE}, {0x0000FFFC, 0x0000FFFD}, {0x00010137, 0x0001013F}, +{0x00010179, 0x00010189}, {0x0001018C, 0x0001018E}, {0x00010190, 0x0001019C}, {0x000101A0, 0x000101A0}, +{0x000101D0, 0x000101FC}, {0x00010877, 0x00010878}, {0x00010AC8, 0x00010AC8}, {0x0001173F, 0x0001173F}, +{0x00011FD5, 0x00011FF1}, {0x00016B3C, 0x00016B3F}, {0x00016B45, 0x00016B45}, {0x0001BC9C, 0x0001BC9C}, +{0x0001CF50, 0x0001CFC3}, {0x0001D000, 0x0001D0F5}, {0x0001D100, 0x0001D126}, {0x0001D129, 0x0001D164}, +{0x0001D16A, 0x0001D16C}, {0x0001D183, 0x0001D184}, {0x0001D18C, 0x0001D1A9}, {0x0001D1AE, 0x0001D1EA}, {0x0001D200, 0x0001D241}, {0x0001D245, 0x0001D245}, {0x0001D300, 0x0001D356}, {0x0001D6C1, 0x0001D6C1}, {0x0001D6DB, 0x0001D6DB}, {0x0001D6FB, 0x0001D6FB}, {0x0001D715, 0x0001D715}, {0x0001D735, 0x0001D735}, {0x0001D74F, 0x0001D74F}, {0x0001D76F, 0x0001D76F}, {0x0001D789, 0x0001D789}, {0x0001D7A9, 0x0001D7A9}, @@ -371,102 +406,100 @@ const std::vector> unicode_ranges_symbol = { {0x0001F000, 0x0001F02B}, {0x0001F030, 0x0001F093}, {0x0001F0A0, 0x0001F0AE}, {0x0001F0B1, 0x0001F0BF}, {0x0001F0C1, 0x0001F0CF}, {0x0001F0D1, 0x0001F0F5}, {0x0001F10D, 0x0001F1AD}, {0x0001F1E6, 0x0001F202}, {0x0001F210, 0x0001F23B}, {0x0001F240, 0x0001F248}, {0x0001F250, 0x0001F251}, {0x0001F260, 0x0001F265}, -{0x0001F300, 0x0001F6D7}, {0x0001F6E0, 0x0001F6EC}, {0x0001F6F0, 0x0001F6FC}, {0x0001F700, 0x0001F773}, -{0x0001F780, 0x0001F7D8}, {0x0001F7E0, 0x0001F7EB}, {0x0001F800, 0x0001F80B}, {0x0001F810, 0x0001F847}, -{0x0001F850, 0x0001F859}, {0x0001F860, 0x0001F887}, {0x0001F890, 0x0001F8AD}, {0x0001F8B0, 0x0001F8B1}, -{0x0001F900, 0x0001F978}, {0x0001F97A, 0x0001F9CB}, {0x0001F9CD, 0x0001FA53}, {0x0001FA60, 0x0001FA6D}, -{0x0001FA70, 0x0001FA74}, {0x0001FA78, 0x0001FA7A}, {0x0001FA80, 0x0001FA86}, {0x0001FA90, 0x0001FAA8}, -{0x0001FAB0, 0x0001FAB6}, {0x0001FAC0, 0x0001FAC2}, {0x0001FAD0, 0x0001FAD6}, {0x0001FB00, 0x0001FB92}, -{0x0001FB94, 0x0001FBCA}, +{0x0001F300, 0x0001F6D7}, {0x0001F6DC, 0x0001F6EC}, {0x0001F6F0, 0x0001F6FC}, {0x0001F700, 0x0001F776}, +{0x0001F77B, 0x0001F7D9}, {0x0001F7E0, 0x0001F7EB}, {0x0001F7F0, 0x0001F7F0}, {0x0001F800, 0x0001F80B}, +{0x0001F810, 0x0001F847}, {0x0001F850, 0x0001F859}, {0x0001F860, 0x0001F887}, {0x0001F890, 0x0001F8AD}, +{0x0001F8B0, 0x0001F8B1}, {0x0001F900, 0x0001FA53}, {0x0001FA60, 0x0001FA6D}, {0x0001FA70, 0x0001FA7C}, +{0x0001FA80, 0x0001FA88}, {0x0001FA90, 0x0001FABD}, {0x0001FABF, 0x0001FAC5}, {0x0001FACE, 0x0001FADB}, +{0x0001FAE0, 0x0001FAE8}, {0x0001FAF0, 0x0001FAF8}, {0x0001FB00, 0x0001FB92}, {0x0001FB94, 0x0001FBCA}, }; const std::vector> unicode_ranges_control = { -{0x00000000, 0x00000008}, {0x0000000E, 0x0000001B}, {0x0000007F, 0x00000084}, {0x00000086, 0x0000009F}, -{0x000000AD, 0x000000AD}, {0x00000378, 0x00000379}, {0x00000380, 0x00000383}, {0x0000038B, 0x0000038B}, -{0x0000038D, 0x0000038D}, {0x000003A2, 0x000003A2}, {0x00000530, 0x00000530}, {0x00000557, 0x00000558}, -{0x0000058B, 0x0000058C}, {0x00000590, 0x00000590}, {0x000005C8, 0x000005CF}, {0x000005EB, 0x000005EE}, -{0x000005F5, 0x00000605}, {0x0000061C, 0x0000061D}, {0x000006DD, 0x000006DD}, {0x0000070E, 0x0000070F}, -{0x0000074B, 0x0000074C}, {0x000007B2, 0x000007BF}, {0x000007FB, 0x000007FC}, {0x0000082E, 0x0000082F}, -{0x0000083F, 0x0000083F}, {0x0000085C, 0x0000085D}, {0x0000085F, 0x0000085F}, {0x0000086B, 0x0000089F}, -{0x000008B5, 0x000008B5}, {0x000008C8, 0x000008D2}, {0x000008E2, 0x000008E2}, {0x00000984, 0x00000984}, -{0x0000098D, 0x0000098E}, {0x00000991, 0x00000992}, {0x000009A9, 0x000009A9}, {0x000009B1, 0x000009B1}, -{0x000009B3, 0x000009B5}, {0x000009BA, 0x000009BB}, {0x000009C5, 0x000009C6}, {0x000009C9, 0x000009CA}, -{0x000009CF, 0x000009D6}, {0x000009D8, 0x000009DB}, {0x000009DE, 0x000009DE}, {0x000009E4, 0x000009E5}, -{0x000009FF, 0x00000A00}, {0x00000A04, 0x00000A04}, {0x00000A0B, 0x00000A0E}, {0x00000A11, 0x00000A12}, -{0x00000A29, 0x00000A29}, {0x00000A31, 0x00000A31}, {0x00000A34, 0x00000A34}, {0x00000A37, 0x00000A37}, -{0x00000A3A, 0x00000A3B}, {0x00000A3D, 0x00000A3D}, {0x00000A43, 0x00000A46}, {0x00000A49, 0x00000A4A}, -{0x00000A4E, 0x00000A50}, {0x00000A52, 0x00000A58}, {0x00000A5D, 0x00000A5D}, {0x00000A5F, 0x00000A65}, -{0x00000A77, 0x00000A80}, {0x00000A84, 0x00000A84}, {0x00000A8E, 0x00000A8E}, {0x00000A92, 0x00000A92}, -{0x00000AA9, 0x00000AA9}, {0x00000AB1, 0x00000AB1}, {0x00000AB4, 0x00000AB4}, {0x00000ABA, 0x00000ABB}, -{0x00000AC6, 0x00000AC6}, {0x00000ACA, 0x00000ACA}, {0x00000ACE, 0x00000ACF}, {0x00000AD1, 0x00000ADF}, -{0x00000AE4, 0x00000AE5}, {0x00000AF2, 0x00000AF8}, {0x00000B00, 0x00000B00}, {0x00000B04, 0x00000B04}, -{0x00000B0D, 0x00000B0E}, {0x00000B11, 0x00000B12}, {0x00000B29, 0x00000B29}, {0x00000B31, 0x00000B31}, -{0x00000B34, 0x00000B34}, {0x00000B3A, 0x00000B3B}, {0x00000B45, 0x00000B46}, {0x00000B49, 0x00000B4A}, -{0x00000B4E, 0x00000B54}, {0x00000B58, 0x00000B5B}, {0x00000B5E, 0x00000B5E}, {0x00000B64, 0x00000B65}, -{0x00000B78, 0x00000B81}, {0x00000B84, 0x00000B84}, {0x00000B8B, 0x00000B8D}, {0x00000B91, 0x00000B91}, -{0x00000B96, 0x00000B98}, {0x00000B9B, 0x00000B9B}, {0x00000B9D, 0x00000B9D}, {0x00000BA0, 0x00000BA2}, -{0x00000BA5, 0x00000BA7}, {0x00000BAB, 0x00000BAD}, {0x00000BBA, 0x00000BBD}, {0x00000BC3, 0x00000BC5}, -{0x00000BC9, 0x00000BC9}, {0x00000BCE, 0x00000BCF}, {0x00000BD1, 0x00000BD6}, {0x00000BD8, 0x00000BE5}, -{0x00000BFB, 0x00000BFF}, {0x00000C0D, 0x00000C0D}, {0x00000C11, 0x00000C11}, {0x00000C29, 0x00000C29}, -{0x00000C3A, 0x00000C3C}, {0x00000C45, 0x00000C45}, {0x00000C49, 0x00000C49}, {0x00000C4E, 0x00000C54}, -{0x00000C57, 0x00000C57}, {0x00000C5B, 0x00000C5F}, {0x00000C64, 0x00000C65}, {0x00000C70, 0x00000C76}, -{0x00000C8D, 0x00000C8D}, {0x00000C91, 0x00000C91}, {0x00000CA9, 0x00000CA9}, {0x00000CB4, 0x00000CB4}, -{0x00000CBA, 0x00000CBB}, {0x00000CC5, 0x00000CC5}, {0x00000CC9, 0x00000CC9}, {0x00000CCE, 0x00000CD4}, -{0x00000CD7, 0x00000CDD}, {0x00000CDF, 0x00000CDF}, {0x00000CE4, 0x00000CE5}, {0x00000CF0, 0x00000CF0}, -{0x00000CF3, 0x00000CFF}, {0x00000D0D, 0x00000D0D}, {0x00000D11, 0x00000D11}, {0x00000D45, 0x00000D45}, -{0x00000D49, 0x00000D49}, {0x00000D50, 0x00000D53}, {0x00000D64, 0x00000D65}, {0x00000D80, 0x00000D80}, -{0x00000D84, 0x00000D84}, {0x00000D97, 0x00000D99}, {0x00000DB2, 0x00000DB2}, {0x00000DBC, 0x00000DBC}, -{0x00000DBE, 0x00000DBF}, {0x00000DC7, 0x00000DC9}, {0x00000DCB, 0x00000DCE}, {0x00000DD5, 0x00000DD5}, -{0x00000DD7, 0x00000DD7}, {0x00000DE0, 0x00000DE5}, {0x00000DF0, 0x00000DF1}, {0x00000DF5, 0x00000E00}, -{0x00000E3B, 0x00000E3E}, {0x00000E5C, 0x00000E80}, {0x00000E83, 0x00000E83}, {0x00000E85, 0x00000E85}, -{0x00000E8B, 0x00000E8B}, {0x00000EA4, 0x00000EA4}, {0x00000EA6, 0x00000EA6}, {0x00000EBE, 0x00000EBF}, -{0x00000EC5, 0x00000EC5}, {0x00000EC7, 0x00000EC7}, {0x00000ECE, 0x00000ECF}, {0x00000EDA, 0x00000EDB}, -{0x00000EE0, 0x00000EFF}, {0x00000F48, 0x00000F48}, {0x00000F6D, 0x00000F70}, {0x00000F98, 0x00000F98}, -{0x00000FBD, 0x00000FBD}, {0x00000FCD, 0x00000FCD}, {0x00000FDB, 0x00000FFF}, {0x000010C6, 0x000010C6}, -{0x000010C8, 0x000010CC}, {0x000010CE, 0x000010CF}, {0x00001249, 0x00001249}, {0x0000124E, 0x0000124F}, -{0x00001257, 0x00001257}, {0x00001259, 0x00001259}, {0x0000125E, 0x0000125F}, {0x00001289, 0x00001289}, -{0x0000128E, 0x0000128F}, {0x000012B1, 0x000012B1}, {0x000012B6, 0x000012B7}, {0x000012BF, 0x000012BF}, -{0x000012C1, 0x000012C1}, {0x000012C6, 0x000012C7}, {0x000012D7, 0x000012D7}, {0x00001311, 0x00001311}, -{0x00001316, 0x00001317}, {0x0000135B, 0x0000135C}, {0x0000137D, 0x0000137F}, {0x0000139A, 0x0000139F}, -{0x000013F6, 0x000013F7}, {0x000013FE, 0x000013FF}, {0x0000169D, 0x0000169F}, {0x000016F9, 0x000016FF}, -{0x0000170D, 0x0000170D}, {0x00001715, 0x0000171F}, {0x00001737, 0x0000173F}, {0x00001754, 0x0000175F}, -{0x0000176D, 0x0000176D}, {0x00001771, 0x00001771}, {0x00001774, 0x0000177F}, {0x000017DE, 0x000017DF}, -{0x000017EA, 0x000017EF}, {0x000017FA, 0x000017FF}, {0x0000180E, 0x0000180F}, {0x0000181A, 0x0000181F}, -{0x00001879, 0x0000187F}, {0x000018AB, 0x000018AF}, {0x000018F6, 0x000018FF}, {0x0000191F, 0x0000191F}, -{0x0000192C, 0x0000192F}, {0x0000193C, 0x0000193F}, {0x00001941, 0x00001943}, {0x0000196E, 0x0000196F}, -{0x00001975, 0x0000197F}, {0x000019AC, 0x000019AF}, {0x000019CA, 0x000019CF}, {0x000019DB, 0x000019DD}, -{0x00001A1C, 0x00001A1D}, {0x00001A5F, 0x00001A5F}, {0x00001A7D, 0x00001A7E}, {0x00001A8A, 0x00001A8F}, -{0x00001A9A, 0x00001A9F}, {0x00001AAE, 0x00001AAF}, {0x00001AC1, 0x00001AFF}, {0x00001B4C, 0x00001B4F}, -{0x00001B7D, 0x00001B7F}, {0x00001BF4, 0x00001BFB}, {0x00001C38, 0x00001C3A}, {0x00001C4A, 0x00001C4C}, -{0x00001C89, 0x00001C8F}, {0x00001CBB, 0x00001CBC}, {0x00001CC8, 0x00001CCF}, {0x00001CFB, 0x00001CFF}, -{0x00001DFA, 0x00001DFA}, {0x00001F16, 0x00001F17}, {0x00001F1E, 0x00001F1F}, {0x00001F46, 0x00001F47}, +{0x00000000, 0x0000001F}, {0x0000007F, 0x0000009F}, {0x000000AD, 0x000000AD}, {0x00000378, 0x00000379}, +{0x00000380, 0x00000383}, {0x0000038B, 0x0000038B}, {0x0000038D, 0x0000038D}, {0x000003A2, 0x000003A2}, +{0x00000530, 0x00000530}, {0x00000557, 0x00000558}, {0x0000058B, 0x0000058C}, {0x00000590, 0x00000590}, +{0x000005C8, 0x000005CF}, {0x000005EB, 0x000005EE}, {0x000005F5, 0x00000605}, {0x0000061C, 0x0000061C}, +{0x000006DD, 0x000006DD}, {0x0000070E, 0x0000070F}, {0x0000074B, 0x0000074C}, {0x000007B2, 0x000007BF}, +{0x000007FB, 0x000007FC}, {0x0000082E, 0x0000082F}, {0x0000083F, 0x0000083F}, {0x0000085C, 0x0000085D}, +{0x0000085F, 0x0000085F}, {0x0000086B, 0x0000086F}, {0x0000088F, 0x00000897}, {0x000008E2, 0x000008E2}, +{0x00000984, 0x00000984}, {0x0000098D, 0x0000098E}, {0x00000991, 0x00000992}, {0x000009A9, 0x000009A9}, +{0x000009B1, 0x000009B1}, {0x000009B3, 0x000009B5}, {0x000009BA, 0x000009BB}, {0x000009C5, 0x000009C6}, +{0x000009C9, 0x000009CA}, {0x000009CF, 0x000009D6}, {0x000009D8, 0x000009DB}, {0x000009DE, 0x000009DE}, +{0x000009E4, 0x000009E5}, {0x000009FF, 0x00000A00}, {0x00000A04, 0x00000A04}, {0x00000A0B, 0x00000A0E}, +{0x00000A11, 0x00000A12}, {0x00000A29, 0x00000A29}, {0x00000A31, 0x00000A31}, {0x00000A34, 0x00000A34}, +{0x00000A37, 0x00000A37}, {0x00000A3A, 0x00000A3B}, {0x00000A3D, 0x00000A3D}, {0x00000A43, 0x00000A46}, +{0x00000A49, 0x00000A4A}, {0x00000A4E, 0x00000A50}, {0x00000A52, 0x00000A58}, {0x00000A5D, 0x00000A5D}, +{0x00000A5F, 0x00000A65}, {0x00000A77, 0x00000A80}, {0x00000A84, 0x00000A84}, {0x00000A8E, 0x00000A8E}, +{0x00000A92, 0x00000A92}, {0x00000AA9, 0x00000AA9}, {0x00000AB1, 0x00000AB1}, {0x00000AB4, 0x00000AB4}, +{0x00000ABA, 0x00000ABB}, {0x00000AC6, 0x00000AC6}, {0x00000ACA, 0x00000ACA}, {0x00000ACE, 0x00000ACF}, +{0x00000AD1, 0x00000ADF}, {0x00000AE4, 0x00000AE5}, {0x00000AF2, 0x00000AF8}, {0x00000B00, 0x00000B00}, +{0x00000B04, 0x00000B04}, {0x00000B0D, 0x00000B0E}, {0x00000B11, 0x00000B12}, {0x00000B29, 0x00000B29}, +{0x00000B31, 0x00000B31}, {0x00000B34, 0x00000B34}, {0x00000B3A, 0x00000B3B}, {0x00000B45, 0x00000B46}, +{0x00000B49, 0x00000B4A}, {0x00000B4E, 0x00000B54}, {0x00000B58, 0x00000B5B}, {0x00000B5E, 0x00000B5E}, +{0x00000B64, 0x00000B65}, {0x00000B78, 0x00000B81}, {0x00000B84, 0x00000B84}, {0x00000B8B, 0x00000B8D}, +{0x00000B91, 0x00000B91}, {0x00000B96, 0x00000B98}, {0x00000B9B, 0x00000B9B}, {0x00000B9D, 0x00000B9D}, +{0x00000BA0, 0x00000BA2}, {0x00000BA5, 0x00000BA7}, {0x00000BAB, 0x00000BAD}, {0x00000BBA, 0x00000BBD}, +{0x00000BC3, 0x00000BC5}, {0x00000BC9, 0x00000BC9}, {0x00000BCE, 0x00000BCF}, {0x00000BD1, 0x00000BD6}, +{0x00000BD8, 0x00000BE5}, {0x00000BFB, 0x00000BFF}, {0x00000C0D, 0x00000C0D}, {0x00000C11, 0x00000C11}, +{0x00000C29, 0x00000C29}, {0x00000C3A, 0x00000C3B}, {0x00000C45, 0x00000C45}, {0x00000C49, 0x00000C49}, +{0x00000C4E, 0x00000C54}, {0x00000C57, 0x00000C57}, {0x00000C5B, 0x00000C5C}, {0x00000C5E, 0x00000C5F}, +{0x00000C64, 0x00000C65}, {0x00000C70, 0x00000C76}, {0x00000C8D, 0x00000C8D}, {0x00000C91, 0x00000C91}, +{0x00000CA9, 0x00000CA9}, {0x00000CB4, 0x00000CB4}, {0x00000CBA, 0x00000CBB}, {0x00000CC5, 0x00000CC5}, +{0x00000CC9, 0x00000CC9}, {0x00000CCE, 0x00000CD4}, {0x00000CD7, 0x00000CDC}, {0x00000CDF, 0x00000CDF}, +{0x00000CE4, 0x00000CE5}, {0x00000CF0, 0x00000CF0}, {0x00000CF4, 0x00000CFF}, {0x00000D0D, 0x00000D0D}, +{0x00000D11, 0x00000D11}, {0x00000D45, 0x00000D45}, {0x00000D49, 0x00000D49}, {0x00000D50, 0x00000D53}, +{0x00000D64, 0x00000D65}, {0x00000D80, 0x00000D80}, {0x00000D84, 0x00000D84}, {0x00000D97, 0x00000D99}, +{0x00000DB2, 0x00000DB2}, {0x00000DBC, 0x00000DBC}, {0x00000DBE, 0x00000DBF}, {0x00000DC7, 0x00000DC9}, +{0x00000DCB, 0x00000DCE}, {0x00000DD5, 0x00000DD5}, {0x00000DD7, 0x00000DD7}, {0x00000DE0, 0x00000DE5}, +{0x00000DF0, 0x00000DF1}, {0x00000DF5, 0x00000E00}, {0x00000E3B, 0x00000E3E}, {0x00000E5C, 0x00000E80}, +{0x00000E83, 0x00000E83}, {0x00000E85, 0x00000E85}, {0x00000E8B, 0x00000E8B}, {0x00000EA4, 0x00000EA4}, +{0x00000EA6, 0x00000EA6}, {0x00000EBE, 0x00000EBF}, {0x00000EC5, 0x00000EC5}, {0x00000EC7, 0x00000EC7}, +{0x00000ECF, 0x00000ECF}, {0x00000EDA, 0x00000EDB}, {0x00000EE0, 0x00000EFF}, {0x00000F48, 0x00000F48}, +{0x00000F6D, 0x00000F70}, {0x00000F98, 0x00000F98}, {0x00000FBD, 0x00000FBD}, {0x00000FCD, 0x00000FCD}, +{0x00000FDB, 0x00000FFF}, {0x000010C6, 0x000010C6}, {0x000010C8, 0x000010CC}, {0x000010CE, 0x000010CF}, +{0x00001249, 0x00001249}, {0x0000124E, 0x0000124F}, {0x00001257, 0x00001257}, {0x00001259, 0x00001259}, +{0x0000125E, 0x0000125F}, {0x00001289, 0x00001289}, {0x0000128E, 0x0000128F}, {0x000012B1, 0x000012B1}, +{0x000012B6, 0x000012B7}, {0x000012BF, 0x000012BF}, {0x000012C1, 0x000012C1}, {0x000012C6, 0x000012C7}, +{0x000012D7, 0x000012D7}, {0x00001311, 0x00001311}, {0x00001316, 0x00001317}, {0x0000135B, 0x0000135C}, +{0x0000137D, 0x0000137F}, {0x0000139A, 0x0000139F}, {0x000013F6, 0x000013F7}, {0x000013FE, 0x000013FF}, +{0x0000169D, 0x0000169F}, {0x000016F9, 0x000016FF}, {0x00001716, 0x0000171E}, {0x00001737, 0x0000173F}, +{0x00001754, 0x0000175F}, {0x0000176D, 0x0000176D}, {0x00001771, 0x00001771}, {0x00001774, 0x0000177F}, +{0x000017DE, 0x000017DF}, {0x000017EA, 0x000017EF}, {0x000017FA, 0x000017FF}, {0x0000180E, 0x0000180E}, +{0x0000181A, 0x0000181F}, {0x00001879, 0x0000187F}, {0x000018AB, 0x000018AF}, {0x000018F6, 0x000018FF}, +{0x0000191F, 0x0000191F}, {0x0000192C, 0x0000192F}, {0x0000193C, 0x0000193F}, {0x00001941, 0x00001943}, +{0x0000196E, 0x0000196F}, {0x00001975, 0x0000197F}, {0x000019AC, 0x000019AF}, {0x000019CA, 0x000019CF}, +{0x000019DB, 0x000019DD}, {0x00001A1C, 0x00001A1D}, {0x00001A5F, 0x00001A5F}, {0x00001A7D, 0x00001A7E}, +{0x00001A8A, 0x00001A8F}, {0x00001A9A, 0x00001A9F}, {0x00001AAE, 0x00001AAF}, {0x00001ACF, 0x00001AFF}, +{0x00001B4D, 0x00001B4F}, {0x00001B7F, 0x00001B7F}, {0x00001BF4, 0x00001BFB}, {0x00001C38, 0x00001C3A}, +{0x00001C4A, 0x00001C4C}, {0x00001C89, 0x00001C8F}, {0x00001CBB, 0x00001CBC}, {0x00001CC8, 0x00001CCF}, +{0x00001CFB, 0x00001CFF}, {0x00001F16, 0x00001F17}, {0x00001F1E, 0x00001F1F}, {0x00001F46, 0x00001F47}, {0x00001F4E, 0x00001F4F}, {0x00001F58, 0x00001F58}, {0x00001F5A, 0x00001F5A}, {0x00001F5C, 0x00001F5C}, {0x00001F5E, 0x00001F5E}, {0x00001F7E, 0x00001F7F}, {0x00001FB5, 0x00001FB5}, {0x00001FC5, 0x00001FC5}, {0x00001FD4, 0x00001FD5}, {0x00001FDC, 0x00001FDC}, {0x00001FF0, 0x00001FF1}, {0x00001FF5, 0x00001FF5}, {0x00001FFF, 0x00001FFF}, {0x0000200B, 0x0000200F}, {0x0000202A, 0x0000202E}, {0x00002060, 0x0000206F}, -{0x00002072, 0x00002073}, {0x0000208F, 0x0000208F}, {0x0000209D, 0x0000209F}, {0x000020C0, 0x000020CF}, +{0x00002072, 0x00002073}, {0x0000208F, 0x0000208F}, {0x0000209D, 0x0000209F}, {0x000020C1, 0x000020CF}, {0x000020F1, 0x000020FF}, {0x0000218C, 0x0000218F}, {0x00002427, 0x0000243F}, {0x0000244B, 0x0000245F}, -{0x00002B74, 0x00002B75}, {0x00002B96, 0x00002B96}, {0x00002C2F, 0x00002C2F}, {0x00002C5F, 0x00002C5F}, -{0x00002CF4, 0x00002CF8}, {0x00002D26, 0x00002D26}, {0x00002D28, 0x00002D2C}, {0x00002D2E, 0x00002D2F}, -{0x00002D68, 0x00002D6E}, {0x00002D71, 0x00002D7E}, {0x00002D97, 0x00002D9F}, {0x00002DA7, 0x00002DA7}, -{0x00002DAF, 0x00002DAF}, {0x00002DB7, 0x00002DB7}, {0x00002DBF, 0x00002DBF}, {0x00002DC7, 0x00002DC7}, -{0x00002DCF, 0x00002DCF}, {0x00002DD7, 0x00002DD7}, {0x00002DDF, 0x00002DDF}, {0x00002E53, 0x00002E7F}, -{0x00002E9A, 0x00002E9A}, {0x00002EF4, 0x00002EFF}, {0x00002FD6, 0x00002FEF}, {0x00002FFC, 0x00002FFF}, -{0x00003040, 0x00003040}, {0x00003097, 0x00003098}, {0x00003100, 0x00003104}, {0x00003130, 0x00003130}, -{0x0000318F, 0x0000318F}, {0x000031E4, 0x000031EF}, {0x0000321F, 0x0000321F}, {0x00009FFD, 0x00009FFF}, -{0x0000A48D, 0x0000A48F}, {0x0000A4C7, 0x0000A4CF}, {0x0000A62C, 0x0000A63F}, {0x0000A6F8, 0x0000A6FF}, -{0x0000A7C0, 0x0000A7C1}, {0x0000A7CB, 0x0000A7F4}, {0x0000A82D, 0x0000A82F}, {0x0000A83A, 0x0000A83F}, -{0x0000A878, 0x0000A87F}, {0x0000A8C6, 0x0000A8CD}, {0x0000A8DA, 0x0000A8DF}, {0x0000A954, 0x0000A95E}, -{0x0000A97D, 0x0000A97F}, {0x0000A9CE, 0x0000A9CE}, {0x0000A9DA, 0x0000A9DD}, {0x0000A9FF, 0x0000A9FF}, -{0x0000AA37, 0x0000AA3F}, {0x0000AA4E, 0x0000AA4F}, {0x0000AA5A, 0x0000AA5B}, {0x0000AAC3, 0x0000AADA}, -{0x0000AAF7, 0x0000AB00}, {0x0000AB07, 0x0000AB08}, {0x0000AB0F, 0x0000AB10}, {0x0000AB17, 0x0000AB1F}, -{0x0000AB27, 0x0000AB27}, {0x0000AB2F, 0x0000AB2F}, {0x0000AB6C, 0x0000AB6F}, {0x0000ABEE, 0x0000ABEF}, -{0x0000ABFA, 0x0000ABFF}, {0x0000D7A4, 0x0000D7AF}, {0x0000D7C7, 0x0000D7CA}, {0x0000D7FC, 0x0000F8FF}, +{0x00002B74, 0x00002B75}, {0x00002B96, 0x00002B96}, {0x00002CF4, 0x00002CF8}, {0x00002D26, 0x00002D26}, +{0x00002D28, 0x00002D2C}, {0x00002D2E, 0x00002D2F}, {0x00002D68, 0x00002D6E}, {0x00002D71, 0x00002D7E}, +{0x00002D97, 0x00002D9F}, {0x00002DA7, 0x00002DA7}, {0x00002DAF, 0x00002DAF}, {0x00002DB7, 0x00002DB7}, +{0x00002DBF, 0x00002DBF}, {0x00002DC7, 0x00002DC7}, {0x00002DCF, 0x00002DCF}, {0x00002DD7, 0x00002DD7}, +{0x00002DDF, 0x00002DDF}, {0x00002E5E, 0x00002E7F}, {0x00002E9A, 0x00002E9A}, {0x00002EF4, 0x00002EFF}, +{0x00002FD6, 0x00002FEF}, {0x00002FFC, 0x00002FFF}, {0x00003040, 0x00003040}, {0x00003097, 0x00003098}, +{0x00003100, 0x00003104}, {0x00003130, 0x00003130}, {0x0000318F, 0x0000318F}, {0x000031E4, 0x000031EF}, +{0x0000321F, 0x0000321F}, {0x0000A48D, 0x0000A48F}, {0x0000A4C7, 0x0000A4CF}, {0x0000A62C, 0x0000A63F}, +{0x0000A6F8, 0x0000A6FF}, {0x0000A7CB, 0x0000A7CF}, {0x0000A7D2, 0x0000A7D2}, {0x0000A7D4, 0x0000A7D4}, +{0x0000A7DA, 0x0000A7F1}, {0x0000A82D, 0x0000A82F}, {0x0000A83A, 0x0000A83F}, {0x0000A878, 0x0000A87F}, +{0x0000A8C6, 0x0000A8CD}, {0x0000A8DA, 0x0000A8DF}, {0x0000A954, 0x0000A95E}, {0x0000A97D, 0x0000A97F}, +{0x0000A9CE, 0x0000A9CE}, {0x0000A9DA, 0x0000A9DD}, {0x0000A9FF, 0x0000A9FF}, {0x0000AA37, 0x0000AA3F}, +{0x0000AA4E, 0x0000AA4F}, {0x0000AA5A, 0x0000AA5B}, {0x0000AAC3, 0x0000AADA}, {0x0000AAF7, 0x0000AB00}, +{0x0000AB07, 0x0000AB08}, {0x0000AB0F, 0x0000AB10}, {0x0000AB17, 0x0000AB1F}, {0x0000AB27, 0x0000AB27}, +{0x0000AB2F, 0x0000AB2F}, {0x0000AB6C, 0x0000AB6F}, {0x0000ABEE, 0x0000ABEF}, {0x0000ABFA, 0x0000ABFF}, +{0x0000D7A4, 0x0000D7AF}, {0x0000D7C7, 0x0000D7CA}, {0x0000D7FC, 0x0000D7FF}, {0x0000E000, 0x0000F8FF}, {0x0000FA6E, 0x0000FA6F}, {0x0000FADA, 0x0000FAFF}, {0x0000FB07, 0x0000FB12}, {0x0000FB18, 0x0000FB1C}, {0x0000FB37, 0x0000FB37}, {0x0000FB3D, 0x0000FB3D}, {0x0000FB3F, 0x0000FB3F}, {0x0000FB42, 0x0000FB42}, -{0x0000FB45, 0x0000FB45}, {0x0000FBC2, 0x0000FBD2}, {0x0000FD40, 0x0000FD4F}, {0x0000FD90, 0x0000FD91}, -{0x0000FDC8, 0x0000FDEF}, {0x0000FDFE, 0x0000FDFF}, {0x0000FE1A, 0x0000FE1F}, {0x0000FE53, 0x0000FE53}, -{0x0000FE67, 0x0000FE67}, {0x0000FE6C, 0x0000FE6F}, {0x0000FE75, 0x0000FE75}, {0x0000FEFD, 0x0000FF00}, +{0x0000FB45, 0x0000FB45}, {0x0000FBC3, 0x0000FBD2}, {0x0000FD90, 0x0000FD91}, {0x0000FDC8, 0x0000FDCE}, +{0x0000FDD0, 0x0000FDEF}, {0x0000FE1A, 0x0000FE1F}, {0x0000FE53, 0x0000FE53}, {0x0000FE67, 0x0000FE67}, +{0x0000FE6C, 0x0000FE6F}, {0x0000FE75, 0x0000FE75}, {0x0000FEFD, 0x0000FEFE}, {0x0000FF00, 0x0000FF00}, {0x0000FFBF, 0x0000FFC1}, {0x0000FFC8, 0x0000FFC9}, {0x0000FFD0, 0x0000FFD1}, {0x0000FFD8, 0x0000FFD9}, {0x0000FFDD, 0x0000FFDF}, {0x0000FFE7, 0x0000FFE7}, {0x0000FFEF, 0x0000FFFB}, {0x0000FFFE, 0x0000FFFF}, {0x0001000C, 0x0001000C}, {0x00010027, 0x00010027}, {0x0001003B, 0x0001003B}, {0x0001003E, 0x0001003E}, @@ -476,82 +509,91 @@ const std::vector> unicode_ranges_control = { {0x00010324, 0x0001032C}, {0x0001034B, 0x0001034F}, {0x0001037B, 0x0001037F}, {0x0001039E, 0x0001039E}, {0x000103C4, 0x000103C7}, {0x000103D6, 0x000103FF}, {0x0001049E, 0x0001049F}, {0x000104AA, 0x000104AF}, {0x000104D4, 0x000104D7}, {0x000104FC, 0x000104FF}, {0x00010528, 0x0001052F}, {0x00010564, 0x0001056E}, -{0x00010570, 0x000105FF}, {0x00010737, 0x0001073F}, {0x00010756, 0x0001075F}, {0x00010768, 0x000107FF}, -{0x00010806, 0x00010807}, {0x00010809, 0x00010809}, {0x00010836, 0x00010836}, {0x00010839, 0x0001083B}, -{0x0001083D, 0x0001083E}, {0x00010856, 0x00010856}, {0x0001089F, 0x000108A6}, {0x000108B0, 0x000108DF}, -{0x000108F3, 0x000108F3}, {0x000108F6, 0x000108FA}, {0x0001091C, 0x0001091E}, {0x0001093A, 0x0001093E}, -{0x00010940, 0x0001097F}, {0x000109B8, 0x000109BB}, {0x000109D0, 0x000109D1}, {0x00010A04, 0x00010A04}, -{0x00010A07, 0x00010A0B}, {0x00010A14, 0x00010A14}, {0x00010A18, 0x00010A18}, {0x00010A36, 0x00010A37}, -{0x00010A3B, 0x00010A3E}, {0x00010A49, 0x00010A4F}, {0x00010A59, 0x00010A5F}, {0x00010AA0, 0x00010ABF}, -{0x00010AE7, 0x00010AEA}, {0x00010AF7, 0x00010AFF}, {0x00010B36, 0x00010B38}, {0x00010B56, 0x00010B57}, -{0x00010B73, 0x00010B77}, {0x00010B92, 0x00010B98}, {0x00010B9D, 0x00010BA8}, {0x00010BB0, 0x00010BFF}, -{0x00010C49, 0x00010C7F}, {0x00010CB3, 0x00010CBF}, {0x00010CF3, 0x00010CF9}, {0x00010D28, 0x00010D2F}, -{0x00010D3A, 0x00010E5F}, {0x00010E7F, 0x00010E7F}, {0x00010EAA, 0x00010EAA}, {0x00010EAE, 0x00010EAF}, -{0x00010EB2, 0x00010EFF}, {0x00010F28, 0x00010F2F}, {0x00010F5A, 0x00010FAF}, {0x00010FCC, 0x00010FDF}, -{0x00010FF7, 0x00010FFF}, {0x0001104E, 0x00011051}, {0x00011070, 0x0001107E}, {0x000110BD, 0x000110BD}, -{0x000110C2, 0x000110CF}, {0x000110E9, 0x000110EF}, {0x000110FA, 0x000110FF}, {0x00011135, 0x00011135}, -{0x00011148, 0x0001114F}, {0x00011177, 0x0001117F}, {0x000111E0, 0x000111E0}, {0x000111F5, 0x000111FF}, -{0x00011212, 0x00011212}, {0x0001123F, 0x0001127F}, {0x00011287, 0x00011287}, {0x00011289, 0x00011289}, -{0x0001128E, 0x0001128E}, {0x0001129E, 0x0001129E}, {0x000112AA, 0x000112AF}, {0x000112EB, 0x000112EF}, -{0x000112FA, 0x000112FF}, {0x00011304, 0x00011304}, {0x0001130D, 0x0001130E}, {0x00011311, 0x00011312}, -{0x00011329, 0x00011329}, {0x00011331, 0x00011331}, {0x00011334, 0x00011334}, {0x0001133A, 0x0001133A}, -{0x00011345, 0x00011346}, {0x00011349, 0x0001134A}, {0x0001134E, 0x0001134F}, {0x00011351, 0x00011356}, -{0x00011358, 0x0001135C}, {0x00011364, 0x00011365}, {0x0001136D, 0x0001136F}, {0x00011375, 0x000113FF}, -{0x0001145C, 0x0001145C}, {0x00011462, 0x0001147F}, {0x000114C8, 0x000114CF}, {0x000114DA, 0x0001157F}, -{0x000115B6, 0x000115B7}, {0x000115DE, 0x000115FF}, {0x00011645, 0x0001164F}, {0x0001165A, 0x0001165F}, -{0x0001166D, 0x0001167F}, {0x000116B9, 0x000116BF}, {0x000116CA, 0x000116FF}, {0x0001171B, 0x0001171C}, -{0x0001172C, 0x0001172F}, {0x00011740, 0x000117FF}, {0x0001183C, 0x0001189F}, {0x000118F3, 0x000118FE}, -{0x00011907, 0x00011908}, {0x0001190A, 0x0001190B}, {0x00011914, 0x00011914}, {0x00011917, 0x00011917}, -{0x00011936, 0x00011936}, {0x00011939, 0x0001193A}, {0x00011947, 0x0001194F}, {0x0001195A, 0x0001199F}, -{0x000119A8, 0x000119A9}, {0x000119D8, 0x000119D9}, {0x000119E5, 0x000119FF}, {0x00011A48, 0x00011A4F}, -{0x00011AA3, 0x00011ABF}, {0x00011AF9, 0x00011BFF}, {0x00011C09, 0x00011C09}, {0x00011C37, 0x00011C37}, +{0x0001057B, 0x0001057B}, {0x0001058B, 0x0001058B}, {0x00010593, 0x00010593}, {0x00010596, 0x00010596}, +{0x000105A2, 0x000105A2}, {0x000105B2, 0x000105B2}, {0x000105BA, 0x000105BA}, {0x000105BD, 0x000105FF}, +{0x00010737, 0x0001073F}, {0x00010756, 0x0001075F}, {0x00010768, 0x0001077F}, {0x00010786, 0x00010786}, +{0x000107B1, 0x000107B1}, {0x000107BB, 0x000107FF}, {0x00010806, 0x00010807}, {0x00010809, 0x00010809}, +{0x00010836, 0x00010836}, {0x00010839, 0x0001083B}, {0x0001083D, 0x0001083E}, {0x00010856, 0x00010856}, +{0x0001089F, 0x000108A6}, {0x000108B0, 0x000108DF}, {0x000108F3, 0x000108F3}, {0x000108F6, 0x000108FA}, +{0x0001091C, 0x0001091E}, {0x0001093A, 0x0001093E}, {0x00010940, 0x0001097F}, {0x000109B8, 0x000109BB}, +{0x000109D0, 0x000109D1}, {0x00010A04, 0x00010A04}, {0x00010A07, 0x00010A0B}, {0x00010A14, 0x00010A14}, +{0x00010A18, 0x00010A18}, {0x00010A36, 0x00010A37}, {0x00010A3B, 0x00010A3E}, {0x00010A49, 0x00010A4F}, +{0x00010A59, 0x00010A5F}, {0x00010AA0, 0x00010ABF}, {0x00010AE7, 0x00010AEA}, {0x00010AF7, 0x00010AFF}, +{0x00010B36, 0x00010B38}, {0x00010B56, 0x00010B57}, {0x00010B73, 0x00010B77}, {0x00010B92, 0x00010B98}, +{0x00010B9D, 0x00010BA8}, {0x00010BB0, 0x00010BFF}, {0x00010C49, 0x00010C7F}, {0x00010CB3, 0x00010CBF}, +{0x00010CF3, 0x00010CF9}, {0x00010D28, 0x00010D2F}, {0x00010D3A, 0x00010E5F}, {0x00010E7F, 0x00010E7F}, +{0x00010EAA, 0x00010EAA}, {0x00010EAE, 0x00010EAF}, {0x00010EB2, 0x00010EFC}, {0x00010F28, 0x00010F2F}, +{0x00010F5A, 0x00010F6F}, {0x00010F8A, 0x00010FAF}, {0x00010FCC, 0x00010FDF}, {0x00010FF7, 0x00010FFF}, +{0x0001104E, 0x00011051}, {0x00011076, 0x0001107E}, {0x000110BD, 0x000110BD}, {0x000110C3, 0x000110CF}, +{0x000110E9, 0x000110EF}, {0x000110FA, 0x000110FF}, {0x00011135, 0x00011135}, {0x00011148, 0x0001114F}, +{0x00011177, 0x0001117F}, {0x000111E0, 0x000111E0}, {0x000111F5, 0x000111FF}, {0x00011212, 0x00011212}, +{0x00011242, 0x0001127F}, {0x00011287, 0x00011287}, {0x00011289, 0x00011289}, {0x0001128E, 0x0001128E}, +{0x0001129E, 0x0001129E}, {0x000112AA, 0x000112AF}, {0x000112EB, 0x000112EF}, {0x000112FA, 0x000112FF}, +{0x00011304, 0x00011304}, {0x0001130D, 0x0001130E}, {0x00011311, 0x00011312}, {0x00011329, 0x00011329}, +{0x00011331, 0x00011331}, {0x00011334, 0x00011334}, {0x0001133A, 0x0001133A}, {0x00011345, 0x00011346}, +{0x00011349, 0x0001134A}, {0x0001134E, 0x0001134F}, {0x00011351, 0x00011356}, {0x00011358, 0x0001135C}, +{0x00011364, 0x00011365}, {0x0001136D, 0x0001136F}, {0x00011375, 0x000113FF}, {0x0001145C, 0x0001145C}, +{0x00011462, 0x0001147F}, {0x000114C8, 0x000114CF}, {0x000114DA, 0x0001157F}, {0x000115B6, 0x000115B7}, +{0x000115DE, 0x000115FF}, {0x00011645, 0x0001164F}, {0x0001165A, 0x0001165F}, {0x0001166D, 0x0001167F}, +{0x000116BA, 0x000116BF}, {0x000116CA, 0x000116FF}, {0x0001171B, 0x0001171C}, {0x0001172C, 0x0001172F}, +{0x00011747, 0x000117FF}, {0x0001183C, 0x0001189F}, {0x000118F3, 0x000118FE}, {0x00011907, 0x00011908}, +{0x0001190A, 0x0001190B}, {0x00011914, 0x00011914}, {0x00011917, 0x00011917}, {0x00011936, 0x00011936}, +{0x00011939, 0x0001193A}, {0x00011947, 0x0001194F}, {0x0001195A, 0x0001199F}, {0x000119A8, 0x000119A9}, +{0x000119D8, 0x000119D9}, {0x000119E5, 0x000119FF}, {0x00011A48, 0x00011A4F}, {0x00011AA3, 0x00011AAF}, +{0x00011AF9, 0x00011AFF}, {0x00011B0A, 0x00011BFF}, {0x00011C09, 0x00011C09}, {0x00011C37, 0x00011C37}, {0x00011C46, 0x00011C4F}, {0x00011C6D, 0x00011C6F}, {0x00011C90, 0x00011C91}, {0x00011CA8, 0x00011CA8}, {0x00011CB7, 0x00011CFF}, {0x00011D07, 0x00011D07}, {0x00011D0A, 0x00011D0A}, {0x00011D37, 0x00011D39}, {0x00011D3B, 0x00011D3B}, {0x00011D3E, 0x00011D3E}, {0x00011D48, 0x00011D4F}, {0x00011D5A, 0x00011D5F}, {0x00011D66, 0x00011D66}, {0x00011D69, 0x00011D69}, {0x00011D8F, 0x00011D8F}, {0x00011D92, 0x00011D92}, -{0x00011D99, 0x00011D9F}, {0x00011DAA, 0x00011EDF}, {0x00011EF9, 0x00011FAF}, {0x00011FB1, 0x00011FBF}, -{0x00011FF2, 0x00011FFE}, {0x0001239A, 0x000123FF}, {0x0001246F, 0x0001246F}, {0x00012475, 0x0001247F}, -{0x00012544, 0x00012FFF}, {0x0001342F, 0x000143FF}, {0x00014647, 0x000167FF}, {0x00016A39, 0x00016A3F}, -{0x00016A5F, 0x00016A5F}, {0x00016A6A, 0x00016A6D}, {0x00016A70, 0x00016ACF}, {0x00016AEE, 0x00016AEF}, -{0x00016AF6, 0x00016AFF}, {0x00016B46, 0x00016B4F}, {0x00016B5A, 0x00016B5A}, {0x00016B62, 0x00016B62}, -{0x00016B78, 0x00016B7C}, {0x00016B90, 0x00016E3F}, {0x00016E9B, 0x00016EFF}, {0x00016F4B, 0x00016F4E}, -{0x00016F88, 0x00016F8E}, {0x00016FA0, 0x00016FDF}, {0x00016FE5, 0x00016FEF}, {0x00016FF2, 0x00016FFF}, -{0x000187F8, 0x000187FF}, {0x00018CD6, 0x00018CFF}, {0x00018D09, 0x0001AFFF}, {0x0001B11F, 0x0001B14F}, -{0x0001B153, 0x0001B163}, {0x0001B168, 0x0001B16F}, {0x0001B2FC, 0x0001BBFF}, {0x0001BC6B, 0x0001BC6F}, -{0x0001BC7D, 0x0001BC7F}, {0x0001BC89, 0x0001BC8F}, {0x0001BC9A, 0x0001BC9B}, {0x0001BCA0, 0x0001CFFF}, -{0x0001D0F6, 0x0001D0FF}, {0x0001D127, 0x0001D128}, {0x0001D173, 0x0001D17A}, {0x0001D1E9, 0x0001D1FF}, -{0x0001D246, 0x0001D2DF}, {0x0001D2F4, 0x0001D2FF}, {0x0001D357, 0x0001D35F}, {0x0001D379, 0x0001D3FF}, -{0x0001D455, 0x0001D455}, {0x0001D49D, 0x0001D49D}, {0x0001D4A0, 0x0001D4A1}, {0x0001D4A3, 0x0001D4A4}, -{0x0001D4A7, 0x0001D4A8}, {0x0001D4AD, 0x0001D4AD}, {0x0001D4BA, 0x0001D4BA}, {0x0001D4BC, 0x0001D4BC}, -{0x0001D4C4, 0x0001D4C4}, {0x0001D506, 0x0001D506}, {0x0001D50B, 0x0001D50C}, {0x0001D515, 0x0001D515}, -{0x0001D51D, 0x0001D51D}, {0x0001D53A, 0x0001D53A}, {0x0001D53F, 0x0001D53F}, {0x0001D545, 0x0001D545}, -{0x0001D547, 0x0001D549}, {0x0001D551, 0x0001D551}, {0x0001D6A6, 0x0001D6A7}, {0x0001D7CC, 0x0001D7CD}, -{0x0001DA8C, 0x0001DA9A}, {0x0001DAA0, 0x0001DAA0}, {0x0001DAB0, 0x0001DFFF}, {0x0001E007, 0x0001E007}, -{0x0001E019, 0x0001E01A}, {0x0001E022, 0x0001E022}, {0x0001E025, 0x0001E025}, {0x0001E02B, 0x0001E0FF}, -{0x0001E12D, 0x0001E12F}, {0x0001E13E, 0x0001E13F}, {0x0001E14A, 0x0001E14D}, {0x0001E150, 0x0001E2BF}, -{0x0001E2FA, 0x0001E2FE}, {0x0001E300, 0x0001E7FF}, {0x0001E8C5, 0x0001E8C6}, {0x0001E8D7, 0x0001E8FF}, -{0x0001E94C, 0x0001E94F}, {0x0001E95A, 0x0001E95D}, {0x0001E960, 0x0001EC70}, {0x0001ECB5, 0x0001ED00}, -{0x0001ED3E, 0x0001EDFF}, {0x0001EE04, 0x0001EE04}, {0x0001EE20, 0x0001EE20}, {0x0001EE23, 0x0001EE23}, -{0x0001EE25, 0x0001EE26}, {0x0001EE28, 0x0001EE28}, {0x0001EE33, 0x0001EE33}, {0x0001EE38, 0x0001EE38}, -{0x0001EE3A, 0x0001EE3A}, {0x0001EE3C, 0x0001EE41}, {0x0001EE43, 0x0001EE46}, {0x0001EE48, 0x0001EE48}, -{0x0001EE4A, 0x0001EE4A}, {0x0001EE4C, 0x0001EE4C}, {0x0001EE50, 0x0001EE50}, {0x0001EE53, 0x0001EE53}, -{0x0001EE55, 0x0001EE56}, {0x0001EE58, 0x0001EE58}, {0x0001EE5A, 0x0001EE5A}, {0x0001EE5C, 0x0001EE5C}, -{0x0001EE5E, 0x0001EE5E}, {0x0001EE60, 0x0001EE60}, {0x0001EE63, 0x0001EE63}, {0x0001EE65, 0x0001EE66}, -{0x0001EE6B, 0x0001EE6B}, {0x0001EE73, 0x0001EE73}, {0x0001EE78, 0x0001EE78}, {0x0001EE7D, 0x0001EE7D}, -{0x0001EE7F, 0x0001EE7F}, {0x0001EE8A, 0x0001EE8A}, {0x0001EE9C, 0x0001EEA0}, {0x0001EEA4, 0x0001EEA4}, -{0x0001EEAA, 0x0001EEAA}, {0x0001EEBC, 0x0001EEEF}, {0x0001EEF2, 0x0001EFFF}, {0x0001F02C, 0x0001F02F}, -{0x0001F094, 0x0001F09F}, {0x0001F0AF, 0x0001F0B0}, {0x0001F0C0, 0x0001F0C0}, {0x0001F0D0, 0x0001F0D0}, -{0x0001F0F6, 0x0001F0FF}, {0x0001F1AE, 0x0001F1E5}, {0x0001F203, 0x0001F20F}, {0x0001F23C, 0x0001F23F}, -{0x0001F249, 0x0001F24F}, {0x0001F252, 0x0001F25F}, {0x0001F266, 0x0001F2FF}, {0x0001F6D8, 0x0001F6DF}, -{0x0001F6ED, 0x0001F6EF}, {0x0001F6FD, 0x0001F6FF}, {0x0001F774, 0x0001F77F}, {0x0001F7D9, 0x0001F7DF}, -{0x0001F7EC, 0x0001F7FF}, {0x0001F80C, 0x0001F80F}, {0x0001F848, 0x0001F84F}, {0x0001F85A, 0x0001F85F}, -{0x0001F888, 0x0001F88F}, {0x0001F8AE, 0x0001F8AF}, {0x0001F8B2, 0x0001F8FF}, {0x0001F979, 0x0001F979}, -{0x0001F9CC, 0x0001F9CC}, {0x0001FA54, 0x0001FA5F}, {0x0001FA6E, 0x0001FA6F}, {0x0001FA75, 0x0001FA77}, -{0x0001FA7B, 0x0001FA7F}, {0x0001FA87, 0x0001FA8F}, {0x0001FAA9, 0x0001FAAF}, {0x0001FAB7, 0x0001FABF}, -{0x0001FAC3, 0x0001FACF}, {0x0001FAD7, 0x0001FAFF}, {0x0001FB93, 0x0001FB93}, {0x0001FBCB, 0x0001FBEF}, -{0x0001FBFA, 0x0001FFFF}, {0x0002A6DE, 0x0002A6FF}, {0x0002B735, 0x0002B73F}, {0x0002B81E, 0x0002B81F}, -{0x0002CEA2, 0x0002CEAF}, {0x0002EBE1, 0x0002F7FF}, {0x0002FA1E, 0x0002FFFF}, {0x0003134B, 0x000E00FF}, -{0x000E01F0, 0x0010FFFF}, +{0x00011D99, 0x00011D9F}, {0x00011DAA, 0x00011EDF}, {0x00011EF9, 0x00011EFF}, {0x00011F11, 0x00011F11}, +{0x00011F3B, 0x00011F3D}, {0x00011F5A, 0x00011FAF}, {0x00011FB1, 0x00011FBF}, {0x00011FF2, 0x00011FFE}, +{0x0001239A, 0x000123FF}, {0x0001246F, 0x0001246F}, {0x00012475, 0x0001247F}, {0x00012544, 0x00012F8F}, +{0x00012FF3, 0x00012FFF}, {0x00013430, 0x0001343F}, {0x00013456, 0x000143FF}, {0x00014647, 0x000167FF}, +{0x00016A39, 0x00016A3F}, {0x00016A5F, 0x00016A5F}, {0x00016A6A, 0x00016A6D}, {0x00016ABF, 0x00016ABF}, +{0x00016ACA, 0x00016ACF}, {0x00016AEE, 0x00016AEF}, {0x00016AF6, 0x00016AFF}, {0x00016B46, 0x00016B4F}, +{0x00016B5A, 0x00016B5A}, {0x00016B62, 0x00016B62}, {0x00016B78, 0x00016B7C}, {0x00016B90, 0x00016E3F}, +{0x00016E9B, 0x00016EFF}, {0x00016F4B, 0x00016F4E}, {0x00016F88, 0x00016F8E}, {0x00016FA0, 0x00016FDF}, +{0x00016FE5, 0x00016FEF}, {0x00016FF2, 0x00016FFF}, {0x000187F8, 0x000187FF}, {0x00018CD6, 0x00018CFF}, +{0x00018D09, 0x0001AFEF}, {0x0001AFF4, 0x0001AFF4}, {0x0001AFFC, 0x0001AFFC}, {0x0001AFFF, 0x0001AFFF}, +{0x0001B123, 0x0001B131}, {0x0001B133, 0x0001B14F}, {0x0001B153, 0x0001B154}, {0x0001B156, 0x0001B163}, +{0x0001B168, 0x0001B16F}, {0x0001B2FC, 0x0001BBFF}, {0x0001BC6B, 0x0001BC6F}, {0x0001BC7D, 0x0001BC7F}, +{0x0001BC89, 0x0001BC8F}, {0x0001BC9A, 0x0001BC9B}, {0x0001BCA0, 0x0001CEFF}, {0x0001CF2E, 0x0001CF2F}, +{0x0001CF47, 0x0001CF4F}, {0x0001CFC4, 0x0001CFFF}, {0x0001D0F6, 0x0001D0FF}, {0x0001D127, 0x0001D128}, +{0x0001D173, 0x0001D17A}, {0x0001D1EB, 0x0001D1FF}, {0x0001D246, 0x0001D2BF}, {0x0001D2D4, 0x0001D2DF}, +{0x0001D2F4, 0x0001D2FF}, {0x0001D357, 0x0001D35F}, {0x0001D379, 0x0001D3FF}, {0x0001D455, 0x0001D455}, +{0x0001D49D, 0x0001D49D}, {0x0001D4A0, 0x0001D4A1}, {0x0001D4A3, 0x0001D4A4}, {0x0001D4A7, 0x0001D4A8}, +{0x0001D4AD, 0x0001D4AD}, {0x0001D4BA, 0x0001D4BA}, {0x0001D4BC, 0x0001D4BC}, {0x0001D4C4, 0x0001D4C4}, +{0x0001D506, 0x0001D506}, {0x0001D50B, 0x0001D50C}, {0x0001D515, 0x0001D515}, {0x0001D51D, 0x0001D51D}, +{0x0001D53A, 0x0001D53A}, {0x0001D53F, 0x0001D53F}, {0x0001D545, 0x0001D545}, {0x0001D547, 0x0001D549}, +{0x0001D551, 0x0001D551}, {0x0001D6A6, 0x0001D6A7}, {0x0001D7CC, 0x0001D7CD}, {0x0001DA8C, 0x0001DA9A}, +{0x0001DAA0, 0x0001DAA0}, {0x0001DAB0, 0x0001DEFF}, {0x0001DF1F, 0x0001DF24}, {0x0001DF2B, 0x0001DFFF}, +{0x0001E007, 0x0001E007}, {0x0001E019, 0x0001E01A}, {0x0001E022, 0x0001E022}, {0x0001E025, 0x0001E025}, +{0x0001E02B, 0x0001E02F}, {0x0001E06E, 0x0001E08E}, {0x0001E090, 0x0001E0FF}, {0x0001E12D, 0x0001E12F}, +{0x0001E13E, 0x0001E13F}, {0x0001E14A, 0x0001E14D}, {0x0001E150, 0x0001E28F}, {0x0001E2AF, 0x0001E2BF}, +{0x0001E2FA, 0x0001E2FE}, {0x0001E300, 0x0001E4CF}, {0x0001E4FA, 0x0001E7DF}, {0x0001E7E7, 0x0001E7E7}, +{0x0001E7EC, 0x0001E7EC}, {0x0001E7EF, 0x0001E7EF}, {0x0001E7FF, 0x0001E7FF}, {0x0001E8C5, 0x0001E8C6}, +{0x0001E8D7, 0x0001E8FF}, {0x0001E94C, 0x0001E94F}, {0x0001E95A, 0x0001E95D}, {0x0001E960, 0x0001EC70}, +{0x0001ECB5, 0x0001ED00}, {0x0001ED3E, 0x0001EDFF}, {0x0001EE04, 0x0001EE04}, {0x0001EE20, 0x0001EE20}, +{0x0001EE23, 0x0001EE23}, {0x0001EE25, 0x0001EE26}, {0x0001EE28, 0x0001EE28}, {0x0001EE33, 0x0001EE33}, +{0x0001EE38, 0x0001EE38}, {0x0001EE3A, 0x0001EE3A}, {0x0001EE3C, 0x0001EE41}, {0x0001EE43, 0x0001EE46}, +{0x0001EE48, 0x0001EE48}, {0x0001EE4A, 0x0001EE4A}, {0x0001EE4C, 0x0001EE4C}, {0x0001EE50, 0x0001EE50}, +{0x0001EE53, 0x0001EE53}, {0x0001EE55, 0x0001EE56}, {0x0001EE58, 0x0001EE58}, {0x0001EE5A, 0x0001EE5A}, +{0x0001EE5C, 0x0001EE5C}, {0x0001EE5E, 0x0001EE5E}, {0x0001EE60, 0x0001EE60}, {0x0001EE63, 0x0001EE63}, +{0x0001EE65, 0x0001EE66}, {0x0001EE6B, 0x0001EE6B}, {0x0001EE73, 0x0001EE73}, {0x0001EE78, 0x0001EE78}, +{0x0001EE7D, 0x0001EE7D}, {0x0001EE7F, 0x0001EE7F}, {0x0001EE8A, 0x0001EE8A}, {0x0001EE9C, 0x0001EEA0}, +{0x0001EEA4, 0x0001EEA4}, {0x0001EEAA, 0x0001EEAA}, {0x0001EEBC, 0x0001EEEF}, {0x0001EEF2, 0x0001EFFF}, +{0x0001F02C, 0x0001F02F}, {0x0001F094, 0x0001F09F}, {0x0001F0AF, 0x0001F0B0}, {0x0001F0C0, 0x0001F0C0}, +{0x0001F0D0, 0x0001F0D0}, {0x0001F0F6, 0x0001F0FF}, {0x0001F1AE, 0x0001F1E5}, {0x0001F203, 0x0001F20F}, +{0x0001F23C, 0x0001F23F}, {0x0001F249, 0x0001F24F}, {0x0001F252, 0x0001F25F}, {0x0001F266, 0x0001F2FF}, +{0x0001F6D8, 0x0001F6DB}, {0x0001F6ED, 0x0001F6EF}, {0x0001F6FD, 0x0001F6FF}, {0x0001F777, 0x0001F77A}, +{0x0001F7DA, 0x0001F7DF}, {0x0001F7EC, 0x0001F7EF}, {0x0001F7F1, 0x0001F7FF}, {0x0001F80C, 0x0001F80F}, +{0x0001F848, 0x0001F84F}, {0x0001F85A, 0x0001F85F}, {0x0001F888, 0x0001F88F}, {0x0001F8AE, 0x0001F8AF}, +{0x0001F8B2, 0x0001F8FF}, {0x0001FA54, 0x0001FA5F}, {0x0001FA6E, 0x0001FA6F}, {0x0001FA7D, 0x0001FA7F}, +{0x0001FA89, 0x0001FA8F}, {0x0001FABE, 0x0001FABE}, {0x0001FAC6, 0x0001FACD}, {0x0001FADC, 0x0001FADF}, +{0x0001FAE9, 0x0001FAEF}, {0x0001FAF9, 0x0001FAFF}, {0x0001FB93, 0x0001FB93}, {0x0001FBCB, 0x0001FBEF}, +{0x0001FBFA, 0x0001FFFF}, {0x0002A6E0, 0x0002A6FF}, {0x0002B73A, 0x0002B73F}, {0x0002B81E, 0x0002B81F}, +{0x0002CEA2, 0x0002CEAF}, {0x0002EBE1, 0x0002F7FF}, {0x0002FA1E, 0x0002FFFF}, {0x0003134B, 0x0003134F}, +{0x000323B0, 0x000E00FF}, {0x000E01F0, 0x0010FFFF}, }; const std::multimap unicode_map_nfd = { diff --git a/unicode-data.h b/unicode-data.h index cb9dd8aa5..3cf84117c 100644 --- a/unicode-data.h +++ b/unicode-data.h @@ -5,7 +5,7 @@ #include #include -extern const std::vector> unicode_ranges_digit; +extern const std::vector> unicode_ranges_number; extern const std::vector> unicode_ranges_letter; extern const std::vector> unicode_ranges_whitespace; extern const std::vector> unicode_ranges_accent_mark; diff --git a/unicode.cpp b/unicode.cpp index f2ccda05f..955c56965 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -110,9 +110,9 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) static std::unordered_map unicode_cpt_type_map() { std::unordered_map cpt_types; - for (auto p : unicode_ranges_digit) { + for (auto p : unicode_ranges_number) { for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_DIGIT; + cpt_types[i] = CODEPOINT_TYPE_NUMBER; } } for (auto p : unicode_ranges_letter) { @@ -300,13 +300,13 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t collecting_letter = true; collecting = true; } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) { collecting_numeric = true; collecting = true; } else if ( - ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || - (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) + ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || + (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) ) { collecting_special = true; collecting = true; @@ -323,13 +323,13 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) { split_condition = true; } - else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) { + else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) { split_condition = true; } - else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { + else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { split_condition = true; } - else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) { split_condition = true; } } @@ -524,19 +524,19 @@ char32_t unicode_tolower(char32_t cp) { std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { // unicode categories static const std::map k_ucat_enum = { - { "\\p{N}", CODEPOINT_TYPE_DIGIT }, + { "\\p{N}", CODEPOINT_TYPE_NUMBER }, { "\\p{L}", CODEPOINT_TYPE_LETTER }, { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION }, }; static const std::map k_ucat_cpt = { - { CODEPOINT_TYPE_DIGIT, 0xD1 }, + { CODEPOINT_TYPE_NUMBER, 0xD1 }, { CODEPOINT_TYPE_LETTER, 0xD2 }, { CODEPOINT_TYPE_PUNCTUATION, 0xD3 }, }; static const std::map k_ucat_map = { - { CODEPOINT_TYPE_DIGIT, "\x30-\x39" }, // 0-9 + { CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9 { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} }; diff --git a/unicode.h b/unicode.h index ce2bcef5a..e9026dc81 100644 --- a/unicode.h +++ b/unicode.h @@ -5,7 +5,7 @@ #include #define CODEPOINT_TYPE_UNIDENTIFIED 0 -#define CODEPOINT_TYPE_DIGIT 1 +#define CODEPOINT_TYPE_NUMBER 1 #define CODEPOINT_TYPE_LETTER 2 #define CODEPOINT_TYPE_WHITESPACE 3 #define CODEPOINT_TYPE_ACCENT_MARK 4