diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index f4ae65495..c07b9aa83 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -20,5 +20,5 @@ jobs: - name: flake8 Lint uses: py-actions/flake8@v2 with: - ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" + ignore: "E203,E211,E221,E222,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" exclude: "examples/*,examples/*/**,*/**/__init__.py" diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 06fa9996d..af698a23a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -406,7 +406,7 @@ class Model(ABC): res = "deepseek-coder" if res is None: - raise NotImplementedError(f"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") return res diff --git a/llama.cpp b/llama.cpp index d8e691c18..923a49877 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12033,6 +12033,8 @@ struct llm_tokenizer_bpe { word_collection = unicode_regex_split(text, { // TODO: ?????????????? //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+ + + // TODO: this is wrong - need to use ReFlex and update unicode.cpp to support the regex above "\\p{P}+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "\\p{N}+",