lint : fix

This commit is contained in:
Georgi Gerganov 2024-04-26 21:12:05 +03:00
parent 1b9b79dd14
commit 8791e94e3c
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 4 additions and 2 deletions

View file

@ -20,5 +20,5 @@ jobs:
- name: flake8 Lint - name: flake8 Lint
uses: py-actions/flake8@v2 uses: py-actions/flake8@v2
with: with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" ignore: "E203,E211,E221,E222,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
exclude: "examples/*,examples/*/**,*/**/__init__.py" exclude: "examples/*,examples/*/**,*/**/__init__.py"

View file

@ -406,7 +406,7 @@ class Model(ABC):
res = "deepseek-coder" res = "deepseek-coder"
if res is None: if res is None:
raise NotImplementedError(f"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
return res return res

View file

@ -12033,6 +12033,8 @@ struct llm_tokenizer_bpe {
word_collection = unicode_regex_split(text, { word_collection = unicode_regex_split(text, {
// TODO: ?????????????? // TODO: ??????????????
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
// TODO: this is wrong - need to use ReFlex and update unicode.cpp to support the regex above
"\\p{P}+", "\\p{P}+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+", "\\p{N}+",