*.py: Stylistic adjustments for python (#8233)
* Superflous parens in conditionals were removed. * Unused args in function were removed. * Replaced unused `idx` var with `_` * Initializing file_format and format_version attributes * Renaming constant to capitals * Preventing redefinition of the `f` var Signed-off-by: Jiri Podivin <jpodivin@redhat.com>
This commit is contained in:
parent
6f11a83e4e
commit
566daa5a5b
3 changed files with 17 additions and 13 deletions
|
@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
|
|||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
# will be updated with time - contributions welcome
|
||||
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
|
||||
if len(sys.argv) == 2:
|
||||
token = sys.argv[1]
|
||||
|
@ -100,8 +100,8 @@ def download_file_with_auth(url, token, save_path):
|
|||
response = sess.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||
with open(save_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
with open(save_path, 'wb') as downloaded_file:
|
||||
downloaded_file.write(response.content)
|
||||
logger.info(f"File {save_path} downloaded successfully")
|
||||
|
||||
|
||||
|
@ -160,7 +160,7 @@ for model in models:
|
|||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||
continue # Skip to the next model if the tokenizer can't be loaded
|
||||
|
||||
chktok = tokenizer.encode(chktxt)
|
||||
chktok = tokenizer.encode(CHK_TXT)
|
||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||
|
||||
logger.info(f"model: {name}")
|
||||
|
@ -192,7 +192,7 @@ src_func = f"""
|
|||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||
# use in llama.cpp to implement the same pre-tokenizer
|
||||
|
||||
chktxt = {repr(chktxt)}
|
||||
chktxt = {repr(CHK_TXT)}
|
||||
|
||||
chktok = tokenizer.encode(chktxt)
|
||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||
|
@ -288,7 +288,7 @@ tests = [
|
|||
"333333333",
|
||||
"Cửa Việt", # llama-bpe fails on this
|
||||
" discards",
|
||||
chktxt,
|
||||
CHK_TXT,
|
||||
]
|
||||
|
||||
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue