fix: protect against slow tokenizer

This commit is contained in:
Joan Martinez 2024-05-02 18:01:00 +02:00
parent 0f94ff7155
commit c7a6c32882

View file

@ -271,10 +271,14 @@ class Model(ABC):
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
pre_out = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(tokenizer.backend_tokenizer.normalizer.normalize_str(chktxt)) if hasattr(tokenizer, 'backend_tokenizer'):
chkhsh = sha256(str(pre_out).encode()).hexdigest() chktok = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(tokenizer.backend_tokenizer.normalizer.normalize_str(chktxt))
chkhsh = sha256(str(chktok).encode()).hexdigest()
else:
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
print(f"pre_out: {pre_out}") print(f"chktok: {chktok}")
print(f"chkhsh: {chkhsh}") print(f"chkhsh: {chkhsh}")
res = None res = None