Switch unidecode over to the new anunidecode library and write some tests to validate results.

This commit is contained in:
Jake Moshenko 2014-08-01 15:50:25 -04:00
parent 2d21dc9293
commit 09917ff062
4 changed files with 54 additions and 5 deletions

View file

@ -30,7 +30,7 @@ reportlab==2.7
blinker blinker
raven raven
python-ldap python-ldap
unidecode
pycrypto pycrypto
logentries logentries
git+https://github.com/DevTable/aniso8601-fake.git git+https://github.com/DevTable/aniso8601-fake.git
git+https://github.com/DevTable/anunidecode.git

View file

@ -13,10 +13,10 @@ PyGithub==1.25.0
PyMySQL==0.6.2 PyMySQL==0.6.2
PyPDF2==1.22 PyPDF2==1.22
SQLAlchemy==0.9.7 SQLAlchemy==0.9.7
Unidecode==0.04.16
Werkzeug==0.9.6 Werkzeug==0.9.6
alembic==0.6.5 alembic==0.6.5
git+https://github.com/DevTable/aniso8601-fake.git git+https://github.com/DevTable/aniso8601-fake.git
git+https://github.com/DevTable/anunidecode.git
argparse==1.2.1 argparse==1.2.1
beautifulsoup4==4.3.2 beautifulsoup4==4.3.2
blinker==1.3 blinker==1.3

50
test/test_util.py Normal file
View file

@ -0,0 +1,50 @@
import unittest
from itertools import islice
from util.validation import generate_valid_usernames
class TestUsernameGenerator(unittest.TestCase):
def assert_generated_output(self, input_username, expected_output):
name_gen = generate_valid_usernames(input_username)
generated_output = list(islice(name_gen, 1))[0]
self.assertEquals(expected_output, generated_output)
def test_basic_ascii_names(self):
self.assert_generated_output('jake', 'jake')
self.assert_generated_output('frank', 'frank')
def test_names_with_caps(self):
self.assert_generated_output('Jake', 'jake')
self.assert_generated_output('FranK', 'frank')
def test_short_names(self):
self.assert_generated_output('a', 'a___')
self.assert_generated_output('ab', 'ab__')
self.assert_generated_output('abc', 'abc_')
def test_long_names(self):
self.assert_generated_output('abcdefghijklmnopqrstuvwxyz1234567890',
'abcdefghijklmnopqrstuvwxyz1234')
def test_unicode_transliteration(self):
self.assert_generated_output(u'\xc6neid', 'aeneid')
self.assert_generated_output(u'\xe9tude', 'etude')
self.assert_generated_output(u'\u5317\u4eb0', 'bei_jing')
self.assert_generated_output(u'\u1515\u14c7\u14c7', 'shanana')
self.assert_generated_output(u'\u13d4\u13b5\u13c6', 'taliqua')
self.assert_generated_output(u'\u0726\u071b\u073d\u0710\u073a', 'ptu_i')
self.assert_generated_output(u'\u0905\u092d\u093f\u091c\u0940\u0924', 'abhijiit')
self.assert_generated_output(u'\u0985\u09ad\u09bf\u099c\u09c0\u09a4', 'abhijiit')
self.assert_generated_output(u'\u0d05\u0d2d\u0d3f\u0d1c\u0d40\u0d24', 'abhijiit')
self.assert_generated_output(u'\u0d2e\u0d32\u0d2f\u0d3e\u0d32\u0d2e\u0d4d', 'mlyaalm')
self.assert_generated_output(u'\ue000', '____')
self.assert_generated_output(u'\u03ff', '____')
def test_multiple_suggestions(self):
name_gen = generate_valid_usernames('a')
generated_output = list(islice(name_gen, 4))
self.assertEquals('a___', generated_output[0])
self.assertEquals('a__0', generated_output[1])
self.assertEquals('a__1', generated_output[2])
self.assertEquals('a__2', generated_output[3])

View file

@ -1,7 +1,6 @@
import re import re
import string import string
import anunidecode
from unidecode import unidecode
INVALID_PASSWORD_MESSAGE = 'Invalid password, password must be at least ' + \ INVALID_PASSWORD_MESSAGE = 'Invalid password, password must be at least ' + \
@ -49,7 +48,7 @@ def _gen_filler_chars(num_filler_chars):
def generate_valid_usernames(input_username): def generate_valid_usernames(input_username):
normalized = unidecode(input_username).strip().lower() normalized = input_username.encode('unidecode', 'ignore').strip().lower()
prefix = re.sub(INVALID_USERNAME_CHARACTERS, '_', normalized)[:30] prefix = re.sub(INVALID_USERNAME_CHARACTERS, '_', normalized)[:30]
num_filler_chars = max(0, MIN_LENGTH - len(prefix)) num_filler_chars = max(0, MIN_LENGTH - len(prefix))