From 09917ff062e08d72afa0619b26e8a9d982c1da95 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Fri, 1 Aug 2014 15:50:25 -0400 Subject: [PATCH] Switch unidecode over to the new anunidecode library and write some tests to validate results. --- requirements-nover.txt | 2 +- requirements.txt | 2 +- test/test_util.py | 50 ++++++++++++++++++++++++++++++++++++++++++ util/validation.py | 5 ++--- 4 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 test/test_util.py diff --git a/requirements-nover.txt b/requirements-nover.txt index 6e21f23ae..c0979629b 100644 --- a/requirements-nover.txt +++ b/requirements-nover.txt @@ -30,7 +30,7 @@ reportlab==2.7 blinker raven python-ldap -unidecode pycrypto logentries git+https://github.com/DevTable/aniso8601-fake.git +git+https://github.com/DevTable/anunidecode.git diff --git a/requirements.txt b/requirements.txt index 165dd2cb9..090ade690 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,10 +13,10 @@ PyGithub==1.25.0 PyMySQL==0.6.2 PyPDF2==1.22 SQLAlchemy==0.9.7 -Unidecode==0.04.16 Werkzeug==0.9.6 alembic==0.6.5 git+https://github.com/DevTable/aniso8601-fake.git +git+https://github.com/DevTable/anunidecode.git argparse==1.2.1 beautifulsoup4==4.3.2 blinker==1.3 diff --git a/test/test_util.py b/test/test_util.py new file mode 100644 index 000000000..ae27c670a --- /dev/null +++ b/test/test_util.py @@ -0,0 +1,50 @@ +import unittest + +from itertools import islice + +from util.validation import generate_valid_usernames + +class TestUsernameGenerator(unittest.TestCase): + def assert_generated_output(self, input_username, expected_output): + name_gen = generate_valid_usernames(input_username) + generated_output = list(islice(name_gen, 1))[0] + self.assertEquals(expected_output, generated_output) + + def test_basic_ascii_names(self): + self.assert_generated_output('jake', 'jake') + self.assert_generated_output('frank', 'frank') + + def test_names_with_caps(self): + self.assert_generated_output('Jake', 'jake') + self.assert_generated_output('FranK', 'frank') + + def test_short_names(self): + self.assert_generated_output('a', 'a___') + self.assert_generated_output('ab', 'ab__') + self.assert_generated_output('abc', 'abc_') + + def test_long_names(self): + self.assert_generated_output('abcdefghijklmnopqrstuvwxyz1234567890', + 'abcdefghijklmnopqrstuvwxyz1234') + + def test_unicode_transliteration(self): + self.assert_generated_output(u'\xc6neid', 'aeneid') + self.assert_generated_output(u'\xe9tude', 'etude') + self.assert_generated_output(u'\u5317\u4eb0', 'bei_jing') + self.assert_generated_output(u'\u1515\u14c7\u14c7', 'shanana') + self.assert_generated_output(u'\u13d4\u13b5\u13c6', 'taliqua') + self.assert_generated_output(u'\u0726\u071b\u073d\u0710\u073a', 'ptu_i') + self.assert_generated_output(u'\u0905\u092d\u093f\u091c\u0940\u0924', 'abhijiit') + self.assert_generated_output(u'\u0985\u09ad\u09bf\u099c\u09c0\u09a4', 'abhijiit') + self.assert_generated_output(u'\u0d05\u0d2d\u0d3f\u0d1c\u0d40\u0d24', 'abhijiit') + self.assert_generated_output(u'\u0d2e\u0d32\u0d2f\u0d3e\u0d32\u0d2e\u0d4d', 'mlyaalm') + self.assert_generated_output(u'\ue000', '____') + self.assert_generated_output(u'\u03ff', '____') + + def test_multiple_suggestions(self): + name_gen = generate_valid_usernames('a') + generated_output = list(islice(name_gen, 4)) + self.assertEquals('a___', generated_output[0]) + self.assertEquals('a__0', generated_output[1]) + self.assertEquals('a__1', generated_output[2]) + self.assertEquals('a__2', generated_output[3]) diff --git a/util/validation.py b/util/validation.py index 511c57fe7..e9b954281 100644 --- a/util/validation.py +++ b/util/validation.py @@ -1,7 +1,6 @@ import re import string - -from unidecode import unidecode +import anunidecode INVALID_PASSWORD_MESSAGE = 'Invalid password, password must be at least ' + \ @@ -49,7 +48,7 @@ def _gen_filler_chars(num_filler_chars): def generate_valid_usernames(input_username): - normalized = unidecode(input_username).strip().lower() + normalized = input_username.encode('unidecode', 'ignore').strip().lower() prefix = re.sub(INVALID_USERNAME_CHARACTERS, '_', normalized)[:30] num_filler_chars = max(0, MIN_LENGTH - len(prefix))