Further fixes for unicode handling in manifests
We were occasionally trying to compute schema 2 version 1 signatures on the *unicode* representation, which was failing the signature check. This PR adds a new wrapper type called `Bytes`, which all manifests must take in, and which handles the unicodes vs encoded utf-8 stuff in a central location. This PR also adds a test for the manifest that was breaking in production.
This commit is contained in:
parent
05fa2bcbe0
commit
171c7e5238
28 changed files with 275 additions and 106 deletions
32
util/bytes.py
Normal file
32
util/bytes.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
class Bytes(object):
|
||||
""" Wrapper around strings and unicode objects to ensure we are always using
|
||||
the correct encoded or decoded data.
|
||||
"""
|
||||
def __init__(self, data):
|
||||
assert isinstance(data, str)
|
||||
self._encoded_data = data
|
||||
|
||||
@classmethod
|
||||
def for_string_or_unicode(cls, input):
|
||||
# If the string is a unicode string, then encode its data as UTF-8. Note that
|
||||
# we don't catch any decode exceptions here, as we want those to be raised.
|
||||
if isinstance(input, unicode):
|
||||
return Bytes(input.encode('utf-8'))
|
||||
|
||||
# Next, try decoding as UTF-8. If we have a utf-8 encoded string, then we have no
|
||||
# additional conversion to do.
|
||||
try:
|
||||
input.decode('utf-8')
|
||||
return Bytes(input)
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
# Finally, if the data is (somehow) a unicode string inside a `str` type, then
|
||||
# re-encoded the data.
|
||||
return Bytes(input.encode('utf-8'))
|
||||
|
||||
def as_encoded_str(self):
|
||||
return self._encoded_data
|
||||
|
||||
def as_unicode(self):
|
||||
return self._encoded_data.decode('utf-8')
|
Reference in a new issue