From eb9ca8e8a837c3a85ebf0ea5c1d5d55ccb493edb Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 10 Jan 2019 16:34:56 -0500 Subject: [PATCH] Fix handling of four byte utf8 manifests - Adds the charset: utf-8 to all the manifest responses - Makes sure we connect to MySQL in utf8mb4 mode, to ensure we can properly read and write 4-byte utf8 strings - Adds tests for all of the above --- data/database.py | 10 +++++++ data/registry_model/test/test_interface.py | 32 ++++++++++++++++++++++ endpoints/v2/manifest.py | 4 +-- image/docker/test/test_schema1.py | 19 +++++++++++++ test/fixtures.py | 3 ++ test/registry/protocol_fixtures.py | 14 ++++++++++ test/registry/protocol_v2.py | 17 ++++++------ test/registry/registry_tests.py | 32 ++++++++++++++++++++++ 8 files changed, 120 insertions(+), 11 deletions(-) diff --git a/data/database.py b/data/database.py index 120aee6f3..ac25421ab 100644 --- a/data/database.py +++ b/data/database.py @@ -70,6 +70,12 @@ SCHEME_RANDOM_FUNCTION = { } +_EXTRA_ARGS = { + 'mysql': dict(charset='utf8mb4'), + 'mysql+pymysql': dict(charset='utf8mb4'), +} + + def pipes_concat(arg1, arg2, *extra_args): """ Concat function for sqlite, since it doesn't support fn.Concat. Concatenates clauses with || characters. @@ -315,6 +321,10 @@ def _db_from_url(url, db_kwargs, connect_timeout=DEFAULT_DB_CONNECT_TIMEOUT, db_kwargs.pop('stale_timeout', None) db_kwargs.pop('max_connections', None) + for key, value in _EXTRA_ARGS.get(parsed_url.drivername, {}).iteritems(): + if key not in db_kwargs: + db_kwargs[key] = value + if allow_retry: driver = _wrap_for_retry(driver) diff --git a/data/registry_model/test/test_interface.py b/data/registry_model/test/test_interface.py index c1787faf5..e8c6054fb 100644 --- a/data/registry_model/test/test_interface.py +++ b/data/registry_model/test/test_interface.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + import hashlib import json import uuid @@ -864,3 +866,33 @@ def test_known_issue_schema1(registry_model): assert found.digest == digest assert found.internal_manifest_bytes.as_encoded_str() == manifest.bytes.as_encoded_str() assert found.get_parsed_manifest().digest == digest + + +def test_unicode_emoji(registry_model): + builder = DockerSchema1ManifestBuilder('devtable', 'simple', 'latest') + builder.add_layer('sha256:abcde', json.dumps({ + 'id': 'someid', + 'author': u'😱', + }, ensure_ascii=False)) + + manifest = builder.build(ensure_ascii=False) + manifest._validate() + + for blob_digest in manifest.local_blob_digests: + _populate_blob(blob_digest) + + # Create the manifest in the database. + repository_ref = registry_model.lookup_repository('devtable', 'simple') + created_manifest, _ = registry_model.create_manifest_and_retarget_tag(repository_ref, manifest, + 'latest', storage) + assert created_manifest + assert created_manifest.digest == manifest.digest + assert (created_manifest.internal_manifest_bytes.as_encoded_str() == + manifest.bytes.as_encoded_str()) + + # Look it up again and validate. + found = registry_model.lookup_manifest_by_digest(repository_ref, manifest.digest, allow_dead=True) + assert found + assert found.digest == manifest.digest + assert found.internal_manifest_bytes.as_encoded_str() == manifest.bytes.as_encoded_str() + assert found.get_parsed_manifest().digest == manifest.digest diff --git a/endpoints/v2/manifest.py b/endpoints/v2/manifest.py index 58e012a44..75f818736 100644 --- a/endpoints/v2/manifest.py +++ b/endpoints/v2/manifest.py @@ -76,7 +76,7 @@ def fetch_manifest_by_tagname(namespace_name, repo_name, manifest_ref): supported.bytes.as_unicode(), status=200, headers={ - 'Content-Type': supported.media_type, + 'Content-Type': '%s; charset=utf-8' % supported.media_type, 'Docker-Content-Digest': supported.digest, }, ) @@ -111,7 +111,7 @@ def fetch_manifest_by_digest(namespace_name, repo_name, manifest_ref): metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v2', True]) return Response(supported.bytes.as_unicode(), status=200, headers={ - 'Content-Type': supported.media_type, + 'Content-Type': '%s; charset=utf-8' % supported.media_type, 'Docker-Content-Digest': supported.digest, }) diff --git a/image/docker/test/test_schema1.py b/image/docker/test/test_schema1.py index 2c290e467..07c323ffc 100644 --- a/image/docker/test/test_schema1.py +++ b/image/docker/test/test_schema1.py @@ -178,3 +178,22 @@ def test_validate_manifest_known_issue(): layers = list(manifest.get_layers(None)) assert layers[-1].author is None + + +@pytest.mark.parametrize('with_key', [ + None, + docker_v2_signing_key, +]) +def test_validate_manifest_with_emoji(with_key): + builder = DockerSchema1ManifestBuilder('somenamespace', 'somerepo', 'sometag') + builder.add_layer('sha256:abcde', json.dumps({ + 'id': 'someid', + 'author': u'😱', + }, ensure_ascii=False)) + + built = builder.build(with_key, ensure_ascii=False) + built._validate() + + # Ensure the manifest can be reloaded. + built_bytes = built.bytes.as_encoded_str() + DockerSchema1Manifest(Bytes.for_string_or_unicode(built_bytes)) diff --git a/test/fixtures.py b/test/fixtures.py index 3c0c1cde5..22a37788b 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -83,6 +83,7 @@ def _init_db_path_sqlite(tmpdir_factory): initialize_database() db.obj.execute_sql('PRAGMA foreign_keys = ON;') + db.obj.execute_sql('PRAGMA encoding="UTF-8";') populate_database() close_db_filter(None) @@ -157,7 +158,9 @@ def initialized_db(appconfig): if not under_test_real_database: # Make absolutely sure foreign key constraints are on. db.obj.execute_sql('PRAGMA foreign_keys = ON;') + db.obj.execute_sql('PRAGMA encoding="UTF-8";') assert db.obj.execute_sql('PRAGMA foreign_keys;').fetchone()[0] == 1 + assert db.obj.execute_sql('PRAGMA encoding;').fetchone()[0] == 'UTF-8' # If under a test *real* database, setup a savepoint. if under_test_real_database: diff --git a/test/registry/protocol_fixtures.py b/test/registry/protocol_fixtures.py index f9ec7e0bd..cefdab22f 100644 --- a/test/registry/protocol_fixtures.py +++ b/test/registry/protocol_fixtures.py @@ -138,6 +138,20 @@ def images_with_empty_layer(): ] +@pytest.fixture(scope="session") +def unicode_emoji_images(): + """ Returns basic images for push and pull testing that contain unicode in the image metadata. """ + # Note: order is from base layer down to leaf. + parent_bytes = layer_bytes_for_contents('parent contents') + image_bytes = layer_bytes_for_contents('some contents') + return [ + Image(id='parentid', bytes=parent_bytes, parent_id=None), + Image(id='someid', bytes=image_bytes, parent_id='parentid', + config={'comment': u'😱', + 'author': u'Sômé guy'}), + ] + + @pytest.fixture(scope="session") def jwk(): return RSAKey(key=RSA.generate(2048)) diff --git a/test/registry/protocol_v2.py b/test/registry/protocol_v2.py index 2b69c663a..1e2580202 100644 --- a/test/registry/protocol_v2.py +++ b/test/registry/protocol_v2.py @@ -168,9 +168,9 @@ class V2Protocol(RegistryProtocol): return None # Parse the returned manifest list and ensure it matches. - assert response.headers['Content-Type'] == DOCKER_SCHEMA2_MANIFESTLIST_CONTENT_TYPE - retrieved = parse_manifest_from_bytes(Bytes.for_string_or_unicode(response.text), - response.headers['Content-Type']) + ct, _ = response.headers['Content-Type'].split(';', 1) + assert ct == DOCKER_SCHEMA2_MANIFESTLIST_CONTENT_TYPE + retrieved = parse_manifest_from_bytes(Bytes.for_string_or_unicode(response.text), ct) assert retrieved.schema_version == 2 assert retrieved.is_manifest_list assert retrieved.digest == manifestlist.digest @@ -185,9 +185,8 @@ class V2Protocol(RegistryProtocol): headers=headers) if expected_failure is not None: return None - - manifest = parse_manifest_from_bytes(Bytes.for_string_or_unicode(response.text), - response.headers['Content-Type']) + ct, _ = response.headers['Content-Type'].split(';', 1) + manifest = parse_manifest_from_bytes(Bytes.for_string_or_unicode(response.text), ct) assert not manifest.is_manifest_list assert manifest.digest == manifest_digest @@ -546,11 +545,11 @@ class V2Protocol(RegistryProtocol): return None # Ensure the manifest returned by us is valid. + ct, _ = response.headers['Content-Type'].split(';', 1) if not self.schema2: - assert response.headers['Content-Type'] in DOCKER_SCHEMA1_CONTENT_TYPES + assert ct in DOCKER_SCHEMA1_CONTENT_TYPES - manifest = parse_manifest_from_bytes(Bytes.for_string_or_unicode(response.text), - response.headers['Content-Type']) + manifest = parse_manifest_from_bytes(Bytes.for_string_or_unicode(response.text), ct) manifests[tag_name] = manifest if manifest.schema_version == 1: diff --git a/test/registry/registry_tests.py b/test/registry/registry_tests.py index 122da98ac..ea5b0ffad 100644 --- a/test/registry/registry_tests.py +++ b/test/registry/registry_tests.py @@ -1822,3 +1822,35 @@ def test_push_legacy_pull_not_allowed(v22_protocol, v1_protocol, remote_images, # Attempt to pull. Should fail with a 404. v1_protocol.pull(liveserver_session, 'devtable', 'newrepo', 'latest', remote_images, credentials=credentials, expected_failure=Failures.UNKNOWN_TAG) + + +def test_push_pull_emoji_unicode(pusher, puller, unicode_emoji_images, liveserver_session, + app_reloader): + """ Test: Push an image with unicode inside and then pull it. """ + credentials = ('devtable', 'password') + + # Push a new repository. + pusher.push(liveserver_session, 'devtable', 'newrepo', 'latest', unicode_emoji_images, + credentials=credentials) + + # Pull the repository to verify. + puller.pull(liveserver_session, 'devtable', 'newrepo', 'latest', unicode_emoji_images, + credentials=credentials) + + +def test_push_pull_emoji_unicode_direct(pusher, puller, unicode_emoji_images, liveserver_session, + app_reloader): + """ Test: Push an image with *unescaped* unicode inside and then pull it. """ + credentials = ('devtable', 'password') + + # Turn off automatic unicode encoding when building the manifests. + options = ProtocolOptions() + options.ensure_ascii = False + + # Push a new repository. + pusher.push(liveserver_session, 'devtable', 'newrepo', 'latest', unicode_emoji_images, + credentials=credentials, options=options) + + # Pull the repository to verify. + puller.pull(liveserver_session, 'devtable', 'newrepo', 'latest', unicode_emoji_images, + credentials=credentials, options=options)