From a43b741f1ba7ee9d00179ef66b8f9a7f7aad4f08 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Mon, 6 Jun 2016 15:38:29 -0400 Subject: [PATCH] Add a uniqueness hash to derived image storage to break caching over tags This allows converted ACIs and squashed images to be unique based on the specified tag. Fixes #92 --- data/database.py | 3 +- ...add_uniqueness_hash_column_for_derived_.py | 29 ++++++++++ data/model/image.py | 26 ++++++--- endpoints/verbs.py | 9 ++-- test/registry_tests.py | 53 +++++++++++++++---- util/canonicaljson.py | 18 +++++++ util/security/fingerprint.py | 21 +------- 7 files changed, 119 insertions(+), 40 deletions(-) create mode 100644 data/migrations/versions/1093d8b212bb_add_uniqueness_hash_column_for_derived_.py create mode 100644 util/canonicaljson.py diff --git a/data/database.py b/data/database.py index fea50c69a..a00b8a139 100644 --- a/data/database.py +++ b/data/database.py @@ -617,12 +617,13 @@ class DerivedStorageForImage(BaseModel): source_image = ForeignKeyField(Image) derivative = ForeignKeyField(ImageStorage) transformation = ForeignKeyField(ImageStorageTransformation) + uniqueness_hash = CharField(null=True) class Meta: database = db read_slaves = (read_slave,) indexes = ( - (('source_image', 'transformation'), True), + (('source_image', 'transformation', 'uniqueness_hash'), True), ) diff --git a/data/migrations/versions/1093d8b212bb_add_uniqueness_hash_column_for_derived_.py b/data/migrations/versions/1093d8b212bb_add_uniqueness_hash_column_for_derived_.py new file mode 100644 index 000000000..cc2c70e2d --- /dev/null +++ b/data/migrations/versions/1093d8b212bb_add_uniqueness_hash_column_for_derived_.py @@ -0,0 +1,29 @@ +"""Add uniqueness hash column for derived image storage + +Revision ID: 1093d8b212bb +Revises: 0f17d94d11eb +Create Date: 2016-06-06 15:27:21.735669 + +""" + +# revision identifiers, used by Alembic. +revision = '1093d8b212bb' +down_revision = '0f17d94d11eb' + +from alembic import op +import sqlalchemy as sa + +def upgrade(tables): + ### commands auto generated by Alembic - please adjust! ### + op.drop_index('derivedstorageforimage_source_image_id_transformation_id', table_name='derivedstorageforimage') + op.add_column('derivedstorageforimage', sa.Column('uniqueness_hash', sa.String(length=255), nullable=True)) + op.create_index('uniqueness_index', 'derivedstorageforimage', ['source_image_id', 'transformation_id', 'uniqueness_hash'], unique=True) + ### end Alembic commands ### + + +def downgrade(tables): + ### commands auto generated by Alembic - please adjust! ### + op.drop_index('uniqueness_index', table_name='derivedstorageforimage') + op.drop_column('derivedstorageforimage', 'uniqueness_hash') + op.create_index('derivedstorageforimage_source_image_id_transformation_id', 'derivedstorageforimage', ['source_image_id', 'transformation_id'], unique=True) + ### end Alembic commands ### diff --git a/data/model/image.py b/data/model/image.py index b865c048b..d5cca8721 100644 --- a/data/model/image.py +++ b/data/model/image.py @@ -1,6 +1,7 @@ import logging import dateutil.parser -import random +import hashlib +import json from peewee import JOIN_LEFT_OUTER, IntegrityError from datetime import datetime @@ -11,6 +12,7 @@ from data.database import (Image, Repository, ImageStoragePlacement, Namespace, ImageStorageLocation, RepositoryPermission, DerivedStorageForImage, ImageStorageTransformation, db_random_func) +from util.canonicaljson import canonicalize logger = logging.getLogger(__name__) @@ -458,27 +460,39 @@ def set_secscan_status(image, indexed, version): .execute()) != 0 -def find_or_create_derived_storage(source_image, transformation_name, preferred_location): - existing = find_derived_storage_for_image(source_image, transformation_name) +def _get_uniqueness_hash(varying_metadata): + if not varying_metadata: + return None + + return hashlib.sha256(json.dumps(canonicalize(varying_metadata))).hexdigest() + + +def find_or_create_derived_storage(source_image, transformation_name, preferred_location, + varying_metadata=None): + existing = find_derived_storage_for_image(source_image, transformation_name, varying_metadata) if existing is not None: return existing logger.debug('Creating storage dervied from source image: %s', source_image.id) + uniqueness_hash = _get_uniqueness_hash(varying_metadata) trans = ImageStorageTransformation.get(name=transformation_name) new_storage = storage.create_v1_storage(preferred_location) DerivedStorageForImage.create(source_image=source_image, derivative=new_storage, - transformation=trans) + transformation=trans, uniqueness_hash=uniqueness_hash) return new_storage -def find_derived_storage_for_image(source_image, transformation_name): +def find_derived_storage_for_image(source_image, transformation_name, varying_metadata=None): + uniqueness_hash = _get_uniqueness_hash(varying_metadata) + try: found = (ImageStorage .select(ImageStorage, DerivedStorageForImage) .join(DerivedStorageForImage) .join(ImageStorageTransformation) .where(DerivedStorageForImage.source_image == source_image, - ImageStorageTransformation.name == transformation_name) + ImageStorageTransformation.name == transformation_name, + DerivedStorageForImage.uniqueness_hash == uniqueness_hash) .get()) found.locations = {placement.location.name for placement in found.imagestorageplacement_set} diff --git a/endpoints/verbs.py b/endpoints/verbs.py index 518cb38be..b4e19c427 100644 --- a/endpoints/verbs.py +++ b/endpoints/verbs.py @@ -171,7 +171,8 @@ def _torrent_repo_verb(repo_image, tag, verb, **kwargs): # Lookup an *existing* derived storage for the verb. If the verb's image storage doesn't exist, # we cannot create it here, so we 406. - derived = model.image.find_derived_storage_for_image(repo_image, verb) + derived = model.image.find_derived_storage_for_image(repo_image, verb, + varying_metadata={'tag': tag}) if not derived: abort(406) @@ -221,7 +222,8 @@ def _repo_verb_signature(namespace, repository, tag, verb, checker=None, **kwarg (repo_image, _, _) = result # Lookup the derived image storage for the verb. - derived = model.image.find_derived_storage_for_image(repo_image, verb) + derived = model.image.find_derived_storage_for_image(repo_image, verb, + varying_metadata={'tag': tag}) if derived is None or derived.uploading: return make_response('', 202) @@ -253,7 +255,8 @@ def _repo_verb(namespace, repository, tag, verb, formatter, sign=False, checker= # Lookup/create the derived image storage for the verb and repo image. derived = model.image.find_or_create_derived_storage(repo_image, verb, - storage.preferred_locations[0]) + storage.preferred_locations[0], + varying_metadata={'tag': tag}) if not derived.uploading: logger.debug('Derived %s image %s exists in storage', verb, derived.uuid) diff --git a/test/registry_tests.py b/test/registry_tests.py index 25e59b632..07c8eb280 100644 --- a/test/registry_tests.py +++ b/test/registry_tests.py @@ -12,7 +12,7 @@ import gpgme import Crypto.Random from cachetools import lru_cache -from flask import request, jsonify, abort +from flask import request, jsonify from flask.blueprints import Blueprint from flask.ext.testing import LiveServerTestCase @@ -342,7 +342,7 @@ class V1RegistryPushMixin(V1RegistryMixin): push_version = 'v1' def do_push(self, namespace, repository, username, password, images=None, expect_failure=None, - munge_shas=False): + munge_shas=False, tag_names=None): images = images or self._get_default_images() auth = (username, password) repo_name = _get_repo_name(namespace, repository) @@ -361,7 +361,6 @@ class V1RegistryPushMixin(V1RegistryMixin): for image_data in images: image_id = image_data['id'] - last_image_id = image_id # PUT /v1/images/{imageID}/json image_json_data = {'id': image_id} @@ -385,9 +384,10 @@ class V1RegistryPushMixin(V1RegistryMixin): headers={'X-Docker-Checksum-Payload': checksum}, auth='sig') - # PUT /v1/repositories/{namespace}/{repository}/tags/latest - self.do_tag(namespace, repository, 'latest', images[-1]['id']) + tag_names = tag_names or ['latest'] + for tag_name in tag_names: + self.do_tag(namespace, repository, tag_name, images[-1]['id']) # PUT /v1/repositories/{namespace}/{repository}/images self.conduct('PUT', '/v1/repositories/%s/images' % repo_name, @@ -1500,19 +1500,21 @@ class TorrentV2PushTests(RegistryTestCaseMixin, TorrentTestMixin, V2RegistryPush class ACIConversionTests(RegistryTestCaseMixin, V1RegistryPushMixin, LiveServerTestCase): """ Tests for registry ACI conversion. """ - def get_converted_image(self): - response = self.conduct('GET', '/c1/aci/localhost:5000/devtable/newrepo/latest/aci/linux/amd64/', auth='sig') + def get_converted_image(self, tag_name='latest'): + url = '/c1/aci/localhost:5000/devtable/newrepo/' + tag_name + '/aci/linux/amd64/' + response = self.conduct('GET', url, auth='sig') tar = tarfile.open(fileobj=StringIO(response.content)) return tar, response.content - def get_converted_signature(self): + def get_converted_signature(self, tag_name='latest'): counter = 0 # Give time for the signature to be written before continuing. As we don't exactly know when # this is (based on CPU conditions when the test is being run), we try a backoff and sleep # approach. while counter < 10: - response = self.conduct('GET', '/c1/aci/localhost:5000/devtable/newrepo/latest/aci.asc/linux/amd64/', auth='sig', expected_code=None) + url = '/c1/aci/localhost:5000/devtable/newrepo/' + tag_name + '/aci.asc/linux/amd64/' + response = self.conduct('GET', url, auth='sig', expected_code=None) if response.status_code == 202 or response.status_code == 404: counter += 1 time.sleep(counter * 2) @@ -1574,7 +1576,8 @@ class ACIConversionTests(RegistryTestCaseMixin, V1RegistryPushMixin, LiveServerT "annotations": [ {"name": "created", "value": ""}, {"name": "homepage", "value": "http://localhost:5000/devtable/newrepo:latest"}, - {"name": "quay.io/derived-image", "value": "fa916d5ca4da5348628dfffcfc943288a0cca521cd21a6d2981a85ec1d7f7a3a"} + {"name": "quay.io/derived-image", + "value": "fa916d5ca4da5348628dfffcfc943288a0cca521cd21a6d2981a85ec1d7f7a3a"} ] }, "labels": [ @@ -1609,6 +1612,36 @@ class ACIConversionTests(RegistryTestCaseMixin, V1RegistryPushMixin, LiveServerT self._verify_signature(signature_again, converted) self._verify_signature(signature_again, converted_again) + def assertHasDerivedImage(self, manifest, expected): + for annotation in manifest['app']['annotations']: + if annotation['name'] == 'homepage': + self.assertEqual(expected, annotation['value']) + return + + self.fail('Derived image annotation not found in metadata') + + def test_conversion_different_tags(self): + initial_images = [ + { + 'id': 'initialid', + 'contents': 'the initial image', + }, + ] + + # Create the repo. + self.do_push('devtable', 'newrepo', 'devtable', 'password', images=initial_images, + tag_names=['latest', 'sometag']) + + # Pull the squashed version of tag latest. + latest_tar, _ = self.get_converted_image(tag_name='latest') + latest_manifest = json.loads(latest_tar.extractfile(latest_tar.getmember('manifest')).read()) + self.assertHasDerivedImage(latest_manifest, 'http://localhost:5000/devtable/newrepo:latest') + + # Pull the squashed version of tag sometag. + sometag_tar, _ = self.get_converted_image(tag_name='sometag') + sometag_manifest = json.loads(sometag_tar.extractfile(sometag_tar.getmember('manifest')).read()) + self.assertHasDerivedImage(sometag_manifest, 'http://localhost:5000/devtable/newrepo:sometag') + def test_multilayer_conversion(self): images = [ diff --git a/util/canonicaljson.py b/util/canonicaljson.py new file mode 100644 index 000000000..17f61df8d --- /dev/null +++ b/util/canonicaljson.py @@ -0,0 +1,18 @@ +import collections + +def canonicalize(json_obj): + """This function canonicalizes a Python object that will be serialized as JSON. + + Args: + json_obj (object): the Python object that will later be serialized as JSON. + + Returns: + object: json_obj now sorted to its canonical form. + + """ + if isinstance(json_obj, collections.MutableMapping): + sorted_obj = sorted({key: canonicalize(val) for key, val in json_obj.items()}.items()) + return collections.OrderedDict(sorted_obj) + elif isinstance(json_obj, (list, tuple)): + return [canonicalize(val) for val in json_obj] + return json_obj diff --git a/util/security/fingerprint.py b/util/security/fingerprint.py index 1341d5780..51212aae7 100644 --- a/util/security/fingerprint.py +++ b/util/security/fingerprint.py @@ -1,26 +1,7 @@ -import collections import json from hashlib import sha256 - - -def canonicalize(json_obj): - """This function canonicalizes a Python object that will be serialized as JSON. - - Args: - json_obj (object): the Python object that will later be serialized as JSON. - - Returns: - object: json_obj now sorted to its canonical form. - - """ - if isinstance(json_obj, collections.MutableMapping): - sorted_obj = sorted({key: canonicalize(val) for key, val in json_obj.items()}.items()) - return collections.OrderedDict(sorted_obj) - elif isinstance(json_obj, (list, tuple)): - return [canonicalize(val) for val in json_obj] - return json_obj - +from util.canonicaljson import canonicalize def canonical_kid(jwk): """This function returns the SHA256 hash of a canonical JWK.