From e97328939734fa2eb6983593bc44c2c320d8f9fb Mon Sep 17 00:00:00 2001 From: Jimmy Zelinskie Date: Fri, 23 Oct 2015 15:24:47 -0400 Subject: [PATCH] Revert "Revert "Merge pull request #682 from jzelinskie/revertrevert"" This reverts commit 278bc736e30dc893917f6368e1fceb80c1d71a81. --- buildman/jobutil/buildjob.py | 5 +- ...15d01_backfill_image_fields_from_image_.py | 24 +++++ data/model/image.py | 11 ++- digest/checksums.py | 2 +- endpoints/api/image.py | 12 +-- endpoints/api/repository.py | 2 +- endpoints/v1/registry.py | 32 +++---- endpoints/verbs.py | 9 +- util/migrate/backfill_aggregate_sizes.py | 50 ++++++----- util/migrate/backfill_image_fields.py | 87 +++++++++++++++++++ util/migrate/backfill_v1_metadata.py | 67 ++++++++++++++ 11 files changed, 236 insertions(+), 65 deletions(-) create mode 100644 data/migrations/versions/2e0380215d01_backfill_image_fields_from_image_.py create mode 100644 util/migrate/backfill_image_fields.py create mode 100644 util/migrate/backfill_v1_metadata.py diff --git a/buildman/jobutil/buildjob.py b/buildman/jobutil/buildjob.py index f6291f62b..dbbb8113f 100644 --- a/buildman/jobutil/buildjob.py +++ b/buildman/jobutil/buildjob.py @@ -128,10 +128,9 @@ class BuildJob(object): return False full_command = '["/bin/sh", "-c", "%s"]' % cache_commands[step] - logger.debug('Checking step #%s: %s, %s == %s', step, image.id, - image.storage.command, full_command) + logger.debug('Checking step #%s: %s, %s == %s', step, image.id, image.command, full_command) - return image.storage.command == full_command + return image.command == full_command path = tree.find_longest_path(base_image.id, checker) if not path: diff --git a/data/migrations/versions/2e0380215d01_backfill_image_fields_from_image_.py b/data/migrations/versions/2e0380215d01_backfill_image_fields_from_image_.py new file mode 100644 index 000000000..93d89ed6e --- /dev/null +++ b/data/migrations/versions/2e0380215d01_backfill_image_fields_from_image_.py @@ -0,0 +1,24 @@ +"""Backfill image fields from image storages + +Revision ID: 2e0380215d01 +Revises: 3ff4fbc94644 +Create Date: 2015-09-15 16:57:42.850246 + +""" + +# revision identifiers, used by Alembic. +revision = '2e0380215d01' +down_revision = '3ff4fbc94644' + +from alembic import op +import sqlalchemy as sa +from util.migrate.backfill_image_fields import backfill_image_fields +from util.migrate.backfill_v1_metadata import backfill_v1_metadata + + +def upgrade(tables): + backfill_image_fields() + backfill_v1_metadata() + +def downgrade(tables): + pass diff --git a/data/model/image.py b/data/model/image.py index 00707d176..82953cd86 100644 --- a/data/model/image.py +++ b/data/model/image.py @@ -79,7 +79,14 @@ def get_repository_images_base(namespace_name, repository_name, query_modifier): query = query_modifier(query) - location_list = list(query) + return invert_placement_query_results(query) + + +def invert_placement_query_results(placement_query): + """ This method will take a query which returns placements, storages, and images, and have it + return images and their storages, along with the placement set on each storage. + """ + location_list = list(placement_query) images = {} for location in location_list: @@ -334,7 +341,6 @@ def set_image_size(docker_image_id, namespace_name, repository_name, image_size, try: # TODO(jschorr): Switch to this faster route once we have full ancestor aggregate_size # parent_image = Image.get(Image.id == ancestors[-1]) - # total_size = image_size + parent_image.storage.aggregate_size ancestor_size = (ImageStorage .select(fn.Sum(ImageStorage.image_size)) .join(Image) @@ -343,6 +349,7 @@ def set_image_size(docker_image_id, namespace_name, repository_name, image_size, # TODO stop writing to storage when all readers are removed if ancestor_size is not None: + # total_size = image_size + parent_image.storage.aggregate_size total_size = ancestor_size + image_size image.storage.aggregate_size = total_size image.aggregate_size = total_size diff --git a/digest/checksums.py b/digest/checksums.py index 154907823..ea30e4dc1 100644 --- a/digest/checksums.py +++ b/digest/checksums.py @@ -68,7 +68,7 @@ def compute_tarsum(fp, json_data): def simple_checksum_handler(json_data): - h = hashlib.sha256(json_data + '\n') + h = hashlib.sha256(json_data.encode('utf8') + '\n') def fn(buf): h.update(buf) diff --git a/endpoints/api/image.py b/endpoints/api/image.py index 014bed412..057e02cae 100644 --- a/endpoints/api/image.py +++ b/endpoints/api/image.py @@ -12,11 +12,7 @@ from util.cache import cache_control_flask_restful def image_view(image, image_map, include_ancestors=True): - extended_props = image - if image.storage and image.storage.id: - extended_props = image.storage - - command = extended_props.command + command = image.command def docker_id(aid): if not aid or not aid in image_map: @@ -26,10 +22,10 @@ def image_view(image, image_map, include_ancestors=True): image_data = { 'id': image.docker_image_id, - 'created': format_date(extended_props.created), - 'comment': extended_props.comment, + 'created': format_date(image.created), + 'comment': image.comment, 'command': json.loads(command) if command else None, - 'size': extended_props.image_size, + 'size': image.storage.image_size, 'uploading': image.storage.uploading, 'sort_index': len(image.ancestors), } diff --git a/endpoints/api/repository.py b/endpoints/api/repository.py index d177403c5..b9664864e 100644 --- a/endpoints/api/repository.py +++ b/endpoints/api/repository.py @@ -254,7 +254,7 @@ class Repository(RepositoryParamResource): tag_info = { 'name': tag.name, 'image_id': tag.image.docker_image_id, - 'size': tag.image.storage.aggregate_size + 'size': tag.image.aggregate_size } if tag.lifetime_start_ts > 0: diff --git a/endpoints/v1/registry.py b/endpoints/v1/registry.py index 082a07864..636da6f30 100644 --- a/endpoints/v1/registry.py +++ b/endpoints/v1/registry.py @@ -211,11 +211,10 @@ def put_image_layer(namespace, repository, image_id): try: logger.debug('Retrieving image data') uuid = repo_image.storage.uuid - json_data = store.get_content(repo_image.storage.locations, store.image_json_path(uuid)) - except (IOError, AttributeError): + json_data = repo_image.v1_json_metadata + except (AttributeError): logger.exception('Exception when retrieving image data') - abort(404, 'Image %(image_id)s not found', issue='unknown-image', - image_id=image_id) + abort(404, 'Image %(image_id)s not found', issue='unknown-image', image_id=image_id) logger.debug('Retrieving image path info') layer_path = store.image_layer_path(uuid) @@ -332,8 +331,7 @@ def put_image_checksum(namespace, repository, image_id): abort(404, 'Image not found: %(image_id)s', issue='unknown-image', image_id=image_id) logger.debug('Looking up repo layer data') - uuid = repo_image.storage.uuid - if not store.exists(repo_image.storage.locations, store.image_json_path(uuid)): + if not repo_image.v1_json_metadata: abort(404, 'Image not found: %(image_id)s', issue='unknown-image', image_id=image_id) logger.debug('Marking image path') @@ -373,12 +371,7 @@ def get_image_json(namespace, repository, image_id, headers): logger.debug('Looking up repo image') repo_image = model.image.get_repo_image_extended(namespace, repository, image_id) - - logger.debug('Looking up repo layer data') - try: - uuid = repo_image.storage.uuid - data = store.get_content(repo_image.storage.locations, store.image_json_path(uuid)) - except (IOError, AttributeError): + if repo_image is None: flask_abort(404) logger.debug('Looking up repo layer size') @@ -388,7 +381,7 @@ def get_image_json(namespace, repository, image_id, headers): # so handle this case rather than failing. headers['X-Docker-Size'] = str(size) - response = make_response(data, 200) + response = make_response(repo_image.v1_json_metadata, 200) response.headers.extend(headers) return response @@ -491,8 +484,6 @@ def put_image_json(namespace, repository, image_id): model.tag.create_temporary_hidden_tag(repo, repo_image, app.config['PUSH_TEMP_TAG_EXPIRATION_SEC']) - uuid = repo_image.storage.uuid - if image_id != data['id']: abort(400, 'JSON data contains invalid id for image: %(image_id)s', issue='invalid-request', image_id=image_id) @@ -510,17 +501,12 @@ def put_image_json(namespace, repository, image_id): if parent_id: logger.debug('Looking up parent image data') - if (parent_id and not - store.exists(parent_locations, store.image_json_path(parent_uuid))): + if parent_id and not parent_image.v1_json_metadata: abort(400, 'Image %(image_id)s depends on non existing parent image %(parent_id)s', issue='invalid-request', image_id=image_id, parent_id=parent_id) - logger.debug('Looking up image storage paths') - json_path = store.image_json_path(uuid) - logger.debug('Checking if image already exists') - if (store.exists(repo_image.storage.locations, json_path) and not - image_is_uploading(repo_image)): + if repo_image.v1_json_metadata and not image_is_uploading(repo_image): exact_abort(409, 'Image already exists') set_uploading_flag(repo_image, True) @@ -536,6 +522,8 @@ def put_image_json(namespace, repository, image_id): data.get('comment'), command, v1_metadata, parent_image) logger.debug('Putting json path') + uuid = repo_image.storage.uuid + json_path = store.image_json_path(uuid) store.put_content(repo_image.storage.locations, json_path, request.data) logger.debug('Generating image ancestry') diff --git a/endpoints/verbs.py b/endpoints/verbs.py index be0067d1d..6201bd897 100644 --- a/endpoints/verbs.py +++ b/endpoints/verbs.py @@ -114,17 +114,15 @@ def _verify_repo_verb(store, namespace, repository, tag, verb, checker=None): abort(404) # Lookup the tag's image and storage. - repo_image = model.image.get_repo_image_extended(namespace, repository, tag_image.docker_image_id) + repo_image = model.image.get_repo_image(namespace, repository, tag_image.docker_image_id) if not repo_image: abort(404) # If there is a data checker, call it first. - uuid = repo_image.storage.uuid image_json = None if checker is not None: - image_json_data = store.get_content(repo_image.storage.locations, store.image_json_path(uuid)) - image_json = json.loads(image_json_data) + image_json = json.loads(repo_image.v1_json_metadata) if not checker(image_json): logger.debug('Check mismatch on %s/%s:%s, verb %s', namespace, repository, tag, verb) @@ -193,8 +191,7 @@ def _repo_verb(namespace, repository, tag, verb, formatter, sign=False, checker= # Load the image's JSON layer. if not image_json: - image_json_data = store.get_content(repo_image.storage.locations, store.image_json_path(uuid)) - image_json = json.loads(image_json_data) + image_json = json.loads(repo_image.v1_json_metadata) # Calculate a synthetic image ID. synthetic_image_id = hashlib.sha256(tag_image.docker_image_id + ':' + verb).hexdigest() diff --git a/util/migrate/backfill_aggregate_sizes.py b/util/migrate/backfill_aggregate_sizes.py index 8e624753b..db6f195f2 100644 --- a/util/migrate/backfill_aggregate_sizes.py +++ b/util/migrate/backfill_aggregate_sizes.py @@ -1,44 +1,50 @@ import logging -from data.database import ImageStorage, Image, db +from data.database import ImageStorage, Image, db, db_for_update from app import app -LOGGER = logging.getLogger(__name__) + +logger = logging.getLogger(__name__) + def backfill_aggregate_sizes(): """ Generates aggregate sizes for any image storage entries without them """ - LOGGER.setLevel(logging.DEBUG) - LOGGER.debug('Aggregate sizes backfill: Began execution') + logger.debug('Aggregate sizes backfill: Began execution') while True: - batch_storage_ids = list(ImageStorage - .select(ImageStorage.id) - .where(ImageStorage.aggregate_size >> None) - .limit(10)) + batch_image_ids = list(Image + .select(Image.id) + .where(Image.aggregate_size >> None) + .limit(100)) - if len(batch_storage_ids) == 0: + if len(batch_image_ids) == 0: # There are no storages left to backfill. We're done! - LOGGER.debug('Aggregate sizes backfill: Backfill completed') + logger.debug('Aggregate sizes backfill: Backfill completed') return - LOGGER.debug('Aggregate sizes backfill: Found %s records to update', len(batch_storage_ids)) - for image_storage_id in batch_storage_ids: - LOGGER.debug('Updating image storage: %s', image_storage_id.id) + logger.debug('Aggregate sizes backfill: Found %s records to update', len(batch_image_ids)) + for image_id in batch_image_ids: + logger.debug('Updating image : %s', image_id.id) with app.config['DB_TRANSACTION_FACTORY'](db): try: - storage = ImageStorage.select().where(ImageStorage.id == image_storage_id.id).get() - image = Image.select().where(Image.storage == storage).get() + image = (Image + .select(Image, ImageStorage) + .join(ImageStorage) + .where(Image.id == image_id) + .get()) + + aggregate_size = image.storage.image_size image_ids = image.ancestors.split('/')[1:-1] - aggregate_size = storage.image_size for image_id in image_ids: - current_image = Image.select().where(Image.id == image_id).join(ImageStorage) - aggregate_size += image.storage.image_size + to_add = db_for_update(Image + .select(Image, ImageStorage) + .join(ImageStorage) + .where(Image.id == image_id)).get() + aggregate_size += to_add.storage.image_size - storage.aggregate_size = aggregate_size - storage.save() - except ImageStorage.DoesNotExist: - pass + image.aggregate_size = aggregate_size + image.save() except Image.DoesNotExist: pass diff --git a/util/migrate/backfill_image_fields.py b/util/migrate/backfill_image_fields.py new file mode 100644 index 000000000..184cc8a42 --- /dev/null +++ b/util/migrate/backfill_image_fields.py @@ -0,0 +1,87 @@ +import logging + +from peewee import (CharField, BigIntegerField, BooleanField, ForeignKeyField, DateTimeField, + TextField) +from data.database import BaseModel, db, db_for_update +from app import app + + +logger = logging.getLogger(__name__) + + +class Repository(BaseModel): + pass + + +# Vendor the information from tables we will be writing to at the time of this migration +class ImageStorage(BaseModel): + created = DateTimeField(null=True) + comment = TextField(null=True) + command = TextField(null=True) + aggregate_size = BigIntegerField(null=True) + uploading = BooleanField(default=True, null=True) + + +class Image(BaseModel): + # This class is intentionally denormalized. Even though images are supposed + # to be globally unique we can't treat them as such for permissions and + # security reasons. So rather than Repository <-> Image being many to many + # each image now belongs to exactly one repository. + docker_image_id = CharField(index=True) + repository = ForeignKeyField(Repository) + + # '/' separated list of ancestory ids, e.g. /1/2/6/7/10/ + ancestors = CharField(index=True, default='/', max_length=64535, null=True) + + storage = ForeignKeyField(ImageStorage, index=True, null=True) + + created = DateTimeField(null=True) + comment = TextField(null=True) + command = TextField(null=True) + aggregate_size = BigIntegerField(null=True) + v1_json_metadata = TextField(null=True) + + +def backfill_image_fields(): + """ Copies metadata from image storages to their images. """ + logger.debug('Image metadata backfill: Began execution') + while True: + batch_image_ids = list(Image + .select(Image.id) + .join(ImageStorage) + .where(Image.created >> None, Image.comment >> None, + Image.command >> None, Image.aggregate_size >> None, + ImageStorage.uploading == False, + ~((ImageStorage.created >> None) & + (ImageStorage.comment >> None) & + (ImageStorage.command >> None) & + (ImageStorage.aggregate_size >> None))) + .limit(100)) + + if len(batch_image_ids) == 0: + logger.debug('Image metadata backfill: Backfill completed') + return + + logger.debug('Image metadata backfill: Found %s records to update', len(batch_image_ids)) + for image_id in batch_image_ids: + logger.debug('Updating image: %s', image_id.id) + + with app.config['DB_TRANSACTION_FACTORY'](db): + try: + image = db_for_update(Image + .select(Image, ImageStorage) + .join(ImageStorage) + .where(Image.id == image_id.id)).get() + + image.created = image.storage.created + image.comment = image.storage.comment + image.command = image.storage.command + image.aggregate_size = image.storage.aggregate_size + image.save() + except Image.DoesNotExist: + pass + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + logging.getLogger('peewee').setLevel(logging.CRITICAL) + backfill_image_fields() diff --git a/util/migrate/backfill_v1_metadata.py b/util/migrate/backfill_v1_metadata.py new file mode 100644 index 000000000..be7a37c93 --- /dev/null +++ b/util/migrate/backfill_v1_metadata.py @@ -0,0 +1,67 @@ +import logging + +from peewee import JOIN_LEFT_OUTER + +from data.database import (Image, ImageStorage, ImageStoragePlacement, ImageStorageLocation, db, + db_for_update) +from app import app, storage +from data import model + + +logger = logging.getLogger(__name__) + + +def backfill_v1_metadata(): + """ Copies metadata from image storages to their images. """ + logger.debug('Image v1 metadata backfill: Began execution') + while True: + batch_image_ids = list(Image + .select(Image.id) + .join(ImageStorage) + .where(Image.v1_json_metadata >> None, ImageStorage.uploading == False) + .limit(100)) + + if len(batch_image_ids) == 0: + logger.debug('Image v1 metadata backfill: Backfill completed') + return + + logger.debug('Image v1 metadata backfill: Found %s records to update', len(batch_image_ids)) + for one_id in batch_image_ids: + with app.config['DB_TRANSACTION_FACTORY'](db): + try: + logger.debug('Loading image: %s', one_id.id) + + raw_query = (ImageStoragePlacement + .select(ImageStoragePlacement, Image, ImageStorage, ImageStorageLocation) + .join(ImageStorageLocation) + .switch(ImageStoragePlacement) + .join(ImageStorage, JOIN_LEFT_OUTER) + .join(Image) + .where(Image.id == one_id.id)) + + placement_query = db_for_update(raw_query) + + repo_image_list = model.image.invert_placement_query_results(placement_query) + if len(repo_image_list) > 1: + logger.error('Found more images than we requested, something is wrong with the query') + return + + repo_image = repo_image_list[0] + uuid = repo_image.storage.uuid + json_path = storage.image_json_path(uuid) + + logger.debug('Updating image: %s from: %s', repo_image.id, json_path) + try: + data = storage.get_content(repo_image.storage.locations, json_path) + except IOError: + data = None + logger.exception('failed to find v1 metadata, defaulting to None') + repo_image.v1_json_metadata = data + repo_image.save() + except ImageStoragePlacement.DoesNotExist: + pass + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + # logging.getLogger('peewee').setLevel(logging.CRITICAL) + backfill_v1_metadata()