diff --git a/data/model/storage.py b/data/model/storage.py index d3e452d9f..ef983ecc9 100644 --- a/data/model/storage.py +++ b/data/model/storage.py @@ -1,6 +1,6 @@ import logging -from peewee import JOIN_LEFT_OUTER, fn +from peewee import JOIN_LEFT_OUTER, fn, SQL from data.model import config, db_transaction, InvalidImageException from data.database import (ImageStorage, Image, DerivedImageStorage, ImageStoragePlacement, @@ -218,13 +218,26 @@ def get_layer_path(storage_record): return store.blob_path(storage_record.content_checksum) + def lookup_repo_storages_by_content_checksum(repo, checksums): """ Looks up repository storages (without placements) matching the given repository and checksum. """ - return (ImageStorage - .select() - .join(Image) - .where(Image.repository == repo, ImageStorage.content_checksum << checksums)) + # There may be many duplicates of the checksums, so for performance reasons we are going + # to use a union to select just one storage with each checksum + queries = [] + + for checksum in set(checksums): + candidate_subq = (ImageStorage + .select(ImageStorage.id, ImageStorage.content_checksum) + .join(Image) + .where(Image.repository == repo, ImageStorage.content_checksum == checksum) + .limit(1)) + queries.append(ImageStorage + .select(SQL('*')) + .from_(candidate_subq)) + + return reduce(lambda l, r: l.union_all(r), queries) + def get_storage_locations(uuid): query = (ImageStoragePlacement diff --git a/data/model/tag.py b/data/model/tag.py index 6883a39e3..96b3ffef4 100644 --- a/data/model/tag.py +++ b/data/model/tag.py @@ -240,10 +240,10 @@ def load_manifest_by_digest(namespace, repo_name, digest): def _load_repo_manifests(namespace, repo_name): - return (TagManifest - .select(TagManifest, RepositoryTag) - .join(RepositoryTag) - .join(Image) - .join(Repository) - .join(Namespace, on=(Namespace.id == Repository.namespace_user)) - .where(Repository.name == repo_name, Namespace.username == namespace)) + return _tag_alive(TagManifest + .select(TagManifest, RepositoryTag) + .join(RepositoryTag) + .join(Image) + .join(Repository) + .join(Namespace, on=(Namespace.id == Repository.namespace_user)) + .where(Repository.name == repo_name, Namespace.username == namespace)) diff --git a/endpoints/v2/manifest.py b/endpoints/v2/manifest.py index 935045734..1ba9be99f 100644 --- a/endpoints/v2/manifest.py +++ b/endpoints/v2/manifest.py @@ -287,16 +287,16 @@ def _write_manifest(namespace, repo_name, manifest): # know which V1 images we need to synthesize and which ones are invalid. layers = list(manifest.layers) - docker_image_ids = [mdata.v1_metadata.docker_id for mdata in layers] - parent_image_ids = [mdata.v1_metadata.parent for mdata in layers - if mdata.v1_metadata.parent] - all_image_ids = list(set(docker_image_ids + parent_image_ids)) + docker_image_ids = {mdata.v1_metadata.docker_id for mdata in layers} + parent_image_ids = {mdata.v1_metadata.parent for mdata in layers + if mdata.v1_metadata.parent} + all_image_ids = list(docker_image_ids | parent_image_ids) images_query = model.image.lookup_repository_images(repo, all_image_ids) images_map = {image.docker_image_id: image for image in images_query} # Lookup the storages associated with each blob in the manifest. - checksums = [str(mdata.digest) for mdata in manifest.layers] + checksums = list({str(mdata.digest) for mdata in manifest.layers}) storage_query = model.storage.lookup_repo_storages_by_content_checksum(repo, checksums) storage_map = {storage.content_checksum: storage for storage in storage_query}