diff --git a/data/model/blob.py b/data/model/blob.py index 2c087f754..e97badedf 100644 --- a/data/model/blob.py +++ b/data/model/blob.py @@ -164,6 +164,18 @@ def initiate_upload(namespace, repo_name, uuid, location_name, storage_metadata) storage_metadata=storage_metadata) +def get_shared_blob(digest): + """ Returns the ImageStorage blob with the given digest or, if not present, + returns None. This method is *only* to be used for shared blobs that are + globally accessible, such as the special empty gzipped tar layer that Docker + no longer pushes to us. + """ + try: + return ImageStorage.get(content_checksum=digest, uploading=False) + except ImageStorage.DoesNotExist: + return None + + def get_or_create_shared_blob(digest, byte_data, storage): """ Returns the ImageStorage blob with the given digest or, if not present, adds a row and writes the given byte data to the storage engine. diff --git a/data/model/oci/manifest.py b/data/model/oci/manifest.py index 299f86b8b..b1b1d8148 100644 --- a/data/model/oci/manifest.py +++ b/data/model/oci/manifest.py @@ -7,7 +7,7 @@ from peewee import IntegrityError, JOIN from data.database import (Tag, Manifest, ManifestBlob, ManifestLegacyImage, ManifestChild, db_transaction) from data.model import BlobDoesNotExist -from data.model.blob import get_or_create_shared_blob +from data.model.blob import get_or_create_shared_blob, get_shared_blob from data.model.oci.tag import filter_to_alive_tags from data.model.oci.label import create_manifest_label from data.model.oci.retriever import RepositoryContentRetriever @@ -108,9 +108,20 @@ def _create_manifest(repository_id, manifest_interface_instance, storage): # Ensure all the blobs in the manifest exist. digests = set(manifest_interface_instance.local_blob_digests) blob_map = {} + + # If the special empty layer is required, simply load it directly. This is much faster + # than trying to load it on a per repository basis, and that is unnecessary anyway since + # this layer is predefined. + if EMPTY_LAYER_BLOB_DIGEST in digests: + digests.remove(EMPTY_LAYER_BLOB_DIGEST) + blob_map[EMPTY_LAYER_BLOB_DIGEST] = get_shared_blob(EMPTY_LAYER_BLOB_DIGEST) + if not blob_map[EMPTY_LAYER_BLOB_DIGEST]: + logger.warning('Could not find the special empty blob in storage') + return None + if digests: query = lookup_repo_storages_by_content_checksum(repository_id, digests) - blob_map = {s.content_checksum: s for s in query} + blob_map.update({s.content_checksum: s for s in query}) for digest_str in digests: if digest_str not in blob_map: logger.warning('Unknown blob `%s` under manifest `%s` for repository `%s`', digest_str, @@ -120,11 +131,12 @@ def _create_manifest(repository_id, manifest_interface_instance, storage): # Special check: If the empty layer blob is needed for this manifest, add it to the # blob map. This is necessary because Docker decided to elide sending of this special # empty layer in schema version 2, but we need to have it referenced for GC and schema version 1. - if manifest_interface_instance.get_requires_empty_layer_blob(retriever): - shared_blob = get_or_create_shared_blob(EMPTY_LAYER_BLOB_DIGEST, EMPTY_LAYER_BYTES, storage) - assert not shared_blob.uploading - assert shared_blob.content_checksum == EMPTY_LAYER_BLOB_DIGEST - blob_map[EMPTY_LAYER_BLOB_DIGEST] = shared_blob + if EMPTY_LAYER_BLOB_DIGEST not in blob_map: + if manifest_interface_instance.get_requires_empty_layer_blob(retriever): + shared_blob = get_or_create_shared_blob(EMPTY_LAYER_BLOB_DIGEST, EMPTY_LAYER_BYTES, storage) + assert not shared_blob.uploading + assert shared_blob.content_checksum == EMPTY_LAYER_BLOB_DIGEST + blob_map[EMPTY_LAYER_BLOB_DIGEST] = shared_blob # Determine and populate the legacy image if necessary. Manifest lists will not have a legacy # image. diff --git a/data/model/storage.py b/data/model/storage.py index 04f8b965b..e1c9541d4 100644 --- a/data/model/storage.py +++ b/data/model/storage.py @@ -264,6 +264,9 @@ def get_layer_path_for_storage(storage_uuid, cas_path, content_checksum): def lookup_repo_storages_by_content_checksum(repo, checksums, by_manifest=False): """ Looks up repository storages (without placements) matching the given repository and checksum. """ + if not checksums: + return [] + # There may be many duplicates of the checksums, so for performance reasons we are going # to use a union to select just one storage with each checksum queries = [] diff --git a/data/model/test/test_model_blob.py b/data/model/test/test_model_blob.py index d322d57cc..343a3c72c 100644 --- a/data/model/test/test_model_blob.py +++ b/data/model/test/test_model_blob.py @@ -1,3 +1,4 @@ +from app import storage from data import model, database from test.fixtures import * @@ -30,3 +31,19 @@ def test_store_blob(initialized_db): assert blob_storage3.id != blob_storage.id assert blob_storage3.image_size == 1234 assert blob_storage3.uncompressed_size == 5678 + + +def test_get_or_create_shared_blob(initialized_db): + shared = model.blob.get_or_create_shared_blob('sha256:abcdef', 'somecontent', storage) + assert shared.content_checksum == 'sha256:abcdef' + + again = model.blob.get_or_create_shared_blob('sha256:abcdef', 'somecontent', storage) + assert shared == again + + +def test_lookup_repo_storages_by_content_checksum(initialized_db): + for image in database.Image.select(): + found = model.storage.lookup_repo_storages_by_content_checksum(image.repository, + [image.storage.content_checksum]) + assert len(found) == 1 + assert found[0].content_checksum == image.storage.content_checksum diff --git a/data/registry_model/registry_oci_model.py b/data/registry_model/registry_oci_model.py index 2fd73cad1..0d27a3dbb 100644 --- a/data/registry_model/registry_oci_model.py +++ b/data/registry_model/registry_oci_model.py @@ -565,11 +565,13 @@ class OCIModel(SharedModel, RegistryDataInterface): there may be multiple records in the same repository for the same blob digest, so the return value of this function may change. """ - image_storage = oci.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest) + image_storage = self._get_shared_storage(blob_digest) if image_storage is None: - return None + image_storage = oci.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest) + if image_storage is None: + return None - assert image_storage.cas_path is not None + assert image_storage.cas_path is not None placements = None if include_placements: diff --git a/data/registry_model/registry_pre_oci_model.py b/data/registry_model/registry_pre_oci_model.py index 62cd1d31e..ce40f12ea 100644 --- a/data/registry_model/registry_pre_oci_model.py +++ b/data/registry_model/registry_pre_oci_model.py @@ -120,7 +120,7 @@ class PreOCIModel(SharedModel, RegistryDataInterface): # Ensure all the blobs in the manifest exist. digests = manifest_interface_instance.checksums - query = model.storage.lookup_repo_storages_by_content_checksum(repository_ref._db_id, digests) + query = self._lookup_repo_storages_by_content_checksum(repository_ref._db_id, digests) blob_map = {s.content_checksum: s for s in query} for layer in manifest_interface_instance.layers: digest_str = str(layer.digest) @@ -481,9 +481,7 @@ class PreOCIModel(SharedModel, RegistryDataInterface): if manifest is None: return None - blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo, - manifest.checksums) - + blob_query = self._lookup_repo_storages_by_content_checksum(repo, manifest.checksums) storage_map = {blob.content_checksum: blob.id for blob in blob_query} try: tag_manifest, _ = model.tag.associate_generated_tag_manifest_with_tag(tag_obj, manifest, @@ -585,10 +583,12 @@ class PreOCIModel(SharedModel, RegistryDataInterface): there may be multiple records in the same repository for the same blob digest, so the return value of this function may change. """ - try: - image_storage = model.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest) - except model.BlobDoesNotExist: - return None + image_storage = self._get_shared_storage(blob_digest) + if image_storage is None: + try: + image_storage = model.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest) + except model.BlobDoesNotExist: + return None assert image_storage.cas_path is not None diff --git a/data/registry_model/shared.py b/data/registry_model/shared.py index 091061526..ae0155f63 100644 --- a/data/registry_model/shared.py +++ b/data/registry_model/shared.py @@ -8,6 +8,7 @@ from data import database from data import model from data.cache import cache_key from data.model.oci.retriever import RepositoryContentRetriever +from data.model.blob import get_shared_blob from data.registry_model.datatype import FromDictionaryException from data.registry_model.datatypes import (RepositoryReference, Blob, TorrentInfo, BlobUpload, LegacyImage, ManifestLayer, DerivedImage) @@ -323,9 +324,8 @@ class SharedModel: if not len(local_blob_digests): return [] - blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id, - local_blob_digests, - by_manifest=by_manifest) + blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, local_blob_digests, + by_manifest=by_manifest) blobs = [] for image_storage in blob_query: placements = None @@ -356,9 +356,8 @@ class SharedModel: blob_digests.append(EMPTY_LAYER_BLOB_DIGEST) if blob_digests: - blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id, - blob_digests, - by_manifest=by_manifest) + blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, blob_digests, + by_manifest=by_manifest) storage_map = {blob.content_checksum: blob for blob in blob_query} @@ -441,3 +440,31 @@ class SharedModel: # Sign the manifest with our signing key. return builder.build(docker_v2_signing_key) + + def _get_shared_storage(self, blob_digest): + """ Returns an ImageStorage row for the blob digest if it is a globally shared storage. """ + # If the EMPTY_LAYER_BLOB_DIGEST is in the checksums, look it up directly. Since we have + # so many duplicate copies in the database currently, looking it up bound to a repository + # can be incredibly slow, and, since it is defined as a globally shared layer, this is extra + # work we don't need to do. + if blob_digest == EMPTY_LAYER_BLOB_DIGEST: + return get_shared_blob(EMPTY_LAYER_BLOB_DIGEST) + + return None + + def _lookup_repo_storages_by_content_checksum(self, repo, checksums, by_manifest=False): + checksums = set(checksums) + + # Load any shared storages first. + extra_storages = [] + for checksum in list(checksums): + shared_storage = self._get_shared_storage(checksum) + if shared_storage is not None: + extra_storages.append(shared_storage) + checksums.remove(checksum) + + found = [] + if checksums: + found = list(model.storage.lookup_repo_storages_by_content_checksum(repo, checksums, + by_manifest=by_manifest)) + return found + extra_storages diff --git a/endpoints/v2/blob.py b/endpoints/v2/blob.py index 99f5ef0e8..85ba873bb 100644 --- a/endpoints/v2/blob.py +++ b/endpoints/v2/blob.py @@ -132,7 +132,8 @@ def _try_to_mount_blob(repository_ref, mount_blob_digest): return None # Lookup if the mount blob's digest exists in the repository. - mount_blob = registry_model.get_repo_blob_by_digest(from_repository_ref, mount_blob_digest) + mount_blob = registry_model.get_cached_repo_blob(model_cache, from_namespace, from_repo_name, + mount_blob_digest) if mount_blob is None: logger.debug('Blob `%s` under repository `%s` not found', mount_blob_digest, from_repo) return None diff --git a/test/registry/registry_tests.py b/test/registry/registry_tests.py index f22ebfcca..da88aac9a 100644 --- a/test/registry/registry_tests.py +++ b/test/registry/registry_tests.py @@ -1204,7 +1204,8 @@ def test_blob_mounting(push_user, push_namespace, push_repo, mount_repo_name, ex options.mount_blobs = {'sha256:' + hashlib.sha256(image.bytes).hexdigest(): mount_repo_name for image in basic_images} - manifest_protocol.push(liveserver_session, 'devtable', 'newrepo', 'latest', basic_images, + manifest_protocol.push(liveserver_session, 'devtable', 'newrepo', 'latest', + basic_images, credentials=('devtable', 'password'), options=options, expected_failure=expected_failure) @@ -1215,6 +1216,26 @@ def test_blob_mounting(push_user, push_namespace, push_repo, mount_repo_name, ex credentials=('devtable', 'password')) +def test_blob_mounting_with_empty_layers(manifest_protocol, pusher, puller, images_with_empty_layer, + liveserver_session, app_reloader): + # Push an image so we can attempt to mount it. + pusher.push(liveserver_session, 'devtable', 'simple', 'latest', images_with_empty_layer, + credentials=('devtable', 'password')) + + # Push again, trying to mount the image layer(s) from the mount repo. + options = ProtocolOptions() + options.scopes = ['repository:devtable/newrepo:push,pull', + 'repository:%s:pull' % ('devtable/simple')] + options.mount_blobs = {'sha256:' + hashlib.sha256(image.bytes).hexdigest(): 'devtable/simple' + for image in images_with_empty_layer} + options.skip_head_checks = True + + manifest_protocol.push(liveserver_session, 'devtable', 'newrepo', 'latest', + images_with_empty_layer, + credentials=('devtable', 'password'), + options=options) + + def get_robot_password(api_caller): api_caller.conduct_auth('devtable', 'password') resp = api_caller.get('/api/v1/organization/buynlarge/robots/ownerbot')