From f75f31503768a53c6ebd5d9e3d378a3c5c3410ef Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 14 Feb 2019 12:46:42 -0500 Subject: [PATCH] Optimize lookup of shared global blobs Currently, we only have one (the shared empty layer), but this should make the blob lookups for repositories significantly faster, as we won't need to do the massive join. --- data/model/blob.py | 12 ++++++ data/model/oci/manifest.py | 26 +++++++++---- data/model/storage.py | 3 ++ data/registry_model/registry_oci_model.py | 8 ++-- data/registry_model/registry_pre_oci_model.py | 16 ++++---- data/registry_model/shared.py | 37 ++++++++++++++++--- 6 files changed, 78 insertions(+), 24 deletions(-) diff --git a/data/model/blob.py b/data/model/blob.py index 2c087f754..e97badedf 100644 --- a/data/model/blob.py +++ b/data/model/blob.py @@ -164,6 +164,18 @@ def initiate_upload(namespace, repo_name, uuid, location_name, storage_metadata) storage_metadata=storage_metadata) +def get_shared_blob(digest): + """ Returns the ImageStorage blob with the given digest or, if not present, + returns None. This method is *only* to be used for shared blobs that are + globally accessible, such as the special empty gzipped tar layer that Docker + no longer pushes to us. + """ + try: + return ImageStorage.get(content_checksum=digest, uploading=False) + except ImageStorage.DoesNotExist: + return None + + def get_or_create_shared_blob(digest, byte_data, storage): """ Returns the ImageStorage blob with the given digest or, if not present, adds a row and writes the given byte data to the storage engine. diff --git a/data/model/oci/manifest.py b/data/model/oci/manifest.py index 299f86b8b..b1b1d8148 100644 --- a/data/model/oci/manifest.py +++ b/data/model/oci/manifest.py @@ -7,7 +7,7 @@ from peewee import IntegrityError, JOIN from data.database import (Tag, Manifest, ManifestBlob, ManifestLegacyImage, ManifestChild, db_transaction) from data.model import BlobDoesNotExist -from data.model.blob import get_or_create_shared_blob +from data.model.blob import get_or_create_shared_blob, get_shared_blob from data.model.oci.tag import filter_to_alive_tags from data.model.oci.label import create_manifest_label from data.model.oci.retriever import RepositoryContentRetriever @@ -108,9 +108,20 @@ def _create_manifest(repository_id, manifest_interface_instance, storage): # Ensure all the blobs in the manifest exist. digests = set(manifest_interface_instance.local_blob_digests) blob_map = {} + + # If the special empty layer is required, simply load it directly. This is much faster + # than trying to load it on a per repository basis, and that is unnecessary anyway since + # this layer is predefined. + if EMPTY_LAYER_BLOB_DIGEST in digests: + digests.remove(EMPTY_LAYER_BLOB_DIGEST) + blob_map[EMPTY_LAYER_BLOB_DIGEST] = get_shared_blob(EMPTY_LAYER_BLOB_DIGEST) + if not blob_map[EMPTY_LAYER_BLOB_DIGEST]: + logger.warning('Could not find the special empty blob in storage') + return None + if digests: query = lookup_repo_storages_by_content_checksum(repository_id, digests) - blob_map = {s.content_checksum: s for s in query} + blob_map.update({s.content_checksum: s for s in query}) for digest_str in digests: if digest_str not in blob_map: logger.warning('Unknown blob `%s` under manifest `%s` for repository `%s`', digest_str, @@ -120,11 +131,12 @@ def _create_manifest(repository_id, manifest_interface_instance, storage): # Special check: If the empty layer blob is needed for this manifest, add it to the # blob map. This is necessary because Docker decided to elide sending of this special # empty layer in schema version 2, but we need to have it referenced for GC and schema version 1. - if manifest_interface_instance.get_requires_empty_layer_blob(retriever): - shared_blob = get_or_create_shared_blob(EMPTY_LAYER_BLOB_DIGEST, EMPTY_LAYER_BYTES, storage) - assert not shared_blob.uploading - assert shared_blob.content_checksum == EMPTY_LAYER_BLOB_DIGEST - blob_map[EMPTY_LAYER_BLOB_DIGEST] = shared_blob + if EMPTY_LAYER_BLOB_DIGEST not in blob_map: + if manifest_interface_instance.get_requires_empty_layer_blob(retriever): + shared_blob = get_or_create_shared_blob(EMPTY_LAYER_BLOB_DIGEST, EMPTY_LAYER_BYTES, storage) + assert not shared_blob.uploading + assert shared_blob.content_checksum == EMPTY_LAYER_BLOB_DIGEST + blob_map[EMPTY_LAYER_BLOB_DIGEST] = shared_blob # Determine and populate the legacy image if necessary. Manifest lists will not have a legacy # image. diff --git a/data/model/storage.py b/data/model/storage.py index 04f8b965b..e1c9541d4 100644 --- a/data/model/storage.py +++ b/data/model/storage.py @@ -264,6 +264,9 @@ def get_layer_path_for_storage(storage_uuid, cas_path, content_checksum): def lookup_repo_storages_by_content_checksum(repo, checksums, by_manifest=False): """ Looks up repository storages (without placements) matching the given repository and checksum. """ + if not checksums: + return [] + # There may be many duplicates of the checksums, so for performance reasons we are going # to use a union to select just one storage with each checksum queries = [] diff --git a/data/registry_model/registry_oci_model.py b/data/registry_model/registry_oci_model.py index 2fd73cad1..0d27a3dbb 100644 --- a/data/registry_model/registry_oci_model.py +++ b/data/registry_model/registry_oci_model.py @@ -565,11 +565,13 @@ class OCIModel(SharedModel, RegistryDataInterface): there may be multiple records in the same repository for the same blob digest, so the return value of this function may change. """ - image_storage = oci.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest) + image_storage = self._get_shared_storage(blob_digest) if image_storage is None: - return None + image_storage = oci.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest) + if image_storage is None: + return None - assert image_storage.cas_path is not None + assert image_storage.cas_path is not None placements = None if include_placements: diff --git a/data/registry_model/registry_pre_oci_model.py b/data/registry_model/registry_pre_oci_model.py index 62cd1d31e..ce40f12ea 100644 --- a/data/registry_model/registry_pre_oci_model.py +++ b/data/registry_model/registry_pre_oci_model.py @@ -120,7 +120,7 @@ class PreOCIModel(SharedModel, RegistryDataInterface): # Ensure all the blobs in the manifest exist. digests = manifest_interface_instance.checksums - query = model.storage.lookup_repo_storages_by_content_checksum(repository_ref._db_id, digests) + query = self._lookup_repo_storages_by_content_checksum(repository_ref._db_id, digests) blob_map = {s.content_checksum: s for s in query} for layer in manifest_interface_instance.layers: digest_str = str(layer.digest) @@ -481,9 +481,7 @@ class PreOCIModel(SharedModel, RegistryDataInterface): if manifest is None: return None - blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo, - manifest.checksums) - + blob_query = self._lookup_repo_storages_by_content_checksum(repo, manifest.checksums) storage_map = {blob.content_checksum: blob.id for blob in blob_query} try: tag_manifest, _ = model.tag.associate_generated_tag_manifest_with_tag(tag_obj, manifest, @@ -585,10 +583,12 @@ class PreOCIModel(SharedModel, RegistryDataInterface): there may be multiple records in the same repository for the same blob digest, so the return value of this function may change. """ - try: - image_storage = model.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest) - except model.BlobDoesNotExist: - return None + image_storage = self._get_shared_storage(blob_digest) + if image_storage is None: + try: + image_storage = model.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest) + except model.BlobDoesNotExist: + return None assert image_storage.cas_path is not None diff --git a/data/registry_model/shared.py b/data/registry_model/shared.py index 091061526..509a0bcc5 100644 --- a/data/registry_model/shared.py +++ b/data/registry_model/shared.py @@ -8,6 +8,7 @@ from data import database from data import model from data.cache import cache_key from data.model.oci.retriever import RepositoryContentRetriever +from data.model.blob import get_shared_blob from data.registry_model.datatype import FromDictionaryException from data.registry_model.datatypes import (RepositoryReference, Blob, TorrentInfo, BlobUpload, LegacyImage, ManifestLayer, DerivedImage) @@ -323,9 +324,8 @@ class SharedModel: if not len(local_blob_digests): return [] - blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id, - local_blob_digests, - by_manifest=by_manifest) + blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, local_blob_digests, + by_manifest=by_manifest) blobs = [] for image_storage in blob_query: placements = None @@ -356,9 +356,8 @@ class SharedModel: blob_digests.append(EMPTY_LAYER_BLOB_DIGEST) if blob_digests: - blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id, - blob_digests, - by_manifest=by_manifest) + blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, blob_digests, + by_manifest=by_manifest) storage_map = {blob.content_checksum: blob for blob in blob_query} @@ -441,3 +440,29 @@ class SharedModel: # Sign the manifest with our signing key. return builder.build(docker_v2_signing_key) + + def _get_shared_storage(self, blob_digest): + """ Returns an ImageStorage row for the blob digest if it is a globally shared storage. """ + # If the EMPTY_LAYER_BLOB_DIGEST is in the checksums, look it up directly. Since we have + # so many duplicate copies in the database currently, looking it up bound to a repository + # can be incredibly slow, and, since it is defined as a globally shared layer, this is extra + # work we don't need to do. + if blob_digest == EMPTY_LAYER_BLOB_DIGEST: + return get_shared_blob(EMPTY_LAYER_BLOB_DIGEST) + + return None + + def _lookup_repo_storages_by_content_checksum(self, repo, checksums, by_manifest=False): + # Load any shared storages first. + extra_storages = [] + for checksum in list(checksums): + shared_storage = self._get_shared_storage(checksum) + if shared_storage is not None: + extra_storages.append(shared_storage) + checksums.remove(checksum) + + found = [] + if checksums: + found = list(model.storage.lookup_repo_storages_by_content_checksum(repo, checksums, + by_manifest=by_manifest)) + return found + extra_storages