Optimize lookup of shared global blobs

Currently, we only have one (the shared empty layer), but this should make the blob lookups for repositories significantly faster, as we won't need to do the massive join.
This commit is contained in:
Joseph Schorr 2019-02-14 12:46:42 -05:00
parent 7beac643ec
commit f75f315037
6 changed files with 78 additions and 24 deletions

View file

@ -565,11 +565,13 @@ class OCIModel(SharedModel, RegistryDataInterface):
there may be multiple records in the same repository for the same blob digest, so the return
value of this function may change.
"""
image_storage = oci.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest)
image_storage = self._get_shared_storage(blob_digest)
if image_storage is None:
return None
image_storage = oci.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest)
if image_storage is None:
return None
assert image_storage.cas_path is not None
assert image_storage.cas_path is not None
placements = None
if include_placements:

View file

@ -120,7 +120,7 @@ class PreOCIModel(SharedModel, RegistryDataInterface):
# Ensure all the blobs in the manifest exist.
digests = manifest_interface_instance.checksums
query = model.storage.lookup_repo_storages_by_content_checksum(repository_ref._db_id, digests)
query = self._lookup_repo_storages_by_content_checksum(repository_ref._db_id, digests)
blob_map = {s.content_checksum: s for s in query}
for layer in manifest_interface_instance.layers:
digest_str = str(layer.digest)
@ -481,9 +481,7 @@ class PreOCIModel(SharedModel, RegistryDataInterface):
if manifest is None:
return None
blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo,
manifest.checksums)
blob_query = self._lookup_repo_storages_by_content_checksum(repo, manifest.checksums)
storage_map = {blob.content_checksum: blob.id for blob in blob_query}
try:
tag_manifest, _ = model.tag.associate_generated_tag_manifest_with_tag(tag_obj, manifest,
@ -585,10 +583,12 @@ class PreOCIModel(SharedModel, RegistryDataInterface):
there may be multiple records in the same repository for the same blob digest, so the return
value of this function may change.
"""
try:
image_storage = model.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest)
except model.BlobDoesNotExist:
return None
image_storage = self._get_shared_storage(blob_digest)
if image_storage is None:
try:
image_storage = model.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest)
except model.BlobDoesNotExist:
return None
assert image_storage.cas_path is not None

View file

@ -8,6 +8,7 @@ from data import database
from data import model
from data.cache import cache_key
from data.model.oci.retriever import RepositoryContentRetriever
from data.model.blob import get_shared_blob
from data.registry_model.datatype import FromDictionaryException
from data.registry_model.datatypes import (RepositoryReference, Blob, TorrentInfo, BlobUpload,
LegacyImage, ManifestLayer, DerivedImage)
@ -323,9 +324,8 @@ class SharedModel:
if not len(local_blob_digests):
return []
blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id,
local_blob_digests,
by_manifest=by_manifest)
blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, local_blob_digests,
by_manifest=by_manifest)
blobs = []
for image_storage in blob_query:
placements = None
@ -356,9 +356,8 @@ class SharedModel:
blob_digests.append(EMPTY_LAYER_BLOB_DIGEST)
if blob_digests:
blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id,
blob_digests,
by_manifest=by_manifest)
blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, blob_digests,
by_manifest=by_manifest)
storage_map = {blob.content_checksum: blob for blob in blob_query}
@ -441,3 +440,29 @@ class SharedModel:
# Sign the manifest with our signing key.
return builder.build(docker_v2_signing_key)
def _get_shared_storage(self, blob_digest):
""" Returns an ImageStorage row for the blob digest if it is a globally shared storage. """
# If the EMPTY_LAYER_BLOB_DIGEST is in the checksums, look it up directly. Since we have
# so many duplicate copies in the database currently, looking it up bound to a repository
# can be incredibly slow, and, since it is defined as a globally shared layer, this is extra
# work we don't need to do.
if blob_digest == EMPTY_LAYER_BLOB_DIGEST:
return get_shared_blob(EMPTY_LAYER_BLOB_DIGEST)
return None
def _lookup_repo_storages_by_content_checksum(self, repo, checksums, by_manifest=False):
# Load any shared storages first.
extra_storages = []
for checksum in list(checksums):
shared_storage = self._get_shared_storage(checksum)
if shared_storage is not None:
extra_storages.append(shared_storage)
checksums.remove(checksum)
found = []
if checksums:
found = list(model.storage.lookup_repo_storages_by_content_checksum(repo, checksums,
by_manifest=by_manifest))
return found + extra_storages