Optimize lookup of shared global blobs

Currently, we only have one (the shared empty layer), but this should make the blob lookups for repositories significantly faster, as we won't need to do the massive join.
This commit is contained in:
Joseph Schorr 2019-02-14 12:46:42 -05:00
parent 7beac643ec
commit f75f315037
6 changed files with 78 additions and 24 deletions

View file

@ -8,6 +8,7 @@ from data import database
from data import model
from data.cache import cache_key
from data.model.oci.retriever import RepositoryContentRetriever
from data.model.blob import get_shared_blob
from data.registry_model.datatype import FromDictionaryException
from data.registry_model.datatypes import (RepositoryReference, Blob, TorrentInfo, BlobUpload,
LegacyImage, ManifestLayer, DerivedImage)
@ -323,9 +324,8 @@ class SharedModel:
if not len(local_blob_digests):
return []
blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id,
local_blob_digests,
by_manifest=by_manifest)
blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, local_blob_digests,
by_manifest=by_manifest)
blobs = []
for image_storage in blob_query:
placements = None
@ -356,9 +356,8 @@ class SharedModel:
blob_digests.append(EMPTY_LAYER_BLOB_DIGEST)
if blob_digests:
blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id,
blob_digests,
by_manifest=by_manifest)
blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, blob_digests,
by_manifest=by_manifest)
storage_map = {blob.content_checksum: blob for blob in blob_query}
@ -441,3 +440,29 @@ class SharedModel:
# Sign the manifest with our signing key.
return builder.build(docker_v2_signing_key)
def _get_shared_storage(self, blob_digest):
""" Returns an ImageStorage row for the blob digest if it is a globally shared storage. """
# If the EMPTY_LAYER_BLOB_DIGEST is in the checksums, look it up directly. Since we have
# so many duplicate copies in the database currently, looking it up bound to a repository
# can be incredibly slow, and, since it is defined as a globally shared layer, this is extra
# work we don't need to do.
if blob_digest == EMPTY_LAYER_BLOB_DIGEST:
return get_shared_blob(EMPTY_LAYER_BLOB_DIGEST)
return None
def _lookup_repo_storages_by_content_checksum(self, repo, checksums, by_manifest=False):
# Load any shared storages first.
extra_storages = []
for checksum in list(checksums):
shared_storage = self._get_shared_storage(checksum)
if shared_storage is not None:
extra_storages.append(shared_storage)
checksums.remove(checksum)
found = []
if checksums:
found = list(model.storage.lookup_repo_storages_by_content_checksum(repo, checksums,
by_manifest=by_manifest))
return found + extra_storages