Merge pull request #3362 from quay/shared-layer-fixes

Shared blob fixes and optimizations
This commit is contained in:
Joseph Schorr 2019-02-14 14:45:54 -05:00 committed by GitHub
commit 39db907172
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 121 additions and 26 deletions

View file

@ -164,6 +164,18 @@ def initiate_upload(namespace, repo_name, uuid, location_name, storage_metadata)
storage_metadata=storage_metadata)
def get_shared_blob(digest):
""" Returns the ImageStorage blob with the given digest or, if not present,
returns None. This method is *only* to be used for shared blobs that are
globally accessible, such as the special empty gzipped tar layer that Docker
no longer pushes to us.
"""
try:
return ImageStorage.get(content_checksum=digest, uploading=False)
except ImageStorage.DoesNotExist:
return None
def get_or_create_shared_blob(digest, byte_data, storage):
""" Returns the ImageStorage blob with the given digest or, if not present,
adds a row and writes the given byte data to the storage engine.

View file

@ -7,7 +7,7 @@ from peewee import IntegrityError, JOIN
from data.database import (Tag, Manifest, ManifestBlob, ManifestLegacyImage, ManifestChild,
db_transaction)
from data.model import BlobDoesNotExist
from data.model.blob import get_or_create_shared_blob
from data.model.blob import get_or_create_shared_blob, get_shared_blob
from data.model.oci.tag import filter_to_alive_tags
from data.model.oci.label import create_manifest_label
from data.model.oci.retriever import RepositoryContentRetriever
@ -108,9 +108,20 @@ def _create_manifest(repository_id, manifest_interface_instance, storage):
# Ensure all the blobs in the manifest exist.
digests = set(manifest_interface_instance.local_blob_digests)
blob_map = {}
# If the special empty layer is required, simply load it directly. This is much faster
# than trying to load it on a per repository basis, and that is unnecessary anyway since
# this layer is predefined.
if EMPTY_LAYER_BLOB_DIGEST in digests:
digests.remove(EMPTY_LAYER_BLOB_DIGEST)
blob_map[EMPTY_LAYER_BLOB_DIGEST] = get_shared_blob(EMPTY_LAYER_BLOB_DIGEST)
if not blob_map[EMPTY_LAYER_BLOB_DIGEST]:
logger.warning('Could not find the special empty blob in storage')
return None
if digests:
query = lookup_repo_storages_by_content_checksum(repository_id, digests)
blob_map = {s.content_checksum: s for s in query}
blob_map.update({s.content_checksum: s for s in query})
for digest_str in digests:
if digest_str not in blob_map:
logger.warning('Unknown blob `%s` under manifest `%s` for repository `%s`', digest_str,
@ -120,11 +131,12 @@ def _create_manifest(repository_id, manifest_interface_instance, storage):
# Special check: If the empty layer blob is needed for this manifest, add it to the
# blob map. This is necessary because Docker decided to elide sending of this special
# empty layer in schema version 2, but we need to have it referenced for GC and schema version 1.
if manifest_interface_instance.get_requires_empty_layer_blob(retriever):
shared_blob = get_or_create_shared_blob(EMPTY_LAYER_BLOB_DIGEST, EMPTY_LAYER_BYTES, storage)
assert not shared_blob.uploading
assert shared_blob.content_checksum == EMPTY_LAYER_BLOB_DIGEST
blob_map[EMPTY_LAYER_BLOB_DIGEST] = shared_blob
if EMPTY_LAYER_BLOB_DIGEST not in blob_map:
if manifest_interface_instance.get_requires_empty_layer_blob(retriever):
shared_blob = get_or_create_shared_blob(EMPTY_LAYER_BLOB_DIGEST, EMPTY_LAYER_BYTES, storage)
assert not shared_blob.uploading
assert shared_blob.content_checksum == EMPTY_LAYER_BLOB_DIGEST
blob_map[EMPTY_LAYER_BLOB_DIGEST] = shared_blob
# Determine and populate the legacy image if necessary. Manifest lists will not have a legacy
# image.

View file

@ -264,6 +264,9 @@ def get_layer_path_for_storage(storage_uuid, cas_path, content_checksum):
def lookup_repo_storages_by_content_checksum(repo, checksums, by_manifest=False):
""" Looks up repository storages (without placements) matching the given repository
and checksum. """
if not checksums:
return []
# There may be many duplicates of the checksums, so for performance reasons we are going
# to use a union to select just one storage with each checksum
queries = []

View file

@ -1,3 +1,4 @@
from app import storage
from data import model, database
from test.fixtures import *
@ -30,3 +31,19 @@ def test_store_blob(initialized_db):
assert blob_storage3.id != blob_storage.id
assert blob_storage3.image_size == 1234
assert blob_storage3.uncompressed_size == 5678
def test_get_or_create_shared_blob(initialized_db):
shared = model.blob.get_or_create_shared_blob('sha256:abcdef', 'somecontent', storage)
assert shared.content_checksum == 'sha256:abcdef'
again = model.blob.get_or_create_shared_blob('sha256:abcdef', 'somecontent', storage)
assert shared == again
def test_lookup_repo_storages_by_content_checksum(initialized_db):
for image in database.Image.select():
found = model.storage.lookup_repo_storages_by_content_checksum(image.repository,
[image.storage.content_checksum])
assert len(found) == 1
assert found[0].content_checksum == image.storage.content_checksum

View file

@ -565,11 +565,13 @@ class OCIModel(SharedModel, RegistryDataInterface):
there may be multiple records in the same repository for the same blob digest, so the return
value of this function may change.
"""
image_storage = oci.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest)
image_storage = self._get_shared_storage(blob_digest)
if image_storage is None:
return None
image_storage = oci.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest)
if image_storage is None:
return None
assert image_storage.cas_path is not None
assert image_storage.cas_path is not None
placements = None
if include_placements:

View file

@ -120,7 +120,7 @@ class PreOCIModel(SharedModel, RegistryDataInterface):
# Ensure all the blobs in the manifest exist.
digests = manifest_interface_instance.checksums
query = model.storage.lookup_repo_storages_by_content_checksum(repository_ref._db_id, digests)
query = self._lookup_repo_storages_by_content_checksum(repository_ref._db_id, digests)
blob_map = {s.content_checksum: s for s in query}
for layer in manifest_interface_instance.layers:
digest_str = str(layer.digest)
@ -481,9 +481,7 @@ class PreOCIModel(SharedModel, RegistryDataInterface):
if manifest is None:
return None
blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo,
manifest.checksums)
blob_query = self._lookup_repo_storages_by_content_checksum(repo, manifest.checksums)
storage_map = {blob.content_checksum: blob.id for blob in blob_query}
try:
tag_manifest, _ = model.tag.associate_generated_tag_manifest_with_tag(tag_obj, manifest,
@ -585,10 +583,12 @@ class PreOCIModel(SharedModel, RegistryDataInterface):
there may be multiple records in the same repository for the same blob digest, so the return
value of this function may change.
"""
try:
image_storage = model.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest)
except model.BlobDoesNotExist:
return None
image_storage = self._get_shared_storage(blob_digest)
if image_storage is None:
try:
image_storage = model.blob.get_repository_blob_by_digest(repository_ref._db_id, blob_digest)
except model.BlobDoesNotExist:
return None
assert image_storage.cas_path is not None

View file

@ -8,6 +8,7 @@ from data import database
from data import model
from data.cache import cache_key
from data.model.oci.retriever import RepositoryContentRetriever
from data.model.blob import get_shared_blob
from data.registry_model.datatype import FromDictionaryException
from data.registry_model.datatypes import (RepositoryReference, Blob, TorrentInfo, BlobUpload,
LegacyImage, ManifestLayer, DerivedImage)
@ -323,9 +324,8 @@ class SharedModel:
if not len(local_blob_digests):
return []
blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id,
local_blob_digests,
by_manifest=by_manifest)
blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, local_blob_digests,
by_manifest=by_manifest)
blobs = []
for image_storage in blob_query:
placements = None
@ -356,9 +356,8 @@ class SharedModel:
blob_digests.append(EMPTY_LAYER_BLOB_DIGEST)
if blob_digests:
blob_query = model.storage.lookup_repo_storages_by_content_checksum(repo_id,
blob_digests,
by_manifest=by_manifest)
blob_query = self._lookup_repo_storages_by_content_checksum(repo_id, blob_digests,
by_manifest=by_manifest)
storage_map = {blob.content_checksum: blob for blob in blob_query}
@ -441,3 +440,31 @@ class SharedModel:
# Sign the manifest with our signing key.
return builder.build(docker_v2_signing_key)
def _get_shared_storage(self, blob_digest):
""" Returns an ImageStorage row for the blob digest if it is a globally shared storage. """
# If the EMPTY_LAYER_BLOB_DIGEST is in the checksums, look it up directly. Since we have
# so many duplicate copies in the database currently, looking it up bound to a repository
# can be incredibly slow, and, since it is defined as a globally shared layer, this is extra
# work we don't need to do.
if blob_digest == EMPTY_LAYER_BLOB_DIGEST:
return get_shared_blob(EMPTY_LAYER_BLOB_DIGEST)
return None
def _lookup_repo_storages_by_content_checksum(self, repo, checksums, by_manifest=False):
checksums = set(checksums)
# Load any shared storages first.
extra_storages = []
for checksum in list(checksums):
shared_storage = self._get_shared_storage(checksum)
if shared_storage is not None:
extra_storages.append(shared_storage)
checksums.remove(checksum)
found = []
if checksums:
found = list(model.storage.lookup_repo_storages_by_content_checksum(repo, checksums,
by_manifest=by_manifest))
return found + extra_storages

View file

@ -132,7 +132,8 @@ def _try_to_mount_blob(repository_ref, mount_blob_digest):
return None
# Lookup if the mount blob's digest exists in the repository.
mount_blob = registry_model.get_repo_blob_by_digest(from_repository_ref, mount_blob_digest)
mount_blob = registry_model.get_cached_repo_blob(model_cache, from_namespace, from_repo_name,
mount_blob_digest)
if mount_blob is None:
logger.debug('Blob `%s` under repository `%s` not found', mount_blob_digest, from_repo)
return None

View file

@ -1204,7 +1204,8 @@ def test_blob_mounting(push_user, push_namespace, push_repo, mount_repo_name, ex
options.mount_blobs = {'sha256:' + hashlib.sha256(image.bytes).hexdigest(): mount_repo_name
for image in basic_images}
manifest_protocol.push(liveserver_session, 'devtable', 'newrepo', 'latest', basic_images,
manifest_protocol.push(liveserver_session, 'devtable', 'newrepo', 'latest',
basic_images,
credentials=('devtable', 'password'),
options=options,
expected_failure=expected_failure)
@ -1215,6 +1216,26 @@ def test_blob_mounting(push_user, push_namespace, push_repo, mount_repo_name, ex
credentials=('devtable', 'password'))
def test_blob_mounting_with_empty_layers(manifest_protocol, pusher, puller, images_with_empty_layer,
liveserver_session, app_reloader):
# Push an image so we can attempt to mount it.
pusher.push(liveserver_session, 'devtable', 'simple', 'latest', images_with_empty_layer,
credentials=('devtable', 'password'))
# Push again, trying to mount the image layer(s) from the mount repo.
options = ProtocolOptions()
options.scopes = ['repository:devtable/newrepo:push,pull',
'repository:%s:pull' % ('devtable/simple')]
options.mount_blobs = {'sha256:' + hashlib.sha256(image.bytes).hexdigest(): 'devtable/simple'
for image in images_with_empty_layer}
options.skip_head_checks = True
manifest_protocol.push(liveserver_session, 'devtable', 'newrepo', 'latest',
images_with_empty_layer,
credentials=('devtable', 'password'),
options=options)
def get_robot_password(api_caller):
api_caller.conduct_auth('devtable', 'password')
resp = api_caller.get('/api/v1/organization/buynlarge/robots/ownerbot')