Audit the number of SQL queries we make in writing manifests, and significantly reduce in the common case

Instead of 41 queries now for the simple manifest, we are down to 14.

The biggest changes:
  - Only synthesize the V1 image rows if we haven't already found them in the database
  - Thread the repository object through to the other model method calls, and use it instead of loading again and again
This commit is contained in:
Joseph Schorr 2018-01-11 16:25:38 -05:00
parent 4f47808e99
commit 9e16a989f5
8 changed files with 145 additions and 66 deletions

View file

@ -421,8 +421,8 @@ def get_image_layers(image):
return image_list return image_list
def synthesize_v1_image(repo, image_storage, docker_image_id, created_date_str, def synthesize_v1_image(repo, image_storage_id, storage_image_size, docker_image_id,
comment, command, v1_json_metadata, parent_image=None): created_date_str, comment, command, v1_json_metadata, parent_image=None):
""" Find an existing image with this docker image id, and if none exists, write one with the """ Find an existing image with this docker image id, and if none exists, write one with the
specified metadata. specified metadata.
""" """
@ -439,13 +439,13 @@ def synthesize_v1_image(repo, image_storage, docker_image_id, created_date_str,
pass pass
# Get the aggregate size for the image. # Get the aggregate size for the image.
aggregate_size = _basequery.calculate_image_aggregate_size(ancestors, image_storage.image_size, aggregate_size = _basequery.calculate_image_aggregate_size(ancestors, storage_image_size,
parent_image) parent_image)
try: try:
return Image.create(docker_image_id=docker_image_id, ancestors=ancestors, comment=comment, return Image.create(docker_image_id=docker_image_id, ancestors=ancestors, comment=comment,
command=command, v1_json_metadata=v1_json_metadata, created=created, command=command, v1_json_metadata=v1_json_metadata, created=created,
storage=image_storage, repository=repo, parent=parent_image, storage=image_storage_id, repository=repo, parent=parent_image,
aggregate_size=aggregate_size) aggregate_size=aggregate_size)
except IntegrityError: except IntegrityError:
return Image.get(docker_image_id=docker_image_id, repository=repo) return Image.get(docker_image_id=docker_image_id, repository=repo)

View file

@ -246,13 +246,16 @@ def create_or_update_tag(namespace_name, repository_name, tag_name, tag_docker_i
except Repository.DoesNotExist: except Repository.DoesNotExist:
raise DataModelException('Invalid repository %s/%s' % (namespace_name, repository_name)) raise DataModelException('Invalid repository %s/%s' % (namespace_name, repository_name))
return create_or_update_tag_for_repo(repo.id, tag_name, tag_docker_image_id, reversion=reversion)
def create_or_update_tag_for_repo(repository_id, tag_name, tag_docker_image_id, reversion=False):
now_ts = get_epoch_timestamp() now_ts = get_epoch_timestamp()
with db_transaction(): with db_transaction():
try: try:
tag = db_for_update(_tag_alive(RepositoryTag tag = db_for_update(_tag_alive(RepositoryTag
.select() .select()
.where(RepositoryTag.repository == repo, .where(RepositoryTag.repository == repository_id,
RepositoryTag.name == tag_name), now_ts)).get() RepositoryTag.name == tag_name), now_ts)).get()
tag.lifetime_end_ts = now_ts tag.lifetime_end_ts = now_ts
tag.save() tag.save()
@ -263,16 +266,17 @@ def create_or_update_tag(namespace_name, repository_name, tag_name, tag_docker_i
raise StaleTagException(msg % tag_name) raise StaleTagException(msg % tag_name)
try: try:
image_obj = Image.get(Image.docker_image_id == tag_docker_image_id, Image.repository == repo) image_obj = Image.get(Image.docker_image_id == tag_docker_image_id,
Image.repository == repository_id)
except Image.DoesNotExist: except Image.DoesNotExist:
raise DataModelException('Invalid image with id: %s' % tag_docker_image_id) raise DataModelException('Invalid image with id: %s' % tag_docker_image_id)
try: try:
return RepositoryTag.create(repository=repo, image=image_obj, name=tag_name, return RepositoryTag.create(repository=repository_id, image=image_obj, name=tag_name,
lifetime_start_ts=now_ts, reversion=reversion) lifetime_start_ts=now_ts, reversion=reversion)
except IntegrityError: except IntegrityError:
msg = 'Tag with name %s and lifetime start %s under repository %s/%s already exists' msg = 'Tag with name %s and lifetime start %s already exists'
raise TagAlreadyCreatedException(msg % (tag_name, now_ts, namespace_name, repository_name)) raise TagAlreadyCreatedException(msg % (tag_name, now_ts))
def create_temporary_hidden_tag(repo, image_obj, expiration_s): def create_temporary_hidden_tag(repo, image_obj, expiration_s):
@ -504,13 +508,27 @@ def restore_tag_to_image(repo_obj, tag_name, docker_image_id):
return existing_image return existing_image
def store_tag_manifest(namespace, repo_name, tag_name, docker_image_id, manifest_digest, def store_tag_manifest(namespace_name, repository_name, tag_name, docker_image_id, manifest_digest,
manifest_data, reversion=False): manifest_data, reversion=False):
""" Stores a tag manifest for a specific tag name in the database. Returns the TagManifest """ Stores a tag manifest for a specific tag name in the database. Returns the TagManifest
object, as well as a boolean indicating whether the TagManifest was created. object, as well as a boolean indicating whether the TagManifest was created.
""" """
try:
repo = _basequery.get_existing_repository(namespace_name, repository_name)
except Repository.DoesNotExist:
raise DataModelException('Invalid repository %s/%s' % (namespace_name, repository_name))
return store_tag_manifest_for_repo(repo.id, tag_name, docker_image_id, manifest_digest,
manifest_data, reversion=False)
def store_tag_manifest_for_repo(repository_id, tag_name, docker_image_id, manifest_digest,
manifest_data, reversion=False):
""" Stores a tag manifest for a specific tag name in the database. Returns the TagManifest
object, as well as a boolean indicating whether the TagManifest was created.
"""
with db_transaction(): with db_transaction():
tag = create_or_update_tag(namespace, repo_name, tag_name, docker_image_id, reversion=reversion) tag = create_or_update_tag_for_repo(repository_id, tag_name, docker_image_id,
reversion=reversion)
try: try:
manifest = TagManifest.get(digest=manifest_digest) manifest = TagManifest.get(digest=manifest_digest)

View file

@ -56,7 +56,7 @@ def gen_basic_auth(username, password):
def conduct_call(client, resource, url_for, method, params, body=None, expected_code=200, def conduct_call(client, resource, url_for, method, params, body=None, expected_code=200,
headers=None): headers=None, raw_body=None):
""" Conducts a call to a Flask endpoint. """ """ Conducts a call to a Flask endpoint. """
params = add_csrf_param(params) params = add_csrf_param(params)
@ -68,6 +68,9 @@ def conduct_call(client, resource, url_for, method, params, body=None, expected_
if body is not None: if body is not None:
body = json.dumps(body) body = json.dumps(body)
if raw_body is not None:
body = raw_body
# Required for anonymous calls to not exception. # Required for anonymous calls to not exception.
g.identity = Identity(None, 'none') g.identity = Identity(None, 'none')

View file

@ -157,7 +157,7 @@ def _write_manifest(namespace_name, repo_name, manifest):
raise ManifestInvalid(detail={'message': 'manifest does not reference any layers'}) raise ManifestInvalid(detail={'message': 'manifest does not reference any layers'})
# Ensure all the blobs in the manifest exist. # Ensure all the blobs in the manifest exist.
storage_map = model.lookup_blobs_by_digest(namespace_name, repo_name, manifest.checksums) storage_map = model.lookup_blobs_by_digest(repo, manifest.checksums)
for layer in manifest.layers: for layer in manifest.layers:
digest_str = str(layer.digest) digest_str = str(layer.digest)
if digest_str not in storage_map: if digest_str not in storage_map:
@ -166,30 +166,35 @@ def _write_manifest(namespace_name, repo_name, manifest):
# Lookup all the images and their parent images (if any) inside the manifest. # Lookup all the images and their parent images (if any) inside the manifest.
# This will let us know which v1 images we need to synthesize and which ones are invalid. # This will let us know which v1 images we need to synthesize and which ones are invalid.
all_image_ids = list(manifest.parent_image_ids | manifest.image_ids) all_image_ids = list(manifest.parent_image_ids | manifest.image_ids)
images_map = model.get_docker_v1_metadata_by_image_id(namespace_name, repo_name, all_image_ids) images_map = model.get_docker_v1_metadata_by_image_id(repo, all_image_ids)
# Rewrite any v1 image IDs that do not match the checksum in the database. # Rewrite any v1 image IDs that do not match the checksum in the database.
try: try:
# TODO: make this batch and read the parent image from the previous iteration, rather than
# reloading it.
rewritten_images = list(manifest.rewrite_invalid_image_ids(images_map)) rewritten_images = list(manifest.rewrite_invalid_image_ids(images_map))
for rewritten_image in rewritten_images: for rewritten_image in rewritten_images:
model.synthesize_v1_image( if not rewritten_image.image_id in images_map:
repo, model.synthesize_v1_image(
storage_map[rewritten_image.content_checksum], repo,
rewritten_image.image_id, storage_map[rewritten_image.content_checksum],
rewritten_image.created, rewritten_image.image_id,
rewritten_image.comment, rewritten_image.created,
rewritten_image.command, rewritten_image.comment,
rewritten_image.compat_json, rewritten_image.command,
rewritten_image.parent_image_id,) rewritten_image.compat_json,
rewritten_image.parent_image_id,
)
except ManifestException as me: except ManifestException as me:
logger.exception("exception when rewriting v1 metadata") logger.exception("exception when rewriting v1 metadata")
raise ManifestInvalid(detail={'message': 'failed synthesizing v1 metadata: %s' % me.message}) raise ManifestInvalid(detail={'message': 'failed synthesizing v1 metadata: %s' % me.message})
# Store the manifest pointing to the tag. # Store the manifest pointing to the tag.
leaf_layer_id = rewritten_images[-1].image_id leaf_layer_id = rewritten_images[-1].image_id
newly_created = model.save_manifest(namespace_name, repo_name, manifest.tag, leaf_layer_id, newly_created = model.save_manifest(repo, manifest.tag, leaf_layer_id, manifest.digest,
manifest.digest, manifest.bytes) manifest.bytes)
if newly_created: if newly_created:
# TODO: make this batch
labels = [] labels = []
for key, value in manifest.layers[-1].v1_metadata.labels.iteritems(): for key, value in manifest.layers[-1].v1_metadata.labels.iteritems():
media_type = 'application/json' if is_json(value) else 'text/plain' media_type = 'application/json' if is_json(value) else 'text/plain'
@ -219,11 +224,12 @@ def _write_manifest_and_log(namespace_name, repo_name, manifest):
'OK', 'OK',
status=202, status=202,
headers={ headers={
'Docker-Content-Digest': 'Docker-Content-Digest': manifest.digest,
manifest.digest,
'Location': 'Location':
url_for('v2.fetch_manifest_by_digest', repository='%s/%s' % (namespace_name, repo_name), url_for('v2.fetch_manifest_by_digest', repository='%s/%s' % (namespace_name, repo_name),
manifest_ref=manifest.digest),},) manifest_ref=manifest.digest),
},
)
@v2_bp.route(MANIFEST_DIGEST_ROUTE, methods=['DELETE']) @v2_bp.route(MANIFEST_DIGEST_ROUTE, methods=['DELETE'])

View file

@ -41,7 +41,7 @@ class BlobUpload(
""" """
class Blob(namedtuple('Blob', ['uuid', 'digest', 'size', 'locations', 'cas_path'])): class Blob(namedtuple('Blob', ['id', 'uuid', 'digest', 'size', 'locations', 'cas_path'])):
""" """
Blob represents an opaque binary blob saved to the storage system. Blob represents an opaque binary blob saved to the storage system.
""" """
@ -122,7 +122,7 @@ class DockerRegistryV2DataInterface(object):
pass pass
@abstractmethod @abstractmethod
def get_docker_v1_metadata_by_image_id(self, namespace_name, repo_name, docker_image_ids): def get_docker_v1_metadata_by_image_id(self, repository, docker_image_ids):
""" """
Returns a map of Docker V1 metadata for each given image ID, matched under the repository with Returns a map of Docker V1 metadata for each given image ID, matched under the repository with
the given namespace and name. Returns an empty map if the matching repository was not found. the given namespace and name. Returns an empty map if the matching repository was not found.
@ -156,8 +156,8 @@ class DockerRegistryV2DataInterface(object):
pass pass
@abstractmethod @abstractmethod
def save_manifest(self, namespace_name, repo_name, tag_name, leaf_layer_docker_id, def save_manifest(self, repository, tag_name, leaf_layer_docker_id, manifest_digest,
manifest_digest, manifest_bytes): manifest_bytes):
""" """
Saves a manifest pointing to the given leaf image, with the given manifest, under the matching Saves a manifest pointing to the given leaf image, with the given manifest, under the matching
repository as a tag with the given name. repository as a tag with the given name.
@ -264,3 +264,9 @@ class DockerRegistryV2DataInterface(object):
""" """
pass pass
@abstractmethod
def lookup_blobs_by_digest(self, repository, digests):
"""
Looks up all blobs with the matching digests under the given repository.
"""
pass

View file

@ -75,15 +75,12 @@ class PreOCIModel(DockerRegistryV2DataInterface):
except DataModelException: except DataModelException:
return None return None
def get_docker_v1_metadata_by_image_id(self, namespace_name, repo_name, docker_image_ids): def get_docker_v1_metadata_by_image_id(self, repository, docker_image_ids):
repo = model.repository.get_repository(namespace_name, repo_name) images_query = model.image.lookup_repository_images(repository.id, docker_image_ids)
if repo is None:
return {}
images_query = model.image.lookup_repository_images(repo, docker_image_ids)
return { return {
image.docker_image_id: _docker_v1_metadata(namespace_name, repo_name, image) image.docker_image_id: _docker_v1_metadata(repository.namespace_name, repository.name, image)
for image in images_query} for image in images_query
}
def get_parents_docker_v1_metadata(self, namespace_name, repo_name, docker_image_id): def get_parents_docker_v1_metadata(self, namespace_name, repo_name, docker_image_id):
repo_image = model.image.get_repo_image(namespace_name, repo_name, docker_image_id) repo_image = model.image.get_repo_image(namespace_name, repo_name, docker_image_id)
@ -104,29 +101,21 @@ class PreOCIModel(DockerRegistryV2DataInterface):
def synthesize_v1_image(self, repository, storage, image_id, created, comment, command, def synthesize_v1_image(self, repository, storage, image_id, created, comment, command,
compat_json, parent_image_id): compat_json, parent_image_id):
repo = model.repository.get_repository(repository.namespace_name, repository.name)
if repo is None:
raise DataModelException('Unknown repository: %s/%s' % (repository.namespace_name,
repository.name))
parent_image = None parent_image = None
if parent_image_id is not None: if parent_image_id is not None:
parent_image = model.image.get_image(repo, parent_image_id) parent_image = model.image.get_image(repository.id, parent_image_id)
if parent_image is None: if parent_image is None:
raise DataModelException('Unknown parent image: %s' % parent_image_id) raise DataModelException('Unknown parent image: %s' % parent_image_id)
storage_obj = model.storage.get_storage_by_uuid(storage.uuid) repo_image = model.image.synthesize_v1_image(repository.id, storage.id, storage.size,
if storage_obj is None: image_id, created, comment, command, compat_json,
raise DataModelException('Unknown storage: %s' % storage.uuid) parent_image)
return _docker_v1_metadata(repository.namespace_name, repository.name, repo_image)
repo_image = model.image.synthesize_v1_image(repo, storage_obj, image_id, created, comment, def save_manifest(self, repository, tag_name, leaf_layer_docker_id, manifest_digest,
command, compat_json, parent_image) manifest_bytes):
return _docker_v1_metadata(repo.namespace_user.username, repo.name, repo_image) (_, newly_created) = model.tag.store_tag_manifest_for_repo(
repository.id, tag_name, leaf_layer_docker_id, manifest_digest, manifest_bytes)
def save_manifest(self, namespace_name, repo_name, tag_name, leaf_layer_docker_id,
manifest_digest, manifest_bytes):
(_, newly_created) = model.tag.store_tag_manifest(
namespace_name, repo_name, tag_name, leaf_layer_docker_id, manifest_digest, manifest_bytes)
return newly_created return newly_created
def repository_tags(self, namespace_name, repo_name, limit, offset): def repository_tags(self, namespace_name, repo_name, limit, offset):
@ -208,15 +197,18 @@ class PreOCIModel(DockerRegistryV2DataInterface):
namespace_name, repo_name, blob_digest, location_obj.id, blob_upload.byte_count, namespace_name, repo_name, blob_digest, location_obj.id, blob_upload.byte_count,
expiration_sec, blob_upload.uncompressed_byte_count) expiration_sec, blob_upload.uncompressed_byte_count)
return Blob( return Blob(
id=blob_record.id,
uuid=blob_record.uuid, uuid=blob_record.uuid,
digest=blob_digest, digest=blob_digest,
size=blob_upload.byte_count, size=blob_upload.byte_count,
locations=[blob_upload.location_name], locations=[blob_upload.location_name],
cas_path=blob_record.cas_path) cas_path=blob_record.cas_path
)
def lookup_blobs_by_digest(self, namespace_name, repo_name, digests): def lookup_blobs_by_digest(self, repository, digests):
def _blob_view(blob_record): def _blob_view(blob_record):
return Blob( return Blob(
id=blob_record.id,
uuid=blob_record.uuid, uuid=blob_record.uuid,
digest=blob_record.content_checksum, digest=blob_record.content_checksum,
size=blob_record.image_size, size=blob_record.image_size,
@ -224,21 +216,20 @@ class PreOCIModel(DockerRegistryV2DataInterface):
locations=None, # Note: Locations is None in this case. locations=None, # Note: Locations is None in this case.
) )
repo = model.repository.get_repository(namespace_name, repo_name) query = model.storage.lookup_repo_storages_by_content_checksum(repository.id, digests)
if repo is None:
return {}
query = model.storage.lookup_repo_storages_by_content_checksum(repo, digests)
return {storage.content_checksum: _blob_view(storage) for storage in query} return {storage.content_checksum: _blob_view(storage) for storage in query}
def get_blob_by_digest(self, namespace_name, repo_name, digest): def get_blob_by_digest(self, namespace_name, repo_name, digest):
try: try:
blob_record = model.blob.get_repo_blob_by_digest(namespace_name, repo_name, digest) blob_record = model.blob.get_repo_blob_by_digest(namespace_name, repo_name, digest)
return Blob( return Blob(
id=blob_record.id,
uuid=blob_record.uuid, uuid=blob_record.uuid,
digest=digest, digest=digest,
size=blob_record.image_size, size=blob_record.image_size,
locations=blob_record.locations, locations=blob_record.locations,
cas_path=blob_record.cas_path) cas_path=blob_record.cas_path
)
except model.BlobDoesNotExist: except model.BlobDoesNotExist:
return None return None
@ -286,7 +277,8 @@ def _docker_v1_metadata(namespace_name, repo_name, repo_image):
comment=repo_image.comment, comment=repo_image.comment,
command=repo_image.command, command=repo_image.command,
# TODO: make sure this isn't needed anywhere, as it is expensive to lookup # TODO: make sure this isn't needed anywhere, as it is expensive to lookup
parent_image_id=None,) parent_image_id=None,
)
def _repository_for_repo(repo): def _repository_for_repo(repo):

View file

@ -0,0 +1,54 @@
import hashlib
import pytest
import time
from mock import patch
from flask import url_for
from playhouse.test_utils import count_queries
from app import instance_keys, app as realapp
from data import model
from endpoints.test.shared import conduct_call
from util.security.registry_jwt import generate_bearer_token, build_context_and_subject
from test.fixtures import *
def test_e2e_query_count_manifest_norewrite(client, app):
tag_manifest = model.tag.load_tag_manifest('devtable', 'simple', 'latest')
params = {
'repository': 'devtable/simple',
'manifest_ref': tag_manifest.digest,
}
user = model.user.get_user('devtable')
access = [{
'type': 'repository',
'name': 'devtable/simple',
'actions': ['pull', 'push'],
}]
context, subject = build_context_and_subject(user=user)
token = generate_bearer_token(realapp.config['SERVER_HOSTNAME'], subject, context, access, 600,
instance_keys)
headers = {
'Authorization': 'Bearer %s' % token,
}
# Conduct a call to prime the instance key and other caches.
conduct_call(client, 'v2.write_manifest_by_digest', url_for, 'PUT', params, expected_code=202,
headers=headers, raw_body=tag_manifest.json_data)
timecode = time.time()
def get_time():
return timecode + 10
with patch('time.time', get_time):
# Necessary in order to have the tag updates not occurr in the same second, which is the
# granularity supported currently.
with count_queries() as counter:
conduct_call(client, 'v2.write_manifest_by_digest', url_for, 'PUT', params, expected_code=202,
headers=headers, raw_body=tag_manifest.json_data)
assert counter.count < 15

View file

@ -165,7 +165,7 @@ class DockerSchema1Manifest(object):
try: try:
validate_schema(self._parsed, DockerSchema1Manifest.METASCHEMA) validate_schema(self._parsed, DockerSchema1Manifest.METASCHEMA)
except ValidationError as ve: except ValidationError as ve:
raise MalformedSchema1Manifest('malformed manifest data: %s' % ve) raise MalformedSchema1Manifest('manifest data does not match schema: %s' % ve)
self._signatures = self._parsed[DOCKER_SCHEMA1_SIGNATURES_KEY] self._signatures = self._parsed[DOCKER_SCHEMA1_SIGNATURES_KEY]
self._tag = self._parsed[DOCKER_SCHEMA1_REPO_TAG_KEY] self._tag = self._parsed[DOCKER_SCHEMA1_REPO_TAG_KEY]