Audit the number of SQL queries we make in writing manifests, and significantly reduce in the common case

Instead of 41 queries now for the simple manifest, we are down to 14. The biggest changes: - Only synthesize the V1 image rows if we haven't already found them in the database - Thread the repository object through to the other model method calls, and use it instead of loading again and again
2018-01-11 16:25:38 -05:00 · 2018-01-11 16:25:38 -05:00 · 9e16a989f5
commit 9e16a989f5
parent 4f47808e99
8 changed files with 145 additions and 66 deletions
--- a/endpoints/v2/manifest.py
+++ b/endpoints/v2/manifest.py
@ -157,7 +157,7 @@ def _write_manifest(namespace_name, repo_name, manifest):
    raise ManifestInvalid(detail={'message': 'manifest does not reference any layers'})

  # Ensure all the blobs in the manifest exist.
-  storage_map = model.lookup_blobs_by_digest(namespace_name, repo_name, manifest.checksums)
+  storage_map = model.lookup_blobs_by_digest(repo, manifest.checksums)
  for layer in manifest.layers:
    digest_str = str(layer.digest)
    if digest_str not in storage_map:
@ -166,30 +166,35 @@ def _write_manifest(namespace_name, repo_name, manifest):
  # Lookup all the images and their parent images (if any) inside the manifest.
  # This will let us know which v1 images we need to synthesize and which ones are invalid.
  all_image_ids = list(manifest.parent_image_ids | manifest.image_ids)
-  images_map = model.get_docker_v1_metadata_by_image_id(namespace_name, repo_name, all_image_ids)
+  images_map = model.get_docker_v1_metadata_by_image_id(repo, all_image_ids)

  # Rewrite any v1 image IDs that do not match the checksum in the database.
  try:
+    # TODO: make this batch and read the parent image from the previous iteration, rather than
+    # reloading it.
    rewritten_images = list(manifest.rewrite_invalid_image_ids(images_map))
    for rewritten_image in rewritten_images:
-      model.synthesize_v1_image(
-        repo,
-        storage_map[rewritten_image.content_checksum],
-        rewritten_image.image_id,
-        rewritten_image.created,
-        rewritten_image.comment,
-        rewritten_image.command,
-        rewritten_image.compat_json,
-        rewritten_image.parent_image_id,)
+      if not rewritten_image.image_id in images_map:
+        model.synthesize_v1_image(
+          repo,
+          storage_map[rewritten_image.content_checksum],
+          rewritten_image.image_id,
+          rewritten_image.created,
+          rewritten_image.comment,
+          rewritten_image.command,
+          rewritten_image.compat_json,
+          rewritten_image.parent_image_id,
+        )
  except ManifestException as me:
    logger.exception("exception when rewriting v1 metadata")
    raise ManifestInvalid(detail={'message': 'failed synthesizing v1 metadata: %s' % me.message})

  # Store the manifest pointing to the tag.
  leaf_layer_id = rewritten_images[-1].image_id
-  newly_created = model.save_manifest(namespace_name, repo_name, manifest.tag, leaf_layer_id,
-                                      manifest.digest, manifest.bytes)
+  newly_created = model.save_manifest(repo, manifest.tag, leaf_layer_id, manifest.digest,
+                                      manifest.bytes)
  if newly_created:
+    # TODO: make this batch
    labels = []
    for key, value in manifest.layers[-1].v1_metadata.labels.iteritems():
      media_type = 'application/json' if is_json(value) else 'text/plain'
@ -219,11 +224,12 @@ def _write_manifest_and_log(namespace_name, repo_name, manifest):
    'OK',
    status=202,
    headers={
-      'Docker-Content-Digest':
-        manifest.digest,
+      'Docker-Content-Digest': manifest.digest,
      'Location':
        url_for('v2.fetch_manifest_by_digest', repository='%s/%s' % (namespace_name, repo_name),
-                manifest_ref=manifest.digest),},)
+                manifest_ref=manifest.digest),
+    },
+  )


@v2_bp.route(MANIFEST_DIGEST_ROUTE, methods=['DELETE'])
--- a/endpoints/v2/models_interface.py
+++ b/endpoints/v2/models_interface.py
@ -41,7 +41,7 @@ class BlobUpload(
  """


-class Blob(namedtuple('Blob', ['uuid', 'digest', 'size', 'locations', 'cas_path'])):
+class Blob(namedtuple('Blob', ['id', 'uuid', 'digest', 'size', 'locations', 'cas_path'])):
  """
  Blob represents an opaque binary blob saved to the storage system.
  """
@ -122,7 +122,7 @@ class DockerRegistryV2DataInterface(object):
    pass

  @abstractmethod
-  def get_docker_v1_metadata_by_image_id(self, namespace_name, repo_name, docker_image_ids):
+  def get_docker_v1_metadata_by_image_id(self, repository, docker_image_ids):
    """
    Returns a map of Docker V1 metadata for each given image ID, matched under the repository with
    the given namespace and name. Returns an empty map if the matching repository was not found.
@ -156,8 +156,8 @@ class DockerRegistryV2DataInterface(object):
    pass

  @abstractmethod
-  def save_manifest(self, namespace_name, repo_name, tag_name, leaf_layer_docker_id,
-                    manifest_digest, manifest_bytes):
+  def save_manifest(self, repository, tag_name, leaf_layer_docker_id, manifest_digest,
+                    manifest_bytes):
    """
    Saves a manifest pointing to the given leaf image, with the given manifest, under the matching
    repository as a tag with the given name.
@ -264,3 +264,9 @@ class DockerRegistryV2DataInterface(object):
    """
    pass

+  @abstractmethod
+  def lookup_blobs_by_digest(self, repository, digests):
+    """
+    Looks up all blobs with the matching digests under the given repository.
+    """
+    pass
--- a/endpoints/v2/models_pre_oci.py
+++ b/endpoints/v2/models_pre_oci.py
@ -75,15 +75,12 @@ class PreOCIModel(DockerRegistryV2DataInterface):
    except DataModelException:
      return None

-  def get_docker_v1_metadata_by_image_id(self, namespace_name, repo_name, docker_image_ids):
-    repo = model.repository.get_repository(namespace_name, repo_name)
-    if repo is None:
-      return {}
-
-    images_query = model.image.lookup_repository_images(repo, docker_image_ids)
+  def get_docker_v1_metadata_by_image_id(self, repository, docker_image_ids):
+    images_query = model.image.lookup_repository_images(repository.id, docker_image_ids)
    return {
-      image.docker_image_id: _docker_v1_metadata(namespace_name, repo_name, image)
-      for image in images_query}
+      image.docker_image_id: _docker_v1_metadata(repository.namespace_name, repository.name, image)
+      for image in images_query
+    }

  def get_parents_docker_v1_metadata(self, namespace_name, repo_name, docker_image_id):
    repo_image = model.image.get_repo_image(namespace_name, repo_name, docker_image_id)
@ -104,29 +101,21 @@ class PreOCIModel(DockerRegistryV2DataInterface):

  def synthesize_v1_image(self, repository, storage, image_id, created, comment, command,
                          compat_json, parent_image_id):
-    repo = model.repository.get_repository(repository.namespace_name, repository.name)
-    if repo is None:
-      raise DataModelException('Unknown repository: %s/%s' % (repository.namespace_name,
-                                                              repository.name))
-
    parent_image = None
    if parent_image_id is not None:
-      parent_image = model.image.get_image(repo, parent_image_id)
+      parent_image = model.image.get_image(repository.id, parent_image_id)
      if parent_image is None:
        raise DataModelException('Unknown parent image: %s' % parent_image_id)

-    storage_obj = model.storage.get_storage_by_uuid(storage.uuid)
-    if storage_obj is None:
-      raise DataModelException('Unknown storage: %s' % storage.uuid)
+    repo_image = model.image.synthesize_v1_image(repository.id, storage.id, storage.size,
+                                                 image_id, created, comment, command, compat_json,
+                                                 parent_image)
+    return _docker_v1_metadata(repository.namespace_name, repository.name, repo_image)

-    repo_image = model.image.synthesize_v1_image(repo, storage_obj, image_id, created, comment,
-                                                 command, compat_json, parent_image)
-    return _docker_v1_metadata(repo.namespace_user.username, repo.name, repo_image)
-
-  def save_manifest(self, namespace_name, repo_name, tag_name, leaf_layer_docker_id,
-                    manifest_digest, manifest_bytes):
-    (_, newly_created) = model.tag.store_tag_manifest(
-      namespace_name, repo_name, tag_name, leaf_layer_docker_id, manifest_digest, manifest_bytes)
+  def save_manifest(self, repository, tag_name, leaf_layer_docker_id, manifest_digest,
+                    manifest_bytes):
+    (_, newly_created) = model.tag.store_tag_manifest_for_repo(
+      repository.id, tag_name, leaf_layer_docker_id, manifest_digest, manifest_bytes)
    return newly_created

  def repository_tags(self, namespace_name, repo_name, limit, offset):
@ -208,15 +197,18 @@ class PreOCIModel(DockerRegistryV2DataInterface):
      namespace_name, repo_name, blob_digest, location_obj.id, blob_upload.byte_count,
      expiration_sec, blob_upload.uncompressed_byte_count)
    return Blob(
+      id=blob_record.id,
      uuid=blob_record.uuid,
      digest=blob_digest,
      size=blob_upload.byte_count,
      locations=[blob_upload.location_name],
-      cas_path=blob_record.cas_path)
+      cas_path=blob_record.cas_path
+    )

-  def lookup_blobs_by_digest(self, namespace_name, repo_name, digests):
+  def lookup_blobs_by_digest(self, repository, digests):
    def _blob_view(blob_record):
      return Blob(
+        id=blob_record.id,
        uuid=blob_record.uuid,
        digest=blob_record.content_checksum,
        size=blob_record.image_size,
@ -224,21 +216,20 @@ class PreOCIModel(DockerRegistryV2DataInterface):
        locations=None,  # Note: Locations is None in this case.
      )

-    repo = model.repository.get_repository(namespace_name, repo_name)
-    if repo is None:
-      return {}
-    query = model.storage.lookup_repo_storages_by_content_checksum(repo, digests)
+    query = model.storage.lookup_repo_storages_by_content_checksum(repository.id, digests)
    return {storage.content_checksum: _blob_view(storage) for storage in query}

  def get_blob_by_digest(self, namespace_name, repo_name, digest):
    try:
      blob_record = model.blob.get_repo_blob_by_digest(namespace_name, repo_name, digest)
      return Blob(
+        id=blob_record.id,
        uuid=blob_record.uuid,
        digest=digest,
        size=blob_record.image_size,
        locations=blob_record.locations,
-        cas_path=blob_record.cas_path)
+        cas_path=blob_record.cas_path
+      )
    except model.BlobDoesNotExist:
      return None

@ -286,7 +277,8 @@ def _docker_v1_metadata(namespace_name, repo_name, repo_image):
    comment=repo_image.comment,
    command=repo_image.command,
    # TODO: make sure this isn't needed anywhere, as it is expensive to lookup
-    parent_image_id=None,)
+    parent_image_id=None,
+  )


 def _repository_for_repo(repo):
--- a/endpoints/v2/test/test_manifest.py
+++ b/endpoints/v2/test/test_manifest.py
@ -0,0 +1,54 @@
+import hashlib
+import pytest
+import time
+
+from mock import patch
+
+from flask import url_for
+from playhouse.test_utils import count_queries
+
+from app import instance_keys, app as realapp
+from data import model
+from endpoints.test.shared import conduct_call
+from util.security.registry_jwt import generate_bearer_token, build_context_and_subject
+from test.fixtures import *
+
+def test_e2e_query_count_manifest_norewrite(client, app):
+  tag_manifest = model.tag.load_tag_manifest('devtable', 'simple', 'latest')
+
+  params = {
+    'repository': 'devtable/simple',
+    'manifest_ref': tag_manifest.digest,
+  }
+
+  user = model.user.get_user('devtable')
+  access = [{
+    'type': 'repository',
+    'name': 'devtable/simple',
+    'actions': ['pull', 'push'],
+  }]
+
+  context, subject = build_context_and_subject(user=user)
+  token = generate_bearer_token(realapp.config['SERVER_HOSTNAME'], subject, context, access, 600,
+                                instance_keys)
+
+  headers = {
+    'Authorization': 'Bearer %s' % token,
+  }
+
+  # Conduct a call to prime the instance key and other caches.
+  conduct_call(client, 'v2.write_manifest_by_digest', url_for, 'PUT', params, expected_code=202,
+               headers=headers, raw_body=tag_manifest.json_data)
+
+  timecode = time.time()
+  def get_time():
+    return timecode + 10
+
+  with patch('time.time', get_time):
+    # Necessary in order to have the tag updates not occurr in the same second, which is the
+    # granularity supported currently.
+    with count_queries() as counter:
+      conduct_call(client, 'v2.write_manifest_by_digest', url_for, 'PUT', params, expected_code=202,
+                   headers=headers, raw_body=tag_manifest.json_data)
+
+    assert counter.count < 15