V1 Docker ID <-> V2 layer SHA mismatch fix

Fix handling of V1 Docker ID <-> V2 layer SHA mismatch by dynamically rewriting the manifest to use new synthesized IDs for all layers above the mismatch. Also adds a bunch of tests for this and other use cases, fixes a bug around manifest digest uniqueness and fixes the 5.5 migration for MySQL.
2016-02-12 17:39:27 +02:00 · 2016-02-12 17:39:27 +02:00 · abd2e3c234
commit abd2e3c234
parent 8b61c69dad
6 changed files with 240 additions and 53 deletions
--- a/data/database.py
+++ b/data/database.py
@ -642,7 +642,7 @@ class RepositoryTag(BaseModel):

 class TagManifest(BaseModel):
  tag = ForeignKeyField(RepositoryTag, index=True, unique=True)
-  digest = CharField(index=True, unique=True)
+  digest = CharField(index=True)
  json_data = TextField()


--- a/data/migrations/versions/88e0f440a2f_add_created_field_to_the_blobupload_.py
+++ b/data/migrations/versions/88e0f440a2f_add_created_field_to_the_blobupload_.py
@ -16,7 +16,7 @@ from datetime import datetime

 def upgrade(tables):
    ### commands auto generated by Alembic - please adjust! ###
-    now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    now = datetime.now().strftime("'%Y-%m-%d %H:%M:%S'")
    op.add_column('blobupload', sa.Column('created', sa.DateTime(), nullable=False, server_default=sa.text(now)))
    op.create_index('blobupload_created', 'blobupload', ['created'], unique=False)
    ### end Alembic commands ###
--- a/data/migrations/versions/e4129c93e477_remove_uniqueness_constraint_on_the_.py
+++ b/data/migrations/versions/e4129c93e477_remove_uniqueness_constraint_on_the_.py
@ -0,0 +1,28 @@
+"""Remove uniqueness constraint on the TagManifest digest column
+
+Revision ID: e4129c93e477
+Revises: 956a0833223
+Create Date: 2016-02-12 17:22:48.039791
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = 'e4129c93e477'
+down_revision = '956a0833223'
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import mysql
+
+def upgrade(tables):
+    ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index('tagmanifest_digest', table_name='tagmanifest')
+    op.create_index('tagmanifest_digest', 'tagmanifest', ['digest'], unique=False)
+    ### end Alembic commands ###
+
+
+def downgrade(tables):
+    ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index('tagmanifest_digest', table_name='tagmanifest')
+    op.create_index('tagmanifest_digest', 'tagmanifest', ['digest'], unique=True)
+    ### end Alembic commands ###
--- a/data/model/image.py
+++ b/data/model/image.py
@ -2,7 +2,7 @@ import logging
 import dateutil.parser
 import random

-from peewee import JOIN_LEFT_OUTER, SQL
+from peewee import JOIN_LEFT_OUTER, IntegrityError
 from datetime import datetime

 from data.model import (DataModelException, db_transaction, _basequery, storage,
@ -405,10 +405,13 @@ def synthesize_v1_image(repo, image_storage, docker_image_id, created_date_str,
  aggregate_size = _basequery.calculate_image_aggregate_size(ancestors, image_storage.image_size,
                                                             parent_image)

-  return Image.create(docker_image_id=docker_image_id, ancestors=ancestors, comment=comment,
-                      command=command, v1_json_metadata=v1_json_metadata, created=created,
-                      storage=image_storage, repository=repo, parent=parent_image,
-                      aggregate_size=aggregate_size)
+  try:
+    return Image.create(docker_image_id=docker_image_id, ancestors=ancestors, comment=comment,
+                        command=command, v1_json_metadata=v1_json_metadata, created=created,
+                        storage=image_storage, repository=repo, parent=parent_image,
+                        aggregate_size=aggregate_size)
+  except IntegrityError:
+    return Image.get(docker_image_id=docker_image_id, repository=repo)


 def ensure_image_locations(*names):
--- a/endpoints/v2/manifest.py
+++ b/endpoints/v2/manifest.py
@ -2,9 +2,10 @@ import logging
 import jwt.utils
 import json
 import features
+import hashlib

 from peewee import IntegrityError
-from flask import make_response, request, url_for, abort
+from flask import make_response, request, url_for
 from collections import namedtuple, OrderedDict
 from jwkest.jws import SIGNER_ALGS, keyrep
 from datetime import datetime
@ -126,7 +127,8 @@ class SignedManifest(object):
      try:
        image_digest = digest_tools.Digest.parse_digest(blob_sum_obj[_BLOB_SUM_KEY])
      except digest_tools.InvalidDigestException:
-        raise ManifestInvalid()
+        err_message = 'could not parse manifest digest: %s' % blob_sum_obj[_BLOB_SUM_KEY]
+        raise ManifestInvalid(detail={'message': err_message})

      metadata_string = history_obj[_V1_COMPAT_KEY]

@ -134,6 +136,9 @@ class SignedManifest(object):
      command_list = v1_metadata.get('container_config', {}).get('Cmd', None)
      command = json.dumps(command_list) if command_list else None

+      if not 'id' in v1_metadata:
+        raise ManifestInvalid(detail={'message': 'invalid manifest v1 history'})
+
      extracted = ExtractedV1Metadata(v1_metadata['id'], v1_metadata.get('parent'),
                                      v1_metadata.get('created'), v1_metadata.get('comment'),
                                      command)
@ -180,6 +185,14 @@ class SignedManifestBuilder(object):
      _V1_COMPAT_KEY: v1_json_metadata,
    })

+  def add_top_layer(self, layer_digest, v1_json_metadata):
+    self._fs_layer_digests.insert(0, {
+      _BLOB_SUM_KEY: layer_digest,
+    })
+    self._history.insert(0, {
+      _V1_COMPAT_KEY: v1_json_metadata,
+    })
+
  def build(self, json_web_key):
    """ Build the payload and sign it, returning a SignedManifest object.
    """
@ -301,8 +314,8 @@ def _reject_manifest2_schema2(func):
 def write_manifest_by_tagname(namespace, repo_name, manifest_ref):
  try:
    manifest = SignedManifest(request.data)
-  except ValueError as ve:
-    raise ManifestInvalid()
+  except ValueError:
+    raise ManifestInvalid(detail={'message': 'could not parse manifest'})

  if manifest.tag != manifest_ref:
    raise TagInvalid()
@ -320,14 +333,29 @@ def write_manifest_by_digest(namespace, repo_name, manifest_ref):
  try:
    manifest = SignedManifest(request.data)
  except ValueError:
-    raise ManifestInvalid()
+    raise ManifestInvalid(detail={'message': 'could not parse manifest'})

  if manifest.digest != manifest_ref:
-    raise ManifestInvalid()
+    raise ManifestInvalid(detail={'message': 'manifest digest mismatch'})

  return _write_manifest(namespace, repo_name, manifest)


+def _updated_v1_metadata(v1_metadata_json, updated_id_map):
+  parsed = json.loads(v1_metadata_json)
+  parsed['id'] = updated_id_map[parsed['id']]
+
+  if parsed.get('parent'):
+    parsed['parent'] = updated_id_map[parsed['parent']]
+
+  if parsed.get('container_config', {}).get('Image'):
+    existing_image = parsed['container_config']['Image']
+    if existing_image in updated_id_map:
+      parsed['container_config']['image'] = updated_id_map[existing_image]
+
+  return json.dumps(parsed)
+
+
 def _write_manifest(namespace, repo_name, manifest):
  # Ensure that the manifest is for this repository. If the manifest's namespace is empty, then
  # it is for the library namespace and we need an extra check.
@ -363,22 +391,56 @@ def _write_manifest(namespace, repo_name, manifest):
  storage_query = model.storage.lookup_repo_storages_by_content_checksum(repo, checksums)
  storage_map = {storage.content_checksum: storage for storage in storage_query}

-  # Synthesize the V1 metadata for each layer.
-  manifest_digest = manifest.digest
+  # Ensure that we have valid V1 docker IDs. If Docker gives us a V1 layer ID pointing to
+  # a storage with a content checksum different from the existing, then we need to rewrite
+  # the Docker ID to ensure consistency.
  tag_name = manifest.tag
+  updated_manifest_builder = SignedManifestBuilder(namespace, repo_name, tag_name)
+  has_updated_manifest = False
+  updated_id_map = {}

  for mdata in layers:
    digest_str = str(mdata.digest)
    v1_mdata = mdata.v1_metadata
+    updated_id_map[v1_mdata.docker_id] = v1_mdata.docker_id
+
+    # Ensure that all blobs exist.
+    blob_storage = storage_map.get(digest_str)
+    if blob_storage is None:
+      raise BlobUnknown(detail={'digest': digest_str})

-    # If there is already a V1 image for this layer, nothing more to do.
    if v1_mdata.docker_id in images_map:
      # Ensure that the V1 image's storage matches the V2 blob. If not, we've found
-      # a data inconsistency and need to fail.
+      # a data inconsistency and need to create a new layer ID for the V1 image.
      v1_image = images_map[v1_mdata.docker_id]
-      if v1_image.storage.content_checksum != digest_str:
-        logger.error('Checksum mismatch on V1 layer %s (#%s): Expected digest %s, found %s',
-                     v1_mdata.docker_id, v1_image.id, digest_str, v1_image.storage.content_checksum)
+      if has_updated_manifest or v1_image.storage.content_checksum != digest_str:
+        new_synthetic_id = hashlib.sha256(mdata.v1_metadata_str + '@' + digest_str).hexdigest()
+        logger.debug('Got content mismatch for layer %s under repo %s/%s. New ID: %s',
+                     v1_mdata.docker_id, namespace, repo_name, new_synthetic_id)
+
+        updated_id_map[v1_mdata.docker_id] = new_synthetic_id
+        has_updated_manifest = True
+
+    # Update the manifest withn the new ID (if any).
+    v1_metadata_json = mdata.v1_metadata_str
+    if has_updated_manifest:
+      v1_metadata_json = _updated_v1_metadata(mdata.v1_metadata_str, updated_id_map)
+
+    updated_manifest_builder.add_top_layer(digest_str, v1_metadata_json)
+
+  # If the manifest was changed due to an updated layer ID, then create a new manifest
+  # based on the updated data.
+  if has_updated_manifest:
+    manifest = updated_manifest_builder.build(docker_v2_signing_key)
+    layers = list(manifest.layers)
+
+  # Synthesize the V1 metadata for each layer.
+  for mdata in layers:
+    v1_mdata = mdata.v1_metadata
+
+    # If the layer with the V1 id already exists, then nothing more to do. We've ensured
+    # it points to the correct content SHA above.
+    if v1_mdata.docker_id in images_map:
      continue

    # Lookup the parent image for the layer, if any.
@ -390,10 +452,8 @@ def _write_manifest(namespace, repo_name, manifest):
        raise ManifestInvalid(detail={'message': msg})

    # Synthesize and store the v1 metadata in the db.
-    blob_storage = storage_map.get(digest_str)
-    if blob_storage is None:
-      raise BlobUnknown(detail={'digest': digest_str})
-
+    digest_str = str(mdata.digest)
+    blob_storage = storage_map[digest_str]
    image = model.image.synthesize_v1_image(repo, blob_storage, v1_mdata.docker_id,
                                            v1_mdata.created, v1_mdata.comment, v1_mdata.command,
                                            mdata.v1_metadata_str, parent_image)
@ -405,9 +465,10 @@ def _write_manifest(namespace, repo_name, manifest):
    raise ManifestInvalid(detail={'message': 'manifest does not reference any layers'})

  # Store the manifest pointing to the tag.
+  manifest_digest = manifest.digest
  leaf_layer = layers[-1]
  model.tag.store_tag_manifest(namespace, repo_name, tag_name, leaf_layer.v1_metadata.docker_id,
-                               manifest_digest, request.data)
+                               manifest_digest, manifest.bytes)

  # Spawn the repo_push event.
  event_data = {
@ -481,9 +542,11 @@ def _generate_and_store_manifest(namespace, repo_name, tag_name):
  try:
    return model.tag.associate_generated_tag_manifest(namespace, repo_name, tag_name,
                                                      manifest.digest, manifest.bytes)
-  except IntegrityError:
+  except IntegrityError as ie:
+    logger.debug('Got integrity error: %s', ie)
    try:
      return model.tag.load_tag_manifest(namespace, repo_name, tag_name)
    except model.InvalidManifestException:
+      logger.exception('Exception when generating manifest')
      raise model.DataModelException('Could not load or generate manifest')

--- a/test/registry_tests.py
+++ b/test/registry_tests.py
@ -6,6 +6,7 @@ import random
 import string
 import resumablehashlib
 import binascii
+import uuid

 import Crypto.Random
 from cachetools import lru_cache
@ -166,23 +167,29 @@ def _get_repo_name(namespace, name):
  return '%s/%s' % (namespace, name)


-def _get_full_contents(image_data):
+def _get_full_contents(image_data, additional_fields=False):
  if 'chunks' in image_data:
    # Data is just for chunking; no need for a real TAR.
    return image_data['contents']

  layer_data = StringIO()

-  tar_file_info = tarfile.TarInfo(name='contents')
-  tar_file_info.type = tarfile.REGTYPE
-  tar_file_info.size = len(image_data['contents'])
+  def add_file(name, contents):
+    tar_file_info = tarfile.TarInfo(name=name)
+    tar_file_info.type = tarfile.REGTYPE
+    tar_file_info.size = len(contents)

-  tar_file = tarfile.open(fileobj=layer_data, mode='w|gz')
-  tar_file.addfile(tar_file_info, StringIO(image_data['contents']))
-  tar_file.close()
+    tar_file = tarfile.open(fileobj=layer_data, mode='w|gz')
+    tar_file.addfile(tar_file_info, StringIO(contents))
+    tar_file.close()
+
+  add_file('contents', image_data['contents'])
+  if additional_fields:
+    add_file('anotherfile', str(uuid.uuid4()))

  layer_bytes = layer_data.getvalue()
  layer_data.close()
+
  return layer_bytes


@ -240,10 +247,10 @@ class RegistryTestCaseMixin(LiveServerTestCase):
    self.csrf_token = ''
    self.csrf_token = self.conduct('GET', '/__test/csrf').text

-  def do_tag(self, namespace, repository, tag, image_id, expected_code=200):
+  def do_tag(self, namespace, repository, tag, image_id, expected_code=200, auth='sig'):
    repo_name = _get_repo_name(namespace, repository)
    self.conduct('PUT', '/v1/repositories/%s/tags/%s' % (repo_name, tag),
-                 data='"%s"' % image_id, expected_code=expected_code, auth='sig')
+                 data='"%s"' % image_id, expected_code=expected_code, auth=auth)

  def conduct_api_login(self, username, password):
    self.conduct('POST', '/api/v1/signin',
@ -256,6 +263,13 @@ class RegistryTestCaseMixin(LiveServerTestCase):
                 data=json.dumps(dict(visibility=visibility)),
                 headers={'Content-Type': 'application/json'})

+  def assertContents(self, image_data, response):
+    if 'chunks' in image_data:
+      return
+
+    tar = tarfile.open(fileobj=StringIO(response.content))
+    self.assertEquals(tar.extractfile('contents').read(), image_data['contents'])
+

 class BaseRegistryMixin(object):
  def conduct(self, method, url, headers=None, data=None, auth=None, params=None, expected_code=200,
@ -311,7 +325,10 @@ class V1RegistryMixin(BaseRegistryMixin):


 class V1RegistryPushMixin(V1RegistryMixin):
-  def do_push(self, namespace, repository, username, password, images=None, expect_failure=None):
+  push_version = 'v1'
+
+  def do_push(self, namespace, repository, username, password, images=None, expect_failure=None,
+              munge_shas=False):
    images = images or self._get_default_images()
    auth = (username, password)
    repo_name = _get_repo_name(namespace, repository)
@ -328,7 +345,6 @@ class V1RegistryPushMixin(V1RegistryMixin):
    if expected_code != 201:
      return

-    last_image_id = None
    for image_data in images:
      image_id = image_data['id']
      last_image_id = image_id
@ -363,8 +379,10 @@ class V1RegistryPushMixin(V1RegistryMixin):


 class V1RegistryPullMixin(V1RegistryMixin):
+  pull_version = 'v1'
+
  def do_pull(self, namespace, repository, username=None, password='password', expect_failure=None,
-              images=None):
+              images=None, munge_shas=False):
    images = images or self._get_default_images()
    repo_name = _get_repo_name(namespace, repository)

@ -377,27 +395,37 @@ class V1RegistryPullMixin(V1RegistryMixin):

    prefix = '/v1/repositories/%s/' % repo_name

-    # GET /v1/repositories/{namespace}/{repository}/
+    # GET /v1/repositories/{namespace}/{repository}/images
    expected_code = _get_expected_code(expect_failure, 1, 200)
    self.conduct('GET', prefix + 'images', auth=auth, expected_code=expected_code)
    if expected_code != 200:
      return

-    # GET /v1/repositories/{namespace}/{repository}/
-    result = json.loads(self.conduct('GET', prefix + 'tags', auth='sig').text)
+    # GET /v1/repositories/{namespace}/{repository}/tags
+    tags_result = json.loads(self.conduct('GET', prefix + 'tags', auth='sig').text)
+    self.assertEquals(1, len(tags_result.values()))

-    self.assertEquals(len(images), len(result.values()))
+    # Ensure we do (or do not) have a matching image ID.
+    tag_image_id = tags_result['latest']
+    known_ids = [item['id'] for item in images]

-    for image_data in images:
-      image_id = image_data['id']
-      self.assertIn(image_id, result.values())
+    self.assertEquals(not munge_shas, tag_image_id in known_ids)

+    # Retrieve the ancestry of the tag image.
+    image_prefix = '/v1/images/%s/' % tag_image_id
+    ancestors = self.conduct('GET', image_prefix + 'ancestry', auth='sig').json()
+    for index, image_id in enumerate(ancestors):
      # /v1/images/{imageID}/{ancestry, json, layer}
      image_prefix = '/v1/images/%s/' % image_id
      self.conduct('GET', image_prefix + 'ancestry', auth='sig')
-      self.conduct('GET', image_prefix + 'json', auth='sig')
-      self.conduct('GET', image_prefix + 'layer', auth='sig')

+      response = self.conduct('GET', image_prefix + 'json', auth='sig')
+      self.assertEquals(image_id, response.json()['id'])
+
+      response = self.conduct('GET', image_prefix + 'layer', auth='sig')
+
+      # Ensure we can parse the layer bytes and that they contain the contents.
+      self.assertContents(images[index], response)


 class V2RegistryMixin(BaseRegistryMixin):
@ -474,8 +502,11 @@ class V2RegistryMixin(BaseRegistryMixin):


 class V2RegistryPushMixin(V2RegistryMixin):
+  push_version = 'v2'
+
  def do_push(self, namespace, repository, username, password, images=None, tag_name=None,
-              cancel=False, invalid=False, expect_failure=None, scopes=None):
+              cancel=False, invalid=False, expect_failure=None, scopes=None,
+              munge_shas=False):
    images = images or self._get_default_images()
    repo_name = _get_repo_name(namespace, repository)

@ -499,8 +530,7 @@ class V2RegistryPushMixin(V2RegistryMixin):
    full_contents = {}

    for image_data in images:
-      full_contents[image_data['id']] = _get_full_contents(image_data)
-
+      full_contents[image_data['id']] = _get_full_contents(image_data, additional_fields=munge_shas)
      checksum = 'sha256:' + hashlib.sha256(full_contents[image_data['id']]).hexdigest()
      if invalid:
        checksum = 'sha256:' + hashlib.sha256('foobarbaz').hexdigest()
@ -595,8 +625,10 @@ class V2RegistryPushMixin(V2RegistryMixin):


 class V2RegistryPullMixin(V2RegistryMixin):
+  pull_version = 'v2'
+
  def do_pull(self, namespace, repository, username=None, password='password', expect_failure=None,
-              manifest_id=None, images=None):
+              manifest_id=None, images=None, munge_shas=False):
    images = images or self._get_default_images()
    repo_name = _get_repo_name(namespace, repository)

@ -630,12 +662,13 @@ class V2RegistryPullMixin(V2RegistryMixin):

    # Verify the layers.
    blobs = {}
-    for layer in manifest_data['fsLayers']:
+    for index, layer in enumerate(manifest_data['fsLayers']):
      blob_id = layer['blobSum']
      result = self.conduct('GET', '/v2/%s/blobs/%s' % (repo_name, blob_id),
                            expected_code=200, auth='jwt')

      blobs[blob_id] = result.content
+      self.assertContents(images[index], result)

    # Verify the V1 metadata is present for each expected image.
    found_v1_layers = set()
@ -645,7 +678,7 @@ class V2RegistryPullMixin(V2RegistryMixin):
      found_v1_layers.add(v1_history['id'])

    for image in images:
-      self.assertIn(image['id'], found_v1_layers)
+      self.assertEquals(not munge_shas, image['id'] in found_v1_layers)

    return blobs

@ -687,6 +720,35 @@ class V2RegistryLoginMixin(object):


 class RegistryTestsMixin(object):
+  def test_push_same_ids_different_sha(self):
+    if self.push_version == 'v1':
+      # No SHAs to munge in V1.
+      return
+
+    images = [
+      {
+        'id': 'latestid',
+        'contents': 'the latest image',
+        'parent': 'baseid',
+      },
+      {
+        'id': 'baseid',
+        'contents': 'The base image',
+      }
+    ]
+
+    # Push a new repository.
+    self.do_push('public', 'newrepo', 'public', 'password', images=images)
+
+    # Pull the repository.
+    self.do_pull('public', 'newrepo', 'public', 'password', images=images)
+
+    # Push a the repository again, but with different SHAs.
+    self.do_push('public', 'newrepo', 'public', 'password', images=images, munge_shas=True)
+
+    # Pull the repository.
+    self.do_pull('public', 'newrepo', 'public', 'password', images=images, munge_shas=True)
+
  def test_push_pull_logging(self):
    # Push a new repository.
    self.do_push('public', 'newrepo', 'public', 'password')
@ -986,6 +1048,27 @@ class V2RegistryTests(V2RegistryPullMixin, V2RegistryPushMixin, RegistryTestsMix
                      RegistryTestCaseMixin, LiveServerTestCase):
  """ Tests for V2 registry. """

+  def test_invalid_blob(self):
+    namespace = 'devtable'
+    repository = 'somerepo'
+    tag_name = 'sometag'
+
+    repo_name = _get_repo_name(namespace, repository)
+
+    self.v2_ping()
+    self.do_auth('devtable', 'password', namespace, repository, scopes=['push', 'pull'])
+
+    # Build a fake manifest.
+    builder = SignedManifestBuilder(namespace, repository, tag_name)
+    builder.add_layer('sha256:' + hashlib.sha256('invalid').hexdigest(), json.dumps({'id': 'foo'}))
+    manifest = builder.build(_JWK)
+
+    response = self.conduct('PUT', '/v2/%s/manifests/%s' % (repo_name, tag_name),
+                            data=manifest.bytes, expected_code=404,
+                            headers={'Content-Type': 'application/json'}, auth='jwt')
+    self.assertEquals('BLOB_UNKNOWN', response.json()['errors'][0]['code'])
+
+
  def test_delete_manifest(self):
    # Push a new repo with the latest tag.
    (_, digest) = self.do_push('devtable', 'newrepo', 'devtable', 'password')
@ -1213,6 +1296,16 @@ class V1PushV2PullRegistryTests(V2RegistryPullMixin, V1RegistryPushMixin, Regist
                                RegistryTestCaseMixin, LiveServerTestCase):
  """ Tests for V1 push, V2 pull registry. """

+  def test_multiple_tag_with_pull(self):
+    """ Tagging the same exact V1 tag multiple times and then pulling with V2. """
+    images = self._get_default_images()
+
+    self.do_push('devtable', 'newrepo', 'devtable', 'password', images=images)
+    self.do_pull('devtable', 'newrepo', 'devtable', 'password', images=images)
+
+    self.do_tag('devtable', 'newrepo', 'latest', images[0]['id'], auth=('devtable', 'password'))
+    self.do_pull('devtable', 'newrepo', 'devtable', 'password', images=images)
+

 class V1PullV2PushRegistryTests(V1RegistryPullMixin, V2RegistryPushMixin, RegistryTestsMixin,
                                RegistryTestCaseMixin, LiveServerTestCase):