V1 Docker ID <-> V2 layer SHA mismatch fix

Fix handling of V1 Docker ID <-> V2 layer SHA mismatch by dynamically rewriting the manifest to use new synthesized IDs for all layers above the mismatch. Also adds a bunch of tests for this and other use cases, fixes a bug around manifest digest uniqueness and fixes the 5.5 migration for MySQL.
This commit is contained in:
Joseph Schorr 2016-02-12 17:39:27 +02:00
parent 8b61c69dad
commit abd2e3c234
6 changed files with 240 additions and 53 deletions

View file

@ -642,7 +642,7 @@ class RepositoryTag(BaseModel):
class TagManifest(BaseModel):
tag = ForeignKeyField(RepositoryTag, index=True, unique=True)
digest = CharField(index=True, unique=True)
digest = CharField(index=True)
json_data = TextField()

View file

@ -16,7 +16,7 @@ from datetime import datetime
def upgrade(tables):
### commands auto generated by Alembic - please adjust! ###
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
now = datetime.now().strftime("'%Y-%m-%d %H:%M:%S'")
op.add_column('blobupload', sa.Column('created', sa.DateTime(), nullable=False, server_default=sa.text(now)))
op.create_index('blobupload_created', 'blobupload', ['created'], unique=False)
### end Alembic commands ###

View file

@ -0,0 +1,28 @@
"""Remove uniqueness constraint on the TagManifest digest column
Revision ID: e4129c93e477
Revises: 956a0833223
Create Date: 2016-02-12 17:22:48.039791
"""
# revision identifiers, used by Alembic.
revision = 'e4129c93e477'
down_revision = '956a0833223'
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import mysql
def upgrade(tables):
### commands auto generated by Alembic - please adjust! ###
op.drop_index('tagmanifest_digest', table_name='tagmanifest')
op.create_index('tagmanifest_digest', 'tagmanifest', ['digest'], unique=False)
### end Alembic commands ###
def downgrade(tables):
### commands auto generated by Alembic - please adjust! ###
op.drop_index('tagmanifest_digest', table_name='tagmanifest')
op.create_index('tagmanifest_digest', 'tagmanifest', ['digest'], unique=True)
### end Alembic commands ###

View file

@ -2,7 +2,7 @@ import logging
import dateutil.parser
import random
from peewee import JOIN_LEFT_OUTER, SQL
from peewee import JOIN_LEFT_OUTER, IntegrityError
from datetime import datetime
from data.model import (DataModelException, db_transaction, _basequery, storage,
@ -405,10 +405,13 @@ def synthesize_v1_image(repo, image_storage, docker_image_id, created_date_str,
aggregate_size = _basequery.calculate_image_aggregate_size(ancestors, image_storage.image_size,
parent_image)
return Image.create(docker_image_id=docker_image_id, ancestors=ancestors, comment=comment,
command=command, v1_json_metadata=v1_json_metadata, created=created,
storage=image_storage, repository=repo, parent=parent_image,
aggregate_size=aggregate_size)
try:
return Image.create(docker_image_id=docker_image_id, ancestors=ancestors, comment=comment,
command=command, v1_json_metadata=v1_json_metadata, created=created,
storage=image_storage, repository=repo, parent=parent_image,
aggregate_size=aggregate_size)
except IntegrityError:
return Image.get(docker_image_id=docker_image_id, repository=repo)
def ensure_image_locations(*names):

View file

@ -2,9 +2,10 @@ import logging
import jwt.utils
import json
import features
import hashlib
from peewee import IntegrityError
from flask import make_response, request, url_for, abort
from flask import make_response, request, url_for
from collections import namedtuple, OrderedDict
from jwkest.jws import SIGNER_ALGS, keyrep
from datetime import datetime
@ -126,7 +127,8 @@ class SignedManifest(object):
try:
image_digest = digest_tools.Digest.parse_digest(blob_sum_obj[_BLOB_SUM_KEY])
except digest_tools.InvalidDigestException:
raise ManifestInvalid()
err_message = 'could not parse manifest digest: %s' % blob_sum_obj[_BLOB_SUM_KEY]
raise ManifestInvalid(detail={'message': err_message})
metadata_string = history_obj[_V1_COMPAT_KEY]
@ -134,6 +136,9 @@ class SignedManifest(object):
command_list = v1_metadata.get('container_config', {}).get('Cmd', None)
command = json.dumps(command_list) if command_list else None
if not 'id' in v1_metadata:
raise ManifestInvalid(detail={'message': 'invalid manifest v1 history'})
extracted = ExtractedV1Metadata(v1_metadata['id'], v1_metadata.get('parent'),
v1_metadata.get('created'), v1_metadata.get('comment'),
command)
@ -180,6 +185,14 @@ class SignedManifestBuilder(object):
_V1_COMPAT_KEY: v1_json_metadata,
})
def add_top_layer(self, layer_digest, v1_json_metadata):
self._fs_layer_digests.insert(0, {
_BLOB_SUM_KEY: layer_digest,
})
self._history.insert(0, {
_V1_COMPAT_KEY: v1_json_metadata,
})
def build(self, json_web_key):
""" Build the payload and sign it, returning a SignedManifest object.
"""
@ -301,8 +314,8 @@ def _reject_manifest2_schema2(func):
def write_manifest_by_tagname(namespace, repo_name, manifest_ref):
try:
manifest = SignedManifest(request.data)
except ValueError as ve:
raise ManifestInvalid()
except ValueError:
raise ManifestInvalid(detail={'message': 'could not parse manifest'})
if manifest.tag != manifest_ref:
raise TagInvalid()
@ -320,14 +333,29 @@ def write_manifest_by_digest(namespace, repo_name, manifest_ref):
try:
manifest = SignedManifest(request.data)
except ValueError:
raise ManifestInvalid()
raise ManifestInvalid(detail={'message': 'could not parse manifest'})
if manifest.digest != manifest_ref:
raise ManifestInvalid()
raise ManifestInvalid(detail={'message': 'manifest digest mismatch'})
return _write_manifest(namespace, repo_name, manifest)
def _updated_v1_metadata(v1_metadata_json, updated_id_map):
parsed = json.loads(v1_metadata_json)
parsed['id'] = updated_id_map[parsed['id']]
if parsed.get('parent'):
parsed['parent'] = updated_id_map[parsed['parent']]
if parsed.get('container_config', {}).get('Image'):
existing_image = parsed['container_config']['Image']
if existing_image in updated_id_map:
parsed['container_config']['image'] = updated_id_map[existing_image]
return json.dumps(parsed)
def _write_manifest(namespace, repo_name, manifest):
# Ensure that the manifest is for this repository. If the manifest's namespace is empty, then
# it is for the library namespace and we need an extra check.
@ -363,22 +391,56 @@ def _write_manifest(namespace, repo_name, manifest):
storage_query = model.storage.lookup_repo_storages_by_content_checksum(repo, checksums)
storage_map = {storage.content_checksum: storage for storage in storage_query}
# Synthesize the V1 metadata for each layer.
manifest_digest = manifest.digest
# Ensure that we have valid V1 docker IDs. If Docker gives us a V1 layer ID pointing to
# a storage with a content checksum different from the existing, then we need to rewrite
# the Docker ID to ensure consistency.
tag_name = manifest.tag
updated_manifest_builder = SignedManifestBuilder(namespace, repo_name, tag_name)
has_updated_manifest = False
updated_id_map = {}
for mdata in layers:
digest_str = str(mdata.digest)
v1_mdata = mdata.v1_metadata
updated_id_map[v1_mdata.docker_id] = v1_mdata.docker_id
# Ensure that all blobs exist.
blob_storage = storage_map.get(digest_str)
if blob_storage is None:
raise BlobUnknown(detail={'digest': digest_str})
# If there is already a V1 image for this layer, nothing more to do.
if v1_mdata.docker_id in images_map:
# Ensure that the V1 image's storage matches the V2 blob. If not, we've found
# a data inconsistency and need to fail.
# a data inconsistency and need to create a new layer ID for the V1 image.
v1_image = images_map[v1_mdata.docker_id]
if v1_image.storage.content_checksum != digest_str:
logger.error('Checksum mismatch on V1 layer %s (#%s): Expected digest %s, found %s',
v1_mdata.docker_id, v1_image.id, digest_str, v1_image.storage.content_checksum)
if has_updated_manifest or v1_image.storage.content_checksum != digest_str:
new_synthetic_id = hashlib.sha256(mdata.v1_metadata_str + '@' + digest_str).hexdigest()
logger.debug('Got content mismatch for layer %s under repo %s/%s. New ID: %s',
v1_mdata.docker_id, namespace, repo_name, new_synthetic_id)
updated_id_map[v1_mdata.docker_id] = new_synthetic_id
has_updated_manifest = True
# Update the manifest withn the new ID (if any).
v1_metadata_json = mdata.v1_metadata_str
if has_updated_manifest:
v1_metadata_json = _updated_v1_metadata(mdata.v1_metadata_str, updated_id_map)
updated_manifest_builder.add_top_layer(digest_str, v1_metadata_json)
# If the manifest was changed due to an updated layer ID, then create a new manifest
# based on the updated data.
if has_updated_manifest:
manifest = updated_manifest_builder.build(docker_v2_signing_key)
layers = list(manifest.layers)
# Synthesize the V1 metadata for each layer.
for mdata in layers:
v1_mdata = mdata.v1_metadata
# If the layer with the V1 id already exists, then nothing more to do. We've ensured
# it points to the correct content SHA above.
if v1_mdata.docker_id in images_map:
continue
# Lookup the parent image for the layer, if any.
@ -390,10 +452,8 @@ def _write_manifest(namespace, repo_name, manifest):
raise ManifestInvalid(detail={'message': msg})
# Synthesize and store the v1 metadata in the db.
blob_storage = storage_map.get(digest_str)
if blob_storage is None:
raise BlobUnknown(detail={'digest': digest_str})
digest_str = str(mdata.digest)
blob_storage = storage_map[digest_str]
image = model.image.synthesize_v1_image(repo, blob_storage, v1_mdata.docker_id,
v1_mdata.created, v1_mdata.comment, v1_mdata.command,
mdata.v1_metadata_str, parent_image)
@ -405,9 +465,10 @@ def _write_manifest(namespace, repo_name, manifest):
raise ManifestInvalid(detail={'message': 'manifest does not reference any layers'})
# Store the manifest pointing to the tag.
manifest_digest = manifest.digest
leaf_layer = layers[-1]
model.tag.store_tag_manifest(namespace, repo_name, tag_name, leaf_layer.v1_metadata.docker_id,
manifest_digest, request.data)
manifest_digest, manifest.bytes)
# Spawn the repo_push event.
event_data = {
@ -481,9 +542,11 @@ def _generate_and_store_manifest(namespace, repo_name, tag_name):
try:
return model.tag.associate_generated_tag_manifest(namespace, repo_name, tag_name,
manifest.digest, manifest.bytes)
except IntegrityError:
except IntegrityError as ie:
logger.debug('Got integrity error: %s', ie)
try:
return model.tag.load_tag_manifest(namespace, repo_name, tag_name)
except model.InvalidManifestException:
logger.exception('Exception when generating manifest')
raise model.DataModelException('Could not load or generate manifest')

View file

@ -6,6 +6,7 @@ import random
import string
import resumablehashlib
import binascii
import uuid
import Crypto.Random
from cachetools import lru_cache
@ -166,23 +167,29 @@ def _get_repo_name(namespace, name):
return '%s/%s' % (namespace, name)
def _get_full_contents(image_data):
def _get_full_contents(image_data, additional_fields=False):
if 'chunks' in image_data:
# Data is just for chunking; no need for a real TAR.
return image_data['contents']
layer_data = StringIO()
tar_file_info = tarfile.TarInfo(name='contents')
tar_file_info.type = tarfile.REGTYPE
tar_file_info.size = len(image_data['contents'])
def add_file(name, contents):
tar_file_info = tarfile.TarInfo(name=name)
tar_file_info.type = tarfile.REGTYPE
tar_file_info.size = len(contents)
tar_file = tarfile.open(fileobj=layer_data, mode='w|gz')
tar_file.addfile(tar_file_info, StringIO(image_data['contents']))
tar_file.close()
tar_file = tarfile.open(fileobj=layer_data, mode='w|gz')
tar_file.addfile(tar_file_info, StringIO(contents))
tar_file.close()
add_file('contents', image_data['contents'])
if additional_fields:
add_file('anotherfile', str(uuid.uuid4()))
layer_bytes = layer_data.getvalue()
layer_data.close()
return layer_bytes
@ -240,10 +247,10 @@ class RegistryTestCaseMixin(LiveServerTestCase):
self.csrf_token = ''
self.csrf_token = self.conduct('GET', '/__test/csrf').text
def do_tag(self, namespace, repository, tag, image_id, expected_code=200):
def do_tag(self, namespace, repository, tag, image_id, expected_code=200, auth='sig'):
repo_name = _get_repo_name(namespace, repository)
self.conduct('PUT', '/v1/repositories/%s/tags/%s' % (repo_name, tag),
data='"%s"' % image_id, expected_code=expected_code, auth='sig')
data='"%s"' % image_id, expected_code=expected_code, auth=auth)
def conduct_api_login(self, username, password):
self.conduct('POST', '/api/v1/signin',
@ -256,6 +263,13 @@ class RegistryTestCaseMixin(LiveServerTestCase):
data=json.dumps(dict(visibility=visibility)),
headers={'Content-Type': 'application/json'})
def assertContents(self, image_data, response):
if 'chunks' in image_data:
return
tar = tarfile.open(fileobj=StringIO(response.content))
self.assertEquals(tar.extractfile('contents').read(), image_data['contents'])
class BaseRegistryMixin(object):
def conduct(self, method, url, headers=None, data=None, auth=None, params=None, expected_code=200,
@ -311,7 +325,10 @@ class V1RegistryMixin(BaseRegistryMixin):
class V1RegistryPushMixin(V1RegistryMixin):
def do_push(self, namespace, repository, username, password, images=None, expect_failure=None):
push_version = 'v1'
def do_push(self, namespace, repository, username, password, images=None, expect_failure=None,
munge_shas=False):
images = images or self._get_default_images()
auth = (username, password)
repo_name = _get_repo_name(namespace, repository)
@ -328,7 +345,6 @@ class V1RegistryPushMixin(V1RegistryMixin):
if expected_code != 201:
return
last_image_id = None
for image_data in images:
image_id = image_data['id']
last_image_id = image_id
@ -363,8 +379,10 @@ class V1RegistryPushMixin(V1RegistryMixin):
class V1RegistryPullMixin(V1RegistryMixin):
pull_version = 'v1'
def do_pull(self, namespace, repository, username=None, password='password', expect_failure=None,
images=None):
images=None, munge_shas=False):
images = images or self._get_default_images()
repo_name = _get_repo_name(namespace, repository)
@ -377,27 +395,37 @@ class V1RegistryPullMixin(V1RegistryMixin):
prefix = '/v1/repositories/%s/' % repo_name
# GET /v1/repositories/{namespace}/{repository}/
# GET /v1/repositories/{namespace}/{repository}/images
expected_code = _get_expected_code(expect_failure, 1, 200)
self.conduct('GET', prefix + 'images', auth=auth, expected_code=expected_code)
if expected_code != 200:
return
# GET /v1/repositories/{namespace}/{repository}/
result = json.loads(self.conduct('GET', prefix + 'tags', auth='sig').text)
# GET /v1/repositories/{namespace}/{repository}/tags
tags_result = json.loads(self.conduct('GET', prefix + 'tags', auth='sig').text)
self.assertEquals(1, len(tags_result.values()))
self.assertEquals(len(images), len(result.values()))
# Ensure we do (or do not) have a matching image ID.
tag_image_id = tags_result['latest']
known_ids = [item['id'] for item in images]
for image_data in images:
image_id = image_data['id']
self.assertIn(image_id, result.values())
self.assertEquals(not munge_shas, tag_image_id in known_ids)
# Retrieve the ancestry of the tag image.
image_prefix = '/v1/images/%s/' % tag_image_id
ancestors = self.conduct('GET', image_prefix + 'ancestry', auth='sig').json()
for index, image_id in enumerate(ancestors):
# /v1/images/{imageID}/{ancestry, json, layer}
image_prefix = '/v1/images/%s/' % image_id
self.conduct('GET', image_prefix + 'ancestry', auth='sig')
self.conduct('GET', image_prefix + 'json', auth='sig')
self.conduct('GET', image_prefix + 'layer', auth='sig')
response = self.conduct('GET', image_prefix + 'json', auth='sig')
self.assertEquals(image_id, response.json()['id'])
response = self.conduct('GET', image_prefix + 'layer', auth='sig')
# Ensure we can parse the layer bytes and that they contain the contents.
self.assertContents(images[index], response)
class V2RegistryMixin(BaseRegistryMixin):
@ -474,8 +502,11 @@ class V2RegistryMixin(BaseRegistryMixin):
class V2RegistryPushMixin(V2RegistryMixin):
push_version = 'v2'
def do_push(self, namespace, repository, username, password, images=None, tag_name=None,
cancel=False, invalid=False, expect_failure=None, scopes=None):
cancel=False, invalid=False, expect_failure=None, scopes=None,
munge_shas=False):
images = images or self._get_default_images()
repo_name = _get_repo_name(namespace, repository)
@ -499,8 +530,7 @@ class V2RegistryPushMixin(V2RegistryMixin):
full_contents = {}
for image_data in images:
full_contents[image_data['id']] = _get_full_contents(image_data)
full_contents[image_data['id']] = _get_full_contents(image_data, additional_fields=munge_shas)
checksum = 'sha256:' + hashlib.sha256(full_contents[image_data['id']]).hexdigest()
if invalid:
checksum = 'sha256:' + hashlib.sha256('foobarbaz').hexdigest()
@ -595,8 +625,10 @@ class V2RegistryPushMixin(V2RegistryMixin):
class V2RegistryPullMixin(V2RegistryMixin):
pull_version = 'v2'
def do_pull(self, namespace, repository, username=None, password='password', expect_failure=None,
manifest_id=None, images=None):
manifest_id=None, images=None, munge_shas=False):
images = images or self._get_default_images()
repo_name = _get_repo_name(namespace, repository)
@ -630,12 +662,13 @@ class V2RegistryPullMixin(V2RegistryMixin):
# Verify the layers.
blobs = {}
for layer in manifest_data['fsLayers']:
for index, layer in enumerate(manifest_data['fsLayers']):
blob_id = layer['blobSum']
result = self.conduct('GET', '/v2/%s/blobs/%s' % (repo_name, blob_id),
expected_code=200, auth='jwt')
blobs[blob_id] = result.content
self.assertContents(images[index], result)
# Verify the V1 metadata is present for each expected image.
found_v1_layers = set()
@ -645,7 +678,7 @@ class V2RegistryPullMixin(V2RegistryMixin):
found_v1_layers.add(v1_history['id'])
for image in images:
self.assertIn(image['id'], found_v1_layers)
self.assertEquals(not munge_shas, image['id'] in found_v1_layers)
return blobs
@ -687,6 +720,35 @@ class V2RegistryLoginMixin(object):
class RegistryTestsMixin(object):
def test_push_same_ids_different_sha(self):
if self.push_version == 'v1':
# No SHAs to munge in V1.
return
images = [
{
'id': 'latestid',
'contents': 'the latest image',
'parent': 'baseid',
},
{
'id': 'baseid',
'contents': 'The base image',
}
]
# Push a new repository.
self.do_push('public', 'newrepo', 'public', 'password', images=images)
# Pull the repository.
self.do_pull('public', 'newrepo', 'public', 'password', images=images)
# Push a the repository again, but with different SHAs.
self.do_push('public', 'newrepo', 'public', 'password', images=images, munge_shas=True)
# Pull the repository.
self.do_pull('public', 'newrepo', 'public', 'password', images=images, munge_shas=True)
def test_push_pull_logging(self):
# Push a new repository.
self.do_push('public', 'newrepo', 'public', 'password')
@ -986,6 +1048,27 @@ class V2RegistryTests(V2RegistryPullMixin, V2RegistryPushMixin, RegistryTestsMix
RegistryTestCaseMixin, LiveServerTestCase):
""" Tests for V2 registry. """
def test_invalid_blob(self):
namespace = 'devtable'
repository = 'somerepo'
tag_name = 'sometag'
repo_name = _get_repo_name(namespace, repository)
self.v2_ping()
self.do_auth('devtable', 'password', namespace, repository, scopes=['push', 'pull'])
# Build a fake manifest.
builder = SignedManifestBuilder(namespace, repository, tag_name)
builder.add_layer('sha256:' + hashlib.sha256('invalid').hexdigest(), json.dumps({'id': 'foo'}))
manifest = builder.build(_JWK)
response = self.conduct('PUT', '/v2/%s/manifests/%s' % (repo_name, tag_name),
data=manifest.bytes, expected_code=404,
headers={'Content-Type': 'application/json'}, auth='jwt')
self.assertEquals('BLOB_UNKNOWN', response.json()['errors'][0]['code'])
def test_delete_manifest(self):
# Push a new repo with the latest tag.
(_, digest) = self.do_push('devtable', 'newrepo', 'devtable', 'password')
@ -1213,6 +1296,16 @@ class V1PushV2PullRegistryTests(V2RegistryPullMixin, V1RegistryPushMixin, Regist
RegistryTestCaseMixin, LiveServerTestCase):
""" Tests for V1 push, V2 pull registry. """
def test_multiple_tag_with_pull(self):
""" Tagging the same exact V1 tag multiple times and then pulling with V2. """
images = self._get_default_images()
self.do_push('devtable', 'newrepo', 'devtable', 'password', images=images)
self.do_pull('devtable', 'newrepo', 'devtable', 'password', images=images)
self.do_tag('devtable', 'newrepo', 'latest', images[0]['id'], auth=('devtable', 'password'))
self.do_pull('devtable', 'newrepo', 'devtable', 'password', images=images)
class V1PullV2PushRegistryTests(V1RegistryPullMixin, V2RegistryPushMixin, RegistryTestsMixin,
RegistryTestCaseMixin, LiveServerTestCase):