From e30b746aeffee4968105a317f37b2f00b914797b Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Sun, 19 Aug 2018 23:50:18 -0400 Subject: [PATCH 1/3] Fix TagManifests with shared digests under the same repository. TagManifests can (apparently, in very rare scenarios) share manifests with the exact same digests, so we need to support that case in the backfill worker. We also need to remove a unique constraint on the manifest column in the mapping table to support this case. --- data/database.py | 2 +- ...emove_unique_from_tagmanifesttomanifest.py | 43 +++++++++++++++++++ workers/manifestbackfillworker.py | 21 +++++++-- workers/test/test_manifestbackfillworker.py | 30 +++++++++++++ 4 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 data/migrations/versions/13411de1c0ff_remove_unique_from_tagmanifesttomanifest.py diff --git a/data/database.py b/data/database.py index 866542c2b..6eb320400 100644 --- a/data/database.py +++ b/data/database.py @@ -1435,7 +1435,7 @@ class TagManifest(BaseModel): class TagManifestToManifest(BaseModel): """ NOTE: Only used for the duration of the migrations. """ tag_manifest = ForeignKeyField(TagManifest, index=True, unique=True) - manifest = ForeignKeyField(Manifest, index=True, unique=True) + manifest = ForeignKeyField(Manifest, index=True) broken = BooleanField(index=True, default=False) diff --git a/data/migrations/versions/13411de1c0ff_remove_unique_from_tagmanifesttomanifest.py b/data/migrations/versions/13411de1c0ff_remove_unique_from_tagmanifesttomanifest.py new file mode 100644 index 000000000..6aa576e31 --- /dev/null +++ b/data/migrations/versions/13411de1c0ff_remove_unique_from_tagmanifesttomanifest.py @@ -0,0 +1,43 @@ +"""Remove unique from TagManifestToManifest + +Revision ID: 13411de1c0ff +Revises: 654e6df88b71 +Create Date: 2018-08-19 23:30:24.969549 + +""" + +# revision identifiers, used by Alembic. +revision = '13411de1c0ff' +down_revision = '654e6df88b71' + +from alembic import op +import sqlalchemy as sa + +def upgrade(tables, tester): + # Note: Because of a restriction in MySQL, we cannot simply remove the index and re-add + # it without the unique=False, nor can we simply alter the index. To make it work, we'd have to + # remove the primary key on the field, so instead we simply drop the table entirely and + # recreate it with the modified index. The backfill will re-fill this in. + op.drop_table('tagmanifesttomanifest') + + op.create_table('tagmanifesttomanifest', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('tag_manifest_id', sa.Integer(), nullable=False), + sa.Column('manifest_id', sa.Integer(), nullable=False), + sa.Column('broken', sa.Boolean(), nullable=False, server_default=sa.sql.expression.false()), + sa.ForeignKeyConstraint(['manifest_id'], ['manifest.id'], name=op.f('fk_tagmanifesttomanifest_manifest_id_manifest')), + sa.ForeignKeyConstraint(['tag_manifest_id'], ['tagmanifest.id'], name=op.f('fk_tagmanifesttomanifest_tag_manifest_id_tagmanifest')), + sa.PrimaryKeyConstraint('id', name=op.f('pk_tagmanifesttomanifest')) + ) + op.create_index('tagmanifesttomanifest_broken', 'tagmanifesttomanifest', ['broken'], unique=False) + op.create_index('tagmanifesttomanifest_manifest_id', 'tagmanifesttomanifest', ['manifest_id'], unique=False) + op.create_index('tagmanifesttomanifest_tag_manifest_id', 'tagmanifesttomanifest', ['tag_manifest_id'], unique=True) + + tester.populate_table('tagmanifesttomanifest', [ + ('manifest_id', tester.TestDataType.Foreign('manifest')), + ('tag_manifest_id', tester.TestDataType.Foreign('tagmanifest')), + ]) + + +def downgrade(tables, tester): + pass diff --git a/workers/manifestbackfillworker.py b/workers/manifestbackfillworker.py index 3a6aab4c6..19139cd38 100644 --- a/workers/manifestbackfillworker.py +++ b/workers/manifestbackfillworker.py @@ -5,7 +5,7 @@ from peewee import JOIN, fn, IntegrityError from app import app from data.database import (UseThenDisconnect, TagManifest, TagManifestToManifest, Image, - db_transaction) + Manifest, db_transaction) from data.model.image import get_parent_images from data.model.tag import populate_manifest from data.model.blob import get_repo_blob_by_digest, BlobDoesNotExist @@ -158,9 +158,22 @@ def backfill_manifest(tag_manifest): except TagManifest.DoesNotExist: return True - # Create the new-style rows for the manifest. - manifest_row = populate_manifest(tag_manifest.tag.repository, manifest, - tag_manifest.tag.image, storage_ids) + # Ensure it wasn't already created. + if lookup_map_row(tag_manifest): + return False + + # Check for a pre-existing manifest matching the digest in the repository. This can happen + # if we've already created the manifest row (typically for tag reverision). + try: + manifest_row = Manifest.get(digest=manifest.digest, repository=tag_manifest.tag.repository) + except Manifest.DoesNotExist: + # Create the new-style rows for the manifest. + try: + manifest_row = populate_manifest(tag_manifest.tag.repository, manifest, + tag_manifest.tag.image, storage_ids) + except IntegrityError: + # Pre-empted. + return False # Create the mapping row. If we find another was created for this tag manifest in the # meantime, then we've been preempted. diff --git a/workers/test/test_manifestbackfillworker.py b/workers/test/test_manifestbackfillworker.py index 0a63325de..4d8a9182e 100644 --- a/workers/test/test_manifestbackfillworker.py +++ b/workers/test/test_manifestbackfillworker.py @@ -138,3 +138,33 @@ def test_manifestbackfillworker_mislinked_invalid_manifest(clear_rows, initializ manifest_blobs = list(ManifestBlob.select().where(ManifestBlob.manifest == manifest_row)) assert len(manifest_blobs) == 0 + + +def test_manifestbackfillworker_repeat_digest(clear_rows, initialized_db): + """ Tests that a manifest with a shared digest will be properly linked. """ + # Delete existing tag manifest so we can reuse the tag. + TagManifestLabel.delete().execute() + TagManifest.delete().execute() + + repo = model.repository.get_repository('devtable', 'gargantuan') + tag_v30 = model.tag.get_active_tag('devtable', 'gargantuan', 'v3.0') + tag_v50 = model.tag.get_active_tag('devtable', 'gargantuan', 'v5.0') + + # Build a manifest and assign it to both tags (this is allowed in the old model). + builder = DockerSchema1ManifestBuilder('devtable', 'gargantuan', 'sometag') + builder.add_layer('sha256:deadbeef', '{"id": "foo"}') + manifest = builder.build(docker_v2_signing_key) + + manifest_1 = TagManifest.create(json_data=manifest.bytes, digest=manifest.digest, + tag=tag_v30) + manifest_2 = TagManifest.create(json_data=manifest.bytes, digest=manifest.digest, + tag=tag_v50) + + # Backfill "both" manifests and ensure both are pointed to by a single resulting row. + assert backfill_manifest(manifest_1) + assert backfill_manifest(manifest_2) + + map_row1 = TagManifestToManifest.get(tag_manifest=manifest_1) + map_row2 = TagManifestToManifest.get(tag_manifest=manifest_2) + + assert map_row1.manifest == map_row2.manifest From ed897626a2fcac4c3956fb46a72def4317b2d5d2 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Mon, 20 Aug 2018 00:00:16 -0400 Subject: [PATCH 2/3] Handle data model exceptions when loading parent images in manifest backfill --- workers/manifestbackfillworker.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/workers/manifestbackfillworker.py b/workers/manifestbackfillworker.py index 19139cd38..1d9be2992 100644 --- a/workers/manifestbackfillworker.py +++ b/workers/manifestbackfillworker.py @@ -6,6 +6,7 @@ from peewee import JOIN, fn, IntegrityError from app import app from data.database import (UseThenDisconnect, TagManifest, TagManifestToManifest, Image, Manifest, db_transaction) +from data.model import DataModelException from data.model.image import get_parent_images from data.model.tag import populate_manifest from data.model.blob import get_repo_blob_by_digest, BlobDoesNotExist @@ -129,7 +130,16 @@ def backfill_manifest(tag_manifest): repository = tag_manifest.tag.repository image_storage_id_map = {root_image.storage.content_checksum: root_image.storage.id} - parent_images = get_parent_images(repository.namespace_user.username, repository.name, root_image) + + try: + parent_images = get_parent_images(repository.namespace_user.username, repository.name, + root_image) + except DataModelException: + logger.exception('Exception when trying to load parent images for manifest `%s`', + tag_manifest.id) + parent_images = {} + is_broken = True + for parent_image in parent_images: image_storage_id_map[parent_image.storage.content_checksum] = parent_image.storage.id From 301532279cb1097441eca4fc9146aa2ca4412f99 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Mon, 20 Aug 2018 11:36:04 -0400 Subject: [PATCH 3/3] Fix broken registry data interface tests --- data/registry_model/test/test_pre_oci_model.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/data/registry_model/test/test_pre_oci_model.py b/data/registry_model/test/test_pre_oci_model.py index 3d7140475..6e81c3fff 100644 --- a/data/registry_model/test/test_pre_oci_model.py +++ b/data/registry_model/test/test_pre_oci_model.py @@ -12,9 +12,10 @@ def pre_oci_model(initialized_db): @pytest.mark.parametrize('names, expected', [ (['unknown'], None), - (['latest'], 'latest'), - (['latest', 'prod'], 'latest'), - (['foo', 'prod'], 'prod'), + (['latest'], {'latest'}), + (['latest', 'prod'], {'latest', 'prod'}), + (['latest', 'prod', 'another'], {'latest', 'prod'}), + (['foo', 'prod'], {'prod'}), ]) def test_find_matching_tag(names, expected, pre_oci_model): repo = model.repository.get_repository('devtable', 'simple') @@ -23,12 +24,12 @@ def test_find_matching_tag(names, expected, pre_oci_model): if expected is None: assert found is None else: - assert found.name == expected + assert found.name in expected @pytest.mark.parametrize('repo_namespace, repo_name, expected', [ - ('devtable', 'simple', 'latest'), - ('buynlarge', 'orgrepo', 'latest'), + ('devtable', 'simple', {'latest'}), + ('buynlarge', 'orgrepo', {'latest', 'prod'}), ]) def test_get_most_recent_tag(repo_namespace, repo_name, expected, pre_oci_model): repo = model.repository.get_repository(repo_namespace, repo_name) @@ -37,4 +38,4 @@ def test_get_most_recent_tag(repo_namespace, repo_name, expected, pre_oci_model) if expected is None: assert found is None else: - assert found.name == expected + assert found.name in expected