From e9a95874ee074ded0c4dd387305b33bfefcb1140 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 22 Jun 2017 16:24:38 -0400 Subject: [PATCH 1/4] Move GC tests into pytest --- data/model/test/test_gc.py | 615 +++++++++++++++++++++++++++++++++++++ test/test_gc.py | 585 ----------------------------------- 2 files changed, 615 insertions(+), 585 deletions(-) create mode 100644 data/model/test/test_gc.py delete mode 100644 test/test_gc.py diff --git a/data/model/test/test_gc.py b/data/model/test/test_gc.py new file mode 100644 index 000000000..2c1626c30 --- /dev/null +++ b/data/model/test/test_gc.py @@ -0,0 +1,615 @@ +import hashlib +import pytest +import time + +from mock import patch + +from app import storage +from contextlib import contextmanager +from playhouse.test_utils import assert_query_count + +from data import model, database +from data.database import Image, ImageStorage, DerivedStorageForImage, Label, TagManifestLabel, Blob +from test.fixtures import * + + +ADMIN_ACCESS_USER = 'devtable' +PUBLIC_USER = 'public' + +REPO = 'somerepo' + +def _set_tag_expiration_policy(namespace, expiration_s): + namespace_user = model.user.get_user(namespace) + model.user.change_user_tag_expiration(namespace_user, expiration_s) + + +@pytest.fixture() +def default_tag_policy(initialized_db): + _set_tag_expiration_policy(ADMIN_ACCESS_USER, 0) + _set_tag_expiration_policy(PUBLIC_USER, 0) + + +def create_image(docker_image_id, repository_obj, username): + preferred = storage.preferred_locations[0] + image = model.image.find_create_or_link_image(docker_image_id, repository_obj, username, {}, + preferred) + image.storage.uploading = False + image.storage.save() + + # Create derived images as well. + model.image.find_or_create_derived_storage(image, 'squash', preferred) + model.image.find_or_create_derived_storage(image, 'aci', preferred) + + # Add some torrent info. + try: + database.TorrentInfo.get(storage=image.storage) + except database.TorrentInfo.DoesNotExist: + model.storage.save_torrent_info(image.storage, 1, 'helloworld') + + # Add some additional placements to the image. + for location_name in ['local_eu']: + location = database.ImageStorageLocation.get(name=location_name) + + try: + database.ImageStoragePlacement.get(location=location, storage=image.storage) + except: + continue + + database.ImageStoragePlacement.create(location=location, storage=image.storage) + + return image.storage + + +def create_repository(namespace=ADMIN_ACCESS_USER, name=REPO, **kwargs): + user = model.user.get_user(namespace) + repo = model.repository.create_repository(namespace, name, user) + + # Populate the repository with the tags. + image_map = {} + for tag_name in kwargs: + image_ids = kwargs[tag_name] + parent = None + + for image_id in image_ids: + if not image_id in image_map: + image_map[image_id] = create_image(image_id, repo, namespace) + + v1_metadata = { + 'id': image_id, + } + if parent is not None: + v1_metadata['parent'] = parent.docker_image_id + + # Set the ancestors for the image. + parent = model.image.set_image_metadata(image_id, namespace, name, '', '', '', v1_metadata, + parent=parent) + + # Set the tag for the image. + tag_manifest, _ = model.tag.store_tag_manifest(namespace, name, tag_name, image_ids[-1], + 'sha:someshahere', '{}') + + # Add some labels to the tag. + model.label.create_manifest_label(tag_manifest, 'foo', 'bar', 'manifest') + model.label.create_manifest_label(tag_manifest, 'meh', 'grah', 'manifest') + + return repo + + +def gc_now(repository): + assert model.repository.garbage_collect_repo(repository) + + +def delete_tag(repository, tag, perform_gc=True): + model.tag.delete_tag(repository.namespace_user.username, repository.name, tag) + if perform_gc: + assert model.repository.garbage_collect_repo(repository) + + +def move_tag(repository, tag, docker_image_id): + model.tag.create_or_update_tag(repository.namespace_user.username, repository.name, tag, + docker_image_id) + assert model.repository.garbage_collect_repo(repository) + + +def assert_not_deleted(repository, *args): + for docker_image_id in args: + assert model.image.get_image_by_id(repository.namespace_user.username, repository.name, + docker_image_id) + + +def assert_deleted(repository, *args): + for docker_image_id in args: + try: + # Verify the image is missing when accessed by the repository. + model.image.get_image_by_id(repository.namespace_user.username, repository.name, + docker_image_id) + except model.DataModelException: + return + + assert False, 'Expected image %s to be deleted' % docker_image_id + + +def _get_dangling_storage_count(): + storage_ids = set([current.id for current in ImageStorage.select()]) + referenced_by_image = set([image.storage_id for image in Image.select()]) + referenced_by_derived = set([derived.derivative_id + for derived in DerivedStorageForImage.select()]) + + return len(storage_ids - referenced_by_image - referenced_by_derived) + + +def _get_dangling_label_count(): + label_ids = set([current.id for current in Label.select()]) + referenced_by_manifest = set([mlabel.label_id for mlabel in TagManifestLabel.select()]) + return len(label_ids - referenced_by_manifest) + + +@contextmanager +def assert_gc_integrity(expect_storage_removed=True): + """ Specialized assertion for ensuring that GC cleans up all dangling storages + and labels, invokes the callback for images removed and doesn't invoke the + callback for images *not* removed. + """ + # Add a callback for when images are removed. + removed_image_storages = [] + model.config.register_image_cleanup_callback(removed_image_storages.extend) + + # Store the number of dangling storages and labels. + existing_storage_count = _get_dangling_storage_count() + existing_label_count = _get_dangling_label_count() + yield + + # Ensure the number of dangling storages and labels has not changed. + updated_storage_count = _get_dangling_storage_count() + assert updated_storage_count == existing_storage_count + + updated_label_count = _get_dangling_label_count() + assert updated_label_count == existing_label_count + + # Ensure that for each call to the image+storage cleanup callback, the image and its + # storage is not found *anywhere* in the database. + for removed_image_and_storage in removed_image_storages: + with pytest.raises(Image.DoesNotExist): + Image.get(id=removed_image_and_storage.id) + + with pytest.raises(ImageStorage.DoesNotExist): + ImageStorage.get(id=removed_image_and_storage.storage_id) + + with pytest.raises(ImageStorage.DoesNotExist): + ImageStorage.get(uuid=removed_image_and_storage.storage.uuid) + + assert expect_storage_removed == bool(removed_image_storages) + + # Ensure all CAS storage is in the storage engine. + preferred = storage.preferred_locations[0] + for storage_row in ImageStorage.select(): + if storage_row.cas_path: + storage.get_content({preferred}, storage.blob_path(storage_row.content_checksum)) + + for blob_row in Blob.select(): + storage.get_content({preferred}, storage.blob_path(blob_row.digest)) + + +def test_has_garbage(default_tag_policy, initialized_db): + """ Remove all existing repositories, then add one without garbage, check, then add one with + garbage, and check again. + """ + # Delete all existing repos. + for repo in database.Repository.select().order_by(database.Repository.id): + assert model.repository.purge_repository(repo.namespace_user.username, repo.name) + + # Change the time machine expiration on the namespace. + (database.User + .update(removed_tag_expiration_s=1000000000) + .where(database.User.username == ADMIN_ACCESS_USER) + .execute()) + + # Create a repository without any garbage. + repository = create_repository(latest=['i1', 'i2', 'i3']) + + # Ensure that no repositories are returned by the has garbage check. + assert model.repository.find_repository_with_garbage(1000000000) is None + + # Delete a tag. + delete_tag(repository, 'latest', perform_gc=False) + + # There should still not be any repositories with garbage, due to time machine. + assert model.repository.find_repository_with_garbage(1000000000) is None + + # Change the time machine expiration on the namespace. + (database.User + .update(removed_tag_expiration_s=0) + .where(database.User.username == ADMIN_ACCESS_USER) + .execute()) + + # Now we should find the repository for GC. + repository = model.repository.find_repository_with_garbage(0) + assert repository is not None + assert repository.name == REPO + + # GC the repository. + assert model.repository.garbage_collect_repo(repository) + + # There should now be no repositories with garbage. + assert model.repository.find_repository_with_garbage(0) is None + + +def test_find_garbage_policy_functions(default_tag_policy, initialized_db): + with assert_query_count(1): + one_policy = model.repository.get_random_gc_policy() + all_policies = model.repository._get_gc_expiration_policies() + assert one_policy in all_policies + + +def test_one_tag(default_tag_policy, initialized_db): + """ Create a repository with a single tag, then remove that tag and verify that the repository + is now empty. """ + with assert_gc_integrity(): + repository = create_repository(latest=['i1', 'i2', 'i3']) + delete_tag(repository, 'latest') + assert_deleted(repository, 'i1', 'i2', 'i3') + + +def test_two_tags_unshared_images(default_tag_policy, initialized_db): + """ Repository has two tags with no shared images between them. """ + with assert_gc_integrity(): + repository = create_repository(latest=['i1', 'i2', 'i3'], other=['f1', 'f2']) + delete_tag(repository, 'latest') + assert_deleted(repository, 'i1', 'i2', 'i3') + assert_not_deleted(repository, 'f1', 'f2') + + +def test_two_tags_shared_images(default_tag_policy, initialized_db): + """ Repository has two tags with shared images. Deleting the tag should only remove the + unshared images. + """ + with assert_gc_integrity(): + repository = create_repository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) + delete_tag(repository, 'latest') + assert_deleted(repository, 'i2', 'i3') + assert_not_deleted(repository, 'i1', 'f1') + + +def test_unrelated_repositories(default_tag_policy, initialized_db): + """ Two repositories with different images. Removing the tag from one leaves the other's + images intact. + """ + with assert_gc_integrity(): + repository1 = create_repository(latest=['i1', 'i2', 'i3'], name='repo1') + repository2 = create_repository(latest=['j1', 'j2', 'j3'], name='repo2') + + delete_tag(repository1, 'latest') + + assert_deleted(repository1, 'i1', 'i2', 'i3') + assert_not_deleted(repository2, 'j1', 'j2', 'j3') + + +def test_related_repositories(default_tag_policy, initialized_db): + """ Two repositories with shared images. Removing the tag from one leaves the other's + images intact. + """ + with assert_gc_integrity(): + repository1 = create_repository(latest=['i1', 'i2', 'i3'], name='repo1') + repository2 = create_repository(latest=['i1', 'i2', 'j1'], name='repo2') + + delete_tag(repository1, 'latest') + + assert_deleted(repository1, 'i3') + assert_not_deleted(repository2, 'i1', 'i2', 'j1') + + +def test_inaccessible_repositories(default_tag_policy, initialized_db): + """ Two repositories under different namespaces should result in the images being deleted + but not completely removed from the database. + """ + with assert_gc_integrity(): + repository1 = create_repository(namespace=ADMIN_ACCESS_USER, latest=['i1', 'i2', 'i3']) + repository2 = create_repository(namespace=PUBLIC_USER, latest=['i1', 'i2', 'i3']) + + delete_tag(repository1, 'latest') + assert_deleted(repository1, 'i1', 'i2', 'i3') + assert_not_deleted(repository2, 'i1', 'i2', 'i3') + + + +def test_many_multiple_shared_images(default_tag_policy, initialized_db): + """ Repository has multiple tags with shared images. Delete all but one tag. + """ + with assert_gc_integrity(): + repository = create_repository(latest=['i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7', 'i8', 'j0'], + master=['i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7', 'i8', 'j1']) + + # Delete tag latest. Should only delete j0, since it is not shared. + delete_tag(repository, 'latest') + + assert_deleted(repository, 'j0') + assert_not_deleted(repository, 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7', 'i8', 'j1') + + # Delete tag master. Should delete the rest of the images. + delete_tag(repository, 'master') + + assert_deleted(repository, 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7', 'i8', 'j1') + + +def test_multiple_shared_images(default_tag_policy, initialized_db): + """ Repository has multiple tags with shared images. Selectively deleting the tags, and + verifying at each step. + """ + with assert_gc_integrity(): + repository = create_repository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], + third=['t1', 't2', 't3'], fourth=['i1', 'f1']) + + # Current state: + # latest -> i3->i2->i1 + # other -> f2->f1->i1 + # third -> t3->t2->t1 + # fourth -> f1->i1 + + # Delete tag other. Should delete f2, since it is not shared. + delete_tag(repository, 'other') + assert_deleted(repository, 'f2') + assert_not_deleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3', 'f1') + + # Current state: + # latest -> i3->i2->i1 + # third -> t3->t2->t1 + # fourth -> f1->i1 + + # Move tag fourth to i3. This should remove f1 since it is no longer referenced. + move_tag(repository, 'fourth', 'i3') + assert_deleted(repository, 'f1') + assert_not_deleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3') + + # Current state: + # latest -> i3->i2->i1 + # third -> t3->t2->t1 + # fourth -> i3->i2->i1 + + # Delete tag 'latest'. This should do nothing since fourth is on the same branch. + delete_tag(repository, 'latest') + assert_not_deleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3') + + # Current state: + # third -> t3->t2->t1 + # fourth -> i3->i2->i1 + + # Delete tag 'third'. This should remove t1->t3. + delete_tag(repository, 'third') + assert_deleted(repository, 't1', 't2', 't3') + assert_not_deleted(repository, 'i1', 'i2', 'i3') + + # Current state: + # fourth -> i3->i2->i1 + + # Add tag to i1. + move_tag(repository, 'newtag', 'i1') + assert_not_deleted(repository, 'i1', 'i2', 'i3') + + # Current state: + # fourth -> i3->i2->i1 + # newtag -> i1 + + # Delete tag 'fourth'. This should remove i2 and i3. + delete_tag(repository, 'fourth') + assert_deleted(repository, 'i2', 'i3') + assert_not_deleted(repository, 'i1') + + # Current state: + # newtag -> i1 + + # Delete tag 'newtag'. This should remove the remaining image. + delete_tag(repository, 'newtag') + assert_deleted(repository, 'i1') + + # Current state: + # (Empty) + + +def test_empty_gc(default_tag_policy, initialized_db): + with assert_gc_integrity(expect_storage_removed=False): + repository = create_repository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], + third=['t1', 't2', 't3'], fourth=['i1', 'f1']) + + gc_now(repository) + assert_not_deleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3', 'f1', 'f2') + + +def test_time_machine_no_gc(default_tag_policy, initialized_db): + """ Repository has two tags with shared images. Deleting the tag should not remove any images + """ + with assert_gc_integrity(expect_storage_removed=False): + repository = create_repository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) + _set_tag_expiration_policy(repository.namespace_user.username, 60*60*24) + + delete_tag(repository, 'latest') + assert_not_deleted(repository, 'i2', 'i3') + assert_not_deleted(repository, 'i1', 'f1') + + +def test_time_machine_gc(default_tag_policy, initialized_db): + """ Repository has two tags with shared images. Deleting the second tag should cause the images + for the first deleted tag to gc. + """ + with assert_gc_integrity(): + repository = create_repository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) + + _set_tag_expiration_policy(repository.namespace_user.username, 1) + + delete_tag(repository, 'latest') + assert_not_deleted(repository, 'i2', 'i3') + assert_not_deleted(repository, 'i1', 'f1') + + time.sleep(2) + + # This will cause the images associated with latest to gc + delete_tag(repository, 'other') + assert_deleted(repository, 'i2', 'i3') + assert_not_deleted(repository, 'i1', 'f1') + + +def test_images_shared_storage(default_tag_policy, initialized_db): + """ Repository with two tags, both with the same shared storage. Deleting the first + tag should delete the first image, but *not* its storage. + """ + with assert_gc_integrity(expect_storage_removed=False): + repository = create_repository() + + # Add two tags, each with their own image, but with the same storage. + image_storage = model.storage.create_v1_storage(storage.preferred_locations[0]) + + first_image = Image.create(docker_image_id='i1', + repository=repository, storage=image_storage, + ancestors='/') + + second_image = Image.create(docker_image_id='i2', + repository=repository, storage=image_storage, + ancestors='/') + + model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, + 'first', first_image.docker_image_id, + 'sha:someshahere', '{}') + + model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, + 'second', second_image.docker_image_id, + 'sha:someshahere', '{}') + + # Delete the first tag. + delete_tag(repository, 'first') + assert_deleted(repository, 'i1') + assert_not_deleted(repository, 'i2') + + +def test_image_with_cas(default_tag_policy, initialized_db): + """ A repository with a tag pointing to an image backed by CAS. Deleting and GCing the tag + should result in the storage and its CAS data being removed. + """ + with assert_gc_integrity(expect_storage_removed=True): + repository = create_repository() + + # Create an image storage record under CAS. + content = 'hello world' + digest = 'sha256:' + hashlib.sha256(content).hexdigest() + preferred = storage.preferred_locations[0] + storage.put_content({preferred}, storage.blob_path(digest), content) + + image_storage = database.ImageStorage.create(content_checksum=digest, uploading=False) + location = database.ImageStorageLocation.get(name=preferred) + database.ImageStoragePlacement.create(location=location, storage=image_storage) + + # Ensure the CAS path exists. + assert storage.exists({preferred}, storage.blob_path(digest)) + + # Create the image and the tag. + first_image = Image.create(docker_image_id='i1', + repository=repository, storage=image_storage, + ancestors='/') + + model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, + 'first', first_image.docker_image_id, + 'sha:someshahere1', '{}') + + assert_not_deleted(repository, 'i1') + + # Delete the tag. + delete_tag(repository, 'first') + assert_deleted(repository, 'i1') + + # Ensure the CAS path is gone. + assert not storage.exists({preferred}, storage.blob_path(digest)) + + +def test_images_shared_cas(default_tag_policy, initialized_db): + """ A repository, each two tags, pointing to the same image, which has image storage + with the same *CAS path*, but *distinct records*. Deleting the first tag should delete the + first image, and its storage, but not the file in storage, as it shares its CAS path. + """ + with assert_gc_integrity(expect_storage_removed=True): + repository = create_repository() + + # Create two image storage records with the same content checksum. + content = 'hello world' + digest = 'sha256:' + hashlib.sha256(content).hexdigest() + preferred = storage.preferred_locations[0] + storage.put_content({preferred}, storage.blob_path(digest), content) + + is1 = database.ImageStorage.create(content_checksum=digest, uploading=False) + is2 = database.ImageStorage.create(content_checksum=digest, uploading=False) + + location = database.ImageStorageLocation.get(name=preferred) + + database.ImageStoragePlacement.create(location=location, storage=is1) + database.ImageStoragePlacement.create(location=location, storage=is2) + + # Ensure the CAS path exists. + assert storage.exists({preferred}, storage.blob_path(digest)) + + # Create two images in the repository, and two tags, each pointing to one of the storages. + first_image = Image.create(docker_image_id='i1', + repository=repository, storage=is1, + ancestors='/') + + second_image = Image.create(docker_image_id='i2', + repository=repository, storage=is2, + ancestors='/') + + model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, + 'first', first_image.docker_image_id, + 'sha:someshahere1', '{}') + + model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, + 'second', second_image.docker_image_id, + 'sha:someshahere2', '{}') + + assert_not_deleted(repository, 'i1', 'i2') + + # Delete the first tag. + delete_tag(repository, 'first') + assert_deleted(repository, 'i1') + assert_not_deleted(repository, 'i2') + + # Ensure the CAS path still exists. + assert storage.exists({preferred}, storage.blob_path(digest)) + + +def test_images_shared_cas_with_new_blob_table(default_tag_policy, initialized_db): + """ A repository with a tag and image that shares its CAS path with a record in the new Blob + table. Deleting the first tag should delete the first image, and its storage, but not the + file in storage, as it shares its CAS path with the blob row. + """ + with assert_gc_integrity(expect_storage_removed=True): + repository = create_repository() + + # Create two image storage records with the same content checksum. + content = 'hello world' + digest = 'sha256:' + hashlib.sha256(content).hexdigest() + preferred = storage.preferred_locations[0] + storage.put_content({preferred}, storage.blob_path(digest), content) + + media_type = database.MediaType.get(name='text/plain') + + is1 = database.ImageStorage.create(content_checksum=digest, uploading=False) + database.Blob.create(digest=digest, size=0, media_type=media_type) + + location = database.ImageStorageLocation.get(name=preferred) + database.ImageStoragePlacement.create(location=location, storage=is1) + + # Ensure the CAS path exists. + assert storage.exists({preferred}, storage.blob_path(digest)) + + # Create the image in the repository, and the tag. + first_image = Image.create(docker_image_id='i1', + repository=repository, storage=is1, + ancestors='/') + + model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, + 'first', first_image.docker_image_id, + 'sha:someshahere1', '{}') + + assert_not_deleted(repository, 'i1') + + # Delete the tag. + delete_tag(repository, 'first') + assert_deleted(repository, 'i1') + + # Ensure the CAS path still exists, as it is referenced by the Blob table + assert storage.exists({preferred}, storage.blob_path(digest)) diff --git a/test/test_gc.py b/test/test_gc.py deleted file mode 100644 index a7085af04..000000000 --- a/test/test_gc.py +++ /dev/null @@ -1,585 +0,0 @@ -import unittest -import time -import hashlib - -from contextlib import contextmanager -from playhouse.test_utils import assert_query_count - -from app import app, storage -from initdb import setup_database_for_testing, finished_database_for_testing -from data import model, database -from data.database import Image, ImageStorage, DerivedStorageForImage, Label, TagManifestLabel, Blob - - -ADMIN_ACCESS_USER = 'devtable' -PUBLIC_USER = 'public' - -REPO = 'somerepo' - - -class TestGarbageCollection(unittest.TestCase): - @staticmethod - def _set_tag_expiration_policy(namespace, expiration_s): - namespace_user = model.user.get_user(namespace) - model.user.change_user_tag_expiration(namespace_user, expiration_s) - - def setUp(self): - setup_database_for_testing(self) - - self._set_tag_expiration_policy(ADMIN_ACCESS_USER, 0) - self._set_tag_expiration_policy(PUBLIC_USER, 0) - - self.app = app.test_client() - self.ctx = app.test_request_context() - self.ctx.__enter__() - - def tearDown(self): - finished_database_for_testing(self) - self.ctx.__exit__(True, None, None) - - @staticmethod - def createImage(docker_image_id, repository_obj, username): - preferred = storage.preferred_locations[0] - image = model.image.find_create_or_link_image(docker_image_id, repository_obj, username, {}, - preferred) - image.storage.uploading = False - image.storage.save() - - # Create derived images as well. - model.image.find_or_create_derived_storage(image, 'squash', preferred) - model.image.find_or_create_derived_storage(image, 'aci', preferred) - - # Add some torrent info. - try: - database.TorrentInfo.get(storage=image.storage) - except database.TorrentInfo.DoesNotExist: - model.storage.save_torrent_info(image.storage, 1, 'helloworld') - - # Add some additional placements to the image. - for location_name in ['local_eu']: - location = database.ImageStorageLocation.get(name=location_name) - - try: - database.ImageStoragePlacement.get(location=location, storage=image.storage) - except: - continue - - database.ImageStoragePlacement.create(location=location, storage=image.storage) - - return image.storage - - def createRepository(self, namespace=ADMIN_ACCESS_USER, name=REPO, **kwargs): - user = model.user.get_user(namespace) - repo = model.repository.create_repository(namespace, name, user) - - # Populate the repository with the tags. - image_map = {} - for tag_name in kwargs: - image_ids = kwargs[tag_name] - parent = None - - for image_id in image_ids: - if not image_id in image_map: - image_map[image_id] = self.createImage(image_id, repo, namespace) - - v1_metadata = { - 'id': image_id, - } - if parent is not None: - v1_metadata['parent'] = parent.docker_image_id - - # Set the ancestors for the image. - parent = model.image.set_image_metadata(image_id, namespace, name, '', '', '', v1_metadata, - parent=parent) - - # Set the tag for the image. - tag_manifest, _ = model.tag.store_tag_manifest(namespace, name, tag_name, image_ids[-1], - 'sha:someshahere', '{}') - - # Add some labels to the tag. - model.label.create_manifest_label(tag_manifest, 'foo', 'bar', 'manifest') - model.label.create_manifest_label(tag_manifest, 'meh', 'grah', 'manifest') - - return repo - - def gcNow(self, repository): - self.assertTrue(model.repository.garbage_collect_repo(repository)) - - def deleteTag(self, repository, tag, perform_gc=True): - model.tag.delete_tag(repository.namespace_user.username, repository.name, tag) - if perform_gc: - self.assertTrue(model.repository.garbage_collect_repo(repository)) - - def moveTag(self, repository, tag, docker_image_id): - model.tag.create_or_update_tag(repository.namespace_user.username, repository.name, tag, - docker_image_id) - self.assertTrue(model.repository.garbage_collect_repo(repository)) - - def assertNotDeleted(self, repository, *args): - for docker_image_id in args: - self.assertTrue(bool(model.image.get_image_by_id(repository.namespace_user.username, - repository.name, docker_image_id))) - - def assertDeleted(self, repository, *args): - for docker_image_id in args: - try: - # Verify the image is missing when accessed by the repository. - model.image.get_image_by_id(repository.namespace_user.username, repository.name, - docker_image_id) - except model.DataModelException: - return - - self.fail('Expected image %s to be deleted' % docker_image_id) - - @staticmethod - def _get_dangling_storage_count(): - storage_ids = set([current.id for current in ImageStorage.select()]) - referenced_by_image = set([image.storage_id for image in Image.select()]) - referenced_by_derived = set([derived.derivative_id - for derived in DerivedStorageForImage.select()]) - - return len(storage_ids - referenced_by_image - referenced_by_derived) - - @staticmethod - def _get_dangling_label_count(): - label_ids = set([current.id for current in Label.select()]) - referenced_by_manifest = set([mlabel.label_id for mlabel in TagManifestLabel.select()]) - return len(label_ids - referenced_by_manifest) - - @contextmanager - def assert_gc_integrity(self, expect_storage_removed=True): - """ Specialized assertion for ensuring that GC cleans up all dangling storages - and labels, invokes the callback for images removed and doesn't invoke the - callback for images *not* removed. - """ - # TODO: Consider also asserting the number of DB queries being performed. - - # Add a callback for when images are removed. - removed_image_storages = [] - model.config.register_image_cleanup_callback(removed_image_storages.extend) - - # Store the number of dangling storages and labels. - existing_storage_count = self._get_dangling_storage_count() - existing_label_count = self._get_dangling_label_count() - yield - - # Ensure the number of dangling storages and labels has not changed. - updated_storage_count = self._get_dangling_storage_count() - self.assertEqual(updated_storage_count, existing_storage_count) - - updated_label_count = self._get_dangling_label_count() - self.assertEqual(updated_label_count, existing_label_count) - - # Ensure that for each call to the image+storage cleanup callback, the image and its - # storage is not found *anywhere* in the database. - for removed_image_and_storage in removed_image_storages: - with self.assertRaises(Image.DoesNotExist): - Image.get(id=removed_image_and_storage.id) - - with self.assertRaises(ImageStorage.DoesNotExist): - ImageStorage.get(id=removed_image_and_storage.storage_id) - - with self.assertRaises(ImageStorage.DoesNotExist): - ImageStorage.get(uuid=removed_image_and_storage.storage.uuid) - - self.assertEquals(expect_storage_removed, bool(removed_image_storages)) - - # Ensure all CAS storage is in the storage engine. - preferred = storage.preferred_locations[0] - for storage_row in ImageStorage.select(): - if storage_row.cas_path: - storage.get_content({preferred}, storage.blob_path(storage_row.content_checksum)) - - for blob_row in Blob.select(): - storage.get_content({preferred}, storage.blob_path(blob_row.digest)) - - def test_has_garbage(self): - """ Remove all existing repositories, then add one without garbage, check, then add one with - garbage, and check again. - """ - # Delete all existing repos. - for repo in database.Repository.select().order_by(database.Repository.id): - self.assertTrue(model.repository.purge_repository(repo.namespace_user.username, repo.name)) - - # Change the time machine expiration on the namespace. - (database.User - .update(removed_tag_expiration_s=1000000000) - .where(database.User.username == ADMIN_ACCESS_USER) - .execute()) - - # Create a repository without any garbage. - repository = self.createRepository(latest=['i1', 'i2', 'i3']) - - # Ensure that no repositories are returned by the has garbage check. - self.assertIsNone(model.repository.find_repository_with_garbage(1000000000)) - - # Delete a tag. - self.deleteTag(repository, 'latest', perform_gc=False) - - # There should still not be any repositories with garbage, due to time machine. - self.assertIsNone(model.repository.find_repository_with_garbage(1000000000)) - - # Change the time machine expiration on the namespace. - (database.User - .update(removed_tag_expiration_s=0) - .where(database.User.username == ADMIN_ACCESS_USER) - .execute()) - - # Now we should find the repository for GC. - repository = model.repository.find_repository_with_garbage(0) - self.assertIsNotNone(repository) - self.assertEquals(REPO, repository.name) - - # GC the repository. - self.assertTrue(model.repository.garbage_collect_repo(repository)) - - # There should now be no repositories with garbage. - self.assertIsNone(model.repository.find_repository_with_garbage(0)) - - def test_find_garbage_policy_functions(self): - with assert_query_count(1): - one_policy = model.repository.get_random_gc_policy() - all_policies = model.repository._get_gc_expiration_policies() - self.assertIn(one_policy, all_policies) - - def test_one_tag(self): - """ Create a repository with a single tag, then remove that tag and verify that the repository - is now empty. """ - with self.assert_gc_integrity(): - repository = self.createRepository(latest=['i1', 'i2', 'i3']) - self.deleteTag(repository, 'latest') - self.assertDeleted(repository, 'i1', 'i2', 'i3') - - def test_two_tags_unshared_images(self): - """ Repository has two tags with no shared images between them. """ - with self.assert_gc_integrity(): - repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['f1', 'f2']) - self.deleteTag(repository, 'latest') - self.assertDeleted(repository, 'i1', 'i2', 'i3') - self.assertNotDeleted(repository, 'f1', 'f2') - - def test_two_tags_shared_images(self): - """ Repository has two tags with shared images. Deleting the tag should only remove the - unshared images. - """ - with self.assert_gc_integrity(): - repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) - self.deleteTag(repository, 'latest') - self.assertDeleted(repository, 'i2', 'i3') - self.assertNotDeleted(repository, 'i1', 'f1') - - def test_unrelated_repositories(self): - """ Two repositories with different images. Removing the tag from one leaves the other's - images intact. - """ - with self.assert_gc_integrity(): - repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') - repository2 = self.createRepository(latest=['j1', 'j2', 'j3'], name='repo2') - - self.deleteTag(repository1, 'latest') - - self.assertDeleted(repository1, 'i1', 'i2', 'i3') - self.assertNotDeleted(repository2, 'j1', 'j2', 'j3') - - def test_related_repositories(self): - """ Two repositories with shared images. Removing the tag from one leaves the other's - images intact. - """ - with self.assert_gc_integrity(): - repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') - repository2 = self.createRepository(latest=['i1', 'i2', 'j1'], name='repo2') - - self.deleteTag(repository1, 'latest') - - self.assertDeleted(repository1, 'i3') - self.assertNotDeleted(repository2, 'i1', 'i2', 'j1') - - def test_inaccessible_repositories(self): - """ Two repositories under different namespaces should result in the images being deleted - but not completely removed from the database. - """ - with self.assert_gc_integrity(): - repository1 = self.createRepository(namespace=ADMIN_ACCESS_USER, latest=['i1', 'i2', 'i3']) - repository2 = self.createRepository(namespace=PUBLIC_USER, latest=['i1', 'i2', 'i3']) - - self.deleteTag(repository1, 'latest') - self.assertDeleted(repository1, 'i1', 'i2', 'i3') - self.assertNotDeleted(repository2, 'i1', 'i2', 'i3') - - def test_multiple_shared_images(self): - """ Repository has multiple tags with shared images. Selectively deleting the tags, and - verifying at each step. - """ - with self.assert_gc_integrity(): - repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], - third=['t1', 't2', 't3'], fourth=['i1', 'f1']) - - # Current state: - # latest -> i3->i2->i1 - # other -> f2->f1->i1 - # third -> t3->t2->t1 - # fourth -> f1->i1 - - # Delete tag other. Should delete f2, since it is not shared. - self.deleteTag(repository, 'other') - self.assertDeleted(repository, 'f2') - self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3', 'f1') - - # Current state: - # latest -> i3->i2->i1 - # third -> t3->t2->t1 - # fourth -> f1->i1 - - # Move tag fourth to i3. This should remove f1 since it is no longer referenced. - self.moveTag(repository, 'fourth', 'i3') - self.assertDeleted(repository, 'f1') - self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3') - - # Current state: - # latest -> i3->i2->i1 - # third -> t3->t2->t1 - # fourth -> i3->i2->i1 - - # Delete tag 'latest'. This should do nothing since fourth is on the same branch. - self.deleteTag(repository, 'latest') - self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3') - - # Current state: - # third -> t3->t2->t1 - # fourth -> i3->i2->i1 - - # Delete tag 'third'. This should remove t1->t3. - self.deleteTag(repository, 'third') - self.assertDeleted(repository, 't1', 't2', 't3') - self.assertNotDeleted(repository, 'i1', 'i2', 'i3') - - # Current state: - # fourth -> i3->i2->i1 - - # Add tag to i1. - self.moveTag(repository, 'newtag', 'i1') - self.assertNotDeleted(repository, 'i1', 'i2', 'i3') - - # Current state: - # fourth -> i3->i2->i1 - # newtag -> i1 - - # Delete tag 'fourth'. This should remove i2 and i3. - self.deleteTag(repository, 'fourth') - self.assertDeleted(repository, 'i2', 'i3') - self.assertNotDeleted(repository, 'i1') - - # Current state: - # newtag -> i1 - - # Delete tag 'newtag'. This should remove the remaining image. - self.deleteTag(repository, 'newtag') - self.assertDeleted(repository, 'i1') - - # Current state: - # (Empty) - - def test_empty_gc(self): - with self.assert_gc_integrity(expect_storage_removed=False): - repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], - third=['t1', 't2', 't3'], fourth=['i1', 'f1']) - - self.gcNow(repository) - self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3', 'f1', 'f2') - - def test_time_machine_no_gc(self): - """ Repository has two tags with shared images. Deleting the tag should not remove any images - """ - with self.assert_gc_integrity(expect_storage_removed=False): - repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) - self._set_tag_expiration_policy(repository.namespace_user.username, 60*60*24) - - self.deleteTag(repository, 'latest') - self.assertNotDeleted(repository, 'i2', 'i3') - self.assertNotDeleted(repository, 'i1', 'f1') - - def test_time_machine_gc(self): - """ Repository has two tags with shared images. Deleting the second tag should cause the images - for the first deleted tag to gc. - """ - with self.assert_gc_integrity(): - repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) - - self._set_tag_expiration_policy(repository.namespace_user.username, 1) - - self.deleteTag(repository, 'latest') - self.assertNotDeleted(repository, 'i2', 'i3') - self.assertNotDeleted(repository, 'i1', 'f1') - - time.sleep(2) - - self.deleteTag(repository, 'other') # This will cause the images associated with latest to gc - self.assertDeleted(repository, 'i2', 'i3') - self.assertNotDeleted(repository, 'i1', 'f1') - - def test_images_shared_storage(self): - """ Repository with two tags, both with the same shared storage. Deleting the first - tag should delete the first image, but *not* its storage. - """ - with self.assert_gc_integrity(expect_storage_removed=False): - repository = self.createRepository() - - # Add two tags, each with their own image, but with the same storage. - image_storage = model.storage.create_v1_storage(storage.preferred_locations[0]) - - first_image = Image.create(docker_image_id='i1', - repository=repository, storage=image_storage, - ancestors='/') - - second_image = Image.create(docker_image_id='i2', - repository=repository, storage=image_storage, - ancestors='/') - - model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, - 'first', first_image.docker_image_id, - 'sha:someshahere', '{}') - - model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, - 'second', second_image.docker_image_id, - 'sha:someshahere', '{}') - - # Delete the first tag. - self.deleteTag(repository, 'first') - self.assertDeleted(repository, 'i1') - self.assertNotDeleted(repository, 'i2') - - def test_image_with_cas(self): - """ A repository with a tag pointing to an image backed by CAS. Deleting and GCing the tag - should result in the storage and its CAS data being removed. - """ - with self.assert_gc_integrity(expect_storage_removed=True): - repository = self.createRepository() - - # Create an image storage record under CAS. - content = 'hello world' - digest = 'sha256:' + hashlib.sha256(content).hexdigest() - preferred = storage.preferred_locations[0] - storage.put_content({preferred}, storage.blob_path(digest), content) - - image_storage = database.ImageStorage.create(content_checksum=digest, uploading=False) - location = database.ImageStorageLocation.get(name=preferred) - database.ImageStoragePlacement.create(location=location, storage=image_storage) - - # Ensure the CAS path exists. - self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) - - # Create the image and the tag. - first_image = Image.create(docker_image_id='i1', - repository=repository, storage=image_storage, - ancestors='/') - - model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, - 'first', first_image.docker_image_id, - 'sha:someshahere1', '{}') - - self.assertNotDeleted(repository, 'i1') - - # Delete the tag. - self.deleteTag(repository, 'first') - self.assertDeleted(repository, 'i1') - - # Ensure the CAS path is gone. - self.assertFalse(storage.exists({preferred}, storage.blob_path(digest))) - - def test_images_shared_cas(self): - """ A repository, each two tags, pointing to the same image, which has image storage - with the same *CAS path*, but *distinct records*. Deleting the first tag should delete the - first image, and its storage, but not the file in storage, as it shares its CAS path. - """ - with self.assert_gc_integrity(expect_storage_removed=True): - repository = self.createRepository() - - # Create two image storage records with the same content checksum. - content = 'hello world' - digest = 'sha256:' + hashlib.sha256(content).hexdigest() - preferred = storage.preferred_locations[0] - storage.put_content({preferred}, storage.blob_path(digest), content) - - is1 = database.ImageStorage.create(content_checksum=digest, uploading=False) - is2 = database.ImageStorage.create(content_checksum=digest, uploading=False) - - location = database.ImageStorageLocation.get(name=preferred) - - database.ImageStoragePlacement.create(location=location, storage=is1) - database.ImageStoragePlacement.create(location=location, storage=is2) - - # Ensure the CAS path exists. - self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) - - # Create two images in the repository, and two tags, each pointing to one of the storages. - first_image = Image.create(docker_image_id='i1', - repository=repository, storage=is1, - ancestors='/') - - second_image = Image.create(docker_image_id='i2', - repository=repository, storage=is2, - ancestors='/') - - model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, - 'first', first_image.docker_image_id, - 'sha:someshahere1', '{}') - - model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, - 'second', second_image.docker_image_id, - 'sha:someshahere2', '{}') - - self.assertNotDeleted(repository, 'i1', 'i2') - - # Delete the first tag. - self.deleteTag(repository, 'first') - self.assertDeleted(repository, 'i1') - self.assertNotDeleted(repository, 'i2') - - # Ensure the CAS path still exists. - self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) - - def test_images_shared_cas_with_new_blob_table(self): - """ A repository with a tag and image that shares its CAS path with a record in the new Blob - table. Deleting the first tag should delete the first image, and its storage, but not the - file in storage, as it shares its CAS path with the blob row. - """ - with self.assert_gc_integrity(expect_storage_removed=True): - repository = self.createRepository() - - # Create two image storage records with the same content checksum. - content = 'hello world' - digest = 'sha256:' + hashlib.sha256(content).hexdigest() - preferred = storage.preferred_locations[0] - storage.put_content({preferred}, storage.blob_path(digest), content) - - media_type = database.MediaType.get(name='text/plain') - - is1 = database.ImageStorage.create(content_checksum=digest, uploading=False) - is2 = database.Blob.create(digest=digest, size=0, media_type=media_type) - - location = database.ImageStorageLocation.get(name=preferred) - database.ImageStoragePlacement.create(location=location, storage=is1) - - # Ensure the CAS path exists. - self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) - - # Create the image in the repository, and the tag. - first_image = Image.create(docker_image_id='i1', - repository=repository, storage=is1, - ancestors='/') - - model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, - 'first', first_image.docker_image_id, - 'sha:someshahere1', '{}') - - self.assertNotDeleted(repository, 'i1') - - # Delete the tag. - self.deleteTag(repository, 'first') - self.assertDeleted(repository, 'i1') - - # Ensure the CAS path still exists, as it is referenced by the Blob table. - self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) - -if __name__ == '__main__': - unittest.main() From 45c70080785a6fd62e14f1b23656f0dbb219f811 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 22 Jun 2017 18:09:17 -0400 Subject: [PATCH 2/4] Change Repo GC to be iterative This prevents us from creating a massive join when there are a large number of tags in the repository, which can result in locking the entire DB for long periods of time. Instead of the join, we just iteratively lookup any images found to be directly referenced by a tag or found as a parent of another image, both of which should be indexed lookups. Once done, we only remove those images and then iterate until the working set stops changing. --- data/model/repository.py | 90 ++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/data/model/repository.py b/data/model/repository.py index 4684bc8af..6602bc081 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -181,49 +181,50 @@ def garbage_collect_repo(repo, extra_candidate_set=None): logger.debug('No candidate images for GC for repo: %s', repo.id) return True - candidates_orphans = list(candidate_orphan_image_set) + all_images_removed = set() + all_storage_id_whitelist = set() + all_unreferenced_candidates = set() - with db_transaction(): - Candidate = Image.alias() - Tagged = Image.alias() - ancestor_superset = Tagged.ancestors ** db_concat_func(Candidate.ancestors, Candidate.id, '/%') + # Iteratively try to remove images from the database. The only images we can remove are those + # that are not referenced by tags AND not the parents of other images. We continue removing images + # until no changes are found. + iteration = 0 + while candidate_orphan_image_set: + iteration = iteration + 1 + logger.debug('Starting iteration #%s for GC of repository %s with candidates: %s', iteration, + repo.id, candidate_orphan_image_set) + candidates_orphans = list(candidate_orphan_image_set) - # We are going to compute all images which are being referenced in two ways: - # First, we will find all images which have their ancestor paths appear in - # another image. Secondly, we union in all of the candidate images which are - # directly referenced by a tag. This can be used in a subquery to directly - # find which candidates are being referenced without any client side - # computation or extra round trips. - direct_referenced = (RepositoryTag - .select(RepositoryTag.image) - .where(RepositoryTag.repository == repo.id, - RepositoryTag.image << candidates_orphans)) - - cloned = direct_referenced.clone().alias('direct_ref') - directly_referenced_subquery = Image.alias().select(cloned.c.image_id).from_(cloned) - - ancestor_referenced = (Candidate - .select(Candidate.id) - .join(Tagged, on=ancestor_superset) - .join(RepositoryTag, on=(Tagged.id == RepositoryTag.image)) + with db_transaction(): + # Any image directly referenced by a tag that still exists, cannot be GCed. + direct_referenced = (RepositoryTag + .select(RepositoryTag.image) .where(RepositoryTag.repository == repo.id, - Candidate.id << candidates_orphans, - ~(Candidate.id << directly_referenced_subquery))) + RepositoryTag.image << candidates_orphans)) - referenced_candidates = (direct_referenced | ancestor_referenced) + # Any image which is the parent of another image, cannot be GCed. + parent_referenced = (Image + .select(Image.parent) + .where(Image.repository == repo.id, + Image.parent << candidates_orphans)) - # We desire a few pieces of information from the database from the following - # query: all of the image ids which are associated with this repository, - # and the storages which are associated with those images. - unreferenced_candidates = (Image - .select(Image.id, Image.docker_image_id, - ImageStorage.id, ImageStorage.uuid) - .join(ImageStorage) - .where(Image.id << candidates_orphans, - ~(Image.id << referenced_candidates))) + referenced_candidates = (direct_referenced | parent_referenced) + + # We desire a few pieces of information from the database from the following + # query: all of the image ids which are associated with this repository, + # and the storages which are associated with those images. + unreferenced_candidates = (Image + .select(Image.id, Image.docker_image_id, + ImageStorage.id, ImageStorage.uuid) + .join(ImageStorage) + .where(Image.id << candidates_orphans, + ~(Image.id << referenced_candidates))) + + image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates] + if len(image_ids_to_remove) == 0: + # No more candidates to remove. + break - image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates] - if len(image_ids_to_remove) > 0: logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove) storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates]) @@ -253,15 +254,22 @@ def garbage_collect_repo(repo, extra_candidate_set=None): logger.info('Could not GC images %s; will try again soon', image_ids_to_remove) return False + # Add the images to the removed set and remove them from the candidate set. + all_images_removed.update(image_ids_to_remove) + all_storage_id_whitelist.update(storage_id_whitelist) + all_unreferenced_candidates.update(unreferenced_candidates) + + candidate_orphan_image_set.difference_update(image_ids_to_remove) + # If any images were removed, GC any orphaned storages. - if len(image_ids_to_remove) > 0: - logger.info('Garbage collecting storage for images: %s', image_ids_to_remove) - storage_ids_removed = set(storage.garbage_collect_storage(storage_id_whitelist)) + if len(all_images_removed) > 0: + logger.info('Garbage collecting storage for images: %s', all_images_removed) + storage_ids_removed = set(storage.garbage_collect_storage(all_storage_id_whitelist)) # If any storages were removed and cleanup callbacks are registered, call them with # the images+storages removed. if storage_ids_removed and config.image_cleanup_callbacks: - image_storages_removed = [candidate for candidate in unreferenced_candidates + image_storages_removed = [candidate for candidate in all_unreferenced_candidates if candidate.storage_id in storage_ids_removed] for callback in config.image_cleanup_callbacks: callback(image_storages_removed) From cdd7cb93211a3b7f667c39e63ecf3e89431f8763 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 22 Jun 2017 18:14:06 -0400 Subject: [PATCH 3/4] Remove directly referenced images from the candidate set before starting GC iteration Makes the lookup query underneath the transaction smaller if there are a lot of images referenced directly by tag. We still must do the direct referenced check within the transaction, but this should reduce the scope of the search space a bit. --- data/model/repository.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/data/model/repository.py b/data/model/repository.py index 6602bc081..2cd2b6976 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -185,11 +185,19 @@ def garbage_collect_repo(repo, extra_candidate_set=None): all_storage_id_whitelist = set() all_unreferenced_candidates = set() + # Remove any images directly referenced by tags, to prune the working set. + direct_referenced = (RepositoryTag + .select(RepositoryTag.image) + .where(RepositoryTag.repository == repo.id, + RepositoryTag.image << list(candidate_orphan_image_set))) + candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced]) + # Iteratively try to remove images from the database. The only images we can remove are those # that are not referenced by tags AND not the parents of other images. We continue removing images # until no changes are found. iteration = 0 - while candidate_orphan_image_set: + making_progress = True + while candidate_orphan_image_set and making_progress: iteration = iteration + 1 logger.debug('Starting iteration #%s for GC of repository %s with candidates: %s', iteration, repo.id, candidate_orphan_image_set) @@ -221,6 +229,7 @@ def garbage_collect_repo(repo, extra_candidate_set=None): ~(Image.id << referenced_candidates))) image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates] + making_progress = bool(len(image_ids_to_remove)) if len(image_ids_to_remove) == 0: # No more candidates to remove. break @@ -254,12 +263,12 @@ def garbage_collect_repo(repo, extra_candidate_set=None): logger.info('Could not GC images %s; will try again soon', image_ids_to_remove) return False - # Add the images to the removed set and remove them from the candidate set. - all_images_removed.update(image_ids_to_remove) - all_storage_id_whitelist.update(storage_id_whitelist) - all_unreferenced_candidates.update(unreferenced_candidates) + # Add the images to the removed set and remove them from the candidate set. + all_images_removed.update(image_ids_to_remove) + all_storage_id_whitelist.update(storage_id_whitelist) + all_unreferenced_candidates.update(unreferenced_candidates) - candidate_orphan_image_set.difference_update(image_ids_to_remove) + candidate_orphan_image_set.difference_update(image_ids_to_remove) # If any images were removed, GC any orphaned storages. if len(all_images_removed) > 0: From 8dcea30d5816690af5839e2f873c19440149f8a3 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 27 Jun 2017 18:11:46 +0300 Subject: [PATCH 4/4] Fix build by pre-calling the caches They were being called in a test-dependent order, which caused any tests which relied on query count to fail --- data/model/_basequery.py | 8 ++++++-- endpoints/api/test/test_search.py | 24 ++++++++++++------------ test/fixtures.py | 5 +++++ 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/data/model/_basequery.py b/data/model/_basequery.py index 4a7c44a0d..28b0b1952 100644 --- a/data/model/_basequery.py +++ b/data/model/_basequery.py @@ -54,9 +54,13 @@ def get_public_repo_visibility(): return Visibility.get(name='public') -@lru_cache(maxsize=3) def _lookup_team_role(name): - return TeamRole.get(name=name) + return _lookup_team_roles()[name] + + +@lru_cache(maxsize=1) +def _lookup_team_roles(): + return {role.name:role for role in TeamRole.select()} def filter_to_repos_for_user(query, username=None, namespace=None, repo_kind='image', diff --git a/endpoints/api/test/test_search.py b/endpoints/api/test/test_search.py index 4efba0841..00a9cc88b 100644 --- a/endpoints/api/test/test_search.py +++ b/endpoints/api/test/test_search.py @@ -7,29 +7,29 @@ from endpoints.api.search import ConductRepositorySearch, ConductSearch from endpoints.api.test.shared import client_with_identity, conduct_api_call from test.fixtures import * -@pytest.mark.parametrize('query, expected_query_count', [ - ('simple', 7), - ('public', 6), - ('repository', 6), +@pytest.mark.parametrize('query', [ + ('simple'), + ('public'), + ('repository'), ]) -def test_repository_search(query, expected_query_count, client): +def test_repository_search(query, client): with client_with_identity('devtable', client) as cl: params = {'query': query} - with assert_query_count(expected_query_count): + with assert_query_count(6): result = conduct_api_call(cl, ConductRepositorySearch, 'GET', params, None, 200).json assert result['start_index'] == 0 assert result['page'] == 1 assert len(result['results']) -@pytest.mark.parametrize('query, expected_query_count', [ - ('simple', 8), - ('public', 8), - ('repository', 8), +@pytest.mark.parametrize('query', [ + ('simple'), + ('public'), + ('repository'), ]) -def test_search_query_count(query, expected_query_count, client): +def test_search_query_count(query, client): with client_with_identity('devtable', client) as cl: params = {'query': query} - with assert_query_count(expected_query_count): + with assert_query_count(8): result = conduct_api_call(cl, ConductSearch, 'GET', params, None, 200).json assert len(result['results']) diff --git a/test/fixtures.py b/test/fixtures.py index bee8199e1..0a0f970c8 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -125,6 +125,11 @@ def initialized_db(appconfig): # Configure the database. configure(appconfig) + # Initialize caches. + model._basequery._lookup_team_roles() + model._basequery.get_public_repo_visibility() + model.log.get_log_entry_kinds() + # If under a test *real* database, setup a savepoint. under_test_real_database = bool(os.environ.get('TEST_DATABASE_URI')) if under_test_real_database: