import unittest import time import hashlib from contextlib import contextmanager from playhouse.test_utils import assert_query_count from app import app, storage from initdb import setup_database_for_testing, finished_database_for_testing from data import model, database from data.database import Image, ImageStorage, DerivedStorageForImage, Label, TagManifestLabel ADMIN_ACCESS_USER = 'devtable' PUBLIC_USER = 'public' REPO = 'somerepo' class TestGarbageCollection(unittest.TestCase): @staticmethod def _set_tag_expiration_policy(namespace, expiration_s): namespace_user = model.user.get_user(namespace) model.user.change_user_tag_expiration(namespace_user, expiration_s) def setUp(self): setup_database_for_testing(self) self._set_tag_expiration_policy(ADMIN_ACCESS_USER, 0) self._set_tag_expiration_policy(PUBLIC_USER, 0) self.app = app.test_client() self.ctx = app.test_request_context() self.ctx.__enter__() def tearDown(self): finished_database_for_testing(self) self.ctx.__exit__(True, None, None) @staticmethod def createImage(docker_image_id, repository_obj, username): preferred = storage.preferred_locations[0] image = model.image.find_create_or_link_image(docker_image_id, repository_obj, username, {}, preferred) image.storage.uploading = False image.storage.save() # Create derived images as well. model.image.find_or_create_derived_storage(image, 'squash', preferred) model.image.find_or_create_derived_storage(image, 'aci', preferred) # Add some torrent info. try: database.TorrentInfo.get(storage=image.storage) except database.TorrentInfo.DoesNotExist: model.storage.save_torrent_info(image.storage, 1, 'helloworld') # Add some additional placements to the image. for location_name in ['local_eu']: location = database.ImageStorageLocation.get(name=location_name) try: database.ImageStoragePlacement.get(location=location, storage=image.storage) except: continue database.ImageStoragePlacement.create(location=location, storage=image.storage) return image.storage def createRepository(self, namespace=ADMIN_ACCESS_USER, name=REPO, **kwargs): user = model.user.get_user(namespace) repo = model.repository.create_repository(namespace, name, user) # Populate the repository with the tags. image_map = {} for tag_name in kwargs: image_ids = kwargs[tag_name] parent = None for image_id in image_ids: if not image_id in image_map: image_map[image_id] = self.createImage(image_id, repo, namespace) v1_metadata = { 'id': image_id, } if parent is not None: v1_metadata['parent'] = parent.docker_image_id # Set the ancestors for the image. parent = model.image.set_image_metadata(image_id, namespace, name, '', '', '', v1_metadata, parent=parent) # Set the tag for the image. tag_manifest, _ = model.tag.store_tag_manifest(namespace, name, tag_name, image_ids[-1], 'sha:someshahere', '{}') # Add some labels to the tag. model.label.create_manifest_label(tag_manifest, 'foo', 'bar', 'manifest') model.label.create_manifest_label(tag_manifest, 'meh', 'grah', 'manifest') return repo def gcNow(self, repository): self.assertTrue(model.repository.garbage_collect_repo(repository)) def deleteTag(self, repository, tag, perform_gc=True): model.tag.delete_tag(repository.namespace_user.username, repository.name, tag) if perform_gc: self.assertTrue(model.repository.garbage_collect_repo(repository)) def moveTag(self, repository, tag, docker_image_id): model.tag.create_or_update_tag(repository.namespace_user.username, repository.name, tag, docker_image_id) self.assertTrue(model.repository.garbage_collect_repo(repository)) def assertNotDeleted(self, repository, *args): for docker_image_id in args: self.assertTrue(bool(model.image.get_image_by_id(repository.namespace_user.username, repository.name, docker_image_id))) def assertDeleted(self, repository, *args): for docker_image_id in args: try: # Verify the image is missing when accessed by the repository. model.image.get_image_by_id(repository.namespace_user.username, repository.name, docker_image_id) except model.DataModelException: return self.fail('Expected image %s to be deleted' % docker_image_id) @staticmethod def _get_dangling_storage_count(): storage_ids = set([current.id for current in ImageStorage.select()]) referenced_by_image = set([image.storage_id for image in Image.select()]) referenced_by_derived = set([derived.derivative_id for derived in DerivedStorageForImage.select()]) return len(storage_ids - referenced_by_image - referenced_by_derived) @staticmethod def _get_dangling_label_count(): label_ids = set([current.id for current in Label.select()]) referenced_by_manifest = set([mlabel.label_id for mlabel in TagManifestLabel.select()]) return len(label_ids - referenced_by_manifest) @contextmanager def assert_gc_integrity(self, expect_storage_removed=True): """ Specialized assertion for ensuring that GC cleans up all dangling storages and labels, invokes the callback for images removed and doesn't invoke the callback for images *not* removed. """ # TODO: Consider also asserting the number of DB queries being performed. # Add a callback for when images are removed. removed_image_storages = [] model.config.register_image_cleanup_callback(removed_image_storages.extend) # Store the number of dangling storages and labels. existing_storage_count = self._get_dangling_storage_count() existing_label_count = self._get_dangling_label_count() yield # Ensure the number of dangling storages and labels has not changed. updated_storage_count = self._get_dangling_storage_count() self.assertEqual(updated_storage_count, existing_storage_count) updated_label_count = self._get_dangling_label_count() self.assertEqual(updated_label_count, existing_label_count) # Ensure that for each call to the image+storage cleanup callback, the image and its # storage is not found *anywhere* in the database. for removed_image_and_storage in removed_image_storages: with self.assertRaises(Image.DoesNotExist): Image.get(id=removed_image_and_storage.id) with self.assertRaises(ImageStorage.DoesNotExist): ImageStorage.get(id=removed_image_and_storage.storage_id) with self.assertRaises(ImageStorage.DoesNotExist): ImageStorage.get(uuid=removed_image_and_storage.storage.uuid) self.assertEquals(expect_storage_removed, bool(removed_image_storages)) # Ensure all CAS storage is in the storage engine. preferred = storage.preferred_locations[0] for storage_row in ImageStorage.select(): if storage_row.cas_path: storage.get_content({preferred}, storage.blob_path(storage_row.content_checksum)) def test_has_garbage(self): """ Remove all existing repositories, then add one without garbage, check, then add one with garbage, and check again. """ # Delete all existing repos. for repo in database.Repository.select().order_by(database.Repository.id): self.assertTrue(model.repository.purge_repository(repo.namespace_user.username, repo.name)) # Change the time machine expiration on the namespace. (database.User .update(removed_tag_expiration_s=1000000000) .where(database.User.username == ADMIN_ACCESS_USER) .execute()) # Create a repository without any garbage. repository = self.createRepository(latest=['i1', 'i2', 'i3']) # Ensure that no repositories are returned by the has garbage check. self.assertIsNone(model.repository.find_repository_with_garbage(1000000000)) # Delete a tag. self.deleteTag(repository, 'latest', perform_gc=False) # There should still not be any repositories with garbage, due to time machine. self.assertIsNone(model.repository.find_repository_with_garbage(1000000000)) # Change the time machine expiration on the namespace. (database.User .update(removed_tag_expiration_s=0) .where(database.User.username == ADMIN_ACCESS_USER) .execute()) # Now we should find the repository for GC. repository = model.repository.find_repository_with_garbage(0) self.assertIsNotNone(repository) self.assertEquals(REPO, repository.name) # GC the repository. self.assertTrue(model.repository.garbage_collect_repo(repository)) # There should now be no repositories with garbage. self.assertIsNone(model.repository.find_repository_with_garbage(0)) def test_find_garbage_policy_functions(self): with assert_query_count(1): one_policy = model.repository.get_random_gc_policy() all_policies = model.repository._get_gc_expiration_policies() self.assertIn(one_policy, all_policies) def test_one_tag(self): """ Create a repository with a single tag, then remove that tag and verify that the repository is now empty. """ with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3']) self.deleteTag(repository, 'latest') self.assertDeleted(repository, 'i1', 'i2', 'i3') def test_two_tags_unshared_images(self): """ Repository has two tags with no shared images between them. """ with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['f1', 'f2']) self.deleteTag(repository, 'latest') self.assertDeleted(repository, 'i1', 'i2', 'i3') self.assertNotDeleted(repository, 'f1', 'f2') def test_two_tags_shared_images(self): """ Repository has two tags with shared images. Deleting the tag should only remove the unshared images. """ with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) self.deleteTag(repository, 'latest') self.assertDeleted(repository, 'i2', 'i3') self.assertNotDeleted(repository, 'i1', 'f1') def test_unrelated_repositories(self): """ Two repositories with different images. Removing the tag from one leaves the other's images intact. """ with self.assert_gc_integrity(): repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') repository2 = self.createRepository(latest=['j1', 'j2', 'j3'], name='repo2') self.deleteTag(repository1, 'latest') self.assertDeleted(repository1, 'i1', 'i2', 'i3') self.assertNotDeleted(repository2, 'j1', 'j2', 'j3') def test_related_repositories(self): """ Two repositories with shared images. Removing the tag from one leaves the other's images intact. """ with self.assert_gc_integrity(): repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') repository2 = self.createRepository(latest=['i1', 'i2', 'j1'], name='repo2') self.deleteTag(repository1, 'latest') self.assertDeleted(repository1, 'i3') self.assertNotDeleted(repository2, 'i1', 'i2', 'j1') def test_inaccessible_repositories(self): """ Two repositories under different namespaces should result in the images being deleted but not completely removed from the database. """ with self.assert_gc_integrity(): repository1 = self.createRepository(namespace=ADMIN_ACCESS_USER, latest=['i1', 'i2', 'i3']) repository2 = self.createRepository(namespace=PUBLIC_USER, latest=['i1', 'i2', 'i3']) self.deleteTag(repository1, 'latest') self.assertDeleted(repository1, 'i1', 'i2', 'i3') self.assertNotDeleted(repository2, 'i1', 'i2', 'i3') def test_multiple_shared_images(self): """ Repository has multiple tags with shared images. Selectively deleting the tags, and verifying at each step. """ with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], third=['t1', 't2', 't3'], fourth=['i1', 'f1']) # Delete tag other. Should delete f2, since it is not shared. self.deleteTag(repository, 'other') self.assertDeleted(repository, 'f2') self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3', 'f1') # Move tag fourth to i3. This should remove f1 since it is no longer referenced. self.moveTag(repository, 'fourth', 'i3') self.assertDeleted(repository, 'f1') self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3') # Delete tag 'latest'. This should do nothing since fourth is on the same branch. self.deleteTag(repository, 'latest') self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3') # Delete tag 'third'. This should remove t1->t3. self.deleteTag(repository, 'third') self.assertDeleted(repository, 't1', 't2', 't3') self.assertNotDeleted(repository, 'i1', 'i2', 'i3') # Add tag to i1. self.moveTag(repository, 'newtag', 'i1') self.assertNotDeleted(repository, 'i1', 'i2', 'i3') # Delete tag 'fourth'. This should remove i2 and i3. self.deleteTag(repository, 'fourth') self.assertDeleted(repository, 'i2', 'i3') self.assertNotDeleted(repository, 'i1') # Delete tag 'newtag'. This should remove the remaining image. self.deleteTag(repository, 'newtag') self.assertDeleted(repository, 'i1') def test_empty_gc(self): with self.assert_gc_integrity(expect_storage_removed=False): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], third=['t1', 't2', 't3'], fourth=['i1', 'f1']) self.gcNow(repository) self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3', 'f1', 'f2') def test_time_machine_no_gc(self): """ Repository has two tags with shared images. Deleting the tag should not remove any images """ with self.assert_gc_integrity(expect_storage_removed=False): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) self._set_tag_expiration_policy(repository.namespace_user.username, 60*60*24) self.deleteTag(repository, 'latest') self.assertNotDeleted(repository, 'i2', 'i3') self.assertNotDeleted(repository, 'i1', 'f1') def test_time_machine_gc(self): """ Repository has two tags with shared images. Deleting the second tag should cause the images for the first deleted tag to gc. """ with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) self._set_tag_expiration_policy(repository.namespace_user.username, 1) self.deleteTag(repository, 'latest') self.assertNotDeleted(repository, 'i2', 'i3') self.assertNotDeleted(repository, 'i1', 'f1') time.sleep(2) self.deleteTag(repository, 'other') # This will cause the images associated with latest to gc self.assertDeleted(repository, 'i2', 'i3') self.assertNotDeleted(repository, 'i1', 'f1') def test_images_shared_storage(self): """ Repository with two tags, both with the same shared storage. Deleting the first tag should delete the first image, but *not* its storage. """ with self.assert_gc_integrity(expect_storage_removed=False): repository = self.createRepository() # Add two tags, each with their own image, but with the same storage. image_storage = model.storage.create_v1_storage(storage.preferred_locations[0]) first_image = Image.create(docker_image_id='i1', repository=repository, storage=image_storage, ancestors='/') second_image = Image.create(docker_image_id='i2', repository=repository, storage=image_storage, ancestors='/') model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, 'first', first_image.docker_image_id, 'sha:someshahere', '{}') model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, 'second', second_image.docker_image_id, 'sha:someshahere', '{}') # Delete the first tag. self.deleteTag(repository, 'first') self.assertDeleted(repository, 'i1') self.assertNotDeleted(repository, 'i2') def test_image_with_cas(self): """ A repository with a tag pointing to an image backed by CAS. Deleting and GCing the tag should result in the storage and its CAS data being removed. """ with self.assert_gc_integrity(expect_storage_removed=True): repository = self.createRepository() # Create an image storage record under CAS. content = 'hello world' digest = 'sha256:' + hashlib.sha256(content).hexdigest() preferred = storage.preferred_locations[0] storage.put_content({preferred}, storage.blob_path(digest), content) image_storage = database.ImageStorage.create(content_checksum=digest, uploading=False) location = database.ImageStorageLocation.get(name=preferred) database.ImageStoragePlacement.create(location=location, storage=image_storage) # Ensure the CAS path exists. self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) # Create the image and the tag. first_image = Image.create(docker_image_id='i1', repository=repository, storage=image_storage, ancestors='/') model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, 'first', first_image.docker_image_id, 'sha:someshahere1', '{}') self.assertNotDeleted(repository, 'i1') # Delete the tag. self.deleteTag(repository, 'first') self.assertDeleted(repository, 'i1') # Ensure the CAS path is gone. self.assertFalse(storage.exists({preferred}, storage.blob_path(digest))) def test_images_shared_cas(self): """ A repository, each two tags, pointing to the same image, which has image storage with the same *CAS path*, but *distinct records*. Deleting the first tag should delete the first image, and its storage, but not the file in storage, as it shares its CAS path. """ with self.assert_gc_integrity(expect_storage_removed=True): repository = self.createRepository() # Create two image storage records with the same content checksum. content = 'hello world' digest = 'sha256:' + hashlib.sha256(content).hexdigest() preferred = storage.preferred_locations[0] storage.put_content({preferred}, storage.blob_path(digest), content) is1 = database.ImageStorage.create(content_checksum=digest, uploading=False) is2 = database.ImageStorage.create(content_checksum=digest, uploading=False) location = database.ImageStorageLocation.get(name=preferred) database.ImageStoragePlacement.create(location=location, storage=is1) database.ImageStoragePlacement.create(location=location, storage=is2) # Ensure the CAS path exists. self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) # Create two images in the repository, and two tags, each pointing to one of the storages. first_image = Image.create(docker_image_id='i1', repository=repository, storage=is1, ancestors='/') second_image = Image.create(docker_image_id='i2', repository=repository, storage=is2, ancestors='/') model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, 'first', first_image.docker_image_id, 'sha:someshahere1', '{}') model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, 'second', second_image.docker_image_id, 'sha:someshahere2', '{}') self.assertNotDeleted(repository, 'i1', 'i2') # Delete the first tag. self.deleteTag(repository, 'first') self.assertDeleted(repository, 'i1') self.assertNotDeleted(repository, 'i2') # Ensure the CAS path still exists. self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) if __name__ == '__main__': unittest.main()