diff --git a/data/model/__init__.py b/data/model/__init__.py index 23a1a647c..331b2f0b3 100644 --- a/data/model/__init__.py +++ b/data/model/__init__.py @@ -107,6 +107,10 @@ class Config(object): def __init__(self): self.app_config = None self.store = None + self.image_cleanup_callbacks = [] + + def register_image_cleanup_callback(self, callback): + self.image_cleanup_callbacks.append(callback) config = Config() diff --git a/data/model/repository.py b/data/model/repository.py index 0281bc592..11bcafcf1 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -5,9 +5,9 @@ from datetime import timedelta, datetime from peewee import JOIN_LEFT_OUTER, fn, SQL, IntegrityError from cachetools import ttl_cache -from data.model import (DataModelException, tag, db_transaction, storage, permission, +from data.model import (config, DataModelException, tag, db_transaction, storage, permission, _basequery) -from data.database import (Repository, Namespace, RepositoryTag, Star, Image, User, +from data.database import (Repository, Namespace, RepositoryTag, Star, Image, ImageStorage, User, Visibility, RepositoryPermission, RepositoryActionCount, Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage, Label, TagManifestLabel, db_for_update, get_epoch_timestamp, @@ -173,29 +173,24 @@ def garbage_collect_repo(repo, extra_candidate_set=None): referenced_candidates = (direct_referenced | ancestor_referenced) - # We desire two pieces of information from the database from the following + # We desire a few pieces of information from the database from the following # query: all of the image ids which are associated with this repository, - # and the storages which are associated with those images. In order to - # fetch just this information, and bypass all of the peewee model parsing - # code, which is overkill for just two fields, we use a tuple query, and - # feed that directly to the dictionary tuple constructor which takes an - # iterable of tuples containing [(k, v), (k, v), ...] + # and the storages which are associated with those images. unreferenced_candidates = (Image - .select(Image.id, Image.storage) + .select(Image.id, Image.docker_image_id, + ImageStorage.id, ImageStorage.uuid) + .join(ImageStorage) .where(Image.id << candidates_orphans, - ~(Image.id << referenced_candidates)) - .tuples()) + ~(Image.id << referenced_candidates))) - unreferecend_images_to_storages = dict(unreferenced_candidates) - to_remove = unreferecend_images_to_storages.keys() - - if len(to_remove) > 0: - logger.info('Cleaning up unreferenced images: %s', to_remove) - storage_id_whitelist = set(unreferecend_images_to_storages.values()) + image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates] + if len(image_ids_to_remove) > 0: + logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove) + storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates]) # Lookup any derived images for the images to remove. derived = DerivedStorageForImage.select().where( - DerivedStorageForImage.source_image << to_remove) + DerivedStorageForImage.source_image << image_ids_to_remove) has_derived = False for derived_image in derived: @@ -207,21 +202,30 @@ def garbage_collect_repo(repo, extra_candidate_set=None): try: (DerivedStorageForImage .delete() - .where(DerivedStorageForImage.source_image << to_remove) + .where(DerivedStorageForImage.source_image << image_ids_to_remove) .execute()) except IntegrityError: - logger.info('Could not GC derived images %s; will try again soon', to_remove) + logger.info('Could not GC derived images %s; will try again soon', image_ids_to_remove) return False try: - Image.delete().where(Image.id << to_remove).execute() + Image.delete().where(Image.id << image_ids_to_remove).execute() except IntegrityError: - logger.info('Could not GC images %s; will try again soon', to_remove) + logger.info('Could not GC images %s; will try again soon', image_ids_to_remove) return False - if len(to_remove) > 0: - logger.info('Garbage collecting storage for images: %s', to_remove) - storage.garbage_collect_storage(storage_id_whitelist) + # If any images were removed, GC any orphaned storages. + if len(image_ids_to_remove) > 0: + logger.info('Garbage collecting storage for images: %s', image_ids_to_remove) + storage_ids_removed = set(storage.garbage_collect_storage(storage_id_whitelist)) + + # If any storages were removed and cleanup callbacks are registered, call them with + # the images+storages removed. + if storage_ids_removed and config.image_cleanup_callbacks: + image_storages_removed = [candidate for candidate in unreferenced_candidates + if candidate.storage_id in storage_ids_removed] + for callback in config.image_cleanup_callbacks: + callback(image_storages_removed) return True diff --git a/data/model/storage.py b/data/model/storage.py index 0c7bf80ba..61153714e 100644 --- a/data/model/storage.py +++ b/data/model/storage.py @@ -72,8 +72,12 @@ def _orphaned_storage_query(candidate_ids): def garbage_collect_storage(storage_id_whitelist): + """ Performs GC on a possible subset of the storage's with the IDs found in the + whitelist. The storages in the whitelist will be checked, and any orphaned will + be removed, with those IDs being returned. + """ if len(storage_id_whitelist) == 0: - return + return [] def placements_query_to_paths_set(placements_query): return {(get_image_location_for_id(placement.location_id).name, @@ -89,7 +93,7 @@ def garbage_collect_storage(storage_id_whitelist): orphaned_storage_ids = _orphaned_storage_query(storage_id_whitelist) if len(orphaned_storage_ids) == 0: # Nothing to GC. - return + return [] placements_to_remove = list(ImageStoragePlacement .select() @@ -133,6 +137,8 @@ def garbage_collect_storage(storage_id_whitelist): logger.debug('Removing %s from %s', image_path, location_name) config.store.remove({location_name}, image_path) + return orphaned_storage_ids + def create_v1_storage(location_name): storage = ImageStorage.create(cas_path=False) diff --git a/test/test_gc.py b/test/test_gc.py index c366a5a83..5155d354f 100644 --- a/test/test_gc.py +++ b/test/test_gc.py @@ -146,20 +146,43 @@ class TestGarbageCollection(unittest.TestCase): return len(label_ids - referenced_by_manifest) @contextmanager - def assert_no_new_dangling_storages_or_labels(self): + def assert_gc_integrity(self, expect_storage_removed=True): """ Specialized assertion for ensuring that GC cleans up all dangling storages - and labels. + and labels, invokes the callback for images removed and doesn't invoke the + callback for images *not* removed. """ # TODO: Consider also asserting the number of DB queries being performed. + + # Add a callback for when images are removed. + removed_image_storages = [] + model.config.register_image_cleanup_callback(removed_image_storages.extend) + + # Store the number of dangling storages and labels. existing_storage_count = self._get_dangling_storage_count() existing_label_count = self._get_dangling_label_count() yield + + # Ensure the number of dangling storages and labels has not changed. updated_storage_count = self._get_dangling_storage_count() self.assertEqual(updated_storage_count, existing_storage_count) updated_label_count = self._get_dangling_label_count() self.assertEqual(updated_label_count, existing_label_count) + # Ensure that for each call to the image+storage cleanup callback, the image and its + # storage is not found *anywhere* in the database. + for removed_image_and_storage in removed_image_storages: + with self.assertRaises(Image.DoesNotExist): + Image.get(id=removed_image_and_storage.id) + + with self.assertRaises(ImageStorage.DoesNotExist): + ImageStorage.get(id=removed_image_and_storage.storage_id) + + with self.assertRaises(ImageStorage.DoesNotExist): + ImageStorage.get(uuid=removed_image_and_storage.storage.uuid) + + self.assertEquals(expect_storage_removed, bool(removed_image_storages)) + def test_has_garbage(self): """ Remove all existing repositories, then add one without garbage, check, then add one with garbage, and check again. @@ -212,14 +235,14 @@ class TestGarbageCollection(unittest.TestCase): def test_one_tag(self): """ Create a repository with a single tag, then remove that tag and verify that the repository is now empty. """ - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3']) self.deleteTag(repository, 'latest') self.assertDeleted(repository, 'i1', 'i2', 'i3') def test_two_tags_unshared_images(self): """ Repository has two tags with no shared images between them. """ - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['f1', 'f2']) self.deleteTag(repository, 'latest') self.assertDeleted(repository, 'i1', 'i2', 'i3') @@ -229,7 +252,7 @@ class TestGarbageCollection(unittest.TestCase): """ Repository has two tags with shared images. Deleting the tag should only remove the unshared images. """ - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) self.deleteTag(repository, 'latest') self.assertDeleted(repository, 'i2', 'i3') @@ -239,7 +262,7 @@ class TestGarbageCollection(unittest.TestCase): """ Two repositories with different images. Removing the tag from one leaves the other's images intact. """ - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(): repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') repository2 = self.createRepository(latest=['j1', 'j2', 'j3'], name='repo2') @@ -252,7 +275,7 @@ class TestGarbageCollection(unittest.TestCase): """ Two repositories with shared images. Removing the tag from one leaves the other's images intact. """ - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(): repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') repository2 = self.createRepository(latest=['i1', 'i2', 'j1'], name='repo2') @@ -265,7 +288,7 @@ class TestGarbageCollection(unittest.TestCase): """ Two repositories under different namespaces should result in the images being deleted but not completely removed from the database. """ - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(): repository1 = self.createRepository(namespace=ADMIN_ACCESS_USER, latest=['i1', 'i2', 'i3']) repository2 = self.createRepository(namespace=PUBLIC_USER, latest=['i1', 'i2', 'i3']) @@ -277,7 +300,7 @@ class TestGarbageCollection(unittest.TestCase): """ Repository has multiple tags with shared images. Selectively deleting the tags, and verifying at each step. """ - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], third=['t1', 't2', 't3'], fourth=['i1', 'f1']) @@ -314,7 +337,7 @@ class TestGarbageCollection(unittest.TestCase): self.assertDeleted(repository, 'i1') def test_empty_gc(self): - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(expect_storage_removed=False): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], third=['t1', 't2', 't3'], fourth=['i1', 'f1']) @@ -324,7 +347,7 @@ class TestGarbageCollection(unittest.TestCase): def test_time_machine_no_gc(self): """ Repository has two tags with shared images. Deleting the tag should not remove any images """ - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(expect_storage_removed=False): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) self._set_tag_expiration_policy(repository.namespace_user.username, 60*60*24) @@ -336,7 +359,7 @@ class TestGarbageCollection(unittest.TestCase): """ Repository has two tags with shared images. Deleting the second tag should cause the images for the first deleted tag to gc. """ - with self.assert_no_new_dangling_storages_or_labels(): + with self.assert_gc_integrity(): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) self._set_tag_expiration_policy(repository.namespace_user.username, 1) @@ -351,6 +374,37 @@ class TestGarbageCollection(unittest.TestCase): self.assertDeleted(repository, 'i2', 'i3') self.assertNotDeleted(repository, 'i1', 'f1') + def test_images_shared_storage(self): + """ Repository with two tags, both with the same shared storage. Deleting the first + tag should delete the first image, but *not* its storage. + """ + with self.assert_gc_integrity(expect_storage_removed=False): + repository = self.createRepository() + + # Add two tags, each with their own image, but with the same storage. + image_storage = model.storage.create_v1_storage(storage.preferred_locations[0]) + + first_image = Image.create(docker_image_id='i1', + repository=repository, storage=image_storage, + ancestors='/') + + second_image = Image.create(docker_image_id='i2', + repository=repository, storage=image_storage, + ancestors='/') + + model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, + 'first', first_image.docker_image_id, + 'sha:someshahere', '{}') + + model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, + 'second', second_image.docker_image_id, + 'sha:someshahere', '{}') + + # Delete the first tag. + self.deleteTag(repository, 'first') + self.assertDeleted(repository, 'i1') + self.assertNotDeleted(repository, 'i2') + if __name__ == '__main__': unittest.main()