Garbage collection image+storage callback support

Add support to GC to invoke a callback with the image+storages removed. Only images whose storage was also removed will be sent to the callback. This will be used by security scanning for its own GC in the followup change.
This commit is contained in:
Joseph Schorr 2016-12-22 14:27:42 -05:00
parent 35244d839d
commit 5225642850
4 changed files with 107 additions and 39 deletions

View file

@ -107,6 +107,10 @@ class Config(object):
def __init__(self): def __init__(self):
self.app_config = None self.app_config = None
self.store = None self.store = None
self.image_cleanup_callbacks = []
def register_image_cleanup_callback(self, callback):
self.image_cleanup_callbacks.append(callback)
config = Config() config = Config()

View file

@ -5,9 +5,9 @@ from datetime import timedelta, datetime
from peewee import JOIN_LEFT_OUTER, fn, SQL, IntegrityError from peewee import JOIN_LEFT_OUTER, fn, SQL, IntegrityError
from cachetools import ttl_cache from cachetools import ttl_cache
from data.model import (DataModelException, tag, db_transaction, storage, permission, from data.model import (config, DataModelException, tag, db_transaction, storage, permission,
_basequery) _basequery)
from data.database import (Repository, Namespace, RepositoryTag, Star, Image, User, from data.database import (Repository, Namespace, RepositoryTag, Star, Image, ImageStorage, User,
Visibility, RepositoryPermission, RepositoryActionCount, Visibility, RepositoryPermission, RepositoryActionCount,
Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage, Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage,
Label, TagManifestLabel, db_for_update, get_epoch_timestamp, Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
@ -173,29 +173,24 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
referenced_candidates = (direct_referenced | ancestor_referenced) referenced_candidates = (direct_referenced | ancestor_referenced)
# We desire two pieces of information from the database from the following # We desire a few pieces of information from the database from the following
# query: all of the image ids which are associated with this repository, # query: all of the image ids which are associated with this repository,
# and the storages which are associated with those images. In order to # and the storages which are associated with those images.
# fetch just this information, and bypass all of the peewee model parsing
# code, which is overkill for just two fields, we use a tuple query, and
# feed that directly to the dictionary tuple constructor which takes an
# iterable of tuples containing [(k, v), (k, v), ...]
unreferenced_candidates = (Image unreferenced_candidates = (Image
.select(Image.id, Image.storage) .select(Image.id, Image.docker_image_id,
ImageStorage.id, ImageStorage.uuid)
.join(ImageStorage)
.where(Image.id << candidates_orphans, .where(Image.id << candidates_orphans,
~(Image.id << referenced_candidates)) ~(Image.id << referenced_candidates)))
.tuples())
unreferecend_images_to_storages = dict(unreferenced_candidates) image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates]
to_remove = unreferecend_images_to_storages.keys() if len(image_ids_to_remove) > 0:
logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove)
if len(to_remove) > 0: storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates])
logger.info('Cleaning up unreferenced images: %s', to_remove)
storage_id_whitelist = set(unreferecend_images_to_storages.values())
# Lookup any derived images for the images to remove. # Lookup any derived images for the images to remove.
derived = DerivedStorageForImage.select().where( derived = DerivedStorageForImage.select().where(
DerivedStorageForImage.source_image << to_remove) DerivedStorageForImage.source_image << image_ids_to_remove)
has_derived = False has_derived = False
for derived_image in derived: for derived_image in derived:
@ -207,21 +202,30 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
try: try:
(DerivedStorageForImage (DerivedStorageForImage
.delete() .delete()
.where(DerivedStorageForImage.source_image << to_remove) .where(DerivedStorageForImage.source_image << image_ids_to_remove)
.execute()) .execute())
except IntegrityError: except IntegrityError:
logger.info('Could not GC derived images %s; will try again soon', to_remove) logger.info('Could not GC derived images %s; will try again soon', image_ids_to_remove)
return False return False
try: try:
Image.delete().where(Image.id << to_remove).execute() Image.delete().where(Image.id << image_ids_to_remove).execute()
except IntegrityError: except IntegrityError:
logger.info('Could not GC images %s; will try again soon', to_remove) logger.info('Could not GC images %s; will try again soon', image_ids_to_remove)
return False return False
if len(to_remove) > 0: # If any images were removed, GC any orphaned storages.
logger.info('Garbage collecting storage for images: %s', to_remove) if len(image_ids_to_remove) > 0:
storage.garbage_collect_storage(storage_id_whitelist) logger.info('Garbage collecting storage for images: %s', image_ids_to_remove)
storage_ids_removed = set(storage.garbage_collect_storage(storage_id_whitelist))
# If any storages were removed and cleanup callbacks are registered, call them with
# the images+storages removed.
if storage_ids_removed and config.image_cleanup_callbacks:
image_storages_removed = [candidate for candidate in unreferenced_candidates
if candidate.storage_id in storage_ids_removed]
for callback in config.image_cleanup_callbacks:
callback(image_storages_removed)
return True return True

View file

@ -72,8 +72,12 @@ def _orphaned_storage_query(candidate_ids):
def garbage_collect_storage(storage_id_whitelist): def garbage_collect_storage(storage_id_whitelist):
""" Performs GC on a possible subset of the storage's with the IDs found in the
whitelist. The storages in the whitelist will be checked, and any orphaned will
be removed, with those IDs being returned.
"""
if len(storage_id_whitelist) == 0: if len(storage_id_whitelist) == 0:
return return []
def placements_query_to_paths_set(placements_query): def placements_query_to_paths_set(placements_query):
return {(get_image_location_for_id(placement.location_id).name, return {(get_image_location_for_id(placement.location_id).name,
@ -89,7 +93,7 @@ def garbage_collect_storage(storage_id_whitelist):
orphaned_storage_ids = _orphaned_storage_query(storage_id_whitelist) orphaned_storage_ids = _orphaned_storage_query(storage_id_whitelist)
if len(orphaned_storage_ids) == 0: if len(orphaned_storage_ids) == 0:
# Nothing to GC. # Nothing to GC.
return return []
placements_to_remove = list(ImageStoragePlacement placements_to_remove = list(ImageStoragePlacement
.select() .select()
@ -133,6 +137,8 @@ def garbage_collect_storage(storage_id_whitelist):
logger.debug('Removing %s from %s', image_path, location_name) logger.debug('Removing %s from %s', image_path, location_name)
config.store.remove({location_name}, image_path) config.store.remove({location_name}, image_path)
return orphaned_storage_ids
def create_v1_storage(location_name): def create_v1_storage(location_name):
storage = ImageStorage.create(cas_path=False) storage = ImageStorage.create(cas_path=False)

View file

@ -146,20 +146,43 @@ class TestGarbageCollection(unittest.TestCase):
return len(label_ids - referenced_by_manifest) return len(label_ids - referenced_by_manifest)
@contextmanager @contextmanager
def assert_no_new_dangling_storages_or_labels(self): def assert_gc_integrity(self, expect_storage_removed=True):
""" Specialized assertion for ensuring that GC cleans up all dangling storages """ Specialized assertion for ensuring that GC cleans up all dangling storages
and labels. and labels, invokes the callback for images removed and doesn't invoke the
callback for images *not* removed.
""" """
# TODO: Consider also asserting the number of DB queries being performed. # TODO: Consider also asserting the number of DB queries being performed.
# Add a callback for when images are removed.
removed_image_storages = []
model.config.register_image_cleanup_callback(removed_image_storages.extend)
# Store the number of dangling storages and labels.
existing_storage_count = self._get_dangling_storage_count() existing_storage_count = self._get_dangling_storage_count()
existing_label_count = self._get_dangling_label_count() existing_label_count = self._get_dangling_label_count()
yield yield
# Ensure the number of dangling storages and labels has not changed.
updated_storage_count = self._get_dangling_storage_count() updated_storage_count = self._get_dangling_storage_count()
self.assertEqual(updated_storage_count, existing_storage_count) self.assertEqual(updated_storage_count, existing_storage_count)
updated_label_count = self._get_dangling_label_count() updated_label_count = self._get_dangling_label_count()
self.assertEqual(updated_label_count, existing_label_count) self.assertEqual(updated_label_count, existing_label_count)
# Ensure that for each call to the image+storage cleanup callback, the image and its
# storage is not found *anywhere* in the database.
for removed_image_and_storage in removed_image_storages:
with self.assertRaises(Image.DoesNotExist):
Image.get(id=removed_image_and_storage.id)
with self.assertRaises(ImageStorage.DoesNotExist):
ImageStorage.get(id=removed_image_and_storage.storage_id)
with self.assertRaises(ImageStorage.DoesNotExist):
ImageStorage.get(uuid=removed_image_and_storage.storage.uuid)
self.assertEquals(expect_storage_removed, bool(removed_image_storages))
def test_has_garbage(self): def test_has_garbage(self):
""" Remove all existing repositories, then add one without garbage, check, then add one with """ Remove all existing repositories, then add one without garbage, check, then add one with
garbage, and check again. garbage, and check again.
@ -212,14 +235,14 @@ class TestGarbageCollection(unittest.TestCase):
def test_one_tag(self): def test_one_tag(self):
""" Create a repository with a single tag, then remove that tag and verify that the repository """ Create a repository with a single tag, then remove that tag and verify that the repository
is now empty. """ is now empty. """
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity():
repository = self.createRepository(latest=['i1', 'i2', 'i3']) repository = self.createRepository(latest=['i1', 'i2', 'i3'])
self.deleteTag(repository, 'latest') self.deleteTag(repository, 'latest')
self.assertDeleted(repository, 'i1', 'i2', 'i3') self.assertDeleted(repository, 'i1', 'i2', 'i3')
def test_two_tags_unshared_images(self): def test_two_tags_unshared_images(self):
""" Repository has two tags with no shared images between them. """ """ Repository has two tags with no shared images between them. """
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['f1', 'f2']) repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['f1', 'f2'])
self.deleteTag(repository, 'latest') self.deleteTag(repository, 'latest')
self.assertDeleted(repository, 'i1', 'i2', 'i3') self.assertDeleted(repository, 'i1', 'i2', 'i3')
@ -229,7 +252,7 @@ class TestGarbageCollection(unittest.TestCase):
""" Repository has two tags with shared images. Deleting the tag should only remove the """ Repository has two tags with shared images. Deleting the tag should only remove the
unshared images. unshared images.
""" """
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1'])
self.deleteTag(repository, 'latest') self.deleteTag(repository, 'latest')
self.assertDeleted(repository, 'i2', 'i3') self.assertDeleted(repository, 'i2', 'i3')
@ -239,7 +262,7 @@ class TestGarbageCollection(unittest.TestCase):
""" Two repositories with different images. Removing the tag from one leaves the other's """ Two repositories with different images. Removing the tag from one leaves the other's
images intact. images intact.
""" """
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity():
repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1')
repository2 = self.createRepository(latest=['j1', 'j2', 'j3'], name='repo2') repository2 = self.createRepository(latest=['j1', 'j2', 'j3'], name='repo2')
@ -252,7 +275,7 @@ class TestGarbageCollection(unittest.TestCase):
""" Two repositories with shared images. Removing the tag from one leaves the other's """ Two repositories with shared images. Removing the tag from one leaves the other's
images intact. images intact.
""" """
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity():
repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1')
repository2 = self.createRepository(latest=['i1', 'i2', 'j1'], name='repo2') repository2 = self.createRepository(latest=['i1', 'i2', 'j1'], name='repo2')
@ -265,7 +288,7 @@ class TestGarbageCollection(unittest.TestCase):
""" Two repositories under different namespaces should result in the images being deleted """ Two repositories under different namespaces should result in the images being deleted
but not completely removed from the database. but not completely removed from the database.
""" """
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity():
repository1 = self.createRepository(namespace=ADMIN_ACCESS_USER, latest=['i1', 'i2', 'i3']) repository1 = self.createRepository(namespace=ADMIN_ACCESS_USER, latest=['i1', 'i2', 'i3'])
repository2 = self.createRepository(namespace=PUBLIC_USER, latest=['i1', 'i2', 'i3']) repository2 = self.createRepository(namespace=PUBLIC_USER, latest=['i1', 'i2', 'i3'])
@ -277,7 +300,7 @@ class TestGarbageCollection(unittest.TestCase):
""" Repository has multiple tags with shared images. Selectively deleting the tags, and """ Repository has multiple tags with shared images. Selectively deleting the tags, and
verifying at each step. verifying at each step.
""" """
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'],
third=['t1', 't2', 't3'], fourth=['i1', 'f1']) third=['t1', 't2', 't3'], fourth=['i1', 'f1'])
@ -314,7 +337,7 @@ class TestGarbageCollection(unittest.TestCase):
self.assertDeleted(repository, 'i1') self.assertDeleted(repository, 'i1')
def test_empty_gc(self): def test_empty_gc(self):
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity(expect_storage_removed=False):
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'],
third=['t1', 't2', 't3'], fourth=['i1', 'f1']) third=['t1', 't2', 't3'], fourth=['i1', 'f1'])
@ -324,7 +347,7 @@ class TestGarbageCollection(unittest.TestCase):
def test_time_machine_no_gc(self): def test_time_machine_no_gc(self):
""" Repository has two tags with shared images. Deleting the tag should not remove any images """ Repository has two tags with shared images. Deleting the tag should not remove any images
""" """
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity(expect_storage_removed=False):
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1'])
self._set_tag_expiration_policy(repository.namespace_user.username, 60*60*24) self._set_tag_expiration_policy(repository.namespace_user.username, 60*60*24)
@ -336,7 +359,7 @@ class TestGarbageCollection(unittest.TestCase):
""" Repository has two tags with shared images. Deleting the second tag should cause the images """ Repository has two tags with shared images. Deleting the second tag should cause the images
for the first deleted tag to gc. for the first deleted tag to gc.
""" """
with self.assert_no_new_dangling_storages_or_labels(): with self.assert_gc_integrity():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1'])
self._set_tag_expiration_policy(repository.namespace_user.username, 1) self._set_tag_expiration_policy(repository.namespace_user.username, 1)
@ -351,6 +374,37 @@ class TestGarbageCollection(unittest.TestCase):
self.assertDeleted(repository, 'i2', 'i3') self.assertDeleted(repository, 'i2', 'i3')
self.assertNotDeleted(repository, 'i1', 'f1') self.assertNotDeleted(repository, 'i1', 'f1')
def test_images_shared_storage(self):
""" Repository with two tags, both with the same shared storage. Deleting the first
tag should delete the first image, but *not* its storage.
"""
with self.assert_gc_integrity(expect_storage_removed=False):
repository = self.createRepository()
# Add two tags, each with their own image, but with the same storage.
image_storage = model.storage.create_v1_storage(storage.preferred_locations[0])
first_image = Image.create(docker_image_id='i1',
repository=repository, storage=image_storage,
ancestors='/')
second_image = Image.create(docker_image_id='i2',
repository=repository, storage=image_storage,
ancestors='/')
model.tag.store_tag_manifest(repository.namespace_user.username, repository.name,
'first', first_image.docker_image_id,
'sha:someshahere', '{}')
model.tag.store_tag_manifest(repository.namespace_user.username, repository.name,
'second', second_image.docker_image_id,
'sha:someshahere', '{}')
# Delete the first tag.
self.deleteTag(repository, 'first')
self.assertDeleted(repository, 'i1')
self.assertNotDeleted(repository, 'i2')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()