Garbage collection image+storage callback support

Add support to GC to invoke a callback with the image+storages removed. Only images whose storage was also removed will be sent to the callback. This will be used by security scanning for its own GC in the followup change.
This commit is contained in:
Joseph Schorr 2016-12-22 14:27:42 -05:00
parent 35244d839d
commit 5225642850
4 changed files with 107 additions and 39 deletions

View file

@ -5,9 +5,9 @@ from datetime import timedelta, datetime
from peewee import JOIN_LEFT_OUTER, fn, SQL, IntegrityError
from cachetools import ttl_cache
from data.model import (DataModelException, tag, db_transaction, storage, permission,
from data.model import (config, DataModelException, tag, db_transaction, storage, permission,
_basequery)
from data.database import (Repository, Namespace, RepositoryTag, Star, Image, User,
from data.database import (Repository, Namespace, RepositoryTag, Star, Image, ImageStorage, User,
Visibility, RepositoryPermission, RepositoryActionCount,
Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage,
Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
@ -173,29 +173,24 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
referenced_candidates = (direct_referenced | ancestor_referenced)
# We desire two pieces of information from the database from the following
# We desire a few pieces of information from the database from the following
# query: all of the image ids which are associated with this repository,
# and the storages which are associated with those images. In order to
# fetch just this information, and bypass all of the peewee model parsing
# code, which is overkill for just two fields, we use a tuple query, and
# feed that directly to the dictionary tuple constructor which takes an
# iterable of tuples containing [(k, v), (k, v), ...]
# and the storages which are associated with those images.
unreferenced_candidates = (Image
.select(Image.id, Image.storage)
.select(Image.id, Image.docker_image_id,
ImageStorage.id, ImageStorage.uuid)
.join(ImageStorage)
.where(Image.id << candidates_orphans,
~(Image.id << referenced_candidates))
.tuples())
~(Image.id << referenced_candidates)))
unreferecend_images_to_storages = dict(unreferenced_candidates)
to_remove = unreferecend_images_to_storages.keys()
if len(to_remove) > 0:
logger.info('Cleaning up unreferenced images: %s', to_remove)
storage_id_whitelist = set(unreferecend_images_to_storages.values())
image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates]
if len(image_ids_to_remove) > 0:
logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove)
storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates])
# Lookup any derived images for the images to remove.
derived = DerivedStorageForImage.select().where(
DerivedStorageForImage.source_image << to_remove)
DerivedStorageForImage.source_image << image_ids_to_remove)
has_derived = False
for derived_image in derived:
@ -207,21 +202,30 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
try:
(DerivedStorageForImage
.delete()
.where(DerivedStorageForImage.source_image << to_remove)
.where(DerivedStorageForImage.source_image << image_ids_to_remove)
.execute())
except IntegrityError:
logger.info('Could not GC derived images %s; will try again soon', to_remove)
logger.info('Could not GC derived images %s; will try again soon', image_ids_to_remove)
return False
try:
Image.delete().where(Image.id << to_remove).execute()
Image.delete().where(Image.id << image_ids_to_remove).execute()
except IntegrityError:
logger.info('Could not GC images %s; will try again soon', to_remove)
logger.info('Could not GC images %s; will try again soon', image_ids_to_remove)
return False
if len(to_remove) > 0:
logger.info('Garbage collecting storage for images: %s', to_remove)
storage.garbage_collect_storage(storage_id_whitelist)
# If any images were removed, GC any orphaned storages.
if len(image_ids_to_remove) > 0:
logger.info('Garbage collecting storage for images: %s', image_ids_to_remove)
storage_ids_removed = set(storage.garbage_collect_storage(storage_id_whitelist))
# If any storages were removed and cleanup callbacks are registered, call them with
# the images+storages removed.
if storage_ids_removed and config.image_cleanup_callbacks:
image_storages_removed = [candidate for candidate in unreferenced_candidates
if candidate.storage_id in storage_ids_removed]
for callback in config.image_cleanup_callbacks:
callback(image_storages_removed)
return True