Merge pull request #2257 from coreos-inc/clair-gc-take2

feat(gc): Garbage collection for security scanning
This commit is contained in:
josephschorr 2017-01-17 14:49:36 -05:00 committed by GitHub
commit aafcb592a6
8 changed files with 194 additions and 62 deletions

View file

@ -107,6 +107,10 @@ class Config(object):
def __init__(self):
self.app_config = None
self.store = None
self.image_cleanup_callbacks = []
def register_image_cleanup_callback(self, callback):
self.image_cleanup_callbacks.append(callback)
config = Config()

View file

@ -5,9 +5,9 @@ from datetime import timedelta, datetime
from peewee import JOIN_LEFT_OUTER, fn, SQL, IntegrityError
from cachetools import ttl_cache
from data.model import (DataModelException, tag, db_transaction, storage, permission,
from data.model import (config, DataModelException, tag, db_transaction, storage, permission,
_basequery)
from data.database import (Repository, Namespace, RepositoryTag, Star, Image, User,
from data.database import (Repository, Namespace, RepositoryTag, Star, Image, ImageStorage, User,
Visibility, RepositoryPermission, RepositoryActionCount,
Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage,
Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
@ -173,29 +173,24 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
referenced_candidates = (direct_referenced | ancestor_referenced)
# We desire two pieces of information from the database from the following
# We desire a few pieces of information from the database from the following
# query: all of the image ids which are associated with this repository,
# and the storages which are associated with those images. In order to
# fetch just this information, and bypass all of the peewee model parsing
# code, which is overkill for just two fields, we use a tuple query, and
# feed that directly to the dictionary tuple constructor which takes an
# iterable of tuples containing [(k, v), (k, v), ...]
# and the storages which are associated with those images.
unreferenced_candidates = (Image
.select(Image.id, Image.storage)
.select(Image.id, Image.docker_image_id,
ImageStorage.id, ImageStorage.uuid)
.join(ImageStorage)
.where(Image.id << candidates_orphans,
~(Image.id << referenced_candidates))
.tuples())
~(Image.id << referenced_candidates)))
unreferecend_images_to_storages = dict(unreferenced_candidates)
to_remove = unreferecend_images_to_storages.keys()
if len(to_remove) > 0:
logger.info('Cleaning up unreferenced images: %s', to_remove)
storage_id_whitelist = set(unreferecend_images_to_storages.values())
image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates]
if len(image_ids_to_remove) > 0:
logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove)
storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates])
# Lookup any derived images for the images to remove.
derived = DerivedStorageForImage.select().where(
DerivedStorageForImage.source_image << to_remove)
DerivedStorageForImage.source_image << image_ids_to_remove)
has_derived = False
for derived_image in derived:
@ -207,21 +202,30 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
try:
(DerivedStorageForImage
.delete()
.where(DerivedStorageForImage.source_image << to_remove)
.where(DerivedStorageForImage.source_image << image_ids_to_remove)
.execute())
except IntegrityError:
logger.info('Could not GC derived images %s; will try again soon', to_remove)
logger.info('Could not GC derived images %s; will try again soon', image_ids_to_remove)
return False
try:
Image.delete().where(Image.id << to_remove).execute()
Image.delete().where(Image.id << image_ids_to_remove).execute()
except IntegrityError:
logger.info('Could not GC images %s; will try again soon', to_remove)
logger.info('Could not GC images %s; will try again soon', image_ids_to_remove)
return False
if len(to_remove) > 0:
logger.info('Garbage collecting storage for images: %s', to_remove)
storage.garbage_collect_storage(storage_id_whitelist)
# If any images were removed, GC any orphaned storages.
if len(image_ids_to_remove) > 0:
logger.info('Garbage collecting storage for images: %s', image_ids_to_remove)
storage_ids_removed = set(storage.garbage_collect_storage(storage_id_whitelist))
# If any storages were removed and cleanup callbacks are registered, call them with
# the images+storages removed.
if storage_ids_removed and config.image_cleanup_callbacks:
image_storages_removed = [candidate for candidate in unreferenced_candidates
if candidate.storage_id in storage_ids_removed]
for callback in config.image_cleanup_callbacks:
callback(image_storages_removed)
return True

View file

@ -72,8 +72,12 @@ def _orphaned_storage_query(candidate_ids):
def garbage_collect_storage(storage_id_whitelist):
""" Performs GC on a possible subset of the storage's with the IDs found in the
whitelist. The storages in the whitelist will be checked, and any orphaned will
be removed, with those IDs being returned.
"""
if len(storage_id_whitelist) == 0:
return
return []
def placements_query_to_paths_set(placements_query):
return {(get_image_location_for_id(placement.location_id).name,
@ -89,7 +93,7 @@ def garbage_collect_storage(storage_id_whitelist):
orphaned_storage_ids = _orphaned_storage_query(storage_id_whitelist)
if len(orphaned_storage_ids) == 0:
# Nothing to GC.
return
return []
placements_to_remove = list(ImageStoragePlacement
.select()
@ -133,6 +137,8 @@ def garbage_collect_storage(storage_id_whitelist):
logger.debug('Removing %s from %s', image_path, location_name)
config.store.remove({location_name}, image_path)
return orphaned_storage_ids
def create_v1_storage(location_name):
storage = ImageStorage.create(cas_path=False)