Update the GC code to do everything with subqueries, making each GC run a bounded finite number of queries with a fixed length.

This commit is contained in:
Jake Moshenko 2014-10-17 14:33:04 -04:00
parent 380eb49e58
commit d8149295ab

View file

@ -1390,50 +1390,52 @@ def garbage_collect_repository(namespace_name, repository_name):
to_remove = set(all_images.keys()).difference(referenced_anscestors) to_remove = set(all_images.keys()).difference(referenced_anscestors)
logger.info('Cleaning up unreferenced images: %s', to_remove) logger.info('Cleaning up unreferenced images: %s', to_remove)
uuids_to_check_for_gc = {all_images[id_to_remove].storage.uuid for id_to_remove in to_remove}
Image.delete().where(Image.id << list(to_remove)).execute() Image.delete().where(Image.id << list(to_remove)).execute()
logger.debug('Checking image storages for being orphaned: %s', uuids_to_check_for_gc)
# We are going to make the concious decision to not delete image storage inside the transaction # We are going to make the concious decision to not delete image storage inside the transaction
# This may end up producing garbage in s3, trading off for higher availability in the database # This may end up producing garbage in s3, trading off for higher availability in the database
def placements_query_to_paths_set(placements_query): def placements_query_to_paths_set(placements_query):
return {(placement.location.name, config.store.image_path(placement.storage.uuid)) return {(placement.location.name, config.store.image_path(placement.storage.uuid))
for placement in placements_query} for placement in placements_query}
def remove_storages(placements_query): def orphaned_storage_query(select_base_query):
storages_to_remove = {placement.storage.uuid: placement.storage return (select_base_query
for placement in placements_query} .switch(ImageStorage)
for storage in storages_to_remove.values(): .join(Image, JOIN_LEFT_OUTER)
# Deletes all placements as well, but leaves derived storages .switch(ImageStorage)
storage.delete_instance(recursive=True) .join(DerivedImageStorage, JOIN_LEFT_OUTER, on=(ImageStorage.id ==
DerivedImageStorage.derivative))
.group_by(ImageStorage)
.having((fn.Count(Image.id) == 0) & (fn.Count(DerivedImageStorage.id) == 0)))
paths_to_remove = set() paths_to_remove = set()
with config.app_config['DB_TRANSACTION_FACTORY'](db): with config.app_config['DB_TRANSACTION_FACTORY'](db):
placements_to_remove = (ImageStoragePlacement # Remove the dervived image storages with sources of orphaned storages
.select(ImageStoragePlacement, ImageStorage, ImageStorageLocation) DerivedImageStorage.delete().where(DerivedImageStorage.source <<
.join(ImageStorage) orphaned_storage_query(ImageStorage.select())).execute()
.join(Image, JOIN_LEFT_OUTER)
.switch(ImageStoragePlacement)
.join(ImageStorageLocation)
.group_by(ImageStorage)
.where(ImageStorage.uuid << list(uuids_to_check_for_gc))
.having(fn.Count(Image.id) == 0))
paths_to_remove.update(placements_query_to_paths_set(placements_to_remove))
remove_storages(placements_to_remove)
with config.app_config['DB_TRANSACTION_FACTORY'](db): # Track all of the data that should be removed from blob storage
derived_to_remove = (ImageStoragePlacement placements_to_remove = orphaned_storage_query(ImageStoragePlacement
.select(ImageStoragePlacement, ImageStorage, ImageStorageLocation) .select(ImageStoragePlacement,
.join(ImageStorage) ImageStorage,
.join(DerivedImageStorage, on=(ImageStorage.id == ImageStorageLocation)
DerivedImageStorage.derivative)) .join(ImageStorageLocation)
.switch(ImageStoragePlacement) .switch(ImageStoragePlacement)
.join(ImageStorageLocation) .join(ImageStorage))
.where(DerivedImageStorage.source >> None)) paths_to_remove.update(placements_query_to_paths_set(placements_to_remove.clone()))
paths_to_remove.update(placements_query_to_paths_set(derived_to_remove))
remove_storages(derived_to_remove) # Remove the placements for orphaned storages
placements_subquery = placements_to_remove.clone().select(ImageStoragePlacement.id)
(ImageStoragePlacement
.delete()
.where(ImageStoragePlacement.id << placements_subquery)
.execute())
# Remove the all orphaned storages
(ImageStorage
.delete()
.where(ImageStorage.id << orphaned_storage_query(ImageStorage.select(ImageStorage.id)))
.execute())
# Delete the actual blob storage # Delete the actual blob storage
for location_name, image_path in paths_to_remove: for location_name, image_path in paths_to_remove: