Optimize GC query for looking up deletable storages

This commit is contained in:
Joseph Schorr 2016-07-25 14:02:00 -07:00
parent 640012103c
commit 9e4f8cac03
2 changed files with 171 additions and 110 deletions

View file

@ -46,6 +46,31 @@ def add_storage_placement(storage, location_name):
pass
def _orphaned_storage_query(candidate_ids):
""" Returns the subset of the candidate ImageStorage IDs representing storages that are no
longer referenced by images.
"""
# Issue a union query to find all storages that are still referenced by a candidate storage. This
# is much faster than the group_by and having call we used to use here.
nonorphaned_queries = []
for counter, candidate_id in enumerate(candidate_ids):
query_alias = 'q{0}'.format(counter)
storage_subq = (ImageStorage
.select(ImageStorage.id)
.join(Image)
.where(ImageStorage.id == candidate_id)
.limit(1)
.alias(query_alias))
nonorphaned_queries.append(ImageStorage
.select(SQL('*'))
.from_(storage_subq))
# Build the set of storages that are missing. These storages are orphaned.
nonorphaned_storage_ids = {storage.id for storage in _reduce_as_tree(nonorphaned_queries)}
return list(candidate_ids - nonorphaned_storage_ids)
def garbage_collect_storage(storage_id_whitelist):
if len(storage_id_whitelist) == 0:
return
@ -55,27 +80,21 @@ def garbage_collect_storage(storage_id_whitelist):
get_layer_path(placement.storage))
for placement in placements_query}
def orphaned_storage_query(select_base_query, candidates, group_by):
return (select_base_query
.switch(ImageStorage)
.join(Image, JOIN_LEFT_OUTER)
.where(ImageStorage.id << list(candidates))
.group_by(*group_by)
.having(fn.Count(Image.id) == 0))
# Note: Both of these deletes must occur in the same transaction (unfortunately) because a
# storage without any placement is invalid, and a placement cannot exist without a storage.
# TODO(jake): We might want to allow for null storages on placements, which would allow us to
# delete the storages, then delete the placements in a non-transaction.
logger.debug('Garbage collecting storages from candidates: %s', storage_id_whitelist)
with db_transaction():
# Track all of the data that should be removed from blob storage
placements_to_remove = list(orphaned_storage_query(ImageStoragePlacement
.select(ImageStoragePlacement,
ImageStorage)
.join(ImageStorage),
storage_id_whitelist,
(ImageStorage.id, ImageStoragePlacement.id)))
orphaned_storage_ids = _orphaned_storage_query(storage_id_whitelist)
if len(orphaned_storage_ids) == 0:
# Nothing to GC.
return
placements_to_remove = list(ImageStoragePlacement
.select()
.join(ImageStorage)
.where(ImageStorage.id << orphaned_storage_ids))
paths_to_remove = placements_query_to_paths_set(placements_to_remove)
@ -89,28 +108,23 @@ def garbage_collect_storage(storage_id_whitelist):
logger.debug('Removed %s image storage placements', placements_removed)
# Remove all orphaned storages
# The comma after ImageStorage.id is VERY important, it makes it a tuple, which is a sequence
orphaned_storages = list(orphaned_storage_query(ImageStorage.select(ImageStorage.id),
storage_id_whitelist,
(ImageStorage.id,)).alias('osq'))
if len(orphaned_storages) > 0:
torrents_removed = (TorrentInfo
.delete()
.where(TorrentInfo.storage << orphaned_storages)
.execute())
logger.debug('Removed %s torrent info records', torrents_removed)
torrents_removed = (TorrentInfo
.delete()
.where(TorrentInfo.storage << orphaned_storage_ids)
.execute())
logger.debug('Removed %s torrent info records', torrents_removed)
signatures_removed = (ImageStorageSignature
.delete()
.where(ImageStorageSignature.storage << orphaned_storages)
.execute())
logger.debug('Removed %s image storage signatures', signatures_removed)
storages_removed = (ImageStorage
signatures_removed = (ImageStorageSignature
.delete()
.where(ImageStorage.id << orphaned_storages)
.where(ImageStorageSignature.storage << orphaned_storage_ids)
.execute())
logger.debug('Removed %s image storage records', storages_removed)
logger.debug('Removed %s image storage signatures', signatures_removed)
storages_removed = (ImageStorage
.delete()
.where(ImageStorage.id << orphaned_storage_ids)
.execute())
logger.debug('Removed %s image storage records', storages_removed)
# We are going to make the conscious decision to not delete image storage blobs inside
# transactions.