Try not to throw any sets of data away when computing images to garbage collect.

This commit is contained in:
Jake Moshenko 2015-07-31 15:03:40 -04:00
parent c7e464ddf2
commit e133ea0962

View file

@ -119,21 +119,30 @@ def garbage_collect_repo(repo):
.join(RepositoryTag)
.where(Image.repository == repo))
referenced_ancestors = set()
for tagged_image in tagged_images:
# The ancestor list is in the format '/1/2/3/', extract just the ids
ancestor_id_strings = tagged_image.ancestors.split('/')[1:-1]
ancestor_list = [int(img_id_str) for img_id_str in ancestor_id_strings]
referenced_ancestors = referenced_ancestors.union(set(ancestor_list))
referenced_ancestors.add(tagged_image.id)
def gen_referenced_ancestors():
for tagged_image in tagged_images:
# The ancestor list is in the format '/1/2/3/', extract just the ids
ancestor_id_strings = tagged_image.ancestors.split('/')[1:-1]
for img_id_str in ancestor_id_strings:
yield int(img_id_str)
yield tagged_image.id
all_repo_images = Image.select(Image.id, Image.storage).where(Image.repository == repo)
all_images = {int(img.id): img for img in all_repo_images}
to_remove = set(all_images.keys()).difference(referenced_ancestors)
referenced_ancestors = set(gen_referenced_ancestors())
# We desire two pieces of information from the database from the following
# query: all of the image ids which are associated with this repository,
# and the storages which are associated with those images. In order to
# fetch just this information, and bypass all of the peewee model parsing
# code, which is overkill for just two fields, we use a tuple query, and
# feed that directly to the dictionary tuple constructor which takes an
# iterable of tuples containing [(k, v), (k, v), ...]
all_repo_images = Image.select(Image.id, Image.storage).where(Image.repository == repo).tuples()
images_to_storages = dict(all_repo_images)
to_remove = set(images_to_storages.keys()).difference(referenced_ancestors)
if len(to_remove) > 0:
logger.info('Cleaning up unreferenced images: %s', to_remove)
storage_id_whitelist = {all_images[to_remove_id].storage_id for to_remove_id in to_remove}
storage_id_whitelist = {images_to_storages[to_remove_id] for to_remove_id in to_remove}
Image.delete().where(Image.id << list(to_remove)).execute()
if len(to_remove) > 0: