diff --git a/data/model/repository.py b/data/model/repository.py index 24199c00b..c518fc5f7 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -119,21 +119,30 @@ def garbage_collect_repo(repo): .join(RepositoryTag) .where(Image.repository == repo)) - referenced_ancestors = set() - for tagged_image in tagged_images: - # The ancestor list is in the format '/1/2/3/', extract just the ids - ancestor_id_strings = tagged_image.ancestors.split('/')[1:-1] - ancestor_list = [int(img_id_str) for img_id_str in ancestor_id_strings] - referenced_ancestors = referenced_ancestors.union(set(ancestor_list)) - referenced_ancestors.add(tagged_image.id) + def gen_referenced_ancestors(): + for tagged_image in tagged_images: + # The ancestor list is in the format '/1/2/3/', extract just the ids + ancestor_id_strings = tagged_image.ancestors.split('/')[1:-1] + for img_id_str in ancestor_id_strings: + yield int(img_id_str) + yield tagged_image.id - all_repo_images = Image.select(Image.id, Image.storage).where(Image.repository == repo) - all_images = {int(img.id): img for img in all_repo_images} - to_remove = set(all_images.keys()).difference(referenced_ancestors) + referenced_ancestors = set(gen_referenced_ancestors()) + + # We desire two pieces of information from the database from the following + # query: all of the image ids which are associated with this repository, + # and the storages which are associated with those images. In order to + # fetch just this information, and bypass all of the peewee model parsing + # code, which is overkill for just two fields, we use a tuple query, and + # feed that directly to the dictionary tuple constructor which takes an + # iterable of tuples containing [(k, v), (k, v), ...] + all_repo_images = Image.select(Image.id, Image.storage).where(Image.repository == repo).tuples() + images_to_storages = dict(all_repo_images) + to_remove = set(images_to_storages.keys()).difference(referenced_ancestors) if len(to_remove) > 0: logger.info('Cleaning up unreferenced images: %s', to_remove) - storage_id_whitelist = {all_images[to_remove_id].storage_id for to_remove_id in to_remove} + storage_id_whitelist = {images_to_storages[to_remove_id] for to_remove_id in to_remove} Image.delete().where(Image.id << list(to_remove)).execute() if len(to_remove) > 0: