From acd86008c889e7d2b4924135bdcb59f1c7046005 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 19 Jun 2015 14:55:30 -0400 Subject: [PATCH] Switch tag deletion to use a single query --- data/model/repository.py | 78 +++++++++++++++++++++++++++------------- data/model/tag.py | 21 ++++------- 2 files changed, 60 insertions(+), 39 deletions(-) diff --git a/data/model/repository.py b/data/model/repository.py index 90857f93c..99115847b 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -7,7 +7,8 @@ from data.model import (DataModelException, tag, db_transaction, storage, image, _basequery) from data.database import (Repository, Namespace, RepositoryTag, Star, Image, ImageStorage, User, Visibility, RepositoryPermission, TupleSelector, RepositoryActionCount, - Role, RepositoryAuthorizedEmail, db_for_update) + Role, RepositoryAuthorizedEmail, db_for_update, get_epoch_timestamp, + db_random_func) logger = logging.getLogger(__name__) @@ -57,40 +58,67 @@ def purge_repository(namespace_name, repository_name): fetched.delete_instance(recursive=True, delete_nullable=False) -def garbage_collect_repository(namespace_name, repository_name): - storage_id_whitelist = {} +def find_repository_with_garbage(): + epoch_timestamp = get_epoch_timestamp() - tag.garbage_collect_tags(namespace_name, repository_name) + try: + candidates = (RepositoryTag + .select(RepositoryTag.repository) + .join(Repository) + .join(Namespace, on=(Repository.namespace_user == Namespace.id)) + .where(~(RepositoryTag.lifetime_end_ts >> None), + (RepositoryTag.lifetime_end_ts <= + (epoch_timestamp - Namespace.removed_tag_expiration_s))) + .limit(500) + .alias('candidates')) + + found = (RepositoryTag + .select(candidates.c.repository) + .from_(candidates) + .order_by(db_random_func()) + .get()) + if not found: + return + + return Repository.get(Repository.id == found) + + except RepositoryTag.DoesNotExist: + return None + except Repository.DoesNotExist: + return None + + +def garbage_collect_repository(namespace_name, repository_name): + repo = get_repository(namespace_name, repository_name) + garbage_collect_repo(repo) + + +def garbage_collect_repo(repo): + storage_id_whitelist = {} + tag.garbage_collect_tags(repo) with db_transaction(): - # TODO (jake): We could probably select this and all the images in a single query using - # a different kind of join. - # Get a list of all images used by tags in the repository - tag_query = (RepositoryTag - .select(RepositoryTag, Image, ImageStorage) - .join(Image) - .join(ImageStorage, JOIN_LEFT_OUTER) - .switch(RepositoryTag) - .join(Repository) - .join(Namespace, on=(Repository.namespace_user == Namespace.id)) - .where(Repository.name == repository_name, Namespace.username == namespace_name)) + tagged_images = (Image + .select(Image.id, Image.ancestors) + .join(RepositoryTag) + .where(Image.repository == repo)) - referenced_ancestors = set() - for one_tag in tag_query: - # The ancestor list is in the format '/1/2/3/', extract just the ids - ancestor_id_strings = one_tag.image.ancestors.split('/')[1:-1] - ancestor_list = [int(img_id_str) for img_id_str in ancestor_id_strings] - referenced_ancestors = referenced_ancestors.union(set(ancestor_list)) - referenced_ancestors.add(one_tag.image.id) + referenced_anscestors = set() + for tagged_image in tagged_images: + # The anscestor list is in the format '/1/2/3/', extract just the ids + anscestor_id_strings = tagged_image.ancestors.split('/')[1:-1] + ancestor_list = [int(img_id_str) for img_id_str in anscestor_id_strings] + referenced_anscestors = referenced_anscestors.union(set(ancestor_list)) + referenced_anscestors.add(tagged_image.id) - all_repo_images = image.get_repository_images(namespace_name, repository_name) + all_repo_images = Image.select(Image.id, Image.storage).where(Image.repository == repo) all_images = {int(img.id): img for img in all_repo_images} - to_remove = set(all_images.keys()).difference(referenced_ancestors) + to_remove = set(all_images.keys()).difference(referenced_anscestors) if len(to_remove) > 0: logger.info('Cleaning up unreferenced images: %s', to_remove) - storage_id_whitelist = {all_images[to_remove_id].storage.id for to_remove_id in to_remove} + storage_id_whitelist = {all_images[to_remove_id].storage_id for to_remove_id in to_remove} Image.delete().where(Image.id << list(to_remove)).execute() if len(to_remove) > 0: diff --git a/data/model/tag.py b/data/model/tag.py index 7f4dc93e4..9ac1ea897 100644 --- a/data/model/tag.py +++ b/data/model/tag.py @@ -97,22 +97,15 @@ def delete_tag(namespace_name, repository_name, tag_name): found.save() -def garbage_collect_tags(namespace_name, repository_name): - # We do this without using a join to prevent holding read locks on the repository table - repo = _basequery.get_existing_repository(namespace_name, repository_name) +def garbage_collect_tags(repo): expired_time = get_epoch_timestamp() - repo.namespace_user.removed_tag_expiration_s - tags_to_delete = list(RepositoryTag - .select(RepositoryTag.id) - .where(RepositoryTag.repository == repo, - ~(RepositoryTag.lifetime_end_ts >> None), - (RepositoryTag.lifetime_end_ts <= expired_time)) - .order_by(RepositoryTag.id)) - if len(tags_to_delete) > 0: - (RepositoryTag - .delete() - .where(RepositoryTag.id << tags_to_delete) - .execute()) + (RepositoryTag + .delete() + .where(RepositoryTag.repository == repo, + ~(RepositoryTag.lifetime_end_ts >> None), + (RepositoryTag.lifetime_end_ts <= expired_time)) + .execute()) def get_tag_image(namespace_name, repository_name, tag_name):