Switch tag deletion to use a single query

This commit is contained in:
Joseph Schorr 2015-06-19 14:55:30 -04:00 committed by Joseph Schorr
parent 79101c1055
commit acd86008c8
2 changed files with 60 additions and 39 deletions

View file

@ -7,7 +7,8 @@ from data.model import (DataModelException, tag, db_transaction, storage, image,
_basequery) _basequery)
from data.database import (Repository, Namespace, RepositoryTag, Star, Image, ImageStorage, User, from data.database import (Repository, Namespace, RepositoryTag, Star, Image, ImageStorage, User,
Visibility, RepositoryPermission, TupleSelector, RepositoryActionCount, Visibility, RepositoryPermission, TupleSelector, RepositoryActionCount,
Role, RepositoryAuthorizedEmail, db_for_update) Role, RepositoryAuthorizedEmail, db_for_update, get_epoch_timestamp,
db_random_func)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -57,40 +58,67 @@ def purge_repository(namespace_name, repository_name):
fetched.delete_instance(recursive=True, delete_nullable=False) fetched.delete_instance(recursive=True, delete_nullable=False)
def garbage_collect_repository(namespace_name, repository_name): def find_repository_with_garbage():
storage_id_whitelist = {} epoch_timestamp = get_epoch_timestamp()
tag.garbage_collect_tags(namespace_name, repository_name) try:
candidates = (RepositoryTag
with db_transaction(): .select(RepositoryTag.repository)
# TODO (jake): We could probably select this and all the images in a single query using
# a different kind of join.
# Get a list of all images used by tags in the repository
tag_query = (RepositoryTag
.select(RepositoryTag, Image, ImageStorage)
.join(Image)
.join(ImageStorage, JOIN_LEFT_OUTER)
.switch(RepositoryTag)
.join(Repository) .join(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id)) .join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(Repository.name == repository_name, Namespace.username == namespace_name)) .where(~(RepositoryTag.lifetime_end_ts >> None),
(RepositoryTag.lifetime_end_ts <=
(epoch_timestamp - Namespace.removed_tag_expiration_s)))
.limit(500)
.alias('candidates'))
referenced_ancestors = set() found = (RepositoryTag
for one_tag in tag_query: .select(candidates.c.repository)
# The ancestor list is in the format '/1/2/3/', extract just the ids .from_(candidates)
ancestor_id_strings = one_tag.image.ancestors.split('/')[1:-1] .order_by(db_random_func())
ancestor_list = [int(img_id_str) for img_id_str in ancestor_id_strings] .get())
referenced_ancestors = referenced_ancestors.union(set(ancestor_list)) if not found:
referenced_ancestors.add(one_tag.image.id) return
all_repo_images = image.get_repository_images(namespace_name, repository_name) return Repository.get(Repository.id == found)
except RepositoryTag.DoesNotExist:
return None
except Repository.DoesNotExist:
return None
def garbage_collect_repository(namespace_name, repository_name):
repo = get_repository(namespace_name, repository_name)
garbage_collect_repo(repo)
def garbage_collect_repo(repo):
storage_id_whitelist = {}
tag.garbage_collect_tags(repo)
with db_transaction():
# Get a list of all images used by tags in the repository
tagged_images = (Image
.select(Image.id, Image.ancestors)
.join(RepositoryTag)
.where(Image.repository == repo))
referenced_anscestors = set()
for tagged_image in tagged_images:
# The anscestor list is in the format '/1/2/3/', extract just the ids
anscestor_id_strings = tagged_image.ancestors.split('/')[1:-1]
ancestor_list = [int(img_id_str) for img_id_str in anscestor_id_strings]
referenced_anscestors = referenced_anscestors.union(set(ancestor_list))
referenced_anscestors.add(tagged_image.id)
all_repo_images = Image.select(Image.id, Image.storage).where(Image.repository == repo)
all_images = {int(img.id): img for img in all_repo_images} all_images = {int(img.id): img for img in all_repo_images}
to_remove = set(all_images.keys()).difference(referenced_ancestors) to_remove = set(all_images.keys()).difference(referenced_anscestors)
if len(to_remove) > 0: if len(to_remove) > 0:
logger.info('Cleaning up unreferenced images: %s', to_remove) logger.info('Cleaning up unreferenced images: %s', to_remove)
storage_id_whitelist = {all_images[to_remove_id].storage.id for to_remove_id in to_remove} storage_id_whitelist = {all_images[to_remove_id].storage_id for to_remove_id in to_remove}
Image.delete().where(Image.id << list(to_remove)).execute() Image.delete().where(Image.id << list(to_remove)).execute()
if len(to_remove) > 0: if len(to_remove) > 0:

View file

@ -97,21 +97,14 @@ def delete_tag(namespace_name, repository_name, tag_name):
found.save() found.save()
def garbage_collect_tags(namespace_name, repository_name): def garbage_collect_tags(repo):
# We do this without using a join to prevent holding read locks on the repository table
repo = _basequery.get_existing_repository(namespace_name, repository_name)
expired_time = get_epoch_timestamp() - repo.namespace_user.removed_tag_expiration_s expired_time = get_epoch_timestamp() - repo.namespace_user.removed_tag_expiration_s
tags_to_delete = list(RepositoryTag (RepositoryTag
.select(RepositoryTag.id) .delete()
.where(RepositoryTag.repository == repo, .where(RepositoryTag.repository == repo,
~(RepositoryTag.lifetime_end_ts >> None), ~(RepositoryTag.lifetime_end_ts >> None),
(RepositoryTag.lifetime_end_ts <= expired_time)) (RepositoryTag.lifetime_end_ts <= expired_time))
.order_by(RepositoryTag.id))
if len(tags_to_delete) > 0:
(RepositoryTag
.delete()
.where(RepositoryTag.id << tags_to_delete)
.execute()) .execute())