Reduce database bandwidth by tracking gc candidate images.
This commit is contained in:
parent
0815f6b6c4
commit
584a5a7ddd
5 changed files with 161 additions and 107 deletions
|
@ -11,7 +11,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Us
|
|||
Visibility, RepositoryPermission, RepositoryActionCount,
|
||||
Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage,
|
||||
Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
|
||||
db_random_func)
|
||||
db_random_func, db_concat_func)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -43,45 +43,21 @@ def get_repository(namespace_name, repository_name):
|
|||
return None
|
||||
|
||||
|
||||
def _purge_all_repository_tags(namespace_name, repository_name):
|
||||
""" Immediately purge all repository tags without respecting the lifeline procedure """
|
||||
try:
|
||||
repo = _basequery.get_existing_repository(namespace_name, repository_name)
|
||||
except Repository.DoesNotExist:
|
||||
raise DataModelException('Invalid repository \'%s/%s\'' %
|
||||
(namespace_name, repository_name))
|
||||
|
||||
# Finds all the tags to delete.
|
||||
repo_tags = list(RepositoryTag.select().where(RepositoryTag.repository == repo.id))
|
||||
if not repo_tags:
|
||||
return
|
||||
|
||||
# Find all labels to delete.
|
||||
manifest_labels_query = (TagManifestLabel
|
||||
.select()
|
||||
.where(TagManifestLabel.repository == repo))
|
||||
|
||||
label_ids = [manifest_label.label_id for manifest_label in manifest_labels_query]
|
||||
if label_ids:
|
||||
# Delete all the mapping entries.
|
||||
TagManifestLabel.delete().where(TagManifestLabel.repository == repo).execute()
|
||||
|
||||
# Delete all the matching labels.
|
||||
Label.delete().where(Label.id << label_ids).execute()
|
||||
|
||||
# Delete all the manifests.
|
||||
TagManifest.delete().where(TagManifest.tag << repo_tags).execute()
|
||||
|
||||
# Delete all tags.
|
||||
RepositoryTag.delete().where(RepositoryTag.repository == repo.id).execute()
|
||||
|
||||
|
||||
def purge_repository(namespace_name, repository_name):
|
||||
repo = _basequery.get_existing_repository(namespace_name, repository_name)
|
||||
|
||||
# Delete all tags to allow gc to reclaim storage
|
||||
_purge_all_repository_tags(namespace_name, repository_name)
|
||||
previously_referenced = tag.purge_all_tags(repo)
|
||||
unreferenced_image_q = Image.select(Image.id).where(Image.repository == repo)
|
||||
|
||||
if len(previously_referenced) > 0:
|
||||
unreferenced_image_q = (unreferenced_image_q
|
||||
.where(~(Image.id << list(previously_referenced))))
|
||||
|
||||
unreferenced_candidates = set(img[0] for img in unreferenced_image_q.tuples())
|
||||
|
||||
# Gc to remove the images and storage
|
||||
garbage_collect_repository(namespace_name, repository_name)
|
||||
garbage_collect_repo(repo, previously_referenced | unreferenced_candidates)
|
||||
|
||||
# Delete the rest of the repository metadata
|
||||
fetched = _basequery.get_existing_repository(namespace_name, repository_name)
|
||||
|
@ -135,34 +111,46 @@ def find_repository_with_garbage(limit_to_gc_policy_s):
|
|||
return None
|
||||
|
||||
|
||||
def garbage_collect_repository(namespace_name, repository_name):
|
||||
repo = get_repository(namespace_name, repository_name)
|
||||
if repo is not None:
|
||||
garbage_collect_repo(repo)
|
||||
|
||||
|
||||
def garbage_collect_repo(repo):
|
||||
def garbage_collect_repo(repo, extra_candidate_set=None):
|
||||
logger.debug('Garbage collecting repository %s', repo.id)
|
||||
|
||||
storage_id_whitelist = set()
|
||||
tag.garbage_collect_tags(repo)
|
||||
candidate_orphan_image_set = tag.garbage_collect_tags(repo)
|
||||
|
||||
if extra_candidate_set:
|
||||
candidate_orphan_image_set.update(extra_candidate_set)
|
||||
|
||||
if not len(candidate_orphan_image_set):
|
||||
logger.debug('No candidate images for GC for repo: %s', repo.id)
|
||||
return
|
||||
|
||||
candidates_orphans = list(candidate_orphan_image_set)
|
||||
|
||||
with db_transaction():
|
||||
# Get a list of all images used by tags in the repository
|
||||
tagged_images = (Image
|
||||
.select(Image.id, Image.ancestors)
|
||||
.join(RepositoryTag)
|
||||
.where(Image.repository == repo))
|
||||
Candidate = Image.alias()
|
||||
Tagged = Image.alias()
|
||||
ancestor_superset = Tagged.ancestors ** db_concat_func(Candidate.ancestors, Candidate.id, '/%')
|
||||
|
||||
def gen_referenced_ancestors():
|
||||
for tagged_image in tagged_images:
|
||||
# The ancestor list is in the format '/1/2/3/', extract just the ids
|
||||
ancestor_id_strings = tagged_image.ancestor_list()
|
||||
for img_id_str in ancestor_id_strings:
|
||||
yield int(img_id_str)
|
||||
yield tagged_image.id
|
||||
# We are going to compute all images which are being referenced in two ways:
|
||||
# First, we will find all images which have their ancestor paths appear in
|
||||
# another image. Secondly, we union in all of the candidate images which are
|
||||
# directly referenced by a tag. This can be used in a subquery to directly
|
||||
# find which candidates are being referenced without any client side
|
||||
# computation or extra round trips.
|
||||
ancestor_referenced = (Candidate
|
||||
.select(Candidate.id)
|
||||
.join(Tagged, on=ancestor_superset)
|
||||
.join(RepositoryTag, on=(Tagged.id == RepositoryTag.image))
|
||||
.where(RepositoryTag.repository == repo.id,
|
||||
Candidate.id << candidates_orphans))
|
||||
|
||||
referenced_ancestors = set(gen_referenced_ancestors())
|
||||
direct_referenced = (Candidate
|
||||
.select(Candidate.id)
|
||||
.join(RepositoryTag)
|
||||
.where(RepositoryTag.repository == repo.id,
|
||||
Candidate.id << candidates_orphans))
|
||||
|
||||
referenced_candidates = (direct_referenced | ancestor_referenced)
|
||||
|
||||
# We desire two pieces of information from the database from the following
|
||||
# query: all of the image ids which are associated with this repository,
|
||||
|
@ -171,13 +159,18 @@ def garbage_collect_repo(repo):
|
|||
# code, which is overkill for just two fields, we use a tuple query, and
|
||||
# feed that directly to the dictionary tuple constructor which takes an
|
||||
# iterable of tuples containing [(k, v), (k, v), ...]
|
||||
all_repo_images = Image.select(Image.id, Image.storage).where(Image.repository == repo).tuples()
|
||||
images_to_storages = dict(all_repo_images)
|
||||
to_remove = list(set(images_to_storages.keys()).difference(referenced_ancestors))
|
||||
unreferenced_candidates = (Image
|
||||
.select(Image.id, Image.storage)
|
||||
.where(Image.id << candidates_orphans,
|
||||
~(Image.id << referenced_candidates))
|
||||
.tuples())
|
||||
|
||||
unreferecend_images_to_storages = dict(unreferenced_candidates)
|
||||
to_remove = unreferecend_images_to_storages.keys()
|
||||
|
||||
if len(to_remove) > 0:
|
||||
logger.info('Cleaning up unreferenced images: %s', to_remove)
|
||||
storage_id_whitelist = {images_to_storages[to_remove_id] for to_remove_id in to_remove}
|
||||
storage_id_whitelist = set(unreferecend_images_to_storages.values())
|
||||
|
||||
# Lookup any derived images for the images to remove.
|
||||
derived = DerivedStorageForImage.select().where(
|
||||
|
|
Reference in a new issue