Optimize GC query join a bit by reducing the surface
We remove the directly referenced images from the join across ancestors, as they will be covered by the first part of the union clause. For some large repositories, this will result in a significantly reduced set of images that have to be joined NxM.
This commit is contained in:
parent
f0dd2e348b
commit
d68b65d90c
2 changed files with 43 additions and 6 deletions
|
@ -194,17 +194,21 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
|
|||
# directly referenced by a tag. This can be used in a subquery to directly
|
||||
# find which candidates are being referenced without any client side
|
||||
# computation or extra round trips.
|
||||
direct_referenced = (RepositoryTag
|
||||
.select(RepositoryTag.image)
|
||||
.where(RepositoryTag.repository == repo.id,
|
||||
RepositoryTag.image << candidates_orphans))
|
||||
|
||||
cloned = direct_referenced.clone().alias('direct_ref')
|
||||
directly_referenced_subquery = Image.alias().select(cloned.c.image_id).from_(cloned)
|
||||
|
||||
ancestor_referenced = (Candidate
|
||||
.select(Candidate.id)
|
||||
.join(Tagged, on=ancestor_superset)
|
||||
.join(RepositoryTag, on=(Tagged.id == RepositoryTag.image))
|
||||
.where(RepositoryTag.repository == repo.id,
|
||||
Candidate.id << candidates_orphans))
|
||||
|
||||
direct_referenced = (RepositoryTag
|
||||
.select(RepositoryTag.image)
|
||||
.where(RepositoryTag.repository == repo.id,
|
||||
RepositoryTag.image << candidates_orphans))
|
||||
Candidate.id << candidates_orphans,
|
||||
~(Candidate.id << directly_referenced_subquery)))
|
||||
|
||||
referenced_candidates = (direct_referenced | ancestor_referenced)
|
||||
|
||||
|
|
Reference in a new issue