From d68b65d90c1ae4df167c4b2ef3dcdc7216542896 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 9 Jun 2017 12:16:34 -0400 Subject: [PATCH] Optimize GC query join a bit by reducing the surface We remove the directly referenced images from the join across ancestors, as they will be covered by the first part of the union clause. For some large repositories, this will result in a significantly reduced set of images that have to be joined NxM. --- data/model/repository.py | 16 ++++++++++------ test/test_gc.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/data/model/repository.py b/data/model/repository.py index e94b8b4fa..0b0d4ac6d 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -194,17 +194,21 @@ def garbage_collect_repo(repo, extra_candidate_set=None): # directly referenced by a tag. This can be used in a subquery to directly # find which candidates are being referenced without any client side # computation or extra round trips. + direct_referenced = (RepositoryTag + .select(RepositoryTag.image) + .where(RepositoryTag.repository == repo.id, + RepositoryTag.image << candidates_orphans)) + + cloned = direct_referenced.clone().alias('direct_ref') + directly_referenced_subquery = Image.alias().select(cloned.c.image_id).from_(cloned) + ancestor_referenced = (Candidate .select(Candidate.id) .join(Tagged, on=ancestor_superset) .join(RepositoryTag, on=(Tagged.id == RepositoryTag.image)) .where(RepositoryTag.repository == repo.id, - Candidate.id << candidates_orphans)) - - direct_referenced = (RepositoryTag - .select(RepositoryTag.image) - .where(RepositoryTag.repository == repo.id, - RepositoryTag.image << candidates_orphans)) + Candidate.id << candidates_orphans, + ~(Candidate.id << directly_referenced_subquery))) referenced_candidates = (direct_referenced | ancestor_referenced) diff --git a/test/test_gc.py b/test/test_gc.py index 1e5369675..a7085af04 100644 --- a/test/test_gc.py +++ b/test/test_gc.py @@ -314,38 +314,71 @@ class TestGarbageCollection(unittest.TestCase): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], third=['t1', 't2', 't3'], fourth=['i1', 'f1']) + # Current state: + # latest -> i3->i2->i1 + # other -> f2->f1->i1 + # third -> t3->t2->t1 + # fourth -> f1->i1 + # Delete tag other. Should delete f2, since it is not shared. self.deleteTag(repository, 'other') self.assertDeleted(repository, 'f2') self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3', 'f1') + # Current state: + # latest -> i3->i2->i1 + # third -> t3->t2->t1 + # fourth -> f1->i1 + # Move tag fourth to i3. This should remove f1 since it is no longer referenced. self.moveTag(repository, 'fourth', 'i3') self.assertDeleted(repository, 'f1') self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3') + # Current state: + # latest -> i3->i2->i1 + # third -> t3->t2->t1 + # fourth -> i3->i2->i1 + # Delete tag 'latest'. This should do nothing since fourth is on the same branch. self.deleteTag(repository, 'latest') self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3') + # Current state: + # third -> t3->t2->t1 + # fourth -> i3->i2->i1 + # Delete tag 'third'. This should remove t1->t3. self.deleteTag(repository, 'third') self.assertDeleted(repository, 't1', 't2', 't3') self.assertNotDeleted(repository, 'i1', 'i2', 'i3') + # Current state: + # fourth -> i3->i2->i1 + # Add tag to i1. self.moveTag(repository, 'newtag', 'i1') self.assertNotDeleted(repository, 'i1', 'i2', 'i3') + # Current state: + # fourth -> i3->i2->i1 + # newtag -> i1 + # Delete tag 'fourth'. This should remove i2 and i3. self.deleteTag(repository, 'fourth') self.assertDeleted(repository, 'i2', 'i3') self.assertNotDeleted(repository, 'i1') + # Current state: + # newtag -> i1 + # Delete tag 'newtag'. This should remove the remaining image. self.deleteTag(repository, 'newtag') self.assertDeleted(repository, 'i1') + # Current state: + # (Empty) + def test_empty_gc(self): with self.assert_gc_integrity(expect_storage_removed=False): repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'],