Merge pull request #1645 from coreos-inc/gc-query-optimize

Optimize GC query for looking up deletable storages
This commit is contained in:
josephschorr 2016-07-26 16:00:17 -07:00 committed by GitHub
commit 0162d3da30
2 changed files with 171 additions and 110 deletions

View file

@ -46,6 +46,31 @@ def add_storage_placement(storage, location_name):
pass pass
def _orphaned_storage_query(candidate_ids):
""" Returns the subset of the candidate ImageStorage IDs representing storages that are no
longer referenced by images.
"""
# Issue a union query to find all storages that are still referenced by a candidate storage. This
# is much faster than the group_by and having call we used to use here.
nonorphaned_queries = []
for counter, candidate_id in enumerate(candidate_ids):
query_alias = 'q{0}'.format(counter)
storage_subq = (ImageStorage
.select(ImageStorage.id)
.join(Image)
.where(ImageStorage.id == candidate_id)
.limit(1)
.alias(query_alias))
nonorphaned_queries.append(ImageStorage
.select(SQL('*'))
.from_(storage_subq))
# Build the set of storages that are missing. These storages are orphaned.
nonorphaned_storage_ids = {storage.id for storage in _reduce_as_tree(nonorphaned_queries)}
return list(candidate_ids - nonorphaned_storage_ids)
def garbage_collect_storage(storage_id_whitelist): def garbage_collect_storage(storage_id_whitelist):
if len(storage_id_whitelist) == 0: if len(storage_id_whitelist) == 0:
return return
@ -55,27 +80,21 @@ def garbage_collect_storage(storage_id_whitelist):
get_layer_path(placement.storage)) get_layer_path(placement.storage))
for placement in placements_query} for placement in placements_query}
def orphaned_storage_query(select_base_query, candidates, group_by):
return (select_base_query
.switch(ImageStorage)
.join(Image, JOIN_LEFT_OUTER)
.where(ImageStorage.id << list(candidates))
.group_by(*group_by)
.having(fn.Count(Image.id) == 0))
# Note: Both of these deletes must occur in the same transaction (unfortunately) because a # Note: Both of these deletes must occur in the same transaction (unfortunately) because a
# storage without any placement is invalid, and a placement cannot exist without a storage. # storage without any placement is invalid, and a placement cannot exist without a storage.
# TODO(jake): We might want to allow for null storages on placements, which would allow us to # TODO(jake): We might want to allow for null storages on placements, which would allow us to
# delete the storages, then delete the placements in a non-transaction. # delete the storages, then delete the placements in a non-transaction.
logger.debug('Garbage collecting storages from candidates: %s', storage_id_whitelist) logger.debug('Garbage collecting storages from candidates: %s', storage_id_whitelist)
with db_transaction(): with db_transaction():
# Track all of the data that should be removed from blob storage orphaned_storage_ids = _orphaned_storage_query(storage_id_whitelist)
placements_to_remove = list(orphaned_storage_query(ImageStoragePlacement if len(orphaned_storage_ids) == 0:
.select(ImageStoragePlacement, # Nothing to GC.
ImageStorage) return
.join(ImageStorage),
storage_id_whitelist, placements_to_remove = list(ImageStoragePlacement
(ImageStorage.id, ImageStoragePlacement.id))) .select()
.join(ImageStorage)
.where(ImageStorage.id << orphaned_storage_ids))
paths_to_remove = placements_query_to_paths_set(placements_to_remove) paths_to_remove = placements_query_to_paths_set(placements_to_remove)
@ -89,26 +108,21 @@ def garbage_collect_storage(storage_id_whitelist):
logger.debug('Removed %s image storage placements', placements_removed) logger.debug('Removed %s image storage placements', placements_removed)
# Remove all orphaned storages # Remove all orphaned storages
# The comma after ImageStorage.id is VERY important, it makes it a tuple, which is a sequence
orphaned_storages = list(orphaned_storage_query(ImageStorage.select(ImageStorage.id),
storage_id_whitelist,
(ImageStorage.id,)).alias('osq'))
if len(orphaned_storages) > 0:
torrents_removed = (TorrentInfo torrents_removed = (TorrentInfo
.delete() .delete()
.where(TorrentInfo.storage << orphaned_storages) .where(TorrentInfo.storage << orphaned_storage_ids)
.execute()) .execute())
logger.debug('Removed %s torrent info records', torrents_removed) logger.debug('Removed %s torrent info records', torrents_removed)
signatures_removed = (ImageStorageSignature signatures_removed = (ImageStorageSignature
.delete() .delete()
.where(ImageStorageSignature.storage << orphaned_storages) .where(ImageStorageSignature.storage << orphaned_storage_ids)
.execute()) .execute())
logger.debug('Removed %s image storage signatures', signatures_removed) logger.debug('Removed %s image storage signatures', signatures_removed)
storages_removed = (ImageStorage storages_removed = (ImageStorage
.delete() .delete()
.where(ImageStorage.id << orphaned_storages) .where(ImageStorage.id << orphaned_storage_ids)
.execute()) .execute())
logger.debug('Removed %s image storage records', storages_removed) logger.debug('Removed %s image storage records', storages_removed)

View file

@ -1,9 +1,12 @@
import unittest import unittest
import time import time
from peewee import fn, JOIN_LEFT_OUTER
from app import app, storage from app import app, storage
from initdb import setup_database_for_testing, finished_database_for_testing from initdb import setup_database_for_testing, finished_database_for_testing
from data import model, database from data import model, database
from data.database import Image, ImageStorage, DerivedStorageForImage
from endpoints.v2.manifest import _generate_and_store_manifest from endpoints.v2.manifest import _generate_and_store_manifest
@ -12,6 +15,29 @@ PUBLIC_USER = 'public'
REPO = 'somerepo' REPO = 'somerepo'
class assert_no_new_dangling_storages(object):
""" Specialized assertion for ensuring that GC cleans up all dangling storages.
"""
def __init__(self):
self.existing_count = 0
def _get_dangling_count(self):
storage_ids = set([current.id for current in ImageStorage.select()])
referneced_by_image = set([image.storage_id for image in Image.select()])
referenced_by_derived = set([derived.derivative_id for derived in DerivedStorageForImage.select()])
return len(storage_ids - referneced_by_image - referenced_by_derived)
def __enter__(self):
self.existing_count = self._get_dangling_count()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
updated_count = self._get_dangling_count()
assert updated_count == self.existing_count
class TestGarbageCollection(unittest.TestCase): class TestGarbageCollection(unittest.TestCase):
@staticmethod @staticmethod
def _set_tag_expiration_policy(namespace, expiration_s): def _set_tag_expiration_policy(namespace, expiration_s):
@ -122,6 +148,7 @@ class TestGarbageCollection(unittest.TestCase):
self.fail('Expected image %s to be deleted' % docker_image_id) self.fail('Expected image %s to be deleted' % docker_image_id)
def test_has_garbage(self): def test_has_garbage(self):
""" Remove all existing repositories, then add one without garbage, check, then add one with """ Remove all existing repositories, then add one without garbage, check, then add one with
garbage, and check again. garbage, and check again.
@ -167,30 +194,37 @@ class TestGarbageCollection(unittest.TestCase):
def test_one_tag(self): def test_one_tag(self):
""" Create a repository with a single tag, then remove that tag and verify that the repository """ Create a repository with a single tag, then remove that tag and verify that the repository
is now empty. """ is now empty. """
with assert_no_new_dangling_storages():
repository = self.createRepository(latest=['i1', 'i2', 'i3']) repository = self.createRepository(latest=['i1', 'i2', 'i3'])
self.deleteTag(repository, 'latest') self.deleteTag(repository, 'latest')
self.assertDeleted(repository, 'i1', 'i2', 'i3') self.assertDeleted(repository, 'i1', 'i2', 'i3')
def test_two_tags_unshared_images(self): def test_two_tags_unshared_images(self):
""" Repository has two tags with no shared images between them. """ """ Repository has two tags with no shared images between them. """
with assert_no_new_dangling_storages():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['f1', 'f2']) repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['f1', 'f2'])
self.deleteTag(repository, 'latest') self.deleteTag(repository, 'latest')
self.assertDeleted(repository, 'i1', 'i2', 'i3') self.assertDeleted(repository, 'i1', 'i2', 'i3')
self.assertNotDeleted(repository, 'f1', 'f2') self.assertNotDeleted(repository, 'f1', 'f2')
def test_two_tags_shared_images(self): def test_two_tags_shared_images(self):
""" Repository has two tags with shared images. Deleting the tag should only remove the """ Repository has two tags with shared images. Deleting the tag should only remove the
unshared images. unshared images.
""" """
with assert_no_new_dangling_storages():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1'])
self.deleteTag(repository, 'latest') self.deleteTag(repository, 'latest')
self.assertDeleted(repository, 'i2', 'i3') self.assertDeleted(repository, 'i2', 'i3')
self.assertNotDeleted(repository, 'i1', 'f1') self.assertNotDeleted(repository, 'i1', 'f1')
def test_unrelated_repositories(self): def test_unrelated_repositories(self):
""" Two repositories with different images. Removing the tag from one leaves the other's """ Two repositories with different images. Removing the tag from one leaves the other's
images intact. images intact.
""" """
with assert_no_new_dangling_storages():
repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1')
repository2 = self.createRepository(latest=['j1', 'j2', 'j3'], name='repo2') repository2 = self.createRepository(latest=['j1', 'j2', 'j3'], name='repo2')
@ -199,10 +233,12 @@ class TestGarbageCollection(unittest.TestCase):
self.assertDeleted(repository1, 'i1', 'i2', 'i3') self.assertDeleted(repository1, 'i1', 'i2', 'i3')
self.assertNotDeleted(repository2, 'j1', 'j2', 'j3') self.assertNotDeleted(repository2, 'j1', 'j2', 'j3')
def test_related_repositories(self): def test_related_repositories(self):
""" Two repositories with shared images. Removing the tag from one leaves the other's """ Two repositories with shared images. Removing the tag from one leaves the other's
images intact. images intact.
""" """
with assert_no_new_dangling_storages():
repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1') repository1 = self.createRepository(latest=['i1', 'i2', 'i3'], name='repo1')
repository2 = self.createRepository(latest=['i1', 'i2', 'j1'], name='repo2') repository2 = self.createRepository(latest=['i1', 'i2', 'j1'], name='repo2')
@ -211,10 +247,12 @@ class TestGarbageCollection(unittest.TestCase):
self.assertDeleted(repository1, 'i3') self.assertDeleted(repository1, 'i3')
self.assertNotDeleted(repository2, 'i1', 'i2', 'j1') self.assertNotDeleted(repository2, 'i1', 'i2', 'j1')
def test_inaccessible_repositories(self): def test_inaccessible_repositories(self):
""" Two repositories under different namespaces should result in the images being deleted """ Two repositories under different namespaces should result in the images being deleted
but not completely removed from the database. but not completely removed from the database.
""" """
with assert_no_new_dangling_storages():
repository1 = self.createRepository(namespace=ADMIN_ACCESS_USER, latest=['i1', 'i2', 'i3']) repository1 = self.createRepository(namespace=ADMIN_ACCESS_USER, latest=['i1', 'i2', 'i3'])
repository2 = self.createRepository(namespace=PUBLIC_USER, latest=['i1', 'i2', 'i3']) repository2 = self.createRepository(namespace=PUBLIC_USER, latest=['i1', 'i2', 'i3'])
@ -227,6 +265,7 @@ class TestGarbageCollection(unittest.TestCase):
""" Repository has multiple tags with shared images. Selectively deleting the tags, and """ Repository has multiple tags with shared images. Selectively deleting the tags, and
verifying at each step. verifying at each step.
""" """
with assert_no_new_dangling_storages():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'],
third=['t1', 't2', 't3'], fourth=['i1', 'f1']) third=['t1', 't2', 't3'], fourth=['i1', 'f1'])
@ -262,16 +301,20 @@ class TestGarbageCollection(unittest.TestCase):
self.deleteTag(repository, 'newtag') self.deleteTag(repository, 'newtag')
self.assertDeleted(repository, 'i1') self.assertDeleted(repository, 'i1')
def test_empty_gc(self): def test_empty_gc(self):
with assert_no_new_dangling_storages():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'], repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1', 'f2'],
third=['t1', 't2', 't3'], fourth=['i1', 'f1']) third=['t1', 't2', 't3'], fourth=['i1', 'f1'])
self.gcNow(repository) self.gcNow(repository)
self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3', 'f1', 'f2') self.assertNotDeleted(repository, 'i1', 'i2', 'i3', 't1', 't2', 't3', 'f1', 'f2')
def test_time_machine_no_gc(self): def test_time_machine_no_gc(self):
""" Repository has two tags with shared images. Deleting the tag should not remove any images """ Repository has two tags with shared images. Deleting the tag should not remove any images
""" """
with assert_no_new_dangling_storages():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1'])
self._set_tag_expiration_policy(repository.namespace_user.username, 60*60*24) self._set_tag_expiration_policy(repository.namespace_user.username, 60*60*24)
@ -279,10 +322,12 @@ class TestGarbageCollection(unittest.TestCase):
self.assertNotDeleted(repository, 'i2', 'i3') self.assertNotDeleted(repository, 'i2', 'i3')
self.assertNotDeleted(repository, 'i1', 'f1') self.assertNotDeleted(repository, 'i1', 'f1')
def test_time_machine_gc(self): def test_time_machine_gc(self):
""" Repository has two tags with shared images. Deleting the second tag should cause the images """ Repository has two tags with shared images. Deleting the second tag should cause the images
for the first deleted tag to gc. for the first deleted tag to gc.
""" """
with assert_no_new_dangling_storages():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1'])
self._set_tag_expiration_policy(repository.namespace_user.username, 1) self._set_tag_expiration_policy(repository.namespace_user.username, 1)
@ -297,7 +342,9 @@ class TestGarbageCollection(unittest.TestCase):
self.assertDeleted(repository, 'i2', 'i3') self.assertDeleted(repository, 'i2', 'i3')
self.assertNotDeleted(repository, 'i1', 'f1') self.assertNotDeleted(repository, 'i1', 'f1')
def test_manifest_gc(self): def test_manifest_gc(self):
with assert_no_new_dangling_storages():
repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1']) repository = self.createRepository(latest=['i1', 'i2', 'i3'], other=['i1', 'f1'])
_generate_and_store_manifest(ADMIN_ACCESS_USER, REPO, 'latest') _generate_and_store_manifest(ADMIN_ACCESS_USER, REPO, 'latest')