Get rid of remaining slow query for garbage collection.

This commit is contained in:
Jake Moshenko 2016-08-01 18:22:38 -04:00
parent b0bffe56ca
commit 05e2773fa7
3 changed files with 33 additions and 16 deletions

View file

@ -1,14 +1,16 @@
import logging import logging
import random
from peewee import JOIN_LEFT_OUTER, fn
from datetime import timedelta, datetime from datetime import timedelta, datetime
from peewee import JOIN_LEFT_OUTER, fn
from cachetools import ttl_cache
from data.model import (DataModelException, tag, db_transaction, storage, permission, from data.model import (DataModelException, tag, db_transaction, storage, permission,
_basequery, config) _basequery)
from data.database import (Repository, Namespace, RepositoryTag, Star, Image, User, from data.database import (Repository, Namespace, RepositoryTag, Star, Image, User,
Visibility, RepositoryPermission, TupleSelector, RepositoryActionCount, Visibility, RepositoryPermission, RepositoryActionCount,
Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage, Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage,
db_for_update, get_epoch_timestamp, db_random_func) get_epoch_timestamp, db_random_func)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -71,8 +73,24 @@ def purge_repository(namespace_name, repository_name):
fetched.delete_instance(recursive=True, delete_nullable=False) fetched.delete_instance(recursive=True, delete_nullable=False)
def find_repository_with_garbage(): @ttl_cache(maxsize=1, ttl=600)
epoch_timestamp = get_epoch_timestamp() def _get_gc_expiration_policies():
policy_tuples_query = (Namespace
.select(Namespace.removed_tag_expiration_s)
.distinct()
.limit(100) # This sucks but it's the only way to limit memory
.tuples())
return [policy[0] for policy in policy_tuples_query]
def get_random_gc_policy():
""" Return a single random policy from the database to use when garbage collecting.
"""
return random.choice(_get_gc_expiration_policies())
def find_repository_with_garbage(limit_to_gc_policy_s):
expiration_timestamp = get_epoch_timestamp() - limit_to_gc_policy_s
try: try:
candidates = (RepositoryTag candidates = (RepositoryTag
@ -80,8 +98,8 @@ def find_repository_with_garbage():
.join(Repository) .join(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id)) .join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(~(RepositoryTag.lifetime_end_ts >> None), .where(~(RepositoryTag.lifetime_end_ts >> None),
(RepositoryTag.lifetime_end_ts <= (RepositoryTag.lifetime_end_ts <= expiration_timestamp),
(epoch_timestamp - Namespace.removed_tag_expiration_s))) (Namespace.removed_tag_expiration_s == limit_to_gc_policy_s))
.limit(500) .limit(500)
.distinct() .distinct()
.alias('candidates')) .alias('candidates'))

View file

@ -1,8 +1,6 @@
import unittest import unittest
import time import time
from peewee import fn, JOIN_LEFT_OUTER
from app import app, storage from app import app, storage
from initdb import setup_database_for_testing, finished_database_for_testing from initdb import setup_database_for_testing, finished_database_for_testing
from data import model, database from data import model, database
@ -166,13 +164,13 @@ class TestGarbageCollection(unittest.TestCase):
repository = self.createRepository(latest=['i1', 'i2', 'i3']) repository = self.createRepository(latest=['i1', 'i2', 'i3'])
# Ensure that no repositories are returned by the has garbage check. # Ensure that no repositories are returned by the has garbage check.
self.assertIsNone(model.repository.find_repository_with_garbage()) self.assertIsNone(model.repository.find_repository_with_garbage(1000000000))
# Delete a tag. # Delete a tag.
self.deleteTag(repository, 'latest', perform_gc=False) self.deleteTag(repository, 'latest', perform_gc=False)
# There should still not be any repositories with garbage, due to time machine. # There should still not be any repositories with garbage, due to time machine.
self.assertIsNone(model.repository.find_repository_with_garbage()) self.assertIsNone(model.repository.find_repository_with_garbage(1000000000))
# Change the time machine expiration on the namespace. # Change the time machine expiration on the namespace.
(database.User.update(removed_tag_expiration_s=0) (database.User.update(removed_tag_expiration_s=0)
@ -180,7 +178,7 @@ class TestGarbageCollection(unittest.TestCase):
.execute()) .execute())
# Now we should find the repository for GC. # Now we should find the repository for GC.
repository = model.repository.find_repository_with_garbage() repository = model.repository.find_repository_with_garbage(0)
self.assertIsNotNone(repository) self.assertIsNotNone(repository)
self.assertEquals(REPO, repository.name) self.assertEquals(REPO, repository.name)
@ -188,7 +186,7 @@ class TestGarbageCollection(unittest.TestCase):
model.repository.garbage_collect_repository(repository.namespace_user.username, repository.name) model.repository.garbage_collect_repository(repository.namespace_user.username, repository.name)
# There should now be no repositories with garbage. # There should now be no repositories with garbage.
self.assertIsNone(model.repository.find_repository_with_garbage()) self.assertIsNone(model.repository.find_repository_with_garbage(0))
def test_one_tag(self): def test_one_tag(self):

View file

@ -1,7 +1,8 @@
import logging import logging
from app import app from app import app
from data.model.repository import find_repository_with_garbage, garbage_collect_repo from data.model.repository import (find_repository_with_garbage, garbage_collect_repo,
from workers.worker import Worker from workers.worker import Worker
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -14,7 +15,7 @@ class GarbageCollectionWorker(Worker):
def _garbage_collection_repos(self): def _garbage_collection_repos(self):
""" Performs garbage collection on repositories. """ """ Performs garbage collection on repositories. """
repository = find_repository_with_garbage() repository = find_repository_with_garbage(get_random_gc_policy())
if repository is None: if repository is None:
logger.debug('No repository with garbage found') logger.debug('No repository with garbage found')
return return