Get rid of remaining slow query for garbage collection.

This commit is contained in:
Jake Moshenko 2016-08-01 18:22:38 -04:00
parent b0bffe56ca
commit 05e2773fa7
3 changed files with 33 additions and 16 deletions

View file

@ -1,14 +1,16 @@
import logging
import random
from peewee import JOIN_LEFT_OUTER, fn
from datetime import timedelta, datetime
from peewee import JOIN_LEFT_OUTER, fn
from cachetools import ttl_cache
from data.model import (DataModelException, tag, db_transaction, storage, permission,
_basequery, config)
_basequery)
from data.database import (Repository, Namespace, RepositoryTag, Star, Image, User,
Visibility, RepositoryPermission, TupleSelector, RepositoryActionCount,
Visibility, RepositoryPermission, RepositoryActionCount,
Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage,
db_for_update, get_epoch_timestamp, db_random_func)
get_epoch_timestamp, db_random_func)
logger = logging.getLogger(__name__)
@ -71,8 +73,24 @@ def purge_repository(namespace_name, repository_name):
fetched.delete_instance(recursive=True, delete_nullable=False)
def find_repository_with_garbage():
epoch_timestamp = get_epoch_timestamp()
@ttl_cache(maxsize=1, ttl=600)
def _get_gc_expiration_policies():
policy_tuples_query = (Namespace
.select(Namespace.removed_tag_expiration_s)
.distinct()
.limit(100) # This sucks but it's the only way to limit memory
.tuples())
return [policy[0] for policy in policy_tuples_query]
def get_random_gc_policy():
""" Return a single random policy from the database to use when garbage collecting.
"""
return random.choice(_get_gc_expiration_policies())
def find_repository_with_garbage(limit_to_gc_policy_s):
expiration_timestamp = get_epoch_timestamp() - limit_to_gc_policy_s
try:
candidates = (RepositoryTag
@ -80,8 +98,8 @@ def find_repository_with_garbage():
.join(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(~(RepositoryTag.lifetime_end_ts >> None),
(RepositoryTag.lifetime_end_ts <=
(epoch_timestamp - Namespace.removed_tag_expiration_s)))
(RepositoryTag.lifetime_end_ts <= expiration_timestamp),
(Namespace.removed_tag_expiration_s == limit_to_gc_policy_s))
.limit(500)
.distinct()
.alias('candidates'))

View file

@ -1,8 +1,6 @@
import unittest
import time
from peewee import fn, JOIN_LEFT_OUTER
from app import app, storage
from initdb import setup_database_for_testing, finished_database_for_testing
from data import model, database
@ -166,13 +164,13 @@ class TestGarbageCollection(unittest.TestCase):
repository = self.createRepository(latest=['i1', 'i2', 'i3'])
# Ensure that no repositories are returned by the has garbage check.
self.assertIsNone(model.repository.find_repository_with_garbage())
self.assertIsNone(model.repository.find_repository_with_garbage(1000000000))
# Delete a tag.
self.deleteTag(repository, 'latest', perform_gc=False)
# There should still not be any repositories with garbage, due to time machine.
self.assertIsNone(model.repository.find_repository_with_garbage())
self.assertIsNone(model.repository.find_repository_with_garbage(1000000000))
# Change the time machine expiration on the namespace.
(database.User.update(removed_tag_expiration_s=0)
@ -180,7 +178,7 @@ class TestGarbageCollection(unittest.TestCase):
.execute())
# Now we should find the repository for GC.
repository = model.repository.find_repository_with_garbage()
repository = model.repository.find_repository_with_garbage(0)
self.assertIsNotNone(repository)
self.assertEquals(REPO, repository.name)
@ -188,7 +186,7 @@ class TestGarbageCollection(unittest.TestCase):
model.repository.garbage_collect_repository(repository.namespace_user.username, repository.name)
# There should now be no repositories with garbage.
self.assertIsNone(model.repository.find_repository_with_garbage())
self.assertIsNone(model.repository.find_repository_with_garbage(0))
def test_one_tag(self):

View file

@ -1,7 +1,8 @@
import logging
from app import app
from data.model.repository import find_repository_with_garbage, garbage_collect_repo
from data.model.repository import (find_repository_with_garbage, garbage_collect_repo,
from workers.worker import Worker
logger = logging.getLogger(__name__)
@ -14,7 +15,7 @@ class GarbageCollectionWorker(Worker):
def _garbage_collection_repos(self):
""" Performs garbage collection on repositories. """
repository = find_repository_with_garbage()
repository = find_repository_with_garbage(get_random_gc_policy())
if repository is None:
logger.debug('No repository with garbage found')
return