diff --git a/data/model/repository.py b/data/model/repository.py index 9ddf4e6d8..a27234314 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -1,14 +1,16 @@ import logging +import random -from peewee import JOIN_LEFT_OUTER, fn from datetime import timedelta, datetime +from peewee import JOIN_LEFT_OUTER, fn +from cachetools import ttl_cache from data.model import (DataModelException, tag, db_transaction, storage, permission, - _basequery, config) + _basequery) from data.database import (Repository, Namespace, RepositoryTag, Star, Image, User, - Visibility, RepositoryPermission, TupleSelector, RepositoryActionCount, + Visibility, RepositoryPermission, RepositoryActionCount, Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage, - db_for_update, get_epoch_timestamp, db_random_func) + get_epoch_timestamp, db_random_func) logger = logging.getLogger(__name__) @@ -71,8 +73,24 @@ def purge_repository(namespace_name, repository_name): fetched.delete_instance(recursive=True, delete_nullable=False) -def find_repository_with_garbage(): - epoch_timestamp = get_epoch_timestamp() +@ttl_cache(maxsize=1, ttl=600) +def _get_gc_expiration_policies(): + policy_tuples_query = (Namespace + .select(Namespace.removed_tag_expiration_s) + .distinct() + .limit(100) # This sucks but it's the only way to limit memory + .tuples()) + return [policy[0] for policy in policy_tuples_query] + + +def get_random_gc_policy(): + """ Return a single random policy from the database to use when garbage collecting. + """ + return random.choice(_get_gc_expiration_policies()) + + +def find_repository_with_garbage(limit_to_gc_policy_s): + expiration_timestamp = get_epoch_timestamp() - limit_to_gc_policy_s try: candidates = (RepositoryTag @@ -80,8 +98,8 @@ def find_repository_with_garbage(): .join(Repository) .join(Namespace, on=(Repository.namespace_user == Namespace.id)) .where(~(RepositoryTag.lifetime_end_ts >> None), - (RepositoryTag.lifetime_end_ts <= - (epoch_timestamp - Namespace.removed_tag_expiration_s))) + (RepositoryTag.lifetime_end_ts <= expiration_timestamp), + (Namespace.removed_tag_expiration_s == limit_to_gc_policy_s)) .limit(500) .distinct() .alias('candidates')) diff --git a/test/test_gc.py b/test/test_gc.py index 2c438c1bc..e7646dc99 100644 --- a/test/test_gc.py +++ b/test/test_gc.py @@ -1,8 +1,6 @@ import unittest import time -from peewee import fn, JOIN_LEFT_OUTER - from app import app, storage from initdb import setup_database_for_testing, finished_database_for_testing from data import model, database @@ -166,13 +164,13 @@ class TestGarbageCollection(unittest.TestCase): repository = self.createRepository(latest=['i1', 'i2', 'i3']) # Ensure that no repositories are returned by the has garbage check. - self.assertIsNone(model.repository.find_repository_with_garbage()) + self.assertIsNone(model.repository.find_repository_with_garbage(1000000000)) # Delete a tag. self.deleteTag(repository, 'latest', perform_gc=False) # There should still not be any repositories with garbage, due to time machine. - self.assertIsNone(model.repository.find_repository_with_garbage()) + self.assertIsNone(model.repository.find_repository_with_garbage(1000000000)) # Change the time machine expiration on the namespace. (database.User.update(removed_tag_expiration_s=0) @@ -180,7 +178,7 @@ class TestGarbageCollection(unittest.TestCase): .execute()) # Now we should find the repository for GC. - repository = model.repository.find_repository_with_garbage() + repository = model.repository.find_repository_with_garbage(0) self.assertIsNotNone(repository) self.assertEquals(REPO, repository.name) @@ -188,7 +186,7 @@ class TestGarbageCollection(unittest.TestCase): model.repository.garbage_collect_repository(repository.namespace_user.username, repository.name) # There should now be no repositories with garbage. - self.assertIsNone(model.repository.find_repository_with_garbage()) + self.assertIsNone(model.repository.find_repository_with_garbage(0)) def test_one_tag(self): diff --git a/workers/gcworker.py b/workers/gcworker.py index 0c4e17465..d7ba3fcf8 100644 --- a/workers/gcworker.py +++ b/workers/gcworker.py @@ -1,7 +1,8 @@ import logging from app import app -from data.model.repository import find_repository_with_garbage, garbage_collect_repo +from data.model.repository import (find_repository_with_garbage, garbage_collect_repo, + from workers.worker import Worker logger = logging.getLogger(__name__) @@ -14,7 +15,7 @@ class GarbageCollectionWorker(Worker): def _garbage_collection_repos(self): """ Performs garbage collection on repositories. """ - repository = find_repository_with_garbage() + repository = find_repository_with_garbage(get_random_gc_policy()) if repository is None: logger.debug('No repository with garbage found') return