From bdda74d6dfa641d82e96b31a4db4162af95e62b7 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 22 Mar 2017 23:45:46 -0400 Subject: [PATCH] Make sure GC checks new Blob table as well before deleting CAS storage --- data/model/storage.py | 21 ++++++++++++++------ test/test_gc.py | 46 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/data/model/storage.py b/data/model/storage.py index 79ab529bb..112b9cc5f 100644 --- a/data/model/storage.py +++ b/data/model/storage.py @@ -8,7 +8,7 @@ from data.model import (config, db_transaction, InvalidImageException, TorrentIn DataModelException, _basequery) from data.database import (ImageStorage, Image, ImageStoragePlacement, ImageStorageLocation, ImageStorageTransformation, ImageStorageSignature, - ImageStorageSignatureKind, Repository, Namespace, TorrentInfo, + ImageStorageSignatureKind, Repository, Namespace, TorrentInfo, Blob, ensure_under_transaction) @@ -95,15 +95,24 @@ def garbage_collect_storage(storage_id_whitelist): unreferenced_checksums = set() if content_checksums: + # Check the current image storage. query = (ImageStorage .select(ImageStorage.content_checksum) .where(ImageStorage.content_checksum << list(content_checksums))) - referenced_checksums = set([image_storage.content_checksum for image_storage in query]) - if referenced_checksums: - logger.warning('GC attempted to remove CAS checksums %s, which are still referenced', - referenced_checksums) + is_referenced_checksums = set([image_storage.content_checksum for image_storage in query]) + if is_referenced_checksums: + logger.warning('GC attempted to remove CAS checksums %s, which are still IS referenced', + is_referenced_checksums) - unreferenced_checksums = content_checksums - referenced_checksums + # Check the new Blob table as well. + query = Blob.select(Blob.digest).where(Blob.digest << list(content_checksums)) + blob_referenced_checksums = set([blob.digest for blob in query]) + if blob_referenced_checksums: + logger.warning('GC attempted to remove CAS checksums %s, which are still Blob referenced', + blob_referenced_checksums) + + unreferenced_checksums = (content_checksums - blob_referenced_checksums - + is_referenced_checksums) # Return all placements for all image storages found not at a CAS path or with a content # checksum that is referenced. diff --git a/test/test_gc.py b/test/test_gc.py index c8ed5ecaa..1e5369675 100644 --- a/test/test_gc.py +++ b/test/test_gc.py @@ -8,7 +8,7 @@ from playhouse.test_utils import assert_query_count from app import app, storage from initdb import setup_database_for_testing, finished_database_for_testing from data import model, database -from data.database import Image, ImageStorage, DerivedStorageForImage, Label, TagManifestLabel +from data.database import Image, ImageStorage, DerivedStorageForImage, Label, TagManifestLabel, Blob ADMIN_ACCESS_USER = 'devtable' @@ -190,6 +190,9 @@ class TestGarbageCollection(unittest.TestCase): if storage_row.cas_path: storage.get_content({preferred}, storage.blob_path(storage_row.content_checksum)) + for blob_row in Blob.select(): + storage.get_content({preferred}, storage.blob_path(blob_row.digest)) + def test_has_garbage(self): """ Remove all existing repositories, then add one without garbage, check, then add one with garbage, and check again. @@ -502,7 +505,48 @@ class TestGarbageCollection(unittest.TestCase): # Ensure the CAS path still exists. self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) + def test_images_shared_cas_with_new_blob_table(self): + """ A repository with a tag and image that shares its CAS path with a record in the new Blob + table. Deleting the first tag should delete the first image, and its storage, but not the + file in storage, as it shares its CAS path with the blob row. + """ + with self.assert_gc_integrity(expect_storage_removed=True): + repository = self.createRepository() + # Create two image storage records with the same content checksum. + content = 'hello world' + digest = 'sha256:' + hashlib.sha256(content).hexdigest() + preferred = storage.preferred_locations[0] + storage.put_content({preferred}, storage.blob_path(digest), content) + + media_type = database.MediaType.get(name='text/plain') + + is1 = database.ImageStorage.create(content_checksum=digest, uploading=False) + is2 = database.Blob.create(digest=digest, size=0, media_type=media_type) + + location = database.ImageStorageLocation.get(name=preferred) + database.ImageStoragePlacement.create(location=location, storage=is1) + + # Ensure the CAS path exists. + self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) + + # Create the image in the repository, and the tag. + first_image = Image.create(docker_image_id='i1', + repository=repository, storage=is1, + ancestors='/') + + model.tag.store_tag_manifest(repository.namespace_user.username, repository.name, + 'first', first_image.docker_image_id, + 'sha:someshahere1', '{}') + + self.assertNotDeleted(repository, 'i1') + + # Delete the tag. + self.deleteTag(repository, 'first') + self.assertDeleted(repository, 'i1') + + # Ensure the CAS path still exists, as it is referenced by the Blob table. + self.assertTrue(storage.exists({preferred}, storage.blob_path(digest))) if __name__ == '__main__': unittest.main()