Make sure GC checks new Blob table as well before deleting CAS storage

This commit is contained in:
Joseph Schorr 2017-03-22 23:45:46 -04:00
parent 3d0e63d8e5
commit bdda74d6df
2 changed files with 60 additions and 7 deletions

View file

@ -8,7 +8,7 @@ from data.model import (config, db_transaction, InvalidImageException, TorrentIn
DataModelException, _basequery)
from data.database import (ImageStorage, Image, ImageStoragePlacement, ImageStorageLocation,
ImageStorageTransformation, ImageStorageSignature,
ImageStorageSignatureKind, Repository, Namespace, TorrentInfo,
ImageStorageSignatureKind, Repository, Namespace, TorrentInfo, Blob,
ensure_under_transaction)
@ -95,15 +95,24 @@ def garbage_collect_storage(storage_id_whitelist):
unreferenced_checksums = set()
if content_checksums:
# Check the current image storage.
query = (ImageStorage
.select(ImageStorage.content_checksum)
.where(ImageStorage.content_checksum << list(content_checksums)))
referenced_checksums = set([image_storage.content_checksum for image_storage in query])
if referenced_checksums:
logger.warning('GC attempted to remove CAS checksums %s, which are still referenced',
referenced_checksums)
is_referenced_checksums = set([image_storage.content_checksum for image_storage in query])
if is_referenced_checksums:
logger.warning('GC attempted to remove CAS checksums %s, which are still IS referenced',
is_referenced_checksums)
unreferenced_checksums = content_checksums - referenced_checksums
# Check the new Blob table as well.
query = Blob.select(Blob.digest).where(Blob.digest << list(content_checksums))
blob_referenced_checksums = set([blob.digest for blob in query])
if blob_referenced_checksums:
logger.warning('GC attempted to remove CAS checksums %s, which are still Blob referenced',
blob_referenced_checksums)
unreferenced_checksums = (content_checksums - blob_referenced_checksums -
is_referenced_checksums)
# Return all placements for all image storages found not at a CAS path or with a content
# checksum that is referenced.

View file

@ -8,7 +8,7 @@ from playhouse.test_utils import assert_query_count
from app import app, storage
from initdb import setup_database_for_testing, finished_database_for_testing
from data import model, database
from data.database import Image, ImageStorage, DerivedStorageForImage, Label, TagManifestLabel
from data.database import Image, ImageStorage, DerivedStorageForImage, Label, TagManifestLabel, Blob
ADMIN_ACCESS_USER = 'devtable'
@ -190,6 +190,9 @@ class TestGarbageCollection(unittest.TestCase):
if storage_row.cas_path:
storage.get_content({preferred}, storage.blob_path(storage_row.content_checksum))
for blob_row in Blob.select():
storage.get_content({preferred}, storage.blob_path(blob_row.digest))
def test_has_garbage(self):
""" Remove all existing repositories, then add one without garbage, check, then add one with
garbage, and check again.
@ -502,7 +505,48 @@ class TestGarbageCollection(unittest.TestCase):
# Ensure the CAS path still exists.
self.assertTrue(storage.exists({preferred}, storage.blob_path(digest)))
def test_images_shared_cas_with_new_blob_table(self):
""" A repository with a tag and image that shares its CAS path with a record in the new Blob
table. Deleting the first tag should delete the first image, and its storage, but not the
file in storage, as it shares its CAS path with the blob row.
"""
with self.assert_gc_integrity(expect_storage_removed=True):
repository = self.createRepository()
# Create two image storage records with the same content checksum.
content = 'hello world'
digest = 'sha256:' + hashlib.sha256(content).hexdigest()
preferred = storage.preferred_locations[0]
storage.put_content({preferred}, storage.blob_path(digest), content)
media_type = database.MediaType.get(name='text/plain')
is1 = database.ImageStorage.create(content_checksum=digest, uploading=False)
is2 = database.Blob.create(digest=digest, size=0, media_type=media_type)
location = database.ImageStorageLocation.get(name=preferred)
database.ImageStoragePlacement.create(location=location, storage=is1)
# Ensure the CAS path exists.
self.assertTrue(storage.exists({preferred}, storage.blob_path(digest)))
# Create the image in the repository, and the tag.
first_image = Image.create(docker_image_id='i1',
repository=repository, storage=is1,
ancestors='/')
model.tag.store_tag_manifest(repository.namespace_user.username, repository.name,
'first', first_image.docker_image_id,
'sha:someshahere1', '{}')
self.assertNotDeleted(repository, 'i1')
# Delete the tag.
self.deleteTag(repository, 'first')
self.assertDeleted(repository, 'i1')
# Ensure the CAS path still exists, as it is referenced by the Blob table.
self.assertTrue(storage.exists({preferred}, storage.blob_path(digest)))
if __name__ == '__main__':
unittest.main()