import logging from peewee import (JOIN_LEFT_OUTER, CharField, BigIntegerField, BooleanField, ForeignKeyField, IntegerField, IntegrityError, fn) from data.database import BaseModel, CloseForLongOperation from data.fields import Base64BinaryField from app import app, storage from digest import checksums from util.migrate.allocator import yield_random_entries from util.registry.torrent import PieceHasher from util.registry.filelike import wrap_with_handler BATCH_SIZE = 1000 logger = logging.getLogger(__name__) # Vendor the information from tables we will be writing to at the time of this migration class ImageStorage(BaseModel): uuid = CharField(index=True, unique=True) checksum = CharField(null=True) image_size = BigIntegerField(null=True) uncompressed_size = BigIntegerField(null=True) uploading = BooleanField(default=True, null=True) cas_path = BooleanField(default=True) content_checksum = CharField(null=True, index=True) class ImageStorageLocation(BaseModel): name = CharField(unique=True, index=True) class ImageStoragePlacement(BaseModel): storage = ForeignKeyField(ImageStorage) location = ForeignKeyField(ImageStorageLocation) class TorrentInfo(BaseModel): storage = ForeignKeyField(ImageStorage) piece_length = IntegerField() pieces = Base64BinaryField() def _get_image_storage_locations(storage_id): placements_query = (ImageStoragePlacement .select(ImageStoragePlacement, ImageStorageLocation) .join(ImageStorageLocation) .switch(ImageStoragePlacement) .join(ImageStorage, JOIN_LEFT_OUTER) .where(ImageStorage.id == storage_id)) locations = set() for placement in placements_query: locations.add(placement.location.name) return locations def _get_layer_path(storage_record): """ Returns the path in the storage engine to the layer data referenced by the storage row. """ if not storage_record.cas_path: logger.debug('Serving layer from legacy v1 path: %s', storage_record.uuid) return storage.v1_image_layer_path(storage_record.uuid) return storage.blob_path(storage_record.content_checksum) def backfill_content_checksums_and_torrent_pieces(piece_length): """ Hashes the entire file for the content associated with an imagestorage. """ logger.debug('Began execution') logger.debug('This may be a long operation!') def batch_query(): return (ImageStorage .select(ImageStorage.id, ImageStorage.uuid, ImageStorage.content_checksum, ImageStorage.cas_path) .join(TorrentInfo, JOIN_LEFT_OUTER, on=((TorrentInfo.storage == ImageStorage.id) & (TorrentInfo.piece_length == piece_length))) .where((TorrentInfo.id >> None) | (ImageStorage.content_checksum >> None))) max_id = ImageStorage.select(fn.Max(ImageStorage.id)).scalar() checksums_written = 0 pieces_written = 0 for candidate_storage, abort in yield_random_entries(batch_query, ImageStorage.id, BATCH_SIZE, max_id): locations = _get_image_storage_locations(candidate_storage.id) checksum = candidate_storage.content_checksum torrent_pieces = '' with CloseForLongOperation(app.config): try: # Compute the checksum layer_path = _get_layer_path(candidate_storage) with storage.stream_read_file(locations, layer_path) as layer_data_handle: hasher = PieceHasher(piece_length) wrapped = wrap_with_handler(layer_data_handle, hasher.update) checksum = 'sha256:{0}'.format(checksums.sha256_file(wrapped)) torrent_pieces = hasher.final_piece_hashes() except Exception as exc: logger.exception('Unable to compute hashes for storage: %s', candidate_storage.uuid) # Create a fallback value for the checksum if checksum is None: checksum = 'unknown:{0}'.format(exc.__class__.__name__) torrent_collision = False checksum_collision = False # Now update the ImageStorage with the checksum num_updated = (ImageStorage .update(content_checksum=checksum) .where(ImageStorage.id == candidate_storage.id, ImageStorage.content_checksum >> None)).execute() checksums_written += num_updated if num_updated == 0: checksum_collision = True try: TorrentInfo.create(storage=candidate_storage.id, piece_length=piece_length, pieces=torrent_pieces) pieces_written += 1 except IntegrityError: torrent_collision = True if torrent_collision and checksum_collision: logger.info('Another worker pre-empted us for storage: %s', candidate_storage.uuid) abort.set() if (pieces_written % BATCH_SIZE) == 0 or (checksums_written % BATCH_SIZE) == 0: logger.debug('%s checksums written, %s torrent pieces written', checksums_written, pieces_written) logger.debug('Completed, %s checksums written, %s torrent pieces written', checksums_written, pieces_written) if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) #logging.getLogger('peewee').setLevel(logging.WARNING) logging.getLogger('boto').setLevel(logging.WARNING) logging.getLogger('data.database').setLevel(logging.WARNING) backfill_content_checksums_and_torrent_pieces(app.config['BITTORRENT_PIECE_SIZE'])