From 45208983bf3d3e62be33e7f35a90f95405790d18 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 3 Oct 2014 15:07:50 -0400 Subject: [PATCH] Update the backfill script to always read the size from the layer data --- tools/uncompressedsize.py | 106 +++++++++++--------------------------- 1 file changed, 30 insertions(+), 76 deletions(-) diff --git a/tools/uncompressedsize.py b/tools/uncompressedsize.py index 56a22756b..2e8955761 100644 --- a/tools/uncompressedsize.py +++ b/tools/uncompressedsize.py @@ -1,92 +1,47 @@ import json import logging +import zlib from data import model from data.database import ImageStorage from app import app, storage as store from data.database import db -from gzip import GzipFile -from tempfile import SpooledTemporaryFile +from util.gzipstream import ZLIB_GZIP_WINDOW logger = logging.getLogger(__name__) - -def backfill_sizes_from_json(): - query = (ImageStorage - .select() - .where(ImageStorage.uncompressed_size == None, ImageStorage.uploading == False) - .limit(100)) - - total = 0 - missing = 0 - batch_processed = 1 - - while batch_processed > 0: - batch_processed = 0 - with app.config['DB_TRANSACTION_FACTORY'](db): - for image_storage in query.clone(): - total += 1 - batch_processed += 1 - - if (total - 1) % 100 == 0: - logger.debug('Storing entry: %s', total) - - # Lookup the JSON for the image. - uuid = image_storage.uuid - with_locations = model.get_storage_by_uuid(uuid) - - try: - json_string = store.get_content(with_locations.locations, store.image_json_path(uuid)) - json_data = json.loads(json_string) - size = json_data.get('Size', json_data.get('size', -1)) - except IOError: - logger.debug('Image storage with no json %s', uuid) - size = -1 - - if size == -1: - missing += 1 - - logger.debug('Missing entry %s (%s/%s)', uuid, missing, total) - - image_storage.uncompressed_size = size - image_storage.save() - - def backfill_sizes_from_data(): - storage_ids = list(ImageStorage - .select(ImageStorage.uuid) - .where(ImageStorage.uncompressed_size == -1, ImageStorage.uploading == False)) - - counter = 0 - for uuid in [s.uuid for s in storage_ids]: - counter += 1 - - # Load the storage with locations. - logger.debug('Loading entry: %s (%s/%s)', uuid, counter, len(storage_ids)) - with_locations = model.get_storage_by_uuid(uuid) - layer_size = -2 - - # Read the layer from backing storage and calculate the uncompressed size. + while True: + # Load the record from the DB. try: - logger.debug('Loading data: %s (%s bytes)', uuid, with_locations.image_size) - CHUNK_SIZE = 512 * 1024 - with SpooledTemporaryFile(CHUNK_SIZE) as tarball: - layer_data = store.get_content(with_locations.locations, store.image_layer_path(uuid)) - tarball.write(layer_data) - tarball.seek(0) + record = (ImageStorage + .select(ImageStorage.uuid) + .where(ImageStorage.uncompressed_size == None, ImageStorage.uploading == False) + .get()) + except ImageStorage.DoesNotExist: + # We're done! + return - with GzipFile(fileobj=tarball, mode='rb') as gzip_file: - gzip_file.read() - layer_size = gzip_file.size + uuid = record.uuid + + # Read the layer from backing storage and calculate the uncompressed size. + logger.debug('Loading data: %s (%s bytes)', uuid, with_locations.image_size) + decompressor = zlib.decompressobj(ZLIB_GZIP_WINDOW) + stream = store.read_stream(with_locations.locations, store.image_layer_path(uuid)) + + uncompressed_size = 0 + CHUNK_SIZE = 512 * 1024 * 1024 + while True: + current_data = stream.read(CHUNK_SIZE) + if len(current_data) == 0: + break + + uncompressed_size += len(decompressor.decompress(current_data)) - except Exception as ex: - logger.debug('Could not gunzip entry: %s. Reason: %s', uuid, ex) - continue - # Write the size to the image storage. We do so under a transaction AFTER checking to # make sure the image storage still exists and has not changed. - logger.debug('Writing entry: %s. Size: %s', uuid, layer_size) + logger.debug('Writing entry: %s. Size: %s', uuid, uncompressed_size) with app.config['DB_TRANSACTION_FACTORY'](db): try: current_record = model.get_storage_by_uuid(uuid) @@ -94,14 +49,13 @@ def backfill_sizes_from_data(): # Record no longer exists. continue - if not current_record.uploading and current_record.uncompressed_size == -1: - current_record.uncompressed_size = layer_size - current_record.save() + if not current_record.uploading and current_record.uncompressed_size == None: + current_record.uncompressed_size = uncompressed_size + #current_record.save() if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) logging.getLogger('boto').setLevel(logging.CRITICAL) - backfill_sizes_from_json() backfill_sizes_from_data()