Select random records to use for the backfill script for uncompressed sizes, can now be parallelized.

2014-10-06 17:15:45 -04:00 · 2014-10-06 17:15:45 -04:00 · 153dbc3f92
commit 153dbc3f92
parent c4266140e2
1 changed files with 39 additions and 34 deletions
--- a/tools/uncompressedsize.py
+++ b/tools/uncompressedsize.py
@ -4,7 +4,7 @@ import zlib
 from data import model
 from data.database import ImageStorage
 from app import app, storage as store
-from data.database import db
+from data.database import db, db_random_func
 from util.gzipstream import ZLIB_GZIP_WINDOW


@ -17,18 +17,23 @@ CHUNK_SIZE = 5 * 1024 * 1024
 def backfill_sizes_from_data():
  while True:
    # Load the record from the DB.
-    try:
-      record = (ImageStorage
+    batch_ids = list(ImageStorage
                     .select(ImageStorage.uuid)
-                .where(ImageStorage.uncompressed_size >> None, ImageStorage.uploading == False)
-                .get())
-    except ImageStorage.DoesNotExist:
+                     .where(ImageStorage.uncompressed_size >> None,
+                            ImageStorage.uploading == False)
+                     .limit(100)
+                     .order_by(db_random_func()))
+    if len(batch_ids) == 0:
      # We're done!
      return

+    for record in batch_ids:
      uuid = record.uuid

      with_locations = model.get_storage_by_uuid(uuid)
+      if with_locations.uncompressed_size is not None:
+        logger.debug('Somebody else already filled this in for us: %s', uuid)
+        continue

      # Read the layer from backing storage and calculate the uncompressed size.
      logger.debug('Loading data: %s (%s bytes)', uuid, with_locations.image_size)