Optimistically update backfill items, reducing RTs

This commit is contained in:
Jake Moshenko 2015-11-10 11:10:09 -05:00
parent 493d077f62
commit a33077b978
3 changed files with 59 additions and 43 deletions

View file

@ -2,11 +2,14 @@ import logging
from peewee import (CharField, BigIntegerField, BooleanField, ForeignKeyField, DateTimeField,
TextField, fn)
from data.database import BaseModel, db, db_for_update
from data.database import BaseModel
from util.migrate.allocator import yield_random_entries
from app import app
BATCH_SIZE = 1000
logger = logging.getLogger(__name__)
@ -40,30 +43,30 @@ class Image(BaseModel):
def backfill_checksums():
""" Copies checksums from image storages to their images. """
logger.debug('Image v1 checksum backfill: Began execution')
logger.debug('Began execution')
logger.debug('This may be a long operation!')
def batch_query():
return (Image
.select(Image.id)
.select(Image, ImageStorage)
.join(ImageStorage)
.where(Image.v1_checksum >> None, ImageStorage.uploading == False,
~(ImageStorage.checksum >> None)))
max_id = Image.select(fn.Max(Image.id)).scalar()
for candidate_image, abort in yield_random_entries(batch_query, Image.id, 1000, max_id):
with app.config['DB_TRANSACTION_FACTORY'](db):
try:
image = db_for_update(Image
.select(Image, ImageStorage)
.join(ImageStorage)
.where(Image.id == candidate_image.id,
Image.v1_checksum >> None)).get()
written = 0
for candidate_image, abort in yield_random_entries(batch_query, Image.id, BATCH_SIZE, max_id):
num_changed = (Image
.update(v1_checksum=candidate_image.storage.checksum)
.where(Image.id == candidate_image.id, Image.v1_checksum >> None)).execute()
if num_changed == 0:
logger.info('Collision with another worker, aborting batch')
abort.set()
written += num_changed
if (written % BATCH_SIZE) == 0:
logger.debug('%s entries written', written)
image.v1_checksum = image.storage.checksum
image.save()
except Image.DoesNotExist:
logger.info('Collision with another worker, aborting batch')
abort.set()
logger.debug('Completed, updated %s entries', written)
if __name__ == "__main__":