Optimistically update backfill items, reducing RTs

This commit is contained in:
Jake Moshenko 2015-11-10 11:10:09 -05:00
parent 493d077f62
commit a33077b978
3 changed files with 59 additions and 43 deletions

View file

@ -5,12 +5,15 @@ from peewee import JOIN_LEFT_OUTER
from peewee import (CharField, BigIntegerField, BooleanField, ForeignKeyField, DateTimeField, from peewee import (CharField, BigIntegerField, BooleanField, ForeignKeyField, DateTimeField,
TextField, fn) TextField, fn)
from data.database import BaseModel, db, db_for_update, CloseForLongOperation from data.database import BaseModel, CloseForLongOperation
from app import app, storage from app import app, storage
from digest import checksums from digest import checksums
from util.migrate.allocator import yield_random_entries from util.migrate.allocator import yield_random_entries
BATCH_SIZE = 1000
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -69,7 +72,8 @@ def _get_image_storage_locations(storage_id):
def backfill_content_checksums(): def backfill_content_checksums():
""" Copies metadata from image storages to their images. """ """ Copies metadata from image storages to their images. """
logger.debug('Image content checksum backfill: Began execution') logger.debug('Began execution')
logger.debug('This may be a long operation!')
def batch_query(): def batch_query():
return (ImageStorage return (ImageStorage
@ -78,9 +82,9 @@ def backfill_content_checksums():
max_id = ImageStorage.select(fn.Max(ImageStorage.id)).scalar() max_id = ImageStorage.select(fn.Max(ImageStorage.id)).scalar()
for candidate_storage, abort in yield_random_entries(batch_query, ImageStorage.id, 1000, max_id): written = 0
logger.debug('Computing content checksum for storage: %s', candidate_storage.uuid) for candidate_storage, abort in yield_random_entries(batch_query, ImageStorage.id, BATCH_SIZE,
max_id):
locations = _get_image_storage_locations(candidate_storage.id) locations = _get_image_storage_locations(candidate_storage.id)
checksum = None checksum = None
@ -95,15 +99,19 @@ def backfill_content_checksums():
checksum = 'unknown:{0}'.format(exc.__class__.__name__) checksum = 'unknown:{0}'.format(exc.__class__.__name__)
# Now update the ImageStorage with the checksum # Now update the ImageStorage with the checksum
with app.config['DB_TRANSACTION_FACTORY'](db): num_updated = (ImageStorage
to_update = db_for_update(ImageStorage.get(ImageStorage.id == candidate_storage.id)) .update(content_checksum=checksum)
if to_update.content_checksum is not None: .where(ImageStorage.id == candidate_storage.id,
logger.info('Another worker filled in the checksum: %s', candidate_storage.uuid) ImageStorage.content_checksum >> None)).execute()
abort.set() if num_updated == 0:
else: logger.info('Another worker filled in the checksum: %s', candidate_storage.uuid)
logger.debug('Setting content checksum to %s for %s', checksum, candidate_storage.uuid) abort.set()
to_update.content_checksum = checksum
to_update.save() written += num_updated
if (written % BATCH_SIZE) == 0:
logger.debug('%s entries written', written)
logger.debug('Completed, %s entries written', written)
if __name__ == '__main__': if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)

View file

@ -1,11 +1,15 @@
import logging import logging
from data.database import BaseModel, db, db_for_update from data.database import BaseModel
from peewee import (fn, CharField, BigIntegerField, ForeignKeyField, BooleanField, DateTimeField, from peewee import (fn, CharField, BigIntegerField, ForeignKeyField, BooleanField, DateTimeField,
TextField, IntegerField) TextField, IntegerField)
from app import app from app import app
from util.migrate.allocator import yield_random_entries from util.migrate.allocator import yield_random_entries
BATCH_SIZE = 1000
class Repository(BaseModel): class Repository(BaseModel):
pass pass
@ -56,19 +60,20 @@ def backfill_parent_id():
max_id = Image.select(fn.Max(Image.id)).scalar() max_id = Image.select(fn.Max(Image.id)).scalar()
for to_backfill, abort in yield_random_entries(fetch_batch, Image.id, 1000, max_id): written = 0
with app.config['DB_TRANSACTION_FACTORY'](db): for to_backfill, abort in yield_random_entries(fetch_batch, Image.id, BATCH_SIZE, max_id):
try: computed_parent = int(to_backfill.ancestors.split('/')[-2])
image = db_for_update(Image num_changed = (Image
.select() .update(parent_id=computed_parent)
.where(Image.id == to_backfill.id, Image.parent_id >> None)).get() .where(Image.id == to_backfill.id, Image.parent_id >> None)).execute()
image.parent_id = int(to_backfill.ancestors.split('/')[-2]) if num_changed == 0:
image.save() logger.info('Collision with another worker, aborting batch')
except Image.DoesNotExist: abort.set()
logger.info('Collision with another worker, aborting batch') written += num_changed
abort.set() if (written % BATCH_SIZE) == 0:
logger.debug('%s entries written', written)
logger.debug('backfill_parent_id: Completed') logger.debug('backfill_parent_id: Completed, updated %s entries', written)
if __name__ == '__main__': if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)

View file

@ -2,11 +2,14 @@ import logging
from peewee import (CharField, BigIntegerField, BooleanField, ForeignKeyField, DateTimeField, from peewee import (CharField, BigIntegerField, BooleanField, ForeignKeyField, DateTimeField,
TextField, fn) TextField, fn)
from data.database import BaseModel, db, db_for_update from data.database import BaseModel
from util.migrate.allocator import yield_random_entries from util.migrate.allocator import yield_random_entries
from app import app from app import app
BATCH_SIZE = 1000
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -40,30 +43,30 @@ class Image(BaseModel):
def backfill_checksums(): def backfill_checksums():
""" Copies checksums from image storages to their images. """ """ Copies checksums from image storages to their images. """
logger.debug('Image v1 checksum backfill: Began execution') logger.debug('Began execution')
logger.debug('This may be a long operation!')
def batch_query(): def batch_query():
return (Image return (Image
.select(Image.id) .select(Image, ImageStorage)
.join(ImageStorage) .join(ImageStorage)
.where(Image.v1_checksum >> None, ImageStorage.uploading == False, .where(Image.v1_checksum >> None, ImageStorage.uploading == False,
~(ImageStorage.checksum >> None))) ~(ImageStorage.checksum >> None)))
max_id = Image.select(fn.Max(Image.id)).scalar() max_id = Image.select(fn.Max(Image.id)).scalar()
for candidate_image, abort in yield_random_entries(batch_query, Image.id, 1000, max_id): written = 0
with app.config['DB_TRANSACTION_FACTORY'](db): for candidate_image, abort in yield_random_entries(batch_query, Image.id, BATCH_SIZE, max_id):
try: num_changed = (Image
image = db_for_update(Image .update(v1_checksum=candidate_image.storage.checksum)
.select(Image, ImageStorage) .where(Image.id == candidate_image.id, Image.v1_checksum >> None)).execute()
.join(ImageStorage) if num_changed == 0:
.where(Image.id == candidate_image.id, logger.info('Collision with another worker, aborting batch')
Image.v1_checksum >> None)).get() abort.set()
written += num_changed
if (written % BATCH_SIZE) == 0:
logger.debug('%s entries written', written)
image.v1_checksum = image.storage.checksum logger.debug('Completed, updated %s entries', written)
image.save()
except Image.DoesNotExist:
logger.info('Collision with another worker, aborting batch')
abort.set()
if __name__ == "__main__": if __name__ == "__main__":