Database optimizations around image creation and logs lookup

This commit is contained in:
Joseph Schorr 2014-11-06 14:48:16 -05:00
parent a35bc11912
commit c569299e5c
2 changed files with 128 additions and 69 deletions

View file

@ -1034,16 +1034,26 @@ def get_repository(namespace_name, repository_name):
return None return None
def get_repo_image(namespace_name, repository_name, image_id): def get_repo_image_directly(namespace_name, repository_name, docker_image_id):
def limit_to_image_id(query): def limit_to_image_id(query):
return query.where(Image.docker_image_id == image_id) return query.where(Image.docker_image_id == docker_image_id).limit(1)
query = _get_repository_images_directly(namespace_name, repository_name, limit_to_image_id)
try:
return query.get()
except Image.DoesNotExist:
return None
def get_repo_image(namespace_name, repository_name, docker_image_id):
def limit_to_image_id(query):
return query.where(Image.docker_image_id == docker_image_id).limit(1)
images = _get_repository_images_base(namespace_name, repository_name, limit_to_image_id) images = _get_repository_images_base(namespace_name, repository_name, limit_to_image_id)
if not images: if not images:
return None return None
else:
return images[0]
return images[0]
def repository_is_public(namespace_name, repository_name): def repository_is_public(namespace_name, repository_name):
try: try:
@ -1136,22 +1146,20 @@ def __translate_ancestry(old_ancestry, translations, repository, username, prefe
if old_ancestry == '/': if old_ancestry == '/':
return '/' return '/'
def translate_id(old_id): def translate_id(old_id, docker_image_id):
logger.debug('Translating id: %s', old_id) logger.debug('Translating id: %s', old_id)
if old_id not in translations: if old_id not in translations:
# Figure out which docker_image_id the old id refers to, then find a image_in_repo = find_create_or_link_image(docker_image_id, repository, username,
# a local one
old = Image.select(Image.docker_image_id).where(Image.id == old_id).get()
image_in_repo = find_create_or_link_image(old.docker_image_id, repository, username,
translations, preferred_location) translations, preferred_location)
translations[old_id] = image_in_repo.id translations[old_id] = image_in_repo.id
return translations[old_id] return translations[old_id]
# TODO: PERFORMANCE IMPROVMENT: Select all the ancestor Docker IDs in a single query. The values # Select all the ancestor Docker IDs in a single query.
# are retrieved in the above translate call.
old_ids = [int(id_str) for id_str in old_ancestry.split('/')[1:-1]] old_ids = [int(id_str) for id_str in old_ancestry.split('/')[1:-1]]
new_ids = [str(translate_id(old_id)) for old_id in old_ids] old_images = {i.id: i.docker_image_id for i in Image.select().where(Image.id << old_ids)}
# Translate the old images into new ones.
new_ids = [str(translate_id(old_id, old_images[old_id])) for old_id in old_ids]
return '/%s/' % '/'.join(new_ids) return '/%s/' % '/'.join(new_ids)
@ -1163,39 +1171,22 @@ def _create_storage(location_name):
return storage return storage
def find_create_or_link_image(docker_image_id, repository, username, translations, def _find_or_link_image(existing_image, repository, username, translations, preferred_location):
preferred_location): # TODO(jake): This call is currently recursively done under a single transaction. Can we make
# it instead be done under a set of transactions?
with config.app_config['DB_TRANSACTION_FACTORY'](db): with config.app_config['DB_TRANSACTION_FACTORY'](db):
# TODO PERF IMPROVEMENT: Also make this only lookup the image directly, rather than # Check for an existing image, under the transaction, to make sure it doesn't already exist.
# joining image and image storage, etc. The common case doesn't need that information repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name,
# (but other callers might, so double check) existing_image.docker_image_id)
repo_image = get_repo_image(repository.namespace_user.username, repository.name,
docker_image_id)
if repo_image: if repo_image:
return repo_image return repo_image
query = (Image # Make sure the existing base image still exists.
.select(Image, ImageStorage)
.distinct()
.join(ImageStorage)
.switch(Image)
.join(Repository)
.join(Visibility)
.switch(Repository)
.join(RepositoryPermission, JOIN_LEFT_OUTER)
.switch(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(ImageStorage.uploading == False))
query = (_filter_to_repos_for_user(query, username)
.where(Image.docker_image_id == docker_image_id))
new_image_ancestry = '/'
origin_image_id = None
try: try:
to_copy = query.get() to_copy = Image.select().join(ImageStorage).where(Image.id == existing_image.id).get()
msg = 'Linking image to existing storage with docker id: %s and uuid: %s' msg = 'Linking image to existing storage with docker id: %s and uuid: %s'
logger.debug(msg, docker_image_id, to_copy.storage.uuid) logger.debug(msg, existing_image.docker_image_id, to_copy.storage.uuid)
new_image_ancestry = __translate_ancestry(to_copy.ancestors, translations, repository, new_image_ancestry = __translate_ancestry(to_copy.ancestors, translations, repository,
username, preferred_location) username, preferred_location)
@ -1203,25 +1194,69 @@ def find_create_or_link_image(docker_image_id, repository, username, translation
storage = to_copy.storage storage = to_copy.storage
storage.locations = {placement.location.name storage.locations = {placement.location.name
for placement in storage.imagestorageplacement_set} for placement in storage.imagestorageplacement_set}
origin_image_id = to_copy.id
new_image = Image.create(docker_image_id=existing_image.docker_image_id,
repository=repository, storage=storage,
ancestors=new_image_ancestry)
logger.debug('Storing translation %s -> %s', existing_image.id, new_image.id)
translations[existing_image.id] = new_image.id
return new_image
except Image.DoesNotExist: except Image.DoesNotExist:
logger.debug('Creating new storage for docker id: %s', docker_image_id) return None
storage = _create_storage(preferred_location)
logger.debug('Storage locations: %s', storage.locations)
new_image = Image.create(docker_image_id=docker_image_id,
repository=repository, storage=storage,
ancestors=new_image_ancestry)
logger.debug('new_image storage locations: %s', new_image.storage.locations)
if origin_image_id: def find_create_or_link_image(docker_image_id, repository, username, translations,
logger.debug('Storing translation %s -> %s', origin_image_id, new_image.id) preferred_location):
translations[origin_image_id] = new_image.id
return new_image # First check for the image existing in the repository. If found, we simply return it.
repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name,
docker_image_id)
if repo_image:
return repo_image
# We next check to see if there is an existing storage the new image can link to.
existing_image_query = (Image
.select(Image, ImageStorage)
.distinct()
.join(ImageStorage)
.switch(Image)
.join(Repository)
.join(Visibility)
.switch(Repository)
.join(RepositoryPermission, JOIN_LEFT_OUTER)
.switch(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(ImageStorage.uploading == False))
existing_image_query = (_filter_to_repos_for_user(existing_image_query, username)
.where(Image.docker_image_id == docker_image_id))
# If there is an existing image, we try to translate its ancestry and copy its storage.
new_image = None
try:
existing_image = existing_image_query.get()
new_image = _find_or_link_image(existing_image, repository, username, translations,
preferred_location)
if new_image:
return new_image
except Image.DoesNotExist:
pass
# Otherwise, create a new storage directly.
with config.app_config['DB_TRANSACTION_FACTORY'](db):
# Final check for an existing image, under the transaction.
repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name,
docker_image_id)
if repo_image:
return repo_image
logger.debug('Creating new storage for docker id: %s', docker_image_id)
storage = _create_storage(preferred_location)
return Image.create(docker_image_id=docker_image_id,
repository=repository, storage=storage,
ancestors='/')
def find_or_create_derived_storage(source, transformation_name, preferred_location): def find_or_create_derived_storage(source, transformation_name, preferred_location):
@ -1335,6 +1370,15 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created
fetched.storage.save() fetched.storage.save()
return fetched return fetched
def _get_repository_images_directly(namespace_name, repository_name, query_modifier):
query = (Image
.select()
.join(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(Repository.name == repository_name, Namespace.username == namespace_name))
query = query_modifier(query)
return query
def _get_repository_images_base(namespace_name, repository_name, query_modifier): def _get_repository_images_base(namespace_name, repository_name, query_modifier):
query = (ImageStoragePlacement query = (ImageStoragePlacement
@ -1371,6 +1415,14 @@ def _get_repository_images_base(namespace_name, repository_name, query_modifier)
return images.values() return images.values()
def lookup_repository_images(namespace_name, repository_name, docker_image_ids):
repo = get_repository(namespace_name, repository_name)
if repo is None:
return []
return Image.select().where(Image.repository == repo, Image.docker_image_id << docker_image_ids)
def get_repository_images(namespace_name, repository_name): def get_repository_images(namespace_name, repository_name):
return _get_repository_images_base(namespace_name, repository_name, lambda q: q) return _get_repository_images_base(namespace_name, repository_name, lambda q: q)
@ -1389,6 +1441,9 @@ def garbage_collect_repository(namespace_name, repository_name):
storage_id_whitelist = {} storage_id_whitelist = {}
with config.app_config['DB_TRANSACTION_FACTORY'](db): with config.app_config['DB_TRANSACTION_FACTORY'](db):
# TODO (jake): We could probably select this and all the images in a single query using
# a different kind of join.
# Get a list of all images used by tags in the repository # Get a list of all images used by tags in the repository
tag_query = (RepositoryTag tag_query = (RepositoryTag
.select(RepositoryTag, Image, ImageStorage) .select(RepositoryTag, Image, ImageStorage)
@ -1407,7 +1462,7 @@ def garbage_collect_repository(namespace_name, repository_name):
referenced_anscestors = referenced_anscestors.union(set(ancestor_list)) referenced_anscestors = referenced_anscestors.union(set(ancestor_list))
referenced_anscestors.add(tag.image.id) referenced_anscestors.add(tag.image.id)
all_repo_images = get_repository_images(namespace_name, repository_name) all_repo_images = _get_repository_images_directly(namespace_name, repository_name, lambda q: q)
all_images = {int(img.id): img for img in all_repo_images} all_images = {int(img.id): img for img in all_repo_images}
to_remove = set(all_images.keys()).difference(referenced_anscestors) to_remove = set(all_images.keys()).difference(referenced_anscestors)
@ -1465,8 +1520,8 @@ def garbage_collect_storage(storage_id_whitelist):
# Note: Both of these deletes must occur in the same transaction (unfortunately) because a # Note: Both of these deletes must occur in the same transaction (unfortunately) because a
# storage without any placement is invalid, and a placement cannot exist without a storage. # storage without any placement is invalid, and a placement cannot exist without a storage.
# TODO: We might want to allow for null storages on placements, which would allow us to delete # TODO(jake): We might want to allow for null storages on placements, which would allow us to
# the storages, then delete the placements in a non-transaction. # delete the storages, then delete the placements in a non-transaction.
logger.debug('Garbage collecting storages from candidates: %s', storage_id_whitelist) logger.debug('Garbage collecting storages from candidates: %s', storage_id_whitelist)
with config.app_config['DB_TRANSACTION_FACTORY'](db): with config.app_config['DB_TRANSACTION_FACTORY'](db):
# Track all of the data that should be removed from blob storage # Track all of the data that should be removed from blob storage
@ -1924,9 +1979,9 @@ def list_logs(start_time, end_time, performer=None, repository=None, namespace=N
if namespace: if namespace:
joined = joined.where(User.username == namespace) joined = joined.where(User.username == namespace)
return joined.where( return list(joined.where(
LogEntry.datetime >= start_time, LogEntry.datetime >= start_time,
LogEntry.datetime < end_time).order_by(LogEntry.datetime.desc()) LogEntry.datetime < end_time).order_by(LogEntry.datetime.desc()))
def log_action(kind_name, user_or_organization_name, performer=None, def log_action(kind_name, user_or_organization_name, performer=None,

View file

@ -223,15 +223,20 @@ def create_repository(namespace, repository):
repo = model.create_repository(namespace, repository, repo = model.create_repository(namespace, repository,
get_authenticated_user()) get_authenticated_user())
profile.debug('Determining added images') profile.debug('Determining already added images')
added_images = OrderedDict([(desc['id'], desc) added_images = OrderedDict([(desc['id'], desc) for desc in image_descriptions])
for desc in image_descriptions])
new_repo_images = dict(added_images) new_repo_images = dict(added_images)
# TODO PERF IMPROVEMENT: Doesn't need the locations OR the imagestorage, so just select the images # Optimization: Lookup any existing images in the repository with matching docker IDs and
# directly. Also use a set here. # remove them from the added dict, so we don't need to look them up one-by-one.
for existing in model.get_repository_images(namespace, repository): def chunks(l, n):
if existing.docker_image_id in new_repo_images: for i in xrange(0, len(l), n):
yield l[i:i+n]
# Note: We do this in chunks in an effort to not hit the SQL query size limit.
for chunk in chunks(new_repo_images.keys(), 50):
existing_images = model.lookup_repository_images(namespace, repository, chunk)
for existing in existing_images:
added_images.pop(existing.docker_image_id) added_images.pop(existing.docker_image_id)
profile.debug('Creating/Linking necessary images') profile.debug('Creating/Linking necessary images')
@ -243,9 +248,8 @@ def create_repository(namespace, repository):
profile.debug('Created images') profile.debug('Created images')
response = make_response('Created', 201)
track_and_log('push_repo', repo) track_and_log('push_repo', repo)
return response return make_response('Created', 201)
@index.route('/repositories/<path:repository>/images', methods=['PUT']) @index.route('/repositories/<path:repository>/images', methods=['PUT'])