From a35bc119121b5c4b3550a572fd310e4dec298156 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 5 Nov 2014 12:27:38 -0500 Subject: [PATCH 1/7] Add perf comments --- data/model/legacy.py | 5 +++++ endpoints/index.py | 2 ++ endpoints/registry.py | 6 ++++++ endpoints/verbs.py | 7 +++++++ 4 files changed, 20 insertions(+) diff --git a/data/model/legacy.py b/data/model/legacy.py index 34be41491..1fb719b00 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -1148,6 +1148,8 @@ def __translate_ancestry(old_ancestry, translations, repository, username, prefe return translations[old_id] + # TODO: PERFORMANCE IMPROVMENT: Select all the ancestor Docker IDs in a single query. The values + # are retrieved in the above translate call. old_ids = [int(id_str) for id_str in old_ancestry.split('/')[1:-1]] new_ids = [str(translate_id(old_id)) for old_id in old_ids] return '/%s/' % '/'.join(new_ids) @@ -1164,6 +1166,9 @@ def _create_storage(location_name): def find_create_or_link_image(docker_image_id, repository, username, translations, preferred_location): with config.app_config['DB_TRANSACTION_FACTORY'](db): + # TODO PERF IMPROVEMENT: Also make this only lookup the image directly, rather than + # joining image and image storage, etc. The common case doesn't need that information + # (but other callers might, so double check) repo_image = get_repo_image(repository.namespace_user.username, repository.name, docker_image_id) if repo_image: diff --git a/endpoints/index.py b/endpoints/index.py index 1c8e551a0..7394671af 100644 --- a/endpoints/index.py +++ b/endpoints/index.py @@ -228,6 +228,8 @@ def create_repository(namespace, repository): for desc in image_descriptions]) new_repo_images = dict(added_images) + # TODO PERF IMPROVEMENT: Doesn't need the locations OR the imagestorage, so just select the images + # directly. Also use a set here. for existing in model.get_repository_images(namespace, repository): if existing.docker_image_id in new_repo_images: added_images.pop(existing.docker_image_id) diff --git a/endpoints/registry.py b/endpoints/registry.py index 48943bf4c..85b4b7963 100644 --- a/endpoints/registry.py +++ b/endpoints/registry.py @@ -151,6 +151,9 @@ def get_image_layer(namespace, repository, image_id, headers): return resp profile.debug('Streaming layer data') + + # TODO: DATABASE: We should disconnect from the database here, so that + # we're not holding the DB handle during this long download. return Response(store.stream_read(repo_image.storage.locations, path), headers=headers) except (IOError, AttributeError): profile.debug('Image not found') @@ -212,6 +215,9 @@ def put_image_layer(namespace, repository, image_id): h, sum_hndlr = checksums.simple_checksum_handler(json_data) sr.add_handler(sum_hndlr) + # TODO: DATABASE: We should disconnect from the database here and reconnect AFTER, so that + # we're not holding the DB handle during this long upload. + # Stream write the data to storage. store.stream_write(repo_image.storage.locations, layer_path, sr) diff --git a/endpoints/verbs.py b/endpoints/verbs.py index b410251d3..fa3bbe574 100644 --- a/endpoints/verbs.py +++ b/endpoints/verbs.py @@ -24,6 +24,8 @@ logger = logging.getLogger(__name__) def _open_stream(namespace, repository, tag, synthetic_image_id, image_json, image_list): store = Storage(app) + # TODO: PERFORMANCE: configure DB, load the images, cache them, then disconnect. + def get_next_image(): for current_image_id in image_list: yield model.get_repo_image(namespace, repository, current_image_id) @@ -55,6 +57,8 @@ def _write_synthetic_image_to_storage(linked_storage_uuid, linked_locations, que queue_file.add_exception_handler(handle_exception) + # TODO: PERFORMANCE: disconnect from the DB and reconnect once the stream write finishes or on + # error. image_path = store.image_layer_path(linked_storage_uuid) store.stream_write(linked_locations, image_path, queue_file) queue_file.close() @@ -132,6 +136,9 @@ def get_squashed_tag(namespace, repository, tag): storage_args = (derived.uuid, derived.locations, storage_queue_file) QueueProcess.run_process(_write_synthetic_image_to_storage, storage_args, finished=_cleanup) + # TODO: PERFORMANCE: close the database handle here for this process before we send the long + # download. + # Return the client's data. return send_file(client_queue_file) From c569299e5c169a2ab20a2214ed91f441f861c739 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 6 Nov 2014 14:48:16 -0500 Subject: [PATCH 2/7] Database optimizations around image creation and logs lookup --- data/model/legacy.py | 175 ++++++++++++++++++++++++++++--------------- endpoints/index.py | 22 +++--- 2 files changed, 128 insertions(+), 69 deletions(-) diff --git a/data/model/legacy.py b/data/model/legacy.py index 1fb719b00..0a8249392 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -1034,16 +1034,26 @@ def get_repository(namespace_name, repository_name): return None -def get_repo_image(namespace_name, repository_name, image_id): +def get_repo_image_directly(namespace_name, repository_name, docker_image_id): def limit_to_image_id(query): - return query.where(Image.docker_image_id == image_id) + return query.where(Image.docker_image_id == docker_image_id).limit(1) + + query = _get_repository_images_directly(namespace_name, repository_name, limit_to_image_id) + try: + return query.get() + except Image.DoesNotExist: + return None + + +def get_repo_image(namespace_name, repository_name, docker_image_id): + def limit_to_image_id(query): + return query.where(Image.docker_image_id == docker_image_id).limit(1) images = _get_repository_images_base(namespace_name, repository_name, limit_to_image_id) if not images: return None - else: - return images[0] + return images[0] def repository_is_public(namespace_name, repository_name): try: @@ -1136,22 +1146,20 @@ def __translate_ancestry(old_ancestry, translations, repository, username, prefe if old_ancestry == '/': return '/' - def translate_id(old_id): + def translate_id(old_id, docker_image_id): logger.debug('Translating id: %s', old_id) if old_id not in translations: - # Figure out which docker_image_id the old id refers to, then find a - # a local one - old = Image.select(Image.docker_image_id).where(Image.id == old_id).get() - image_in_repo = find_create_or_link_image(old.docker_image_id, repository, username, + image_in_repo = find_create_or_link_image(docker_image_id, repository, username, translations, preferred_location) translations[old_id] = image_in_repo.id - return translations[old_id] - # TODO: PERFORMANCE IMPROVMENT: Select all the ancestor Docker IDs in a single query. The values - # are retrieved in the above translate call. + # Select all the ancestor Docker IDs in a single query. old_ids = [int(id_str) for id_str in old_ancestry.split('/')[1:-1]] - new_ids = [str(translate_id(old_id)) for old_id in old_ids] + old_images = {i.id: i.docker_image_id for i in Image.select().where(Image.id << old_ids)} + + # Translate the old images into new ones. + new_ids = [str(translate_id(old_id, old_images[old_id])) for old_id in old_ids] return '/%s/' % '/'.join(new_ids) @@ -1163,39 +1171,22 @@ def _create_storage(location_name): return storage -def find_create_or_link_image(docker_image_id, repository, username, translations, - preferred_location): +def _find_or_link_image(existing_image, repository, username, translations, preferred_location): + # TODO(jake): This call is currently recursively done under a single transaction. Can we make + # it instead be done under a set of transactions? with config.app_config['DB_TRANSACTION_FACTORY'](db): - # TODO PERF IMPROVEMENT: Also make this only lookup the image directly, rather than - # joining image and image storage, etc. The common case doesn't need that information - # (but other callers might, so double check) - repo_image = get_repo_image(repository.namespace_user.username, repository.name, - docker_image_id) + # Check for an existing image, under the transaction, to make sure it doesn't already exist. + repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name, + existing_image.docker_image_id) if repo_image: return repo_image - query = (Image - .select(Image, ImageStorage) - .distinct() - .join(ImageStorage) - .switch(Image) - .join(Repository) - .join(Visibility) - .switch(Repository) - .join(RepositoryPermission, JOIN_LEFT_OUTER) - .switch(Repository) - .join(Namespace, on=(Repository.namespace_user == Namespace.id)) - .where(ImageStorage.uploading == False)) - - query = (_filter_to_repos_for_user(query, username) - .where(Image.docker_image_id == docker_image_id)) - - new_image_ancestry = '/' - origin_image_id = None + # Make sure the existing base image still exists. try: - to_copy = query.get() + to_copy = Image.select().join(ImageStorage).where(Image.id == existing_image.id).get() + msg = 'Linking image to existing storage with docker id: %s and uuid: %s' - logger.debug(msg, docker_image_id, to_copy.storage.uuid) + logger.debug(msg, existing_image.docker_image_id, to_copy.storage.uuid) new_image_ancestry = __translate_ancestry(to_copy.ancestors, translations, repository, username, preferred_location) @@ -1203,25 +1194,69 @@ def find_create_or_link_image(docker_image_id, repository, username, translation storage = to_copy.storage storage.locations = {placement.location.name for placement in storage.imagestorageplacement_set} - origin_image_id = to_copy.id + + new_image = Image.create(docker_image_id=existing_image.docker_image_id, + repository=repository, storage=storage, + ancestors=new_image_ancestry) + + logger.debug('Storing translation %s -> %s', existing_image.id, new_image.id) + translations[existing_image.id] = new_image.id + return new_image except Image.DoesNotExist: - logger.debug('Creating new storage for docker id: %s', docker_image_id) - storage = _create_storage(preferred_location) - - logger.debug('Storage locations: %s', storage.locations) - - new_image = Image.create(docker_image_id=docker_image_id, - repository=repository, storage=storage, - ancestors=new_image_ancestry) - - logger.debug('new_image storage locations: %s', new_image.storage.locations) + return None - if origin_image_id: - logger.debug('Storing translation %s -> %s', origin_image_id, new_image.id) - translations[origin_image_id] = new_image.id +def find_create_or_link_image(docker_image_id, repository, username, translations, + preferred_location): - return new_image + # First check for the image existing in the repository. If found, we simply return it. + repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name, + docker_image_id) + if repo_image: + return repo_image + + # We next check to see if there is an existing storage the new image can link to. + existing_image_query = (Image + .select(Image, ImageStorage) + .distinct() + .join(ImageStorage) + .switch(Image) + .join(Repository) + .join(Visibility) + .switch(Repository) + .join(RepositoryPermission, JOIN_LEFT_OUTER) + .switch(Repository) + .join(Namespace, on=(Repository.namespace_user == Namespace.id)) + .where(ImageStorage.uploading == False)) + + existing_image_query = (_filter_to_repos_for_user(existing_image_query, username) + .where(Image.docker_image_id == docker_image_id)) + + # If there is an existing image, we try to translate its ancestry and copy its storage. + new_image = None + try: + existing_image = existing_image_query.get() + new_image = _find_or_link_image(existing_image, repository, username, translations, + preferred_location) + if new_image: + return new_image + except Image.DoesNotExist: + pass + + # Otherwise, create a new storage directly. + with config.app_config['DB_TRANSACTION_FACTORY'](db): + # Final check for an existing image, under the transaction. + repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name, + docker_image_id) + if repo_image: + return repo_image + + logger.debug('Creating new storage for docker id: %s', docker_image_id) + storage = _create_storage(preferred_location) + + return Image.create(docker_image_id=docker_image_id, + repository=repository, storage=storage, + ancestors='/') def find_or_create_derived_storage(source, transformation_name, preferred_location): @@ -1335,6 +1370,15 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created fetched.storage.save() return fetched +def _get_repository_images_directly(namespace_name, repository_name, query_modifier): + query = (Image + .select() + .join(Repository) + .join(Namespace, on=(Repository.namespace_user == Namespace.id)) + .where(Repository.name == repository_name, Namespace.username == namespace_name)) + + query = query_modifier(query) + return query def _get_repository_images_base(namespace_name, repository_name, query_modifier): query = (ImageStoragePlacement @@ -1371,6 +1415,14 @@ def _get_repository_images_base(namespace_name, repository_name, query_modifier) return images.values() +def lookup_repository_images(namespace_name, repository_name, docker_image_ids): + repo = get_repository(namespace_name, repository_name) + if repo is None: + return [] + + return Image.select().where(Image.repository == repo, Image.docker_image_id << docker_image_ids) + + def get_repository_images(namespace_name, repository_name): return _get_repository_images_base(namespace_name, repository_name, lambda q: q) @@ -1389,6 +1441,9 @@ def garbage_collect_repository(namespace_name, repository_name): storage_id_whitelist = {} with config.app_config['DB_TRANSACTION_FACTORY'](db): + # TODO (jake): We could probably select this and all the images in a single query using + # a different kind of join. + # Get a list of all images used by tags in the repository tag_query = (RepositoryTag .select(RepositoryTag, Image, ImageStorage) @@ -1407,7 +1462,7 @@ def garbage_collect_repository(namespace_name, repository_name): referenced_anscestors = referenced_anscestors.union(set(ancestor_list)) referenced_anscestors.add(tag.image.id) - all_repo_images = get_repository_images(namespace_name, repository_name) + all_repo_images = _get_repository_images_directly(namespace_name, repository_name, lambda q: q) all_images = {int(img.id): img for img in all_repo_images} to_remove = set(all_images.keys()).difference(referenced_anscestors) @@ -1465,8 +1520,8 @@ def garbage_collect_storage(storage_id_whitelist): # Note: Both of these deletes must occur in the same transaction (unfortunately) because a # storage without any placement is invalid, and a placement cannot exist without a storage. - # TODO: We might want to allow for null storages on placements, which would allow us to delete - # the storages, then delete the placements in a non-transaction. + # TODO(jake): We might want to allow for null storages on placements, which would allow us to + # delete the storages, then delete the placements in a non-transaction. logger.debug('Garbage collecting storages from candidates: %s', storage_id_whitelist) with config.app_config['DB_TRANSACTION_FACTORY'](db): # Track all of the data that should be removed from blob storage @@ -1924,9 +1979,9 @@ def list_logs(start_time, end_time, performer=None, repository=None, namespace=N if namespace: joined = joined.where(User.username == namespace) - return joined.where( + return list(joined.where( LogEntry.datetime >= start_time, - LogEntry.datetime < end_time).order_by(LogEntry.datetime.desc()) + LogEntry.datetime < end_time).order_by(LogEntry.datetime.desc())) def log_action(kind_name, user_or_organization_name, performer=None, diff --git a/endpoints/index.py b/endpoints/index.py index 7394671af..d97dd94f5 100644 --- a/endpoints/index.py +++ b/endpoints/index.py @@ -223,15 +223,20 @@ def create_repository(namespace, repository): repo = model.create_repository(namespace, repository, get_authenticated_user()) - profile.debug('Determining added images') - added_images = OrderedDict([(desc['id'], desc) - for desc in image_descriptions]) + profile.debug('Determining already added images') + added_images = OrderedDict([(desc['id'], desc) for desc in image_descriptions]) new_repo_images = dict(added_images) - # TODO PERF IMPROVEMENT: Doesn't need the locations OR the imagestorage, so just select the images - # directly. Also use a set here. - for existing in model.get_repository_images(namespace, repository): - if existing.docker_image_id in new_repo_images: + # Optimization: Lookup any existing images in the repository with matching docker IDs and + # remove them from the added dict, so we don't need to look them up one-by-one. + def chunks(l, n): + for i in xrange(0, len(l), n): + yield l[i:i+n] + + # Note: We do this in chunks in an effort to not hit the SQL query size limit. + for chunk in chunks(new_repo_images.keys(), 50): + existing_images = model.lookup_repository_images(namespace, repository, chunk) + for existing in existing_images: added_images.pop(existing.docker_image_id) profile.debug('Creating/Linking necessary images') @@ -243,9 +248,8 @@ def create_repository(namespace, repository): profile.debug('Created images') - response = make_response('Created', 201) track_and_log('push_repo', repo) - return response + return make_response('Created', 201) @index.route('/repositories//images', methods=['PUT']) From 23d9bd2b42faecc161265db80c0670066541664c Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 6 Nov 2014 17:50:48 -0500 Subject: [PATCH 3/7] Change verbs to use short lived database connections --- data/database.py | 13 +++++++++++++ data/model/legacy.py | 6 ++++++ endpoints/verbs.py | 34 ++++++++++++++++++---------------- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/data/database.py b/data/database.py index 1914a954c..3966c30cc 100644 --- a/data/database.py +++ b/data/database.py @@ -35,6 +35,19 @@ class CallableProxy(Proxy): raise AttributeError('Cannot use uninitialized Proxy.') return self.obj(*args, **kwargs) +class UseThenDisconnect(object): + """ Helper object for conducting work with a database and then tearing it down. """ + + def __init__(self, config_object): + self.config_object = config_object + + def __enter__(self): + configure(self.config_object) + + def __exit__(self, type, value, traceback): + close_db_filter(None) + + db = Proxy() read_slave = Proxy() db_random_func = CallableProxy() diff --git a/data/model/legacy.py b/data/model/legacy.py index 0a8249392..74e2d0728 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -1423,6 +1423,12 @@ def lookup_repository_images(namespace_name, repository_name, docker_image_ids): return Image.select().where(Image.repository == repo, Image.docker_image_id << docker_image_ids) +def get_matching_repository_images(namespace_name, repository_name, docker_image_ids): + def modify_query(q): + return q.where(Image.docker_image_id << docker_image_ids) + + return _get_repository_images_base(namespace_name, repository_name, modify_query) + def get_repository_images(namespace_name, repository_name): return _get_repository_images_base(namespace_name, repository_name, lambda q: q) diff --git a/endpoints/verbs.py b/endpoints/verbs.py index fa3bbe574..b38a90d6f 100644 --- a/endpoints/verbs.py +++ b/endpoints/verbs.py @@ -21,26 +21,28 @@ from util.dockerloadformat import build_docker_load_stream verbs = Blueprint('verbs', __name__) logger = logging.getLogger(__name__) -def _open_stream(namespace, repository, tag, synthetic_image_id, image_json, image_list): +def _open_stream(namespace, repository, tag, synthetic_image_id, image_json, image_id_list): store = Storage(app) - # TODO: PERFORMANCE: configure DB, load the images, cache them, then disconnect. + # For performance reasons, we load the full image list here, cache it, then disconnect from + # the database. + with database.UseThenDisconnect(app.config): + image_list = model.get_matching_repository_images(namespace, repository, image_id_list) def get_next_image(): - for current_image_id in image_list: - yield model.get_repo_image(namespace, repository, current_image_id) + for current_image in image_list: + yield current_image def get_next_layer(): - for current_image_id in image_list: - current_image_entry = model.get_repo_image(namespace, repository, current_image_id) + for current_image_entry in image_list: current_image_path = store.image_layer_path(current_image_entry.storage.uuid) current_image_stream = store.stream_read_file(current_image_entry.storage.locations, current_image_path) + current_image_id = current_image_entry.id logger.debug('Returning image layer %s: %s' % (current_image_id, current_image_path)) yield current_image_stream - database.configure(app.config) stream = build_docker_load_stream(namespace, repository, tag, synthetic_image_id, image_json, get_next_image, get_next_layer) @@ -48,25 +50,25 @@ def _open_stream(namespace, repository, tag, synthetic_image_id, image_json, ima def _write_synthetic_image_to_storage(linked_storage_uuid, linked_locations, queue_file): - database.configure(app.config) store = Storage(app) def handle_exception(ex): logger.debug('Exception when building squashed image %s: %s', linked_storage_uuid, ex) - model.delete_derived_storage_by_uuid(linked_storage_uuid) + + with database.UseThenDisconnect(app.config): + model.delete_derived_storage_by_uuid(linked_storage_uuid) queue_file.add_exception_handler(handle_exception) - # TODO: PERFORMANCE: disconnect from the DB and reconnect once the stream write finishes or on - # error. image_path = store.image_layer_path(linked_storage_uuid) store.stream_write(linked_locations, image_path, queue_file) queue_file.close() if not queue_file.raised_exception: - done_uploading = model.get_storage_by_uuid(linked_storage_uuid) - done_uploading.uploading = False - done_uploading.save() + with database.UseThenDisconnect(app.config): + done_uploading = model.get_storage_by_uuid(linked_storage_uuid) + done_uploading.uploading = False + done_uploading.save() @verbs.route('/squash///', methods=['GET']) @@ -136,8 +138,8 @@ def get_squashed_tag(namespace, repository, tag): storage_args = (derived.uuid, derived.locations, storage_queue_file) QueueProcess.run_process(_write_synthetic_image_to_storage, storage_args, finished=_cleanup) - # TODO: PERFORMANCE: close the database handle here for this process before we send the long - # download. + # Close the database handle here for this process before we send the long download. + database.close_db_filter(None) # Return the client's data. return send_file(client_queue_file) From d5bbb5748176f54be40d960a8501dbd55bdbbe3e Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 6 Nov 2014 18:00:52 -0500 Subject: [PATCH 4/7] Change registry code to disconnect from the DB before long I/O operations --- data/database.py | 17 +++++++++++++++++ endpoints/registry.py | 13 ++++++------- endpoints/verbs.py | 3 +++ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/data/database.py b/data/database.py index 3966c30cc..c466a6d3f 100644 --- a/data/database.py +++ b/data/database.py @@ -35,6 +35,22 @@ class CallableProxy(Proxy): raise AttributeError('Cannot use uninitialized Proxy.') return self.obj(*args, **kwargs) + +class CloseForLongOperation(object): + """ Helper object which disconnects the database then reconnects after the nested operation + completes. + """ + + def __init__(self, config_object): + self.config_object = config_object + + def __enter__(self): + close_db_filter(None) + + def __exit__(self, type, value, traceback): + configure(self.config_object) + + class UseThenDisconnect(object): """ Helper object for conducting work with a database and then tearing it down. """ @@ -69,6 +85,7 @@ def _db_from_url(url, db_kwargs): def configure(config_object): + logger.debug('Configuring database') db_kwargs = dict(config_object['DB_CONNECTION_ARGS']) write_db_uri = config_object['DB_URI'] db.initialize(_db_from_url(write_db_uri, db_kwargs)) diff --git a/endpoints/registry.py b/endpoints/registry.py index 85b4b7963..6e6821ed0 100644 --- a/endpoints/registry.py +++ b/endpoints/registry.py @@ -13,7 +13,7 @@ from util import checksums, changes from util.http import abort, exact_abort from auth.permissions import (ReadRepositoryPermission, ModifyRepositoryPermission) -from data import model +from data import model, database from util import gzipstream @@ -152,8 +152,9 @@ def get_image_layer(namespace, repository, image_id, headers): profile.debug('Streaming layer data') - # TODO: DATABASE: We should disconnect from the database here, so that - # we're not holding the DB handle during this long download. + # Close the database handle here for this process before we send the long download. + database.close_db_filter(None) + return Response(store.stream_read(repo_image.storage.locations, path), headers=headers) except (IOError, AttributeError): profile.debug('Image not found') @@ -215,11 +216,9 @@ def put_image_layer(namespace, repository, image_id): h, sum_hndlr = checksums.simple_checksum_handler(json_data) sr.add_handler(sum_hndlr) - # TODO: DATABASE: We should disconnect from the database here and reconnect AFTER, so that - # we're not holding the DB handle during this long upload. - # Stream write the data to storage. - store.stream_write(repo_image.storage.locations, layer_path, sr) + with database.CloseForLongOperation(app.config): + store.stream_write(repo_image.storage.locations, layer_path, sr) # Append the computed checksum. csums = [] diff --git a/endpoints/verbs.py b/endpoints/verbs.py index b38a90d6f..92f3fa24d 100644 --- a/endpoints/verbs.py +++ b/endpoints/verbs.py @@ -101,6 +101,9 @@ def get_squashed_tag(namespace, repository, tag): logger.debug('Redirecting to download URL for derived image %s', derived.uuid) return redirect(download_url) + # Close the database handle here for this process before we send the long download. + database.close_db_filter(None) + logger.debug('Sending cached derived image %s', derived.uuid) return send_file(store.stream_read_file(derived.locations, derived_layer_path)) From 691be498176321e72a25a4ac7c731d8a29dbb8cd Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 7 Nov 2014 14:36:32 -0500 Subject: [PATCH 5/7] Fix issues with the perf updated code --- data/model/legacy.py | 4 ++++ endpoints/registry.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/data/model/legacy.py b/data/model/legacy.py index 74e2d0728..9cccfa82b 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -1235,12 +1235,16 @@ def find_create_or_link_image(docker_image_id, repository, username, translation # If there is an existing image, we try to translate its ancestry and copy its storage. new_image = None try: + logger.debug('Looking up existing image for ID: %s', docker_image_id) existing_image = existing_image_query.get() + + logger.debug('Existing image %s found for ID: %s', existing_image.id, docker_image_id) new_image = _find_or_link_image(existing_image, repository, username, translations, preferred_location) if new_image: return new_image except Image.DoesNotExist: + logger.debug('No existing image found for ID: %s', docker_image_id) pass # Otherwise, create a new storage directly. diff --git a/endpoints/registry.py b/endpoints/registry.py index 6e6821ed0..bd107116a 100644 --- a/endpoints/registry.py +++ b/endpoints/registry.py @@ -7,7 +7,7 @@ from functools import wraps from datetime import datetime from time import time -from app import storage as store, image_diff_queue +from app import storage as store, image_diff_queue, app from auth.auth import process_auth, extract_namespace_repo_from_session from util import checksums, changes from util.http import abort, exact_abort From 17f605a9ef1f542cc2db9580d7dcce9cfa81a21c Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Sun, 9 Nov 2014 15:50:50 -0500 Subject: [PATCH 6/7] Select only a single token. --- data/model/legacy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/model/legacy.py b/data/model/legacy.py index 9cccfa82b..e5b49bc53 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -1816,7 +1816,7 @@ def get_repository_delegate_tokens(namespace_name, repository_name): def get_repo_delegate_token(namespace_name, repository_name, code): repo_query = get_repository_delegate_tokens(namespace_name, repository_name) - found = list(repo_query.where(AccessToken.code == code)) + found = list(repo_query.where(AccessToken.code == code).limit(1)) if found: return found[0] From 091f821a6a18dce2d34370fa45a0681140a48c03 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Mon, 10 Nov 2014 13:44:36 -0500 Subject: [PATCH 7/7] - Rename get_repo_image to get_repo_image_extended and get_repo_image_directly to get_repo_image - Remove the configure call from CloseForLongOperation - Other small fixes --- data/database.py | 3 ++- data/model/legacy.py | 46 +++++++++++++++++++++--------------------- endpoints/api/image.py | 4 ++-- endpoints/registry.py | 20 +++++++++--------- endpoints/verbs.py | 2 +- tools/auditancestry.py | 4 ++-- 6 files changed, 40 insertions(+), 39 deletions(-) diff --git a/data/database.py b/data/database.py index c466a6d3f..bce76c1bb 100644 --- a/data/database.py +++ b/data/database.py @@ -48,7 +48,8 @@ class CloseForLongOperation(object): close_db_filter(None) def __exit__(self, type, value, traceback): - configure(self.config_object) + # Note: Nothing to do. The next SQL call will reconnect automatically. + pass class UseThenDisconnect(object): diff --git a/data/model/legacy.py b/data/model/legacy.py index e5b49bc53..e797b96b6 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -1034,18 +1034,18 @@ def get_repository(namespace_name, repository_name): return None -def get_repo_image_directly(namespace_name, repository_name, docker_image_id): +def get_repo_image(namespace_name, repository_name, docker_image_id): def limit_to_image_id(query): return query.where(Image.docker_image_id == docker_image_id).limit(1) - query = _get_repository_images_directly(namespace_name, repository_name, limit_to_image_id) + query = _get_repository_images(namespace_name, repository_name, limit_to_image_id) try: return query.get() except Image.DoesNotExist: return None -def get_repo_image(namespace_name, repository_name, docker_image_id): +def get_repo_image_extended(namespace_name, repository_name, docker_image_id): def limit_to_image_id(query): return query.where(Image.docker_image_id == docker_image_id).limit(1) @@ -1156,7 +1156,8 @@ def __translate_ancestry(old_ancestry, translations, repository, username, prefe # Select all the ancestor Docker IDs in a single query. old_ids = [int(id_str) for id_str in old_ancestry.split('/')[1:-1]] - old_images = {i.id: i.docker_image_id for i in Image.select().where(Image.id << old_ids)} + query = Image.select(Image.id, Image.docker_image_id).where(Image.id << old_ids) + old_images = {i.id: i.docker_image_id for i in query} # Translate the old images into new ones. new_ids = [str(translate_id(old_id, old_images[old_id])) for old_id in old_ids] @@ -1176,8 +1177,8 @@ def _find_or_link_image(existing_image, repository, username, translations, pref # it instead be done under a set of transactions? with config.app_config['DB_TRANSACTION_FACTORY'](db): # Check for an existing image, under the transaction, to make sure it doesn't already exist. - repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name, - existing_image.docker_image_id) + repo_image = get_repo_image(repository.namespace_user.username, repository.name, + existing_image.docker_image_id) if repo_image: return repo_image @@ -1210,8 +1211,8 @@ def find_create_or_link_image(docker_image_id, repository, username, translation preferred_location): # First check for the image existing in the repository. If found, we simply return it. - repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name, - docker_image_id) + repo_image = get_repo_image(repository.namespace_user.username, repository.name, + docker_image_id) if repo_image: return repo_image @@ -1250,8 +1251,8 @@ def find_create_or_link_image(docker_image_id, repository, username, translation # Otherwise, create a new storage directly. with config.app_config['DB_TRANSACTION_FACTORY'](db): # Final check for an existing image, under the transaction. - repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name, - docker_image_id) + repo_image = get_repo_image(repository.namespace_user.username, repository.name, + docker_image_id) if repo_image: return repo_image @@ -1374,7 +1375,7 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created fetched.storage.save() return fetched -def _get_repository_images_directly(namespace_name, repository_name, query_modifier): +def _get_repository_images(namespace_name, repository_name, query_modifier): query = (Image .select() .join(Repository) @@ -1420,12 +1421,12 @@ def _get_repository_images_base(namespace_name, repository_name, query_modifier) def lookup_repository_images(namespace_name, repository_name, docker_image_ids): - repo = get_repository(namespace_name, repository_name) - if repo is None: - return [] - - return Image.select().where(Image.repository == repo, Image.docker_image_id << docker_image_ids) - + return (Image + .select() + .join(Repository) + .join(Namespace, on=(Repository.namespace_user == Namespace.id)) + .where(Repository.name == repository_name, Namespace.username == namespace_name, + Image.docker_image_id << docker_image_ids)) def get_matching_repository_images(namespace_name, repository_name, docker_image_ids): def modify_query(q): @@ -1472,7 +1473,7 @@ def garbage_collect_repository(namespace_name, repository_name): referenced_anscestors = referenced_anscestors.union(set(ancestor_list)) referenced_anscestors.add(tag.image.id) - all_repo_images = _get_repository_images_directly(namespace_name, repository_name, lambda q: q) + all_repo_images = _get_repository_images(namespace_name, repository_name, lambda q: q) all_images = {int(img.id): img for img in all_repo_images} to_remove = set(all_images.keys()).difference(referenced_anscestors) @@ -1585,7 +1586,7 @@ def get_tag_image(namespace_name, repository_name, tag_name): def get_image_by_id(namespace_name, repository_name, docker_image_id): - image = get_repo_image(namespace_name, repository_name, docker_image_id) + image = get_repo_image_extended(namespace_name, repository_name, docker_image_id) if not image: raise DataModelException('Unable to find image \'%s\' for repo \'%s/%s\'' % (docker_image_id, namespace_name, repository_name)) @@ -1816,11 +1817,10 @@ def get_repository_delegate_tokens(namespace_name, repository_name): def get_repo_delegate_token(namespace_name, repository_name, code): repo_query = get_repository_delegate_tokens(namespace_name, repository_name) - found = list(repo_query.where(AccessToken.code == code).limit(1)) - if found: - return found[0] - else: + try: + return repo_query.where(AccessToken.code == code).get() + except AccessToken.DoesNotExist: raise InvalidTokenException('Unable to find token with code: %s' % code) diff --git a/endpoints/api/image.py b/endpoints/api/image.py index 3a6c62507..c952cc5d7 100644 --- a/endpoints/api/image.py +++ b/endpoints/api/image.py @@ -73,7 +73,7 @@ class RepositoryImage(RepositoryParamResource): @nickname('getImage') def get(self, namespace, repository, image_id): """ Get the information available for the specified image. """ - image = model.get_repo_image(namespace, repository, image_id) + image = model.get_repo_image_extended(namespace, repository, image_id) if not image: raise NotFound() @@ -94,7 +94,7 @@ class RepositoryImageChanges(RepositoryParamResource): @nickname('getImageChanges') def get(self, namespace, repository, image_id): """ Get the list of changes for the specified image. """ - image = model.get_repo_image(namespace, repository, image_id) + image = model.get_repo_image_extended(namespace, repository, image_id) if not image: raise NotFound() diff --git a/endpoints/registry.py b/endpoints/registry.py index bd107116a..77ddbec68 100644 --- a/endpoints/registry.py +++ b/endpoints/registry.py @@ -59,7 +59,7 @@ def require_completion(f): @wraps(f) def wrapper(namespace, repository, *args, **kwargs): image_id = kwargs['image_id'] - repo_image = model.get_repo_image(namespace, repository, image_id) + repo_image = model.get_repo_image_extended(namespace, repository, image_id) if image_is_uploading(repo_image): abort(400, 'Image %(image_id)s is being uploaded, retry later', issue='upload-in-progress', image_id=kwargs['image_id']) @@ -103,7 +103,7 @@ def head_image_layer(namespace, repository, image_id, headers): profile.debug('Checking repo permissions') if permission.can() or model.repository_is_public(namespace, repository): profile.debug('Looking up repo image') - repo_image = model.get_repo_image(namespace, repository, image_id) + repo_image = model.get_repo_image_extended(namespace, repository, image_id) if not repo_image: profile.debug('Image not found') abort(404, 'Image %(image_id)s not found', issue='unknown-image', @@ -136,7 +136,7 @@ def get_image_layer(namespace, repository, image_id, headers): profile.debug('Checking repo permissions') if permission.can() or model.repository_is_public(namespace, repository): profile.debug('Looking up repo image') - repo_image = model.get_repo_image(namespace, repository, image_id) + repo_image = model.get_repo_image_extended(namespace, repository, image_id) profile.debug('Looking up the layer path') try: @@ -174,7 +174,7 @@ def put_image_layer(namespace, repository, image_id): abort(403) profile.debug('Retrieving image') - repo_image = model.get_repo_image(namespace, repository, image_id) + repo_image = model.get_repo_image_extended(namespace, repository, image_id) try: profile.debug('Retrieving image data') uuid = repo_image.storage.uuid @@ -298,7 +298,7 @@ def put_image_checksum(namespace, repository, image_id): issue='missing-checksum-cookie', image_id=image_id) profile.debug('Looking up repo image') - repo_image = model.get_repo_image(namespace, repository, image_id) + repo_image = model.get_repo_image_extended(namespace, repository, image_id) if not repo_image or not repo_image.storage: abort(404, 'Image not found: %(image_id)s', issue='unknown-image', image_id=image_id) @@ -353,7 +353,7 @@ def get_image_json(namespace, repository, image_id, headers): abort(403) profile.debug('Looking up repo image') - repo_image = model.get_repo_image(namespace, repository, image_id) + repo_image = model.get_repo_image_extended(namespace, repository, image_id) profile.debug('Looking up repo layer data') try: @@ -384,7 +384,7 @@ def get_image_ancestry(namespace, repository, image_id, headers): abort(403) profile.debug('Looking up repo image') - repo_image = model.get_repo_image(namespace, repository, image_id) + repo_image = model.get_repo_image_extended(namespace, repository, image_id) profile.debug('Looking up image data') try: @@ -448,7 +448,7 @@ def put_image_json(namespace, repository, image_id): issue='invalid-request', image_id=image_id) profile.debug('Looking up repo image') - repo_image = model.get_repo_image(namespace, repository, image_id) + repo_image = model.get_repo_image_extended(namespace, repository, image_id) if not repo_image: profile.debug('Image not found') abort(404, 'Image %(image_id)s not found', issue='unknown-image', @@ -465,7 +465,7 @@ def put_image_json(namespace, repository, image_id): parent_image = None if parent_id: profile.debug('Looking up parent image') - parent_image = model.get_repo_image(namespace, repository, parent_id) + parent_image = model.get_repo_image_extended(namespace, repository, parent_id) parent_uuid = parent_image and parent_image.storage.uuid parent_locations = parent_image and parent_image.storage.locations @@ -518,7 +518,7 @@ def put_image_json(namespace, repository, image_id): def process_image_changes(namespace, repository, image_id): logger.debug('Generating diffs for image: %s' % image_id) - repo_image = model.get_repo_image(namespace, repository, image_id) + repo_image = model.get_repo_image_extended(namespace, repository, image_id) if not repo_image: logger.warning('No image for id: %s', image_id) return None, None diff --git a/endpoints/verbs.py b/endpoints/verbs.py index 92f3fa24d..33a214788 100644 --- a/endpoints/verbs.py +++ b/endpoints/verbs.py @@ -83,7 +83,7 @@ def get_squashed_tag(namespace, repository, tag): abort(404) # Lookup the tag's image and storage. - repo_image = model.get_repo_image(namespace, repository, tag_image.docker_image_id) + repo_image = model.get_repo_image_extended(namespace, repository, tag_image.docker_image_id) if not repo_image: abort(404) diff --git a/tools/auditancestry.py b/tools/auditancestry.py index 59d636836..27fd11d8c 100644 --- a/tools/auditancestry.py +++ b/tools/auditancestry.py @@ -27,7 +27,7 @@ bad_count = 0 good_count = 0 def resolve_or_create(repo, docker_image_id, new_ancestry): - existing = model.get_repo_image(repo.namespace_user.username, repo.name, docker_image_id) + existing = model.get_repo_image_extended(repo.namespace_user.username, repo.name, docker_image_id) if existing: logger.debug('Found existing image: %s, %s', existing.id, docker_image_id) return existing @@ -63,7 +63,7 @@ def all_ancestors_exist(ancestors): cant_fix = [] for img in query: try: - with_locations = model.get_repo_image(img.repository.namespace_user.username, + with_locations = model.get_repo_image_extended(img.repository.namespace_user.username, img.repository.name, img.docker_image_id) ancestry_storage = store.image_ancestry_path(img.storage.uuid) if store.exists(with_locations.storage.locations, ancestry_storage):