diff --git a/data/model/legacy.py b/data/model/legacy.py index 1fb719b00..0a8249392 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -1034,16 +1034,26 @@ def get_repository(namespace_name, repository_name): return None -def get_repo_image(namespace_name, repository_name, image_id): +def get_repo_image_directly(namespace_name, repository_name, docker_image_id): def limit_to_image_id(query): - return query.where(Image.docker_image_id == image_id) + return query.where(Image.docker_image_id == docker_image_id).limit(1) + + query = _get_repository_images_directly(namespace_name, repository_name, limit_to_image_id) + try: + return query.get() + except Image.DoesNotExist: + return None + + +def get_repo_image(namespace_name, repository_name, docker_image_id): + def limit_to_image_id(query): + return query.where(Image.docker_image_id == docker_image_id).limit(1) images = _get_repository_images_base(namespace_name, repository_name, limit_to_image_id) if not images: return None - else: - return images[0] + return images[0] def repository_is_public(namespace_name, repository_name): try: @@ -1136,22 +1146,20 @@ def __translate_ancestry(old_ancestry, translations, repository, username, prefe if old_ancestry == '/': return '/' - def translate_id(old_id): + def translate_id(old_id, docker_image_id): logger.debug('Translating id: %s', old_id) if old_id not in translations: - # Figure out which docker_image_id the old id refers to, then find a - # a local one - old = Image.select(Image.docker_image_id).where(Image.id == old_id).get() - image_in_repo = find_create_or_link_image(old.docker_image_id, repository, username, + image_in_repo = find_create_or_link_image(docker_image_id, repository, username, translations, preferred_location) translations[old_id] = image_in_repo.id - return translations[old_id] - # TODO: PERFORMANCE IMPROVMENT: Select all the ancestor Docker IDs in a single query. The values - # are retrieved in the above translate call. + # Select all the ancestor Docker IDs in a single query. old_ids = [int(id_str) for id_str in old_ancestry.split('/')[1:-1]] - new_ids = [str(translate_id(old_id)) for old_id in old_ids] + old_images = {i.id: i.docker_image_id for i in Image.select().where(Image.id << old_ids)} + + # Translate the old images into new ones. + new_ids = [str(translate_id(old_id, old_images[old_id])) for old_id in old_ids] return '/%s/' % '/'.join(new_ids) @@ -1163,39 +1171,22 @@ def _create_storage(location_name): return storage -def find_create_or_link_image(docker_image_id, repository, username, translations, - preferred_location): +def _find_or_link_image(existing_image, repository, username, translations, preferred_location): + # TODO(jake): This call is currently recursively done under a single transaction. Can we make + # it instead be done under a set of transactions? with config.app_config['DB_TRANSACTION_FACTORY'](db): - # TODO PERF IMPROVEMENT: Also make this only lookup the image directly, rather than - # joining image and image storage, etc. The common case doesn't need that information - # (but other callers might, so double check) - repo_image = get_repo_image(repository.namespace_user.username, repository.name, - docker_image_id) + # Check for an existing image, under the transaction, to make sure it doesn't already exist. + repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name, + existing_image.docker_image_id) if repo_image: return repo_image - query = (Image - .select(Image, ImageStorage) - .distinct() - .join(ImageStorage) - .switch(Image) - .join(Repository) - .join(Visibility) - .switch(Repository) - .join(RepositoryPermission, JOIN_LEFT_OUTER) - .switch(Repository) - .join(Namespace, on=(Repository.namespace_user == Namespace.id)) - .where(ImageStorage.uploading == False)) - - query = (_filter_to_repos_for_user(query, username) - .where(Image.docker_image_id == docker_image_id)) - - new_image_ancestry = '/' - origin_image_id = None + # Make sure the existing base image still exists. try: - to_copy = query.get() + to_copy = Image.select().join(ImageStorage).where(Image.id == existing_image.id).get() + msg = 'Linking image to existing storage with docker id: %s and uuid: %s' - logger.debug(msg, docker_image_id, to_copy.storage.uuid) + logger.debug(msg, existing_image.docker_image_id, to_copy.storage.uuid) new_image_ancestry = __translate_ancestry(to_copy.ancestors, translations, repository, username, preferred_location) @@ -1203,25 +1194,69 @@ def find_create_or_link_image(docker_image_id, repository, username, translation storage = to_copy.storage storage.locations = {placement.location.name for placement in storage.imagestorageplacement_set} - origin_image_id = to_copy.id + + new_image = Image.create(docker_image_id=existing_image.docker_image_id, + repository=repository, storage=storage, + ancestors=new_image_ancestry) + + logger.debug('Storing translation %s -> %s', existing_image.id, new_image.id) + translations[existing_image.id] = new_image.id + return new_image except Image.DoesNotExist: - logger.debug('Creating new storage for docker id: %s', docker_image_id) - storage = _create_storage(preferred_location) - - logger.debug('Storage locations: %s', storage.locations) - - new_image = Image.create(docker_image_id=docker_image_id, - repository=repository, storage=storage, - ancestors=new_image_ancestry) - - logger.debug('new_image storage locations: %s', new_image.storage.locations) + return None - if origin_image_id: - logger.debug('Storing translation %s -> %s', origin_image_id, new_image.id) - translations[origin_image_id] = new_image.id +def find_create_or_link_image(docker_image_id, repository, username, translations, + preferred_location): - return new_image + # First check for the image existing in the repository. If found, we simply return it. + repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name, + docker_image_id) + if repo_image: + return repo_image + + # We next check to see if there is an existing storage the new image can link to. + existing_image_query = (Image + .select(Image, ImageStorage) + .distinct() + .join(ImageStorage) + .switch(Image) + .join(Repository) + .join(Visibility) + .switch(Repository) + .join(RepositoryPermission, JOIN_LEFT_OUTER) + .switch(Repository) + .join(Namespace, on=(Repository.namespace_user == Namespace.id)) + .where(ImageStorage.uploading == False)) + + existing_image_query = (_filter_to_repos_for_user(existing_image_query, username) + .where(Image.docker_image_id == docker_image_id)) + + # If there is an existing image, we try to translate its ancestry and copy its storage. + new_image = None + try: + existing_image = existing_image_query.get() + new_image = _find_or_link_image(existing_image, repository, username, translations, + preferred_location) + if new_image: + return new_image + except Image.DoesNotExist: + pass + + # Otherwise, create a new storage directly. + with config.app_config['DB_TRANSACTION_FACTORY'](db): + # Final check for an existing image, under the transaction. + repo_image = get_repo_image_directly(repository.namespace_user.username, repository.name, + docker_image_id) + if repo_image: + return repo_image + + logger.debug('Creating new storage for docker id: %s', docker_image_id) + storage = _create_storage(preferred_location) + + return Image.create(docker_image_id=docker_image_id, + repository=repository, storage=storage, + ancestors='/') def find_or_create_derived_storage(source, transformation_name, preferred_location): @@ -1335,6 +1370,15 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created fetched.storage.save() return fetched +def _get_repository_images_directly(namespace_name, repository_name, query_modifier): + query = (Image + .select() + .join(Repository) + .join(Namespace, on=(Repository.namespace_user == Namespace.id)) + .where(Repository.name == repository_name, Namespace.username == namespace_name)) + + query = query_modifier(query) + return query def _get_repository_images_base(namespace_name, repository_name, query_modifier): query = (ImageStoragePlacement @@ -1371,6 +1415,14 @@ def _get_repository_images_base(namespace_name, repository_name, query_modifier) return images.values() +def lookup_repository_images(namespace_name, repository_name, docker_image_ids): + repo = get_repository(namespace_name, repository_name) + if repo is None: + return [] + + return Image.select().where(Image.repository == repo, Image.docker_image_id << docker_image_ids) + + def get_repository_images(namespace_name, repository_name): return _get_repository_images_base(namespace_name, repository_name, lambda q: q) @@ -1389,6 +1441,9 @@ def garbage_collect_repository(namespace_name, repository_name): storage_id_whitelist = {} with config.app_config['DB_TRANSACTION_FACTORY'](db): + # TODO (jake): We could probably select this and all the images in a single query using + # a different kind of join. + # Get a list of all images used by tags in the repository tag_query = (RepositoryTag .select(RepositoryTag, Image, ImageStorage) @@ -1407,7 +1462,7 @@ def garbage_collect_repository(namespace_name, repository_name): referenced_anscestors = referenced_anscestors.union(set(ancestor_list)) referenced_anscestors.add(tag.image.id) - all_repo_images = get_repository_images(namespace_name, repository_name) + all_repo_images = _get_repository_images_directly(namespace_name, repository_name, lambda q: q) all_images = {int(img.id): img for img in all_repo_images} to_remove = set(all_images.keys()).difference(referenced_anscestors) @@ -1465,8 +1520,8 @@ def garbage_collect_storage(storage_id_whitelist): # Note: Both of these deletes must occur in the same transaction (unfortunately) because a # storage without any placement is invalid, and a placement cannot exist without a storage. - # TODO: We might want to allow for null storages on placements, which would allow us to delete - # the storages, then delete the placements in a non-transaction. + # TODO(jake): We might want to allow for null storages on placements, which would allow us to + # delete the storages, then delete the placements in a non-transaction. logger.debug('Garbage collecting storages from candidates: %s', storage_id_whitelist) with config.app_config['DB_TRANSACTION_FACTORY'](db): # Track all of the data that should be removed from blob storage @@ -1924,9 +1979,9 @@ def list_logs(start_time, end_time, performer=None, repository=None, namespace=N if namespace: joined = joined.where(User.username == namespace) - return joined.where( + return list(joined.where( LogEntry.datetime >= start_time, - LogEntry.datetime < end_time).order_by(LogEntry.datetime.desc()) + LogEntry.datetime < end_time).order_by(LogEntry.datetime.desc())) def log_action(kind_name, user_or_organization_name, performer=None, diff --git a/endpoints/index.py b/endpoints/index.py index 7394671af..d97dd94f5 100644 --- a/endpoints/index.py +++ b/endpoints/index.py @@ -223,15 +223,20 @@ def create_repository(namespace, repository): repo = model.create_repository(namespace, repository, get_authenticated_user()) - profile.debug('Determining added images') - added_images = OrderedDict([(desc['id'], desc) - for desc in image_descriptions]) + profile.debug('Determining already added images') + added_images = OrderedDict([(desc['id'], desc) for desc in image_descriptions]) new_repo_images = dict(added_images) - # TODO PERF IMPROVEMENT: Doesn't need the locations OR the imagestorage, so just select the images - # directly. Also use a set here. - for existing in model.get_repository_images(namespace, repository): - if existing.docker_image_id in new_repo_images: + # Optimization: Lookup any existing images in the repository with matching docker IDs and + # remove them from the added dict, so we don't need to look them up one-by-one. + def chunks(l, n): + for i in xrange(0, len(l), n): + yield l[i:i+n] + + # Note: We do this in chunks in an effort to not hit the SQL query size limit. + for chunk in chunks(new_repo_images.keys(), 50): + existing_images = model.lookup_repository_images(namespace, repository, chunk) + for existing in existing_images: added_images.pop(existing.docker_image_id) profile.debug('Creating/Linking necessary images') @@ -243,9 +248,8 @@ def create_repository(namespace, repository): profile.debug('Created images') - response = make_response('Created', 201) track_and_log('push_repo', repo) - return response + return make_response('Created', 201) @index.route('/repositories//images', methods=['PUT'])