From 8b2e4d3bcff3a62dd9b33a8498ea425564c92473 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 28 Apr 2017 19:57:24 -0400 Subject: [PATCH 1/7] Add a test for get_matching_tags --- data/model/test/test_tag.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/data/model/test/test_tag.py b/data/model/test/test_tag.py index 9d372efab..623db281a 100644 --- a/data/model/test/test_tag.py +++ b/data/model/test/test_tag.py @@ -1,8 +1,25 @@ +from data.database import Image, RepositoryTag from data.model.repository import create_repository -from data.model.tag import list_active_repo_tags, create_or_update_tag, delete_tag +from data.model.tag import (list_active_repo_tags, create_or_update_tag, delete_tag, + get_matching_tags, _tag_alive) from data.model.image import find_create_or_link_image + from test.fixtures import * +def test_get_matching_tags(initialized_db): + # Test for every image in the test database. + for image in Image.select(): + matching_query = get_matching_tags(image.docker_image_id, image.storage.uuid) + expected_query = (RepositoryTag + .select() + .join(Image) + .where(RepositoryTag.hidden == False) + .where((Image.id == image.id) | (Image.ancestors ** ('%%/%s/%%' % image.id)))) + + matching_tags = set([tag.id for tag in matching_query]) + expected_tags = set([tag.id for tag in _tag_alive(expected_query)]) + assert matching_tags == expected_tags, "mismatch for image %s" % image.id + def assert_tags(repository, *args): tags = list(list_active_repo_tags(repository)) assert len(tags) == len(args) From e583be391406bc3329b7befb727fba362dba7066 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 28 Apr 2017 20:10:54 -0400 Subject: [PATCH 2/7] Remove inner query for ancestors lookup on get_matching_tags --- data/model/image.py | 23 +++++++++-------------- data/model/tag.py | 9 +++++++-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/data/model/image.py b/data/model/image.py index 5ec12682e..7ce160aa7 100644 --- a/data/model/image.py +++ b/data/model/image.py @@ -17,23 +17,18 @@ from util.canonicaljson import canonicalize logger = logging.getLogger(__name__) -def get_repository_image_and_deriving(docker_image_id, storage_uuid): - """ Returns all matching images with the given docker image ID and storage uuid, along with any - images which have the image ID as parents. +def get_image_with_storage(docker_image_id, storage_uuid): + """ Returns the image with the given docker image ID and storage uuid or None if none. """ try: - image_found = (Image - .select() - .join(ImageStorage) - .where(Image.docker_image_id == docker_image_id, - ImageStorage.uuid == storage_uuid) - .get()) + return (Image + .select() + .join(ImageStorage) + .where(Image.docker_image_id == docker_image_id, + ImageStorage.uuid == storage_uuid) + .get()) except Image.DoesNotExist: - return Image.select().where(Image.id < 0) # Empty query - - ancestors_pattern = '%s%s/%%' % (image_found.ancestors, image_found.id) - return Image.select().where((Image.ancestors ** ancestors_pattern) | - (Image.id == image_found.id)) + return None def get_parent_images_with_placements(namespace_name, repository_name, image_obj): diff --git a/data/model/tag.py b/data/model/tag.py index e4c7b90cd..c6112a092 100644 --- a/data/model/tag.py +++ b/data/model/tag.py @@ -56,14 +56,19 @@ def _tag_alive(query, now_ts=None): def get_matching_tags(docker_image_id, storage_uuid, *args): """ Returns a query pointing to all tags that contain the image with the given docker_image_id and storage_uuid. """ - image_query = image.get_repository_image_and_deriving(docker_image_id, storage_uuid) + image_row = image.get_image_with_storage(docker_image_id, storage_uuid) + if image_row is None: + return RepositoryTag.select().where(RepositoryTag.id < 0) # Empty query. + ancestors_str = '%s%s/%%' % (image_row.ancestors, image_row.id) return _tag_alive(RepositoryTag .select(*args) .distinct() .join(Image) .join(ImageStorage) - .where(Image.id << image_query, RepositoryTag.hidden == False)) + .where(RepositoryTag.hidden == False) + .where((Image.id == image_row.id) | + (Image.ancestors ** ancestors_str))) def get_tags_for_image(image_id, *args): From 74dd0ef8e898c796077ae6f35e873f2dc0c2e906 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 2 May 2017 15:38:25 -0400 Subject: [PATCH 3/7] Add a batch `get_matching_tags_for_images` method This will be used in the security notification worker to retrieving the tags needed in a set of batch calls, rather than multiple calls per image --- data/model/_basequery.py | 23 +++++++++++ data/model/storage.py | 29 ++------------ data/model/tag.py | 67 +++++++++++++++++++++++++++++++ data/model/test/test_tag.py | 79 ++++++++++++++++++++++++++++++++----- 4 files changed, 162 insertions(+), 36 deletions(-) diff --git a/data/model/_basequery.py b/data/model/_basequery.py index f37a843ea..c4400d4fb 100644 --- a/data/model/_basequery.py +++ b/data/model/_basequery.py @@ -6,6 +6,29 @@ from data.database import (Repository, User, Team, TeamMember, RepositoryPermiss Namespace, Visibility, ImageStorage, Image, RepositoryKind, db_for_update) +def reduce_as_tree(queries_to_reduce): + """ This method will split a list of queries into halves recursively until we reach individual + queries, at which point it will start unioning the queries, or the already unioned subqueries. + This works around a bug in peewee SQL generation where reducing linearly generates a chain + of queries that will exceed the recursion depth limit when it has around 80 queries. + """ + mid = len(queries_to_reduce)/2 + left = queries_to_reduce[:mid] + right = queries_to_reduce[mid:] + + to_reduce_right = right[0] + if len(right) > 1: + to_reduce_right = reduce_as_tree(right) + + if len(left) > 1: + to_reduce_left = reduce_as_tree(left) + elif len(left) == 1: + to_reduce_left = left[0] + else: + return to_reduce_right + + return to_reduce_left.union_all(to_reduce_right) + def get_existing_repository(namespace_name, repository_name, for_update=False, kind_filter=None): query = (Repository diff --git a/data/model/storage.py b/data/model/storage.py index 112b9cc5f..eef439d39 100644 --- a/data/model/storage.py +++ b/data/model/storage.py @@ -68,7 +68,8 @@ def _orphaned_storage_query(candidate_ids): .from_(storage_subq)) # Build the set of storages that are missing. These storages are orphaned. - nonorphaned_storage_ids = {storage.id for storage in _reduce_as_tree(nonorphaned_queries)} + nonorphaned_storage_ids = {storage.id for storage + in _basequery.reduce_as_tree(nonorphaned_queries)} return list(candidate_ids - nonorphaned_storage_ids) @@ -275,31 +276,7 @@ def lookup_repo_storages_by_content_checksum(repo, checksums): .select(SQL('*')) .from_(candidate_subq)) - return _reduce_as_tree(queries) - - -def _reduce_as_tree(queries_to_reduce): - """ This method will split a list of queries into halves recursively until we reach individual - queries, at which point it will start unioning the queries, or the already unioned subqueries. - This works around a bug in peewee SQL generation where reducing linearly generates a chain - of queries that will exceed the recursion depth limit when it has around 80 queries. - """ - mid = len(queries_to_reduce)/2 - left = queries_to_reduce[:mid] - right = queries_to_reduce[mid:] - - to_reduce_right = right[0] - if len(right) > 1: - to_reduce_right = _reduce_as_tree(right) - - if len(left) > 1: - to_reduce_left = _reduce_as_tree(left) - elif len(left) == 1: - to_reduce_left = left[0] - else: - return to_reduce_right - - return to_reduce_left.union_all(to_reduce_right) + return _basequery.reduce_as_tree(queries) def set_image_storage_metadata(docker_image_id, namespace_name, repository_name, image_size, diff --git a/data/model/tag.py b/data/model/tag.py index c6112a092..4e17f7cdb 100644 --- a/data/model/tag.py +++ b/data/model/tag.py @@ -53,6 +53,73 @@ def _tag_alive(query, now_ts=None): (RepositoryTag.lifetime_end_ts > now_ts)) +_MAX_SUB_QUERIES = 100 + +def get_matching_tags_for_images(image_pairs, filter_query=None, selections=None): + """ Returns all tags that contain the images with the given docker_image_id and storage_uuid, + as specified as an iterable of pairs. """ + if not image_pairs: + return [] + + image_pairs = set(image_pairs) + + # Find all possible matching image+storages. + ids = [image_pair[0] for image_pair in image_pairs] + uuids = [image_pair[1] for image_pair in image_pairs] + images_query = (Image + .select(Image.id, Image.docker_image_id, Image.ancestors, ImageStorage.uuid) + .join(ImageStorage) + .where(Image.docker_image_id << ids, ImageStorage.uuid << uuids)) + + # Filter down to those images actually in the pairs set and build the set of queries to run. + individual_image_queries = [] + + for img in images_query: + # Make sure the actual image was requested. + pair = (img.docker_image_id, img.storage.uuid) + if pair not in image_pairs: + continue + + # Remove the pair so we don't try it again. + image_pairs.remove(pair) + + ancestors_str = '%s%s/%%' % (img.ancestors, img.id) + query = (Image + .select(Image.id) + .where((Image.id == img.id) | (Image.ancestors ** ancestors_str))) + + individual_image_queries.append(query) + + if not individual_image_queries: + return [] + + # Shard based on the max subquery count. This is used to prevent going over the DB's max query + # size, as well as to prevent the DB from locking up on a massive query. + sharded_queries = [] + while individual_image_queries: + shard = individual_image_queries[0:_MAX_SUB_QUERIES] + sharded_queries.append(_basequery.reduce_as_tree(shard)) + individual_image_queries = individual_image_queries[_MAX_SUB_QUERIES:] + + # Collect IDs of the tags found for each query. + tags = {} + for query in sharded_queries: + tag_query = (_tag_alive(RepositoryTag + .select(*(selections or [])) + .distinct() + .join(Image) + .where(RepositoryTag.hidden == False) + .where(Image.id << query))) + + if filter_query is not None: + tag_query = filter_query(tag_query) + + for tag in tag_query: + tags[tag.id] = tag + + return tags.values() + + def get_matching_tags(docker_image_id, storage_uuid, *args): """ Returns a query pointing to all tags that contain the image with the given docker_image_id and storage_uuid. """ diff --git a/data/model/test/test_tag.py b/data/model/test/test_tag.py index 623db281a..407e5813f 100644 --- a/data/model/test/test_tag.py +++ b/data/model/test/test_tag.py @@ -1,25 +1,84 @@ -from data.database import Image, RepositoryTag +from data.database import Image, RepositoryTag, ImageStorage, Repository from data.model.repository import create_repository from data.model.tag import (list_active_repo_tags, create_or_update_tag, delete_tag, - get_matching_tags, _tag_alive) + get_matching_tags, _tag_alive, get_matching_tags_for_images) from data.model.image import find_create_or_link_image from test.fixtures import * +def _get_expected_tags(image): + expected_query = (RepositoryTag + .select() + .join(Image) + .where(RepositoryTag.hidden == False) + .where((Image.id == image.id) | (Image.ancestors ** ('%%/%s/%%' % image.id)))) + return set([tag.id for tag in _tag_alive(expected_query)]) + + def test_get_matching_tags(initialized_db): # Test for every image in the test database. - for image in Image.select(): + for image in Image.select(Image, ImageStorage).join(ImageStorage): matching_query = get_matching_tags(image.docker_image_id, image.storage.uuid) - expected_query = (RepositoryTag - .select() - .join(Image) - .where(RepositoryTag.hidden == False) - .where((Image.id == image.id) | (Image.ancestors ** ('%%/%s/%%' % image.id)))) - matching_tags = set([tag.id for tag in matching_query]) - expected_tags = set([tag.id for tag in _tag_alive(expected_query)]) + expected_tags = _get_expected_tags(image) assert matching_tags == expected_tags, "mismatch for image %s" % image.id + +def test_get_matching_tag_ids_for_images(initialized_db): + # Try for various sets of the first N images. + for count in [5, 10, 15]: + pairs = [] + expected_tags_ids = set() + for image in Image.select(Image, ImageStorage).join(ImageStorage): + if len(pairs) >= count: + break + + pairs.append((image.docker_image_id, image.storage.uuid)) + expected_tags_ids.update(_get_expected_tags(image)) + + matching_tags_ids = set([tag.id for tag in get_matching_tags_for_images(pairs)]) + assert matching_tags_ids == expected_tags_ids + + +def test_get_matching_tag_ids_for_all_images(initialized_db): + pairs = [] + for image in Image.select(Image, ImageStorage).join(ImageStorage): + pairs.append((image.docker_image_id, image.storage.uuid)) + + expected_tags_ids = set([tag.id for tag in _tag_alive(RepositoryTag.select())]) + matching_tags_ids = set([tag.id for tag in get_matching_tags_for_images(pairs)]) + + # Ensure every alive tag was found. + assert matching_tags_ids == expected_tags_ids + + +def test_get_matching_tag_ids_images_filtered(initialized_db): + def filter_query(query): + return query.join(Repository).where(Repository.name == 'simple') + + filtered_images = filter_query(Image + .select(Image, ImageStorage) + .join(RepositoryTag) + .switch(Image) + .join(ImageStorage) + .switch(Image)) + + expected_tags_query = _tag_alive(filter_query(RepositoryTag + .select())) + + pairs = [] + for image in filtered_images: + pairs.append((image.docker_image_id, image.storage.uuid)) + + matching_tags = get_matching_tags_for_images(pairs, filter_query=filter_query) + + expected_tag_ids = set([tag.id for tag in expected_tags_query]) + matching_tags_ids = set([tag.id for tag in matching_tags]) + + # Ensure every alive tag was found. + assert matching_tags_ids == expected_tag_ids + + def assert_tags(repository, *args): tags = list(list_active_repo_tags(repository)) assert len(tags) == len(args) From 98fcae753bcbe7f3f9c356e27d6f6bbffe1cef67 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 2 May 2017 15:39:27 -0400 Subject: [PATCH 4/7] Change the security notification system to use get_matching_tags_for_images This should vastly reduce the number of database calls we make, as instead of making 2-3 calls per image, we'll make two calls per ~100 images --- util/secscan/notifier.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/util/secscan/notifier.py b/util/secscan/notifier.py index 1cfea4ace..ad31f876b 100644 --- a/util/secscan/notifier.py +++ b/util/secscan/notifier.py @@ -5,7 +5,7 @@ from collections import defaultdict from enum import Enum from app import secscan_api -from data.model.tag import filter_tags_have_repository_event, get_matching_tags +from data.model.tag import filter_tags_have_repository_event, get_matching_tags_for_images from data.database import (Image, ImageStorage, ExternalNotificationEvent, Repository, RepositoryTag) from endpoints.notificationhelper import notification_batch @@ -33,10 +33,10 @@ class SecurityNotificationHandler(object): self.tag_map = defaultdict(set) self.repository_map = {} self.check_map = {} + self.layer_ids = set() self.stream_tracker = None self.results_per_stream = results_per_stream - self.reporting_failed = False self.vulnerability_info = None self.event = ExternalNotificationEvent.get(name='vulnerability_found') @@ -133,10 +133,6 @@ class SecurityNotificationHandler(object): self.stream_tracker.push_new(new_layer_ids) self.stream_tracker.push_old(old_layer_ids) - # If the reporting failed at any point, nothing more we can do. - if self.reporting_failed: - return ProcessNotificationPageResult.FAILED - # Check to see if there are any additional pages to process. if 'NextPage' not in notification_page_data: return self._done() @@ -145,21 +141,30 @@ class SecurityNotificationHandler(object): def _done(self): if self.stream_tracker is not None: + # Mark the tracker as done, so that it finishes reporting any outstanding layers. self.stream_tracker.done() - if self.reporting_failed: - return ProcessNotificationPageResult.FAILED + # Process all the layers. + if self.vulnerability_info is not None: + if not self._process_layers(): + return ProcessNotificationPageResult.FAILED return ProcessNotificationPageResult.FINISHED_PROCESSING def _report(self, new_layer_id): - # Split the layer ID into its Docker Image ID and storage ID. - (docker_image_id, storage_uuid) = new_layer_id.split('.', 2) + self.layer_ids.add(new_layer_id) + + def _process_layers(self): + # Builds the pairs of layer ID and storage uuid. + pairs = [tuple(layer_id.split('.', 2)) for layer_id in self.layer_ids] + + def filter_notifying_repos(query): + query = query.join(ImageStorage) + return filter_tags_have_repository_event(query, self.event) # Find the matching tags. - matching = get_matching_tags(docker_image_id, storage_uuid, RepositoryTag, Repository, - Image, ImageStorage) - tags = list(filter_tags_have_repository_event(matching, self.event)) + tags = get_matching_tags_for_images(pairs, selections=[RepositoryTag, Image, ImageStorage], + filter_query=filter_notifying_repos) cve_id = self.vulnerability_info['Name'] for tag in tags: @@ -170,12 +175,14 @@ class SecurityNotificationHandler(object): try: self.check_map[tag_layer_id] = secscan_api.check_layer_vulnerable(tag_layer_id, cve_id) except APIRequestFailure: - self.reporting_failed = True - return + return False logger.debug('Result of layer %s is vulnerable to %s check: %s', tag_layer_id, cve_id, self.check_map[tag_layer_id]) + if self.check_map[tag_layer_id]: # Add the vulnerable tag to the list. self.tag_map[tag.repository_id].add(tag.name) self.repository_map[tag.repository_id] = tag.repository + + return True From d77463cc8b1b84bd3f9174c20ec3ad722d313c81 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 2 May 2017 15:50:31 -0400 Subject: [PATCH 5/7] Add sharing to the images lookup in get_matching_tags_for_images Should prevent an issue if we ever get a request with thousands and thousands of images where we would exceed the DB's max packet size --- data/model/tag.py | 30 +++++++++----- data/model/test/test_tag.py | 80 ++++++++++++++++++++++++------------- 2 files changed, 72 insertions(+), 38 deletions(-) diff --git a/data/model/tag.py b/data/model/tag.py index 4e17f7cdb..79f132109 100644 --- a/data/model/tag.py +++ b/data/model/tag.py @@ -54,6 +54,7 @@ def _tag_alive(query, now_ts=None): _MAX_SUB_QUERIES = 100 +_MAX_IMAGE_LOOKUP_COUNT = 500 def get_matching_tags_for_images(image_pairs, filter_query=None, selections=None): """ Returns all tags that contain the images with the given docker_image_id and storage_uuid, @@ -61,27 +62,36 @@ def get_matching_tags_for_images(image_pairs, filter_query=None, selections=None if not image_pairs: return [] - image_pairs = set(image_pairs) + image_pairs_set = set(image_pairs) # Find all possible matching image+storages. - ids = [image_pair[0] for image_pair in image_pairs] - uuids = [image_pair[1] for image_pair in image_pairs] - images_query = (Image - .select(Image.id, Image.docker_image_id, Image.ancestors, ImageStorage.uuid) - .join(ImageStorage) - .where(Image.docker_image_id << ids, ImageStorage.uuid << uuids)) + images = [] + + while image_pairs: + image_pairs_slice = image_pairs[0:_MAX_IMAGE_LOOKUP_COUNT] + + ids = [pair[0] for pair in image_pairs_slice] + uuids = [pair[1] for pair in image_pairs_slice] + + images_query = (Image + .select(Image.id, Image.docker_image_id, Image.ancestors, ImageStorage.uuid) + .join(ImageStorage) + .where(Image.docker_image_id << ids, ImageStorage.uuid << uuids)) + + images.extend(list(images_query)) + image_pairs = image_pairs[_MAX_IMAGE_LOOKUP_COUNT:] # Filter down to those images actually in the pairs set and build the set of queries to run. individual_image_queries = [] - for img in images_query: + for img in images: # Make sure the actual image was requested. pair = (img.docker_image_id, img.storage.uuid) - if pair not in image_pairs: + if pair not in image_pairs_set: continue # Remove the pair so we don't try it again. - image_pairs.remove(pair) + image_pairs_set.remove(pair) ancestors_str = '%s%s/%%' % (img.ancestors, img.id) query = (Image diff --git a/data/model/test/test_tag.py b/data/model/test/test_tag.py index 407e5813f..fa993eedc 100644 --- a/data/model/test/test_tag.py +++ b/data/model/test/test_tag.py @@ -1,3 +1,7 @@ +import pytest + +from mock import patch + from data.database import Image, RepositoryTag, ImageStorage, Repository from data.model.repository import create_repository from data.model.tag import (list_active_repo_tags, create_or_update_tag, delete_tag, @@ -14,42 +18,62 @@ def _get_expected_tags(image): .where((Image.id == image.id) | (Image.ancestors ** ('%%/%s/%%' % image.id)))) return set([tag.id for tag in _tag_alive(expected_query)]) - -def test_get_matching_tags(initialized_db): - # Test for every image in the test database. - for image in Image.select(Image, ImageStorage).join(ImageStorage): - matching_query = get_matching_tags(image.docker_image_id, image.storage.uuid) - matching_tags = set([tag.id for tag in matching_query]) - expected_tags = _get_expected_tags(image) - assert matching_tags == expected_tags, "mismatch for image %s" % image.id +@pytest.mark.parametrize('max_subqueries,max_image_lookup_count', [ + (1, 1), + (10, 10), + (100, 500), +]) +def test_get_matching_tags(max_subqueries, max_image_lookup_count, initialized_db): + with patch('data.model.tag._MAX_SUB_QUERIES', max_subqueries): + with patch('data.model.tag._MAX_IMAGE_LOOKUP_COUNT', max_image_lookup_count): + # Test for every image in the test database. + for image in Image.select(Image, ImageStorage).join(ImageStorage): + matching_query = get_matching_tags(image.docker_image_id, image.storage.uuid) + matching_tags = set([tag.id for tag in matching_query]) + expected_tags = _get_expected_tags(image) + assert matching_tags == expected_tags, "mismatch for image %s" % image.id -def test_get_matching_tag_ids_for_images(initialized_db): - # Try for various sets of the first N images. - for count in [5, 10, 15]: - pairs = [] - expected_tags_ids = set() - for image in Image.select(Image, ImageStorage).join(ImageStorage): - if len(pairs) >= count: - break +@pytest.mark.parametrize('max_subqueries,max_image_lookup_count', [ + (1, 1), + (10, 10), + (100, 500), +]) +def test_get_matching_tag_ids_for_images(max_subqueries, max_image_lookup_count, initialized_db): + with patch('data.model.tag._MAX_SUB_QUERIES', max_subqueries): + with patch('data.model.tag._MAX_IMAGE_LOOKUP_COUNT', max_image_lookup_count): + # Try for various sets of the first N images. + for count in [5, 10, 15]: + pairs = [] + expected_tags_ids = set() + for image in Image.select(Image, ImageStorage).join(ImageStorage): + if len(pairs) >= count: + break - pairs.append((image.docker_image_id, image.storage.uuid)) - expected_tags_ids.update(_get_expected_tags(image)) + pairs.append((image.docker_image_id, image.storage.uuid)) + expected_tags_ids.update(_get_expected_tags(image)) - matching_tags_ids = set([tag.id for tag in get_matching_tags_for_images(pairs)]) - assert matching_tags_ids == expected_tags_ids + matching_tags_ids = set([tag.id for tag in get_matching_tags_for_images(pairs)]) + assert matching_tags_ids == expected_tags_ids -def test_get_matching_tag_ids_for_all_images(initialized_db): - pairs = [] - for image in Image.select(Image, ImageStorage).join(ImageStorage): - pairs.append((image.docker_image_id, image.storage.uuid)) +@pytest.mark.parametrize('max_subqueries,max_image_lookup_count', [ + (1, 1), + (10, 10), + (100, 500), +]) +def test_get_matching_tag_ids_for_all_images(max_subqueries, max_image_lookup_count, initialized_db): + with patch('data.model.tag._MAX_SUB_QUERIES', max_subqueries): + with patch('data.model.tag._MAX_IMAGE_LOOKUP_COUNT', max_image_lookup_count): + pairs = [] + for image in Image.select(Image, ImageStorage).join(ImageStorage): + pairs.append((image.docker_image_id, image.storage.uuid)) - expected_tags_ids = set([tag.id for tag in _tag_alive(RepositoryTag.select())]) - matching_tags_ids = set([tag.id for tag in get_matching_tags_for_images(pairs)]) + expected_tags_ids = set([tag.id for tag in _tag_alive(RepositoryTag.select())]) + matching_tags_ids = set([tag.id for tag in get_matching_tags_for_images(pairs)]) - # Ensure every alive tag was found. - assert matching_tags_ids == expected_tags_ids + # Ensure every alive tag was found. + assert matching_tags_ids == expected_tags_ids def test_get_matching_tag_ids_images_filtered(initialized_db): From 977bbc20a250ef91efcf7fd90745f80b4025611f Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 2 May 2017 18:29:14 -0400 Subject: [PATCH 6/7] Add filtering onto the images query in get_matching_tags_for_images Should make the query even faster in the security notification case --- data/model/tag.py | 44 +++++++++++++++++++++++++------------ data/model/test/test_tag.py | 3 ++- util/secscan/notifier.py | 11 +++++++--- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/data/model/tag.py b/data/model/tag.py index 79f132109..14abc026d 100644 --- a/data/model/tag.py +++ b/data/model/tag.py @@ -53,10 +53,30 @@ def _tag_alive(query, now_ts=None): (RepositoryTag.lifetime_end_ts > now_ts)) +def filter_has_repository_event(query, event): + """ Filters the query by ensuring the repositories returned have the given event. """ + return (query + .join(Repository) + .join(RepositoryNotification) + .where(RepositoryNotification.event == event)) + + +def filter_tags_have_repository_event(query, event): + """ Filters the query by ensuring the repository tags live in a repository that has the given + event. Also returns the image storage for the tag's image and orders the results by + lifetime_start_ts. + """ + query = filter_has_repository_event(query, event) + query = query.switch(Image).join(ImageStorage) + query = query.switch(RepositoryTag).order_by(RepositoryTag.lifetime_start_ts.desc()) + return query + + _MAX_SUB_QUERIES = 100 _MAX_IMAGE_LOOKUP_COUNT = 500 -def get_matching_tags_for_images(image_pairs, filter_query=None, selections=None): +def get_matching_tags_for_images(image_pairs, filter_images=None, filter_tags=None, + selections=None): """ Returns all tags that contain the images with the given docker_image_id and storage_uuid, as specified as an iterable of pairs. """ if not image_pairs: @@ -76,7 +96,11 @@ def get_matching_tags_for_images(image_pairs, filter_query=None, selections=None images_query = (Image .select(Image.id, Image.docker_image_id, Image.ancestors, ImageStorage.uuid) .join(ImageStorage) - .where(Image.docker_image_id << ids, ImageStorage.uuid << uuids)) + .where(Image.docker_image_id << ids, ImageStorage.uuid << uuids) + .switch(Image)) + + if filter_images is not None: + images_query = filter_images(images_query) images.extend(list(images_query)) image_pairs = image_pairs[_MAX_IMAGE_LOOKUP_COUNT:] @@ -119,10 +143,11 @@ def get_matching_tags_for_images(image_pairs, filter_query=None, selections=None .distinct() .join(Image) .where(RepositoryTag.hidden == False) - .where(Image.id << query))) + .where(Image.id << query) + .switch(RepositoryTag))) - if filter_query is not None: - tag_query = filter_query(tag_query) + if filter_tags is not None: + tag_query = filter_tags(tag_query) for tag in tag_query: tags[tag.id] = tag @@ -156,15 +181,6 @@ def get_tags_for_image(image_id, *args): RepositoryTag.hidden == False)) -def filter_tags_have_repository_event(query, event): - return (query - .switch(RepositoryTag) - .join(Repository) - .join(RepositoryNotification) - .where(RepositoryNotification.event == event) - .order_by(RepositoryTag.lifetime_start_ts.desc())) - - def get_tag_manifest_digests(tags): """ Returns a map from tag ID to its associated manifest digest, if any. """ if not tags: diff --git a/data/model/test/test_tag.py b/data/model/test/test_tag.py index fa993eedc..8fa0eb852 100644 --- a/data/model/test/test_tag.py +++ b/data/model/test/test_tag.py @@ -94,7 +94,8 @@ def test_get_matching_tag_ids_images_filtered(initialized_db): for image in filtered_images: pairs.append((image.docker_image_id, image.storage.uuid)) - matching_tags = get_matching_tags_for_images(pairs, filter_query=filter_query) + matching_tags = get_matching_tags_for_images(pairs, filter_images=filter_query, + filter_tags=filter_query) expected_tag_ids = set([tag.id for tag in expected_tags_query]) matching_tags_ids = set([tag.id for tag in matching_tags]) diff --git a/util/secscan/notifier.py b/util/secscan/notifier.py index ad31f876b..c71209ebe 100644 --- a/util/secscan/notifier.py +++ b/util/secscan/notifier.py @@ -5,7 +5,9 @@ from collections import defaultdict from enum import Enum from app import secscan_api -from data.model.tag import filter_tags_have_repository_event, get_matching_tags_for_images +from data.model.tag import (filter_has_repository_event, filter_tags_have_repository_event, + get_matching_tags_for_images) + from data.database import (Image, ImageStorage, ExternalNotificationEvent, Repository, RepositoryTag) from endpoints.notificationhelper import notification_batch @@ -159,12 +161,15 @@ class SecurityNotificationHandler(object): pairs = [tuple(layer_id.split('.', 2)) for layer_id in self.layer_ids] def filter_notifying_repos(query): - query = query.join(ImageStorage) + return filter_has_repository_event(query, self.event) + + def filter_and_order(query): return filter_tags_have_repository_event(query, self.event) # Find the matching tags. tags = get_matching_tags_for_images(pairs, selections=[RepositoryTag, Image, ImageStorage], - filter_query=filter_notifying_repos) + filter_images=filter_notifying_repos, + filter_tags=filter_and_order) cve_id = self.vulnerability_info['Name'] for tag in tags: From 68d473bff450fb776738087c2a595c42eb946768 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 3 May 2017 11:41:39 -0400 Subject: [PATCH 7/7] Clarify comment and make pairs code a little nicer --- data/model/tag.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/data/model/tag.py b/data/model/tag.py index 14abc026d..24926174b 100644 --- a/data/model/tag.py +++ b/data/model/tag.py @@ -88,7 +88,7 @@ def get_matching_tags_for_images(image_pairs, filter_images=None, filter_tags=No images = [] while image_pairs: - image_pairs_slice = image_pairs[0:_MAX_IMAGE_LOOKUP_COUNT] + image_pairs_slice = image_pairs[:_MAX_IMAGE_LOOKUP_COUNT] ids = [pair[0] for pair in image_pairs_slice] uuids = [pair[1] for pair in image_pairs_slice] @@ -109,7 +109,11 @@ def get_matching_tags_for_images(image_pairs, filter_images=None, filter_tags=No individual_image_queries = [] for img in images: - # Make sure the actual image was requested. + # Make sure the image found is in the set of those requested, and that we haven't already + # processed it. We need this check because the query above checks for images with matching + # IDs OR storage UUIDs, rather than the expected ID+UUID pair. We do this for efficiency + # reasons, and it is highly unlikely we'll find an image with a mismatch, but we need this + # check to be absolutely sure. pair = (img.docker_image_id, img.storage.uuid) if pair not in image_pairs_set: continue @@ -131,7 +135,7 @@ def get_matching_tags_for_images(image_pairs, filter_images=None, filter_tags=No # size, as well as to prevent the DB from locking up on a massive query. sharded_queries = [] while individual_image_queries: - shard = individual_image_queries[0:_MAX_SUB_QUERIES] + shard = individual_image_queries[:_MAX_SUB_QUERIES] sharded_queries.append(_basequery.reduce_as_tree(shard)) individual_image_queries = individual_image_queries[_MAX_SUB_QUERIES:]