Security scanner flow changes and auto-retry

Changes the security scanner code to raise exceptions now for non-successful operations. One of the new exceptions raised is MissingParentLayerException, which, when raised, will cause the security worker to perform a full rescan of all parent images for the current layer, before trying once more to scan the current layer. This should allow the system to be "self-healing" in the case where the security scanner engine somehow loses or corrupts a parent layer.
This commit is contained in:
Joseph Schorr 2016-12-15 16:27:24 -05:00
parent 9fa16679f8
commit 405eca074c
5 changed files with 228 additions and 82 deletions

View file

@ -8,7 +8,7 @@ from data.database import Image, IMAGE_NOT_SCANNED_ENGINE_VERSION
from endpoints.notificationevent import VulnerabilityFoundEvent
from endpoints.v2 import v2_bp
from initdb import setup_database_for_testing, finished_database_for_testing
from util.secscan.api import SecurityScannerAPI, AnalyzeLayerException
from util.secscan.api import SecurityScannerAPI
from util.secscan.analyzer import LayerAnalyzer
from util.secscan.fake import fake_security_scanner
from util.secscan.notifier import process_notification_data
@ -131,7 +131,7 @@ class TestSecurityScanner(unittest.TestCase):
def test_analyze_layer_failure(self):
""" Tests that failing to analyze a layer marks it as not analyzed. """
""" Tests that failing to analyze a layer (because it 422s) marks it as analyzed but failed. """
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest', include_storage=True)
self.assertFalse(layer.security_indexed)
@ -148,6 +148,8 @@ class TestSecurityScanner(unittest.TestCase):
def test_analyze_layer_internal_error(self):
""" Tests that failing to analyze a layer (because it 500s) marks it as not analyzed. """
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest', include_storage=True)
self.assertFalse(layer.security_indexed)
self.assertEquals(-1, layer.security_indexed_engine)
@ -162,7 +164,29 @@ class TestSecurityScanner(unittest.TestCase):
self.assertAnalyzed(layer, security_scanner, False, -1)
def test_analyze_layer_missing_parent(self):
def test_analyze_layer_error(self):
""" Tests that failing to analyze a layer (because it 400s) marks it as analyzed but failed. """
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest', include_storage=True)
self.assertFalse(layer.security_indexed)
self.assertEquals(-1, layer.security_indexed_engine)
with fake_security_scanner() as security_scanner:
# Make is so trying to analyze the parent will fail with an error.
security_scanner.set_error_layer_id(security_scanner.layer_id(layer.parent))
# Try to the layer and its parents, but with one request causing an error.
analyzer = LayerAnalyzer(app.config, self.api)
analyzer.analyze_recursively(layer)
# Make sure it is marked as analyzed, but in a failed state.
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest')
self.assertAnalyzed(layer, security_scanner, False, 1)
def test_analyze_layer_missing_parent_handled(self):
""" Tests that a missing parent causes an automatic reanalysis, which succeeds. """
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest', include_storage=True)
self.assertFalse(layer.security_indexed)
self.assertEquals(-1, layer.security_indexed_engine)
@ -176,19 +200,81 @@ class TestSecurityScanner(unittest.TestCase):
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest')
self.assertAnalyzed(layer, security_scanner, True, 1)
# Mark the layer as not analyzed and delete its parent layer from the security scanner.
# Mark the layer as not yet scanned.
layer.security_indexed_engine = IMAGE_NOT_SCANNED_ENGINE_VERSION
layer.security_indexed = False
layer.save()
# Remove the layer's parent entirely from the security scanner.
security_scanner.remove_layer(security_scanner.layer_id(layer.parent))
# Try to analyze again; this should fail because the parent is missing.
with self.assertRaisesRegexp(AnalyzeLayerException, 'Bad request to security scanner'):
analyzer.analyze_recursively(layer)
# Analyze again, which should properly re-analyze the missing parent and this layer.
analyzer.analyze_recursively(layer)
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest')
self.assertAnalyzed(layer, security_scanner, True, 1)
def test_analyze_layer_invalid_parent(self):
""" Tests that trying to reanalyze a parent that is invalid causes the layer to be marked
as analyzed, but failed.
"""
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest', include_storage=True)
self.assertFalse(layer.security_indexed)
self.assertEquals(-1, layer.security_indexed_engine)
with fake_security_scanner() as security_scanner:
# Analyze the layer and its parents.
analyzer = LayerAnalyzer(app.config, self.api)
analyzer.analyze_recursively(layer)
# Make sure it was analyzed.
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest')
self.assertAnalyzed(layer, security_scanner, True, 1)
# Mark the layer as not yet scanned.
layer.security_indexed_engine = IMAGE_NOT_SCANNED_ENGINE_VERSION
layer.security_indexed = False
layer.save()
# Remove the layer's parent entirely from the security scanner.
security_scanner.remove_layer(security_scanner.layer_id(layer.parent))
# Make is so trying to analyze the parent will fail.
security_scanner.set_error_layer_id(security_scanner.layer_id(layer.parent))
# Try to analyze again, which should try to reindex the parent and fail.
analyzer.analyze_recursively(layer)
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest')
self.assertAnalyzed(layer, security_scanner, False, 1)
def test_analyze_layer_unsupported_parent(self):
""" Tests that attempting to analyze a layer whose parent is unanalyzable, results in the layer
being marked as analyzed, but failed.
"""
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest', include_storage=True)
self.assertFalse(layer.security_indexed)
self.assertEquals(-1, layer.security_indexed_engine)
with fake_security_scanner() as security_scanner:
# Make is so trying to analyze the parent will fail.
security_scanner.set_fail_layer_id(security_scanner.layer_id(layer.parent))
# Attempt to the layer and its parents. This should mark the layer itself as unanalyzable.
analyzer = LayerAnalyzer(app.config, self.api)
analyzer.analyze_recursively(layer)
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest')
self.assertAnalyzed(layer, security_scanner, False, 1)
def test_analyze_layer_missing_storage(self):
""" Tests trying to analyze a layer with missing storage. """
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest', include_storage=True)
self.assertFalse(layer.security_indexed)
self.assertEquals(-1, layer.security_indexed_engine)
@ -199,16 +285,16 @@ class TestSecurityScanner(unittest.TestCase):
storage.remove(locations, path)
storage.remove(locations, 'all_files_exist')
with fake_security_scanner():
with fake_security_scanner() as security_scanner:
analyzer = LayerAnalyzer(app.config, self.api)
analyzer.analyze_recursively(layer)
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest')
self.assertEquals(False, layer.security_indexed)
self.assertEquals(1, layer.security_indexed_engine)
self.assertAnalyzed(layer, security_scanner, False, 1)
def assert_analyze_layer_notify(self, security_indexed_engine, security_indexed, expect_notification):
def assert_analyze_layer_notify(self, security_indexed_engine, security_indexed,
expect_notification):
layer = model.tag.get_tag_image(ADMIN_ACCESS_USER, SIMPLE_REPO, 'latest', include_storage=True)
self.assertFalse(layer.security_indexed)
self.assertEquals(-1, layer.security_indexed_engine)
@ -218,7 +304,8 @@ class TestSecurityScanner(unittest.TestCase):
# Add a repo event for the layer.
repo = model.repository.get_repository(ADMIN_ACCESS_USER, SIMPLE_REPO)
model.notification.create_repo_notification(repo, 'vulnerability_found', 'quay_notification', {}, {'level': 100})
model.notification.create_repo_notification(repo, 'vulnerability_found',
'quay_notification', {}, {'level': 100})
# Update the layer's state before analyzing.
layer.security_indexed_engine = security_indexed_engine
@ -285,7 +372,8 @@ class TestSecurityScanner(unittest.TestCase):
# Add a repo event for the layer.
repo = model.repository.get_repository(ADMIN_ACCESS_USER, SIMPLE_REPO)
model.notification.create_repo_notification(repo, 'vulnerability_found', 'quay_notification', {}, {'level': 100})
model.notification.create_repo_notification(repo, 'vulnerability_found', 'quay_notification',
{}, {'level': 100})
# Ensure that there are no event queue items for the layer.
self.assertIsNone(notification_queue.get())
@ -314,7 +402,8 @@ class TestSecurityScanner(unittest.TestCase):
# Add a repo event for the layer.
repo = model.repository.get_repository(ADMIN_ACCESS_USER, SIMPLE_REPO)
model.notification.create_repo_notification(repo, 'vulnerability_found', 'quay_notification', {}, {'level': 100})
model.notification.create_repo_notification(repo, 'vulnerability_found', 'quay_notification',
{}, {'level': 100})
# Ensure that there are no event queue items for the layer.
self.assertIsNone(notification_queue.get())
@ -343,7 +432,8 @@ class TestSecurityScanner(unittest.TestCase):
# Add a repo event for the layer.
repo = model.repository.get_repository(ADMIN_ACCESS_USER, SIMPLE_REPO)
model.notification.create_repo_notification(repo, 'vulnerability_found', 'quay_notification', {}, {'level': 100})
model.notification.create_repo_notification(repo, 'vulnerability_found', 'quay_notification',
{}, {'level': 100})
# Ensure that there are no event queue items for the layer.
self.assertIsNone(notification_queue.get())
@ -389,7 +479,8 @@ class TestSecurityScanner(unittest.TestCase):
# Add a repo event for the layer.
repo = model.repository.get_repository(ADMIN_ACCESS_USER, SIMPLE_REPO)
model.notification.create_repo_notification(repo, 'vulnerability_found', 'quay_notification', {}, {'level': 100})
model.notification.create_repo_notification(repo, 'vulnerability_found', 'quay_notification',
{}, {'level': 100})
# Ensure that there are no event queue items for the layer.
self.assertIsNone(notification_queue.get())

View file

@ -6,54 +6,83 @@ import features
from collections import defaultdict
from endpoints.notificationhelper import spawn_notification
from data.database import Image, ExternalNotificationEvent, IMAGE_NOT_SCANNED_ENGINE_VERSION
from data.database import ExternalNotificationEvent, IMAGE_NOT_SCANNED_ENGINE_VERSION, Image
from data.model.tag import filter_tags_have_repository_event, get_tags_for_image
from data.model.image import set_secscan_status, get_image_with_storage_and_parent_base
from util.secscan.api import APIRequestFailure
from util.secscan.api import (APIRequestFailure, AnalyzeLayerException, MissingParentLayerException,
InvalidLayerException, AnalyzeLayerRetryException)
from util.morecollections import AttrDict
logger = logging.getLogger(__name__)
class PreemptedException(Exception):
""" Exception raised if another worker analyzed the image before this worker was able to do so.
"""
class LayerAnalyzer(object):
""" Helper class to perform analysis of a layer via the security scanner. """
def __init__(self, config, api):
self._api = api
self._target_version = config.get('SECURITY_SCANNER_ENGINE_VERSION_TARGET', 2)
def analyze_recursively(self, layer):
""" Analyzes a layer and all its parents.
Return a tuple of two bools:
- The first one tells us if the layer and its parents analyzed successfully.
- The second one is set to False when another call pre-empted the candidate's analysis
for us.
""" Analyzes a layer and all its parents. Raises a PreemptedException if the analysis was
preempted by another worker.
"""
if layer.parent_id and layer.parent.security_indexed_engine < self._target_version:
# The image has a parent that is not analyzed yet with this engine.
# Get the parent to get it's own parent and recurse.
try:
self._analyze_recursively_and_check(layer)
except MissingParentLayerException:
# The parent layer of this layer was missing. Force a reanalyze.
try:
self._analyze_recursively_and_check(layer, force_parents=True)
except MissingParentLayerException:
# Parent is still missing... mark the layer as invalid.
if not set_secscan_status(layer, False, self._target_version):
raise PreemptedException
def _analyze_recursively_and_check(self, layer, force_parents=False):
""" Analyzes a layer and all its parents, optionally forcing parents to be reanalyzed,
and checking for various exceptions that can occur during analysis.
"""
try:
self._analyze_recursively(layer, force_parents=force_parents)
except InvalidLayerException:
# One of the parent layers is invalid, so this layer is invalid as well.
if not set_secscan_status(layer, False, self._target_version):
raise PreemptedException
except AnalyzeLayerRetryException:
# Something went wrong when trying to analyze the layer, but we should retry, so leave
# the layer unindexed. Another worker will come along and handle it.
pass
except MissingParentLayerException:
# Pass upward, as missing parent is handled in the analyze_recursively method.
raise
except AnalyzeLayerException:
# Something went wrong when trying to analyze the layer and we cannot retry, so mark the
# layer as invalid.
if not set_secscan_status(layer, False, self._target_version):
raise PreemptedException
def _analyze_recursively(self, layer, force_parents=False):
# Check if there is a parent layer that needs to be analyzed.
if layer.parent_id and (force_parents or
layer.parent.security_indexed_engine < self._target_version):
try:
base_query = get_image_with_storage_and_parent_base()
parent_layer = base_query.where(Image.id == layer.parent_id).get()
except Image.DoesNotExist:
logger.warning("Image %s has Image %s as parent but doesn't exist.", layer.id,
layer.parent_id)
raise AnalyzeLayerException('Parent image not found')
return False, set_secscan_status(layer, False, self._target_version)
self._analyze_recursively(parent_layer, force_parents=force_parents)
cont, _ = self.analyze_recursively(parent_layer)
if not cont:
# The analysis failed for some reason and did not mark the layer as failed,
# thus we should not try to analyze the children of that layer.
# Interrupt the recursive analysis and return as no-one pre-empted us.
return False, True
# Analyze the layer itself.
self._analyze(layer, force_parents=force_parents)
# Now we know all parents are analyzed.
return self._analyze(layer)
def _analyze(self, layer):
def _analyze(self, layer, force_parents=False):
""" Analyzes a single layer.
Return a tuple of two bools:
@ -63,33 +92,30 @@ class LayerAnalyzer(object):
"""
# If the parent couldn't be analyzed with the target version or higher, we can't analyze
# this image. Mark it as failed with the current target version.
if (layer.parent_id and not layer.parent.security_indexed and
layer.parent.security_indexed_engine >= self._target_version):
return True, set_secscan_status(layer, False, self._target_version)
if not force_parents and (layer.parent_id and not layer.parent.security_indexed and
layer.parent.security_indexed_engine >= self._target_version):
if not set_secscan_status(layer, False, self._target_version):
raise PreemptedException
# Nothing more to do.
return
# Analyze the image.
previously_security_indexed_successfully = layer.security_indexed
previous_security_indexed_engine = layer.security_indexed_engine
logger.info('Analyzing layer %s', layer.docker_image_id)
(analyzed_version, should_requeue) = self._api.analyze_layer(layer)
analyzed_version = self._api.analyze_layer(layer)
# If analysis failed, then determine whether we need to requeue.
if not analyzed_version:
if should_requeue:
# If the layer needs to be requeued, return that the children cannot be analyzed (at this
# time) and there was no collision with another worker.
return False, False
else:
# If the layer cannot be requeued, we allow the children to be analyzed, because the code
# path above will mark them as not analyzable, and we mark the image itself as not being
# analyzable.
return True, set_secscan_status(layer, False, self._target_version)
# Mark the image as analyzed.
logger.info('Analyzed layer %s successfully with version %s', layer.docker_image_id,
analyzed_version)
set_status = set_secscan_status(layer, True, analyzed_version)
# Mark the image as analyzed.
if not set_secscan_status(layer, True, analyzed_version):
# If the image was previously successfully marked as resolved, then set_secscan_status
# might return False because we're not changing it (since this is a fixup).
if not previously_security_indexed_successfully:
raise PreemptedException
# If we are the one who've done the job successfully first, then we need to decide if we should
# send notifications. Notifications are sent if:
@ -99,9 +125,7 @@ class LayerAnalyzer(object):
# feature set in the security scanner, notifications will be spammy.
is_new_image = previous_security_indexed_engine == IMAGE_NOT_SCANNED_ENGINE_VERSION
is_existing_image_unindexed = not is_new_image and not previously_security_indexed_successfully
if (features.SECURITY_NOTIFICATIONS and set_status and
(is_new_image or is_existing_image_unindexed)):
if (features.SECURITY_NOTIFICATIONS and (is_new_image or is_existing_image_unindexed)):
# Get the tags of the layer we analyzed.
repository_map = defaultdict(list)
event = ExternalNotificationEvent.get(name='vulnerability_found')
@ -152,5 +176,3 @@ class LayerAnalyzer(object):
})
spawn_notification(repository, 'vulnerability_found', event_data)
return True, set_status

View file

@ -15,12 +15,24 @@ from util import get_app_url
TOKEN_VALIDITY_LIFETIME_S = 60 # Amount of time the security scanner has to call the layer URL
UNKNOWN_PARENT_LAYER_ERROR_MSG = 'worker: parent layer is unknown, it must be processed first'
logger = logging.getLogger(__name__)
class AnalyzeLayerException(Exception):
""" Exception raised when a layer fails to analyze due to a *client-side* issue. """
""" Exception raised when a layer fails to analyze due to a request issue. """
class AnalyzeLayerRetryException(Exception):
""" Exception raised when a layer fails to analyze due to a request issue, and the request should
be retried.
"""
class MissingParentLayerException(AnalyzeLayerException):
""" Exception raised when the parent of the layer is missing from the security scanner. """
class InvalidLayerException(AnalyzeLayerException):
""" Exception raised when the layer itself cannot be handled by the security scanner. """
class APIRequestFailure(Exception):
""" Exception raised when there is a failure to conduct an API request. """
@ -142,12 +154,12 @@ class SecurityScannerAPI(object):
def analyze_layer(self, layer):
""" Posts the given layer to the security scanner for analysis, blocking until complete.
Returns a tuple containing the analysis version (on success, None on failure) and
whether the request should be retried.
Returns the analysis version on success or raises an exception deriving from
AnalyzeLayerException on failure. Callers should handle all cases of AnalyzeLayerException.
"""
request = self._new_analyze_request(layer)
if not request:
return None, False
raise AnalyzeLayerException
logger.info('Analyzing layer %s', request['Layer']['Name'])
try:
@ -155,13 +167,13 @@ class SecurityScannerAPI(object):
json_response = response.json()
except requests.exceptions.Timeout:
logger.exception('Timeout when trying to post layer data response for %s', layer.id)
return None, True
raise AnalyzeLayerRetryException
except requests.exceptions.ConnectionError:
logger.exception('Connection error when trying to post layer data response for %s', layer.id)
return None, True
raise AnalyzeLayerRetryException
except (requests.exceptions.RequestException, ValueError) as re:
logger.exception('Failed to post layer data response for %s', layer.id)
return None, False
logger.exception('Failed to post layer data response for %s: %s', layer.id, re)
raise AnalyzeLayerException
# Handle any errors from the security scanner.
if response.status_code != 201:
@ -171,17 +183,23 @@ class SecurityScannerAPI(object):
# 400 means the layer could not be analyzed due to a bad request.
if response.status_code == 400:
logger.error('Bad request when calling security scanner for layer %s: %s',
response.status_code, json_response)
raise AnalyzeLayerException('Bad request to security scanner')
if message == UNKNOWN_PARENT_LAYER_ERROR_MSG:
raise MissingParentLayerException('Bad request to security scanner: %s' % message)
else:
raise AnalyzeLayerException('Bad request to security scanner: %s' % message)
# 422 means that the layer could not be analyzed:
# - the layer could not be extracted (manifest?)
# - the layer could not be extracted (might be a manifest or an invalid .tar.gz)
# - the layer operating system / package manager is unsupported
return None, response.status_code != 422
elif response.status_code == 422:
raise InvalidLayerException
api_version = json_response['Layer']['IndexedByVersion']
return api_version, False
# Otherwise, it is some other error and we should retry.
else:
raise AnalyzeLayerRetryException
# Return the parsed API version.
return json_response['Layer']['IndexedByVersion']
def check_layer_vulnerable(self, layer_id, cve_name):

View file

@ -5,6 +5,7 @@ import urlparse
from contextlib import contextmanager
from httmock import urlmatch, HTTMock, all_requests
from util.secscan.api import UNKNOWN_PARENT_LAYER_ERROR_MSG
@contextmanager
def fake_security_scanner(hostname='fakesecurityscanner'):
@ -29,6 +30,7 @@ class FakeSecurityScanner(object):
self.fail_layer_id = None
self.internal_error_layer_id = None
self.error_layer_id = None
def set_fail_layer_id(self, fail_layer_id):
""" Sets a layer ID that, if encountered when the analyze call is made, causes a 422
@ -42,6 +44,12 @@ class FakeSecurityScanner(object):
"""
self.internal_error_layer_id = internal_error_layer_id
def set_error_layer_id(self, error_layer_id):
""" Sets a layer ID that, if encountered when the analyze call is made, causes a 400
to be raised.
"""
self.error_layer_id = error_layer_id
def has_layer(self, layer_id):
""" Returns true if the layer with the given ID has been analyzed. """
return layer_id in self.layers
@ -192,6 +200,12 @@ class FakeSecurityScanner(object):
'content': json.dumps({'Error': {'Message': 'Cannot analyze'}}),
}
if layer['Name'] == self.error_layer_id:
return {
'status_code': 400,
'content': json.dumps({'Error': {'Message': 'Some sort of error'}}),
}
parent_id = layer.get('ParentName', None)
parent_layer = None
@ -200,7 +214,7 @@ class FakeSecurityScanner(object):
if parent_layer is None:
return {
'status_code': 400,
'content': json.dumps({'Error': {'Message': 'Unknown parent'}}),
'content': json.dumps({'Error': {'Message': UNKNOWN_PARENT_LAYER_ERROR_MSG}}),
}
self.add_layer(layer['Name'])

View file

@ -9,7 +9,7 @@ from data.database import UseThenDisconnect
from data.model.image import (get_images_eligible_for_scan, get_max_id_for_sec_scan,
get_min_id_for_sec_scan, get_image_id)
from util.secscan.api import SecurityConfigValidator
from util.secscan.analyzer import LayerAnalyzer
from util.secscan.analyzer import LayerAnalyzer, PreemptedException
from util.migrate.allocator import yield_random_entries
from endpoints.v2 import v2_bp
@ -48,8 +48,9 @@ class SecurityWorker(Worker):
with UseThenDisconnect(app.config):
for candidate, abt in yield_random_entries(batch_query, get_image_id(), BATCH_SIZE, max_id,
self._min_id):
_, continue_batch = self._analyzer.analyze_recursively(candidate)
if not continue_batch:
try:
self._analyzer.analyze_recursively(candidate)
except PreemptedException:
logger.info('Another worker pre-empted us for layer: %s', candidate.id)
abt.set()