Security scanner flow changes and auto-retry
Changes the security scanner code to raise exceptions now for non-successful operations. One of the new exceptions raised is MissingParentLayerException, which, when raised, will cause the security worker to perform a full rescan of all parent images for the current layer, before trying once more to scan the current layer. This should allow the system to be "self-healing" in the case where the security scanner engine somehow loses or corrupts a parent layer.
This commit is contained in:
parent
9fa16679f8
commit
405eca074c
5 changed files with 228 additions and 82 deletions
|
@ -6,54 +6,83 @@ import features
|
|||
from collections import defaultdict
|
||||
|
||||
from endpoints.notificationhelper import spawn_notification
|
||||
from data.database import Image, ExternalNotificationEvent, IMAGE_NOT_SCANNED_ENGINE_VERSION
|
||||
from data.database import ExternalNotificationEvent, IMAGE_NOT_SCANNED_ENGINE_VERSION, Image
|
||||
from data.model.tag import filter_tags_have_repository_event, get_tags_for_image
|
||||
from data.model.image import set_secscan_status, get_image_with_storage_and_parent_base
|
||||
from util.secscan.api import APIRequestFailure
|
||||
from util.secscan.api import (APIRequestFailure, AnalyzeLayerException, MissingParentLayerException,
|
||||
InvalidLayerException, AnalyzeLayerRetryException)
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PreemptedException(Exception):
|
||||
""" Exception raised if another worker analyzed the image before this worker was able to do so.
|
||||
"""
|
||||
|
||||
|
||||
class LayerAnalyzer(object):
|
||||
""" Helper class to perform analysis of a layer via the security scanner. """
|
||||
def __init__(self, config, api):
|
||||
self._api = api
|
||||
self._target_version = config.get('SECURITY_SCANNER_ENGINE_VERSION_TARGET', 2)
|
||||
|
||||
|
||||
def analyze_recursively(self, layer):
|
||||
""" Analyzes a layer and all its parents.
|
||||
|
||||
Return a tuple of two bools:
|
||||
- The first one tells us if the layer and its parents analyzed successfully.
|
||||
- The second one is set to False when another call pre-empted the candidate's analysis
|
||||
for us.
|
||||
""" Analyzes a layer and all its parents. Raises a PreemptedException if the analysis was
|
||||
preempted by another worker.
|
||||
"""
|
||||
if layer.parent_id and layer.parent.security_indexed_engine < self._target_version:
|
||||
# The image has a parent that is not analyzed yet with this engine.
|
||||
# Get the parent to get it's own parent and recurse.
|
||||
try:
|
||||
self._analyze_recursively_and_check(layer)
|
||||
except MissingParentLayerException:
|
||||
# The parent layer of this layer was missing. Force a reanalyze.
|
||||
try:
|
||||
self._analyze_recursively_and_check(layer, force_parents=True)
|
||||
except MissingParentLayerException:
|
||||
# Parent is still missing... mark the layer as invalid.
|
||||
if not set_secscan_status(layer, False, self._target_version):
|
||||
raise PreemptedException
|
||||
|
||||
def _analyze_recursively_and_check(self, layer, force_parents=False):
|
||||
""" Analyzes a layer and all its parents, optionally forcing parents to be reanalyzed,
|
||||
and checking for various exceptions that can occur during analysis.
|
||||
"""
|
||||
try:
|
||||
self._analyze_recursively(layer, force_parents=force_parents)
|
||||
except InvalidLayerException:
|
||||
# One of the parent layers is invalid, so this layer is invalid as well.
|
||||
if not set_secscan_status(layer, False, self._target_version):
|
||||
raise PreemptedException
|
||||
except AnalyzeLayerRetryException:
|
||||
# Something went wrong when trying to analyze the layer, but we should retry, so leave
|
||||
# the layer unindexed. Another worker will come along and handle it.
|
||||
pass
|
||||
except MissingParentLayerException:
|
||||
# Pass upward, as missing parent is handled in the analyze_recursively method.
|
||||
raise
|
||||
except AnalyzeLayerException:
|
||||
# Something went wrong when trying to analyze the layer and we cannot retry, so mark the
|
||||
# layer as invalid.
|
||||
if not set_secscan_status(layer, False, self._target_version):
|
||||
raise PreemptedException
|
||||
|
||||
def _analyze_recursively(self, layer, force_parents=False):
|
||||
# Check if there is a parent layer that needs to be analyzed.
|
||||
if layer.parent_id and (force_parents or
|
||||
layer.parent.security_indexed_engine < self._target_version):
|
||||
try:
|
||||
base_query = get_image_with_storage_and_parent_base()
|
||||
parent_layer = base_query.where(Image.id == layer.parent_id).get()
|
||||
except Image.DoesNotExist:
|
||||
logger.warning("Image %s has Image %s as parent but doesn't exist.", layer.id,
|
||||
layer.parent_id)
|
||||
raise AnalyzeLayerException('Parent image not found')
|
||||
|
||||
return False, set_secscan_status(layer, False, self._target_version)
|
||||
self._analyze_recursively(parent_layer, force_parents=force_parents)
|
||||
|
||||
cont, _ = self.analyze_recursively(parent_layer)
|
||||
if not cont:
|
||||
# The analysis failed for some reason and did not mark the layer as failed,
|
||||
# thus we should not try to analyze the children of that layer.
|
||||
# Interrupt the recursive analysis and return as no-one pre-empted us.
|
||||
return False, True
|
||||
# Analyze the layer itself.
|
||||
self._analyze(layer, force_parents=force_parents)
|
||||
|
||||
# Now we know all parents are analyzed.
|
||||
return self._analyze(layer)
|
||||
|
||||
|
||||
def _analyze(self, layer):
|
||||
def _analyze(self, layer, force_parents=False):
|
||||
""" Analyzes a single layer.
|
||||
|
||||
Return a tuple of two bools:
|
||||
|
@ -63,33 +92,30 @@ class LayerAnalyzer(object):
|
|||
"""
|
||||
# If the parent couldn't be analyzed with the target version or higher, we can't analyze
|
||||
# this image. Mark it as failed with the current target version.
|
||||
if (layer.parent_id and not layer.parent.security_indexed and
|
||||
layer.parent.security_indexed_engine >= self._target_version):
|
||||
return True, set_secscan_status(layer, False, self._target_version)
|
||||
if not force_parents and (layer.parent_id and not layer.parent.security_indexed and
|
||||
layer.parent.security_indexed_engine >= self._target_version):
|
||||
if not set_secscan_status(layer, False, self._target_version):
|
||||
raise PreemptedException
|
||||
|
||||
# Nothing more to do.
|
||||
return
|
||||
|
||||
# Analyze the image.
|
||||
previously_security_indexed_successfully = layer.security_indexed
|
||||
previous_security_indexed_engine = layer.security_indexed_engine
|
||||
|
||||
logger.info('Analyzing layer %s', layer.docker_image_id)
|
||||
(analyzed_version, should_requeue) = self._api.analyze_layer(layer)
|
||||
analyzed_version = self._api.analyze_layer(layer)
|
||||
|
||||
# If analysis failed, then determine whether we need to requeue.
|
||||
if not analyzed_version:
|
||||
if should_requeue:
|
||||
# If the layer needs to be requeued, return that the children cannot be analyzed (at this
|
||||
# time) and there was no collision with another worker.
|
||||
return False, False
|
||||
else:
|
||||
# If the layer cannot be requeued, we allow the children to be analyzed, because the code
|
||||
# path above will mark them as not analyzable, and we mark the image itself as not being
|
||||
# analyzable.
|
||||
return True, set_secscan_status(layer, False, self._target_version)
|
||||
|
||||
# Mark the image as analyzed.
|
||||
logger.info('Analyzed layer %s successfully with version %s', layer.docker_image_id,
|
||||
analyzed_version)
|
||||
set_status = set_secscan_status(layer, True, analyzed_version)
|
||||
|
||||
# Mark the image as analyzed.
|
||||
if not set_secscan_status(layer, True, analyzed_version):
|
||||
# If the image was previously successfully marked as resolved, then set_secscan_status
|
||||
# might return False because we're not changing it (since this is a fixup).
|
||||
if not previously_security_indexed_successfully:
|
||||
raise PreemptedException
|
||||
|
||||
# If we are the one who've done the job successfully first, then we need to decide if we should
|
||||
# send notifications. Notifications are sent if:
|
||||
|
@ -99,9 +125,7 @@ class LayerAnalyzer(object):
|
|||
# feature set in the security scanner, notifications will be spammy.
|
||||
is_new_image = previous_security_indexed_engine == IMAGE_NOT_SCANNED_ENGINE_VERSION
|
||||
is_existing_image_unindexed = not is_new_image and not previously_security_indexed_successfully
|
||||
if (features.SECURITY_NOTIFICATIONS and set_status and
|
||||
(is_new_image or is_existing_image_unindexed)):
|
||||
|
||||
if (features.SECURITY_NOTIFICATIONS and (is_new_image or is_existing_image_unindexed)):
|
||||
# Get the tags of the layer we analyzed.
|
||||
repository_map = defaultdict(list)
|
||||
event = ExternalNotificationEvent.get(name='vulnerability_found')
|
||||
|
@ -152,5 +176,3 @@ class LayerAnalyzer(object):
|
|||
})
|
||||
|
||||
spawn_notification(repository, 'vulnerability_found', event_data)
|
||||
|
||||
return True, set_status
|
||||
|
|
|
@ -15,12 +15,24 @@ from util import get_app_url
|
|||
|
||||
TOKEN_VALIDITY_LIFETIME_S = 60 # Amount of time the security scanner has to call the layer URL
|
||||
|
||||
UNKNOWN_PARENT_LAYER_ERROR_MSG = 'worker: parent layer is unknown, it must be processed first'
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnalyzeLayerException(Exception):
|
||||
""" Exception raised when a layer fails to analyze due to a *client-side* issue. """
|
||||
""" Exception raised when a layer fails to analyze due to a request issue. """
|
||||
|
||||
class AnalyzeLayerRetryException(Exception):
|
||||
""" Exception raised when a layer fails to analyze due to a request issue, and the request should
|
||||
be retried.
|
||||
"""
|
||||
|
||||
class MissingParentLayerException(AnalyzeLayerException):
|
||||
""" Exception raised when the parent of the layer is missing from the security scanner. """
|
||||
|
||||
class InvalidLayerException(AnalyzeLayerException):
|
||||
""" Exception raised when the layer itself cannot be handled by the security scanner. """
|
||||
|
||||
class APIRequestFailure(Exception):
|
||||
""" Exception raised when there is a failure to conduct an API request. """
|
||||
|
@ -142,12 +154,12 @@ class SecurityScannerAPI(object):
|
|||
|
||||
def analyze_layer(self, layer):
|
||||
""" Posts the given layer to the security scanner for analysis, blocking until complete.
|
||||
Returns a tuple containing the analysis version (on success, None on failure) and
|
||||
whether the request should be retried.
|
||||
Returns the analysis version on success or raises an exception deriving from
|
||||
AnalyzeLayerException on failure. Callers should handle all cases of AnalyzeLayerException.
|
||||
"""
|
||||
request = self._new_analyze_request(layer)
|
||||
if not request:
|
||||
return None, False
|
||||
raise AnalyzeLayerException
|
||||
|
||||
logger.info('Analyzing layer %s', request['Layer']['Name'])
|
||||
try:
|
||||
|
@ -155,13 +167,13 @@ class SecurityScannerAPI(object):
|
|||
json_response = response.json()
|
||||
except requests.exceptions.Timeout:
|
||||
logger.exception('Timeout when trying to post layer data response for %s', layer.id)
|
||||
return None, True
|
||||
raise AnalyzeLayerRetryException
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.exception('Connection error when trying to post layer data response for %s', layer.id)
|
||||
return None, True
|
||||
raise AnalyzeLayerRetryException
|
||||
except (requests.exceptions.RequestException, ValueError) as re:
|
||||
logger.exception('Failed to post layer data response for %s', layer.id)
|
||||
return None, False
|
||||
logger.exception('Failed to post layer data response for %s: %s', layer.id, re)
|
||||
raise AnalyzeLayerException
|
||||
|
||||
# Handle any errors from the security scanner.
|
||||
if response.status_code != 201:
|
||||
|
@ -171,17 +183,23 @@ class SecurityScannerAPI(object):
|
|||
|
||||
# 400 means the layer could not be analyzed due to a bad request.
|
||||
if response.status_code == 400:
|
||||
logger.error('Bad request when calling security scanner for layer %s: %s',
|
||||
response.status_code, json_response)
|
||||
raise AnalyzeLayerException('Bad request to security scanner')
|
||||
if message == UNKNOWN_PARENT_LAYER_ERROR_MSG:
|
||||
raise MissingParentLayerException('Bad request to security scanner: %s' % message)
|
||||
else:
|
||||
raise AnalyzeLayerException('Bad request to security scanner: %s' % message)
|
||||
|
||||
# 422 means that the layer could not be analyzed:
|
||||
# - the layer could not be extracted (manifest?)
|
||||
# - the layer could not be extracted (might be a manifest or an invalid .tar.gz)
|
||||
# - the layer operating system / package manager is unsupported
|
||||
return None, response.status_code != 422
|
||||
elif response.status_code == 422:
|
||||
raise InvalidLayerException
|
||||
|
||||
api_version = json_response['Layer']['IndexedByVersion']
|
||||
return api_version, False
|
||||
# Otherwise, it is some other error and we should retry.
|
||||
else:
|
||||
raise AnalyzeLayerRetryException
|
||||
|
||||
# Return the parsed API version.
|
||||
return json_response['Layer']['IndexedByVersion']
|
||||
|
||||
|
||||
def check_layer_vulnerable(self, layer_id, cve_name):
|
||||
|
|
|
@ -5,6 +5,7 @@ import urlparse
|
|||
|
||||
from contextlib import contextmanager
|
||||
from httmock import urlmatch, HTTMock, all_requests
|
||||
from util.secscan.api import UNKNOWN_PARENT_LAYER_ERROR_MSG
|
||||
|
||||
@contextmanager
|
||||
def fake_security_scanner(hostname='fakesecurityscanner'):
|
||||
|
@ -29,6 +30,7 @@ class FakeSecurityScanner(object):
|
|||
|
||||
self.fail_layer_id = None
|
||||
self.internal_error_layer_id = None
|
||||
self.error_layer_id = None
|
||||
|
||||
def set_fail_layer_id(self, fail_layer_id):
|
||||
""" Sets a layer ID that, if encountered when the analyze call is made, causes a 422
|
||||
|
@ -42,6 +44,12 @@ class FakeSecurityScanner(object):
|
|||
"""
|
||||
self.internal_error_layer_id = internal_error_layer_id
|
||||
|
||||
def set_error_layer_id(self, error_layer_id):
|
||||
""" Sets a layer ID that, if encountered when the analyze call is made, causes a 400
|
||||
to be raised.
|
||||
"""
|
||||
self.error_layer_id = error_layer_id
|
||||
|
||||
def has_layer(self, layer_id):
|
||||
""" Returns true if the layer with the given ID has been analyzed. """
|
||||
return layer_id in self.layers
|
||||
|
@ -192,6 +200,12 @@ class FakeSecurityScanner(object):
|
|||
'content': json.dumps({'Error': {'Message': 'Cannot analyze'}}),
|
||||
}
|
||||
|
||||
if layer['Name'] == self.error_layer_id:
|
||||
return {
|
||||
'status_code': 400,
|
||||
'content': json.dumps({'Error': {'Message': 'Some sort of error'}}),
|
||||
}
|
||||
|
||||
parent_id = layer.get('ParentName', None)
|
||||
parent_layer = None
|
||||
|
||||
|
@ -200,7 +214,7 @@ class FakeSecurityScanner(object):
|
|||
if parent_layer is None:
|
||||
return {
|
||||
'status_code': 400,
|
||||
'content': json.dumps({'Error': {'Message': 'Unknown parent'}}),
|
||||
'content': json.dumps({'Error': {'Message': UNKNOWN_PARENT_LAYER_ERROR_MSG}}),
|
||||
}
|
||||
|
||||
self.add_layer(layer['Name'])
|
||||
|
|
Reference in a new issue