Security scanner flow changes and auto-retry

Changes the security scanner code to raise exceptions now for non-successful operations. One of the new exceptions raised is MissingParentLayerException, which, when raised, will cause the security worker to perform a full rescan of all parent images for the current layer, before trying once more to scan the current layer. This should allow the system to be "self-healing" in the case where the security scanner engine somehow loses or corrupts a parent layer.
This commit is contained in:
Joseph Schorr 2016-12-15 16:27:24 -05:00
parent 9fa16679f8
commit 405eca074c
5 changed files with 228 additions and 82 deletions

View file

@ -6,54 +6,83 @@ import features
from collections import defaultdict
from endpoints.notificationhelper import spawn_notification
from data.database import Image, ExternalNotificationEvent, IMAGE_NOT_SCANNED_ENGINE_VERSION
from data.database import ExternalNotificationEvent, IMAGE_NOT_SCANNED_ENGINE_VERSION, Image
from data.model.tag import filter_tags_have_repository_event, get_tags_for_image
from data.model.image import set_secscan_status, get_image_with_storage_and_parent_base
from util.secscan.api import APIRequestFailure
from util.secscan.api import (APIRequestFailure, AnalyzeLayerException, MissingParentLayerException,
InvalidLayerException, AnalyzeLayerRetryException)
from util.morecollections import AttrDict
logger = logging.getLogger(__name__)
class PreemptedException(Exception):
""" Exception raised if another worker analyzed the image before this worker was able to do so.
"""
class LayerAnalyzer(object):
""" Helper class to perform analysis of a layer via the security scanner. """
def __init__(self, config, api):
self._api = api
self._target_version = config.get('SECURITY_SCANNER_ENGINE_VERSION_TARGET', 2)
def analyze_recursively(self, layer):
""" Analyzes a layer and all its parents.
Return a tuple of two bools:
- The first one tells us if the layer and its parents analyzed successfully.
- The second one is set to False when another call pre-empted the candidate's analysis
for us.
""" Analyzes a layer and all its parents. Raises a PreemptedException if the analysis was
preempted by another worker.
"""
if layer.parent_id and layer.parent.security_indexed_engine < self._target_version:
# The image has a parent that is not analyzed yet with this engine.
# Get the parent to get it's own parent and recurse.
try:
self._analyze_recursively_and_check(layer)
except MissingParentLayerException:
# The parent layer of this layer was missing. Force a reanalyze.
try:
self._analyze_recursively_and_check(layer, force_parents=True)
except MissingParentLayerException:
# Parent is still missing... mark the layer as invalid.
if not set_secscan_status(layer, False, self._target_version):
raise PreemptedException
def _analyze_recursively_and_check(self, layer, force_parents=False):
""" Analyzes a layer and all its parents, optionally forcing parents to be reanalyzed,
and checking for various exceptions that can occur during analysis.
"""
try:
self._analyze_recursively(layer, force_parents=force_parents)
except InvalidLayerException:
# One of the parent layers is invalid, so this layer is invalid as well.
if not set_secscan_status(layer, False, self._target_version):
raise PreemptedException
except AnalyzeLayerRetryException:
# Something went wrong when trying to analyze the layer, but we should retry, so leave
# the layer unindexed. Another worker will come along and handle it.
pass
except MissingParentLayerException:
# Pass upward, as missing parent is handled in the analyze_recursively method.
raise
except AnalyzeLayerException:
# Something went wrong when trying to analyze the layer and we cannot retry, so mark the
# layer as invalid.
if not set_secscan_status(layer, False, self._target_version):
raise PreemptedException
def _analyze_recursively(self, layer, force_parents=False):
# Check if there is a parent layer that needs to be analyzed.
if layer.parent_id and (force_parents or
layer.parent.security_indexed_engine < self._target_version):
try:
base_query = get_image_with_storage_and_parent_base()
parent_layer = base_query.where(Image.id == layer.parent_id).get()
except Image.DoesNotExist:
logger.warning("Image %s has Image %s as parent but doesn't exist.", layer.id,
layer.parent_id)
raise AnalyzeLayerException('Parent image not found')
return False, set_secscan_status(layer, False, self._target_version)
self._analyze_recursively(parent_layer, force_parents=force_parents)
cont, _ = self.analyze_recursively(parent_layer)
if not cont:
# The analysis failed for some reason and did not mark the layer as failed,
# thus we should not try to analyze the children of that layer.
# Interrupt the recursive analysis and return as no-one pre-empted us.
return False, True
# Analyze the layer itself.
self._analyze(layer, force_parents=force_parents)
# Now we know all parents are analyzed.
return self._analyze(layer)
def _analyze(self, layer):
def _analyze(self, layer, force_parents=False):
""" Analyzes a single layer.
Return a tuple of two bools:
@ -63,33 +92,30 @@ class LayerAnalyzer(object):
"""
# If the parent couldn't be analyzed with the target version or higher, we can't analyze
# this image. Mark it as failed with the current target version.
if (layer.parent_id and not layer.parent.security_indexed and
layer.parent.security_indexed_engine >= self._target_version):
return True, set_secscan_status(layer, False, self._target_version)
if not force_parents and (layer.parent_id and not layer.parent.security_indexed and
layer.parent.security_indexed_engine >= self._target_version):
if not set_secscan_status(layer, False, self._target_version):
raise PreemptedException
# Nothing more to do.
return
# Analyze the image.
previously_security_indexed_successfully = layer.security_indexed
previous_security_indexed_engine = layer.security_indexed_engine
logger.info('Analyzing layer %s', layer.docker_image_id)
(analyzed_version, should_requeue) = self._api.analyze_layer(layer)
analyzed_version = self._api.analyze_layer(layer)
# If analysis failed, then determine whether we need to requeue.
if not analyzed_version:
if should_requeue:
# If the layer needs to be requeued, return that the children cannot be analyzed (at this
# time) and there was no collision with another worker.
return False, False
else:
# If the layer cannot be requeued, we allow the children to be analyzed, because the code
# path above will mark them as not analyzable, and we mark the image itself as not being
# analyzable.
return True, set_secscan_status(layer, False, self._target_version)
# Mark the image as analyzed.
logger.info('Analyzed layer %s successfully with version %s', layer.docker_image_id,
analyzed_version)
set_status = set_secscan_status(layer, True, analyzed_version)
# Mark the image as analyzed.
if not set_secscan_status(layer, True, analyzed_version):
# If the image was previously successfully marked as resolved, then set_secscan_status
# might return False because we're not changing it (since this is a fixup).
if not previously_security_indexed_successfully:
raise PreemptedException
# If we are the one who've done the job successfully first, then we need to decide if we should
# send notifications. Notifications are sent if:
@ -99,9 +125,7 @@ class LayerAnalyzer(object):
# feature set in the security scanner, notifications will be spammy.
is_new_image = previous_security_indexed_engine == IMAGE_NOT_SCANNED_ENGINE_VERSION
is_existing_image_unindexed = not is_new_image and not previously_security_indexed_successfully
if (features.SECURITY_NOTIFICATIONS and set_status and
(is_new_image or is_existing_image_unindexed)):
if (features.SECURITY_NOTIFICATIONS and (is_new_image or is_existing_image_unindexed)):
# Get the tags of the layer we analyzed.
repository_map = defaultdict(list)
event = ExternalNotificationEvent.get(name='vulnerability_found')
@ -152,5 +176,3 @@ class LayerAnalyzer(object):
})
spawn_notification(repository, 'vulnerability_found', event_data)
return True, set_status