Security scanner flow changes and auto-retry
Changes the security scanner code to raise exceptions now for non-successful operations. One of the new exceptions raised is MissingParentLayerException, which, when raised, will cause the security worker to perform a full rescan of all parent images for the current layer, before trying once more to scan the current layer. This should allow the system to be "self-healing" in the case where the security scanner engine somehow loses or corrupts a parent layer.
This commit is contained in:
parent
9fa16679f8
commit
405eca074c
5 changed files with 228 additions and 82 deletions
|
@ -6,54 +6,83 @@ import features
|
|||
from collections import defaultdict
|
||||
|
||||
from endpoints.notificationhelper import spawn_notification
|
||||
from data.database import Image, ExternalNotificationEvent, IMAGE_NOT_SCANNED_ENGINE_VERSION
|
||||
from data.database import ExternalNotificationEvent, IMAGE_NOT_SCANNED_ENGINE_VERSION, Image
|
||||
from data.model.tag import filter_tags_have_repository_event, get_tags_for_image
|
||||
from data.model.image import set_secscan_status, get_image_with_storage_and_parent_base
|
||||
from util.secscan.api import APIRequestFailure
|
||||
from util.secscan.api import (APIRequestFailure, AnalyzeLayerException, MissingParentLayerException,
|
||||
InvalidLayerException, AnalyzeLayerRetryException)
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PreemptedException(Exception):
|
||||
""" Exception raised if another worker analyzed the image before this worker was able to do so.
|
||||
"""
|
||||
|
||||
|
||||
class LayerAnalyzer(object):
|
||||
""" Helper class to perform analysis of a layer via the security scanner. """
|
||||
def __init__(self, config, api):
|
||||
self._api = api
|
||||
self._target_version = config.get('SECURITY_SCANNER_ENGINE_VERSION_TARGET', 2)
|
||||
|
||||
|
||||
def analyze_recursively(self, layer):
|
||||
""" Analyzes a layer and all its parents.
|
||||
|
||||
Return a tuple of two bools:
|
||||
- The first one tells us if the layer and its parents analyzed successfully.
|
||||
- The second one is set to False when another call pre-empted the candidate's analysis
|
||||
for us.
|
||||
""" Analyzes a layer and all its parents. Raises a PreemptedException if the analysis was
|
||||
preempted by another worker.
|
||||
"""
|
||||
if layer.parent_id and layer.parent.security_indexed_engine < self._target_version:
|
||||
# The image has a parent that is not analyzed yet with this engine.
|
||||
# Get the parent to get it's own parent and recurse.
|
||||
try:
|
||||
self._analyze_recursively_and_check(layer)
|
||||
except MissingParentLayerException:
|
||||
# The parent layer of this layer was missing. Force a reanalyze.
|
||||
try:
|
||||
self._analyze_recursively_and_check(layer, force_parents=True)
|
||||
except MissingParentLayerException:
|
||||
# Parent is still missing... mark the layer as invalid.
|
||||
if not set_secscan_status(layer, False, self._target_version):
|
||||
raise PreemptedException
|
||||
|
||||
def _analyze_recursively_and_check(self, layer, force_parents=False):
|
||||
""" Analyzes a layer and all its parents, optionally forcing parents to be reanalyzed,
|
||||
and checking for various exceptions that can occur during analysis.
|
||||
"""
|
||||
try:
|
||||
self._analyze_recursively(layer, force_parents=force_parents)
|
||||
except InvalidLayerException:
|
||||
# One of the parent layers is invalid, so this layer is invalid as well.
|
||||
if not set_secscan_status(layer, False, self._target_version):
|
||||
raise PreemptedException
|
||||
except AnalyzeLayerRetryException:
|
||||
# Something went wrong when trying to analyze the layer, but we should retry, so leave
|
||||
# the layer unindexed. Another worker will come along and handle it.
|
||||
pass
|
||||
except MissingParentLayerException:
|
||||
# Pass upward, as missing parent is handled in the analyze_recursively method.
|
||||
raise
|
||||
except AnalyzeLayerException:
|
||||
# Something went wrong when trying to analyze the layer and we cannot retry, so mark the
|
||||
# layer as invalid.
|
||||
if not set_secscan_status(layer, False, self._target_version):
|
||||
raise PreemptedException
|
||||
|
||||
def _analyze_recursively(self, layer, force_parents=False):
|
||||
# Check if there is a parent layer that needs to be analyzed.
|
||||
if layer.parent_id and (force_parents or
|
||||
layer.parent.security_indexed_engine < self._target_version):
|
||||
try:
|
||||
base_query = get_image_with_storage_and_parent_base()
|
||||
parent_layer = base_query.where(Image.id == layer.parent_id).get()
|
||||
except Image.DoesNotExist:
|
||||
logger.warning("Image %s has Image %s as parent but doesn't exist.", layer.id,
|
||||
layer.parent_id)
|
||||
raise AnalyzeLayerException('Parent image not found')
|
||||
|
||||
return False, set_secscan_status(layer, False, self._target_version)
|
||||
self._analyze_recursively(parent_layer, force_parents=force_parents)
|
||||
|
||||
cont, _ = self.analyze_recursively(parent_layer)
|
||||
if not cont:
|
||||
# The analysis failed for some reason and did not mark the layer as failed,
|
||||
# thus we should not try to analyze the children of that layer.
|
||||
# Interrupt the recursive analysis and return as no-one pre-empted us.
|
||||
return False, True
|
||||
# Analyze the layer itself.
|
||||
self._analyze(layer, force_parents=force_parents)
|
||||
|
||||
# Now we know all parents are analyzed.
|
||||
return self._analyze(layer)
|
||||
|
||||
|
||||
def _analyze(self, layer):
|
||||
def _analyze(self, layer, force_parents=False):
|
||||
""" Analyzes a single layer.
|
||||
|
||||
Return a tuple of two bools:
|
||||
|
@ -63,33 +92,30 @@ class LayerAnalyzer(object):
|
|||
"""
|
||||
# If the parent couldn't be analyzed with the target version or higher, we can't analyze
|
||||
# this image. Mark it as failed with the current target version.
|
||||
if (layer.parent_id and not layer.parent.security_indexed and
|
||||
layer.parent.security_indexed_engine >= self._target_version):
|
||||
return True, set_secscan_status(layer, False, self._target_version)
|
||||
if not force_parents and (layer.parent_id and not layer.parent.security_indexed and
|
||||
layer.parent.security_indexed_engine >= self._target_version):
|
||||
if not set_secscan_status(layer, False, self._target_version):
|
||||
raise PreemptedException
|
||||
|
||||
# Nothing more to do.
|
||||
return
|
||||
|
||||
# Analyze the image.
|
||||
previously_security_indexed_successfully = layer.security_indexed
|
||||
previous_security_indexed_engine = layer.security_indexed_engine
|
||||
|
||||
logger.info('Analyzing layer %s', layer.docker_image_id)
|
||||
(analyzed_version, should_requeue) = self._api.analyze_layer(layer)
|
||||
analyzed_version = self._api.analyze_layer(layer)
|
||||
|
||||
# If analysis failed, then determine whether we need to requeue.
|
||||
if not analyzed_version:
|
||||
if should_requeue:
|
||||
# If the layer needs to be requeued, return that the children cannot be analyzed (at this
|
||||
# time) and there was no collision with another worker.
|
||||
return False, False
|
||||
else:
|
||||
# If the layer cannot be requeued, we allow the children to be analyzed, because the code
|
||||
# path above will mark them as not analyzable, and we mark the image itself as not being
|
||||
# analyzable.
|
||||
return True, set_secscan_status(layer, False, self._target_version)
|
||||
|
||||
# Mark the image as analyzed.
|
||||
logger.info('Analyzed layer %s successfully with version %s', layer.docker_image_id,
|
||||
analyzed_version)
|
||||
set_status = set_secscan_status(layer, True, analyzed_version)
|
||||
|
||||
# Mark the image as analyzed.
|
||||
if not set_secscan_status(layer, True, analyzed_version):
|
||||
# If the image was previously successfully marked as resolved, then set_secscan_status
|
||||
# might return False because we're not changing it (since this is a fixup).
|
||||
if not previously_security_indexed_successfully:
|
||||
raise PreemptedException
|
||||
|
||||
# If we are the one who've done the job successfully first, then we need to decide if we should
|
||||
# send notifications. Notifications are sent if:
|
||||
|
@ -99,9 +125,7 @@ class LayerAnalyzer(object):
|
|||
# feature set in the security scanner, notifications will be spammy.
|
||||
is_new_image = previous_security_indexed_engine == IMAGE_NOT_SCANNED_ENGINE_VERSION
|
||||
is_existing_image_unindexed = not is_new_image and not previously_security_indexed_successfully
|
||||
if (features.SECURITY_NOTIFICATIONS and set_status and
|
||||
(is_new_image or is_existing_image_unindexed)):
|
||||
|
||||
if (features.SECURITY_NOTIFICATIONS and (is_new_image or is_existing_image_unindexed)):
|
||||
# Get the tags of the layer we analyzed.
|
||||
repository_map = defaultdict(list)
|
||||
event = ExternalNotificationEvent.get(name='vulnerability_found')
|
||||
|
@ -152,5 +176,3 @@ class LayerAnalyzer(object):
|
|||
})
|
||||
|
||||
spawn_notification(repository, 'vulnerability_found', event_data)
|
||||
|
||||
return True, set_status
|
||||
|
|
Reference in a new issue