initial import for Open Source 🎉
This commit is contained in:
parent
1898c361f3
commit
9c0dd3b722
2048 changed files with 218743 additions and 0 deletions
266
buildman/server.py
Normal file
266
buildman/server.py
Normal file
|
@ -0,0 +1,266 @@
|
|||
import logging
|
||||
import json
|
||||
import trollius
|
||||
|
||||
from threading import Event
|
||||
from datetime import timedelta
|
||||
from trollius.coroutines import From
|
||||
from autobahn.asyncio.wamp import RouterFactory, RouterSessionFactory
|
||||
from autobahn.asyncio.websocket import WampWebSocketServerFactory
|
||||
from autobahn.wamp import types
|
||||
from aiowsgi import create_server as create_wsgi_server
|
||||
from flask import Flask
|
||||
|
||||
from buildman.enums import BuildJobResult, BuildServerStatus, RESULT_PHASES
|
||||
from buildman.jobutil.buildstatus import StatusHandler
|
||||
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
|
||||
from data import database, model
|
||||
from app import app, metric_queue
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
WORK_CHECK_TIMEOUT = 10
|
||||
TIMEOUT_PERIOD_MINUTES = 20
|
||||
JOB_TIMEOUT_SECONDS = 300
|
||||
SETUP_LEEWAY_SECONDS = 30
|
||||
MINIMUM_JOB_EXTENSION = timedelta(minutes=1)
|
||||
|
||||
HEARTBEAT_PERIOD_SEC = 30
|
||||
|
||||
|
||||
class BuilderServer(object):
|
||||
""" Server which handles both HTTP and WAMP requests, managing the full state of the build
|
||||
controller.
|
||||
"""
|
||||
def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass,
|
||||
lifecycle_manager_config, manager_hostname):
|
||||
self._loop = None
|
||||
self._current_status = BuildServerStatus.STARTING
|
||||
self._current_components = []
|
||||
self._realm_map = {}
|
||||
self._job_count = 0
|
||||
|
||||
self._session_factory = RouterSessionFactory(RouterFactory())
|
||||
self._registry_hostname = registry_hostname
|
||||
self._queue = queue
|
||||
self._build_logs = build_logs
|
||||
self._user_files = user_files
|
||||
self._lifecycle_manager = lifecycle_manager_klass(
|
||||
self._register_component,
|
||||
self._unregister_component,
|
||||
self._job_heartbeat,
|
||||
self._job_complete,
|
||||
manager_hostname,
|
||||
HEARTBEAT_PERIOD_SEC,
|
||||
)
|
||||
self._lifecycle_manager_config = lifecycle_manager_config
|
||||
|
||||
self._shutdown_event = Event()
|
||||
self._current_status = BuildServerStatus.RUNNING
|
||||
|
||||
self._register_controller()
|
||||
|
||||
def _register_controller(self):
|
||||
controller_app = Flask('controller')
|
||||
server = self
|
||||
|
||||
@controller_app.route('/status')
|
||||
def status():
|
||||
metrics = server._queue.get_metrics()
|
||||
(running_count, available_not_running_count, available_count) = metrics
|
||||
|
||||
workers = [component for component in server._current_components
|
||||
if component.kind() == 'builder']
|
||||
|
||||
data = {
|
||||
'status': server._current_status,
|
||||
'running_local': server._job_count,
|
||||
'running_total': running_count,
|
||||
'workers': len(workers),
|
||||
'job_total': available_count + running_count
|
||||
}
|
||||
|
||||
return json.dumps(data)
|
||||
|
||||
self._controller_app = controller_app
|
||||
|
||||
def run(self, host, websocket_port, controller_port, ssl=None):
|
||||
logger.debug('Initializing the lifecycle manager')
|
||||
self._lifecycle_manager.initialize(self._lifecycle_manager_config)
|
||||
|
||||
logger.debug('Initializing all members of the event loop')
|
||||
loop = trollius.get_event_loop()
|
||||
|
||||
logger.debug('Starting server on port %s, with controller on port %s', websocket_port,
|
||||
controller_port)
|
||||
|
||||
try:
|
||||
loop.run_until_complete(self._initialize(loop, host, websocket_port, controller_port, ssl))
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
def close(self):
|
||||
logger.debug('Requested server shutdown')
|
||||
self._current_status = BuildServerStatus.SHUTDOWN
|
||||
self._lifecycle_manager.shutdown()
|
||||
self._shutdown_event.wait()
|
||||
logger.debug('Shutting down server')
|
||||
|
||||
def _register_component(self, realm, component_klass, **kwargs):
|
||||
""" Registers a component with the server. The component_klass must derive from
|
||||
BaseComponent.
|
||||
"""
|
||||
logger.debug('Registering component with realm %s', realm)
|
||||
if realm in self._realm_map:
|
||||
logger.debug('Component with realm %s already registered', realm)
|
||||
return self._realm_map[realm]
|
||||
|
||||
component = component_klass(types.ComponentConfig(realm=realm), realm=realm, **kwargs)
|
||||
component.server = self
|
||||
component.parent_manager = self._lifecycle_manager
|
||||
component.build_logs = self._build_logs
|
||||
component.user_files = self._user_files
|
||||
component.registry_hostname = self._registry_hostname
|
||||
|
||||
self._realm_map[realm] = component
|
||||
self._current_components.append(component)
|
||||
self._session_factory.add(component)
|
||||
return component
|
||||
|
||||
def _unregister_component(self, component):
|
||||
logger.debug('Unregistering component with realm %s and token %s',
|
||||
component.builder_realm, component.expected_token)
|
||||
|
||||
self._realm_map.pop(component.builder_realm, None)
|
||||
|
||||
if component in self._current_components:
|
||||
self._current_components.remove(component)
|
||||
self._session_factory.remove(component)
|
||||
|
||||
def _job_heartbeat(self, build_job):
|
||||
self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS,
|
||||
minimum_extension=MINIMUM_JOB_EXTENSION)
|
||||
|
||||
@trollius.coroutine
|
||||
def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False):
|
||||
if job_status == BuildJobResult.INCOMPLETE:
|
||||
logger.warning('[BUILD INCOMPLETE: job complete] Build ID: %s. No retry restore.',
|
||||
build_job.repo_build.uuid)
|
||||
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
|
||||
else:
|
||||
self._queue.complete(build_job.job_item)
|
||||
|
||||
# Update the trigger failure tracking (if applicable).
|
||||
if build_job.repo_build.trigger is not None:
|
||||
model.build.update_trigger_disable_status(build_job.repo_build.trigger,
|
||||
RESULT_PHASES[job_status])
|
||||
|
||||
if update_phase:
|
||||
status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid)
|
||||
yield From(status_handler.set_phase(RESULT_PHASES[job_status]))
|
||||
|
||||
self._job_count = self._job_count - 1
|
||||
|
||||
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
|
||||
self._shutdown_event.set()
|
||||
|
||||
_report_completion_status(build_job, job_status, executor_name)
|
||||
|
||||
@trollius.coroutine
|
||||
def _work_checker(self):
|
||||
logger.debug('Initializing work checker')
|
||||
while self._current_status == BuildServerStatus.RUNNING:
|
||||
with database.CloseForLongOperation(app.config):
|
||||
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
|
||||
|
||||
logger.debug('Checking for more work for %d active workers',
|
||||
self._lifecycle_manager.num_workers())
|
||||
|
||||
processing_time = self._lifecycle_manager.overall_setup_time() + SETUP_LEEWAY_SECONDS
|
||||
job_item = self._queue.get(processing_time=processing_time, ordering_required=True)
|
||||
if job_item is None:
|
||||
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
|
||||
continue
|
||||
|
||||
try:
|
||||
build_job = BuildJob(job_item)
|
||||
except BuildJobLoadException as irbe:
|
||||
logger.warning('[BUILD INCOMPLETE: job load exception] Job data: %s. No retry restore.',
|
||||
job_item.body)
|
||||
logger.exception(irbe)
|
||||
self._queue.incomplete(job_item, restore_retry=False)
|
||||
continue
|
||||
|
||||
logger.debug('Checking for an avaliable worker for build job %s',
|
||||
build_job.repo_build.uuid)
|
||||
|
||||
try:
|
||||
schedule_success, retry_timeout = yield From(self._lifecycle_manager.schedule(build_job))
|
||||
except:
|
||||
logger.warning('[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.',
|
||||
build_job.repo_build.uuid)
|
||||
logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid)
|
||||
self._current_status = BuildServerStatus.EXCEPTION
|
||||
self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT)
|
||||
return
|
||||
|
||||
if schedule_success:
|
||||
logger.debug('Marking build %s as scheduled', build_job.repo_build.uuid)
|
||||
status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid)
|
||||
yield From(status_handler.set_phase(database.BUILD_PHASE.BUILD_SCHEDULED))
|
||||
|
||||
self._job_count = self._job_count + 1
|
||||
logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid,
|
||||
self._job_count)
|
||||
else:
|
||||
logger.warning('[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.',
|
||||
build_job.repo_build.uuid)
|
||||
logger.debug('All workers are busy for job %s Requeuing after %s seconds.',
|
||||
build_job.repo_build.uuid, retry_timeout)
|
||||
self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)
|
||||
|
||||
@trollius.coroutine
|
||||
def _queue_metrics_updater(self):
|
||||
logger.debug('Initializing queue metrics updater')
|
||||
while self._current_status == BuildServerStatus.RUNNING:
|
||||
logger.debug('Writing metrics')
|
||||
self._queue.update_metrics()
|
||||
|
||||
logger.debug('Metrics going to sleep for 30 seconds')
|
||||
yield From(trollius.sleep(30))
|
||||
|
||||
@trollius.coroutine
|
||||
def _initialize(self, loop, host, websocket_port, controller_port, ssl=None):
|
||||
self._loop = loop
|
||||
|
||||
# Create the WAMP server.
|
||||
transport_factory = WampWebSocketServerFactory(self._session_factory, debug_wamp=False)
|
||||
transport_factory.setProtocolOptions(failByDrop=True)
|
||||
|
||||
# Initialize the controller server and the WAMP server
|
||||
create_wsgi_server(self._controller_app, loop=loop, host=host, port=controller_port, ssl=ssl)
|
||||
yield From(loop.create_server(transport_factory, host, websocket_port, ssl=ssl))
|
||||
|
||||
# Initialize the metrics updater
|
||||
trollius.async(self._queue_metrics_updater())
|
||||
|
||||
# Initialize the work queue checker.
|
||||
yield From(self._work_checker())
|
||||
|
||||
def _report_completion_status(build_job, status, executor_name):
|
||||
metric_queue.build_counter.Inc(labelvalues=[status])
|
||||
metric_queue.repository_build_completed.Inc(labelvalues=[build_job.namespace, build_job.repo_name,
|
||||
status, executor_name or 'executor'])
|
||||
if status == BuildJobResult.COMPLETE:
|
||||
status_name = 'CompleteBuilds'
|
||||
elif status == BuildJobResult.ERROR:
|
||||
status_name = 'FailedBuilds'
|
||||
elif status == BuildJobResult.INCOMPLETE:
|
||||
status_name = 'IncompletedBuilds'
|
||||
else:
|
||||
return
|
||||
|
||||
metric_queue.put_deprecated(status_name, 1, unit='Count')
|
Reference in a new issue