2013-10-24 20:37:03 +00:00
|
|
|
import logging
|
|
|
|
import json
|
|
|
|
import daemon
|
|
|
|
import time
|
|
|
|
import argparse
|
|
|
|
import digitalocean
|
2013-10-25 05:14:38 +00:00
|
|
|
import requests
|
2013-10-29 23:59:29 +00:00
|
|
|
import os
|
2013-10-24 20:37:03 +00:00
|
|
|
|
|
|
|
from apscheduler.scheduler import Scheduler
|
|
|
|
from multiprocessing.pool import ThreadPool
|
2013-10-27 23:06:20 +00:00
|
|
|
from base64 import b64encode
|
|
|
|
from requests.exceptions import ConnectionError
|
2013-10-24 20:37:03 +00:00
|
|
|
|
|
|
|
from data.queue import dockerfile_build_queue
|
2013-10-25 05:14:38 +00:00
|
|
|
from data.userfiles import UserRequestFiles
|
2013-10-24 20:37:03 +00:00
|
|
|
from data import model
|
2013-10-30 17:22:00 +00:00
|
|
|
from data.database import db as db_connection
|
2013-10-24 20:37:03 +00:00
|
|
|
from app import app
|
|
|
|
|
|
|
|
|
|
|
|
root_logger = logging.getLogger('')
|
|
|
|
root_logger.setLevel(logging.DEBUG)
|
|
|
|
|
|
|
|
FORMAT = '%(asctime)-15s - %(levelname)s - %(pathname)s - %(funcName)s - %(message)s'
|
|
|
|
formatter = logging.Formatter(FORMAT)
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2013-10-29 17:42:26 +00:00
|
|
|
BUILD_SERVER_CMD = ('docker run -d -lxc-conf="lxc.aa_profile=unconfined" ' +
|
|
|
|
'-privileged -e \'RESOURCE_URL=%s\' -e \'TAG=%s\' ' +
|
|
|
|
'-e \'TOKEN=%s\' quay.io/quay/buildserver')
|
2013-10-25 05:14:38 +00:00
|
|
|
|
|
|
|
|
2013-10-29 17:42:26 +00:00
|
|
|
def retry_command(to_call, args=[], kwargs={}, retries=5, period=5):
|
2013-10-27 23:06:20 +00:00
|
|
|
try:
|
2013-10-29 17:42:26 +00:00
|
|
|
return to_call(*args, **kwargs)
|
2013-10-27 23:06:20 +00:00
|
|
|
except Exception as ex:
|
|
|
|
if retries:
|
2013-10-29 17:42:26 +00:00
|
|
|
logger.debug('Retrying command after %ss' % period)
|
2013-10-27 23:06:20 +00:00
|
|
|
time.sleep(period)
|
2013-10-29 17:42:26 +00:00
|
|
|
return retry_command(to_call, args, kwargs, retries-1, period)
|
2013-10-27 23:06:20 +00:00
|
|
|
raise ex
|
|
|
|
|
|
|
|
|
2013-10-25 05:14:38 +00:00
|
|
|
def get_status(url):
|
2013-10-29 22:15:12 +00:00
|
|
|
return retry_command(requests.get, [url]).json()['status']
|
2013-10-25 05:14:38 +00:00
|
|
|
|
|
|
|
|
2013-10-24 20:37:03 +00:00
|
|
|
def babysit_builder(request):
|
2013-10-28 18:49:23 +00:00
|
|
|
""" Spin up a build node and ask it to build our job. Retryable errors
|
|
|
|
should return False, while fatal errors should return True.
|
|
|
|
"""
|
2013-10-25 05:14:38 +00:00
|
|
|
try:
|
2013-10-27 23:06:20 +00:00
|
|
|
logger.debug('Starting work item: %s' % request)
|
|
|
|
repository_build = model.get_repository_build(request['build_id'])
|
|
|
|
logger.debug('Request details: %s' % repository_build)
|
|
|
|
|
|
|
|
# Initialize digital ocean API
|
|
|
|
do_client_id = app.config['DO_CLIENT_ID']
|
|
|
|
do_api_key = app.config['DO_CLIENT_SECRET']
|
|
|
|
manager = digitalocean.Manager(client_id=do_client_id, api_key=do_api_key)
|
|
|
|
|
|
|
|
# check if there is already a DO node for this build, if so clean it up
|
|
|
|
old_id = repository_build.build_node_id
|
|
|
|
if old_id:
|
|
|
|
logger.debug('Cleaning up old DO node: %s' % old_id)
|
|
|
|
old_droplet = digitalocean.Droplet(id=old_id, client_id=do_client_id,
|
|
|
|
api_key=do_api_key)
|
2013-10-29 17:42:26 +00:00
|
|
|
retry_command(old_droplet.destroy)
|
2013-10-27 23:06:20 +00:00
|
|
|
|
|
|
|
# Pick the region for the new droplet
|
|
|
|
allowed_regions = app.config['DO_ALLOWED_REGIONS']
|
2013-10-29 17:42:26 +00:00
|
|
|
regions = retry_command(manager.get_all_regions)
|
|
|
|
available_regions = {region.id for region in regions}
|
2013-10-27 23:06:20 +00:00
|
|
|
regions = available_regions.intersection(allowed_regions)
|
|
|
|
if not regions:
|
|
|
|
logger.error('No droplets in our allowed regtions, available: %s' %
|
|
|
|
available_regions)
|
|
|
|
return False
|
|
|
|
|
|
|
|
# start the DO node
|
|
|
|
name = 'dockerfile-build-%s' % repository_build.id
|
|
|
|
logger.debug('Starting DO node: %s' % name)
|
|
|
|
droplet = digitalocean.Droplet(client_id=do_client_id,
|
|
|
|
api_key=do_api_key,
|
|
|
|
name=name,
|
|
|
|
region_id=regions.pop(),
|
|
|
|
image_id=1004145, # Docker on 13.04
|
|
|
|
size_id=66, # 512MB,
|
|
|
|
backup_active=False)
|
2013-10-29 17:42:26 +00:00
|
|
|
retry_command(droplet.create, [],
|
|
|
|
{'ssh_key_ids': [app.config['DO_SSH_KEY_ID']]})
|
2013-10-27 23:06:20 +00:00
|
|
|
repository_build.build_node_id = droplet.id
|
|
|
|
repository_build.phase = 'starting'
|
|
|
|
repository_build.save()
|
|
|
|
|
2013-10-29 17:42:26 +00:00
|
|
|
logger.debug('Waiting for DO node to be available.')
|
|
|
|
|
|
|
|
startup = retry_command(droplet.get_events)[0]
|
2013-10-27 23:06:20 +00:00
|
|
|
while not startup.percentage or int(startup.percentage) != 100:
|
|
|
|
logger.debug('Droplet startup percentage: %s' % startup.percentage)
|
|
|
|
time.sleep(5)
|
2013-10-29 17:42:26 +00:00
|
|
|
retry_command(startup.load)
|
2013-10-27 23:06:20 +00:00
|
|
|
|
2013-10-29 17:42:26 +00:00
|
|
|
retry_command(droplet.load)
|
2013-10-27 23:06:20 +00:00
|
|
|
logger.debug('Droplet started at ip address: %s' % droplet.ip_address)
|
|
|
|
|
|
|
|
# connect to it with ssh
|
|
|
|
repository_build.phase = 'initializing'
|
|
|
|
repository_build.save()
|
|
|
|
|
2013-10-29 22:56:57 +00:00
|
|
|
# We wait until here to import paramiko because otherwise it doesn't work
|
|
|
|
# under the daemon context.
|
|
|
|
import paramiko
|
2013-10-27 23:06:20 +00:00
|
|
|
ssh_client = paramiko.SSHClient()
|
|
|
|
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
2013-10-29 17:42:26 +00:00
|
|
|
|
2013-10-29 22:15:12 +00:00
|
|
|
logger.debug('Connecting to droplet through ssh at ip: %s' %
|
|
|
|
droplet.ip_address)
|
2013-10-29 17:42:26 +00:00
|
|
|
retry_command(ssh_client.connect, [droplet.ip_address, 22, 'root'],
|
2013-10-29 22:15:12 +00:00
|
|
|
{'look_for_keys': False, 'timeout': 10.0,
|
2013-10-29 17:42:26 +00:00
|
|
|
'key_filename': app.config['DO_SSH_PRIVATE_KEY_FILENAME']})
|
2013-10-27 23:06:20 +00:00
|
|
|
|
|
|
|
# Load the node with the pull token
|
|
|
|
token = app.config['BUILD_NODE_PULL_TOKEN']
|
|
|
|
basicauth = b64encode('%s:%s' % ('$token', token))
|
|
|
|
auth_object = {
|
|
|
|
'https://quay.io/v1/': {
|
|
|
|
'auth': basicauth,
|
|
|
|
'email': '',
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
create_auth_cmd = 'echo \'%s\' > .dockercfg' % json.dumps(auth_object)
|
|
|
|
ssh_client.exec_command(create_auth_cmd)
|
|
|
|
|
|
|
|
# Pull and run the buildserver
|
|
|
|
pull_cmd = 'docker pull quay.io/quay/buildserver'
|
|
|
|
_, stdout, _ = ssh_client.exec_command(pull_cmd)
|
|
|
|
pull_status = stdout.channel.recv_exit_status()
|
|
|
|
|
|
|
|
if pull_status != 0:
|
|
|
|
logger.error('Pull command failed for host: %s' % droplet.ip_address)
|
2013-10-29 17:42:26 +00:00
|
|
|
return False
|
2013-10-27 23:06:20 +00:00
|
|
|
else:
|
|
|
|
logger.debug('Pull status was: %s' % pull_status)
|
|
|
|
|
2013-10-28 18:49:23 +00:00
|
|
|
# Remove the credentials we used to pull so crafty users cant steal them
|
|
|
|
remove_auth_cmd = 'rm .dockercfg'
|
|
|
|
ssh_client.exec_command(remove_auth_cmd)
|
|
|
|
|
2013-10-29 17:42:26 +00:00
|
|
|
# Prepare the signed resource url the build node can fetch the job from
|
|
|
|
user_files = UserRequestFiles(app.config['AWS_ACCESS_KEY'],
|
|
|
|
app.config['AWS_SECRET_KEY'],
|
|
|
|
app.config['REGISTRY_S3_BUCKET'])
|
|
|
|
resource_url = user_files.get_file_url(repository_build.resource_key)
|
|
|
|
|
2013-10-28 18:49:23 +00:00
|
|
|
# Start the build server
|
2013-10-29 17:42:26 +00:00
|
|
|
start_cmd = BUILD_SERVER_CMD % (resource_url, repository_build.tag,
|
|
|
|
repository_build.access_token.code)
|
|
|
|
logger.debug('Sending build server request with command: %s' % start_cmd)
|
2013-10-27 23:06:20 +00:00
|
|
|
ssh_client.exec_command(start_cmd)
|
|
|
|
|
2013-10-29 17:42:26 +00:00
|
|
|
status_endpoint = 'http://%s:5002/build/' % droplet.ip_address
|
2013-10-27 23:06:20 +00:00
|
|
|
# wait for the server to be ready
|
|
|
|
logger.debug('Waiting for buildserver to be ready')
|
2013-10-29 17:42:26 +00:00
|
|
|
retry_command(requests.get, [status_endpoint])
|
2013-10-27 23:06:20 +00:00
|
|
|
|
|
|
|
# wait for the job to be complete
|
|
|
|
repository_build.phase = 'building'
|
2013-10-29 17:42:26 +00:00
|
|
|
repository_build.status_url = status_endpoint
|
2013-10-27 23:06:20 +00:00
|
|
|
repository_build.save()
|
|
|
|
|
|
|
|
logger.debug('Waiting for job to be complete')
|
2013-10-29 17:42:26 +00:00
|
|
|
status = get_status(status_endpoint)
|
2013-10-27 23:06:20 +00:00
|
|
|
while status != 'error' and status != 'complete':
|
|
|
|
logger.debug('Job status is: %s' % status)
|
|
|
|
time.sleep(5)
|
2013-10-29 17:42:26 +00:00
|
|
|
status = get_status(status_endpoint)
|
2013-10-25 05:14:38 +00:00
|
|
|
|
2013-10-27 23:06:20 +00:00
|
|
|
logger.debug('Job complete with status: %s' % status)
|
|
|
|
if status == 'error':
|
2013-10-29 17:42:26 +00:00
|
|
|
error_message = requests.get(status_endpoint).json()['message']
|
|
|
|
logger.warning('Job error: %s' % error_message)
|
2013-10-27 23:06:20 +00:00
|
|
|
repository_build.phase = 'error'
|
|
|
|
else:
|
|
|
|
repository_build.phase = 'complete'
|
2013-10-24 20:37:03 +00:00
|
|
|
|
2013-10-27 23:06:20 +00:00
|
|
|
# clean up the DO node
|
|
|
|
logger.debug('Cleaning up DO node.')
|
2013-10-29 19:39:44 +00:00
|
|
|
retry_command(droplet.destroy)
|
2013-10-24 20:37:03 +00:00
|
|
|
|
2013-10-27 23:06:20 +00:00
|
|
|
repository_build.status_url = None
|
|
|
|
repository_build.build_node_id = None;
|
|
|
|
repository_build.save()
|
2013-10-25 19:13:11 +00:00
|
|
|
|
2013-10-27 23:06:20 +00:00
|
|
|
return True
|
|
|
|
|
|
|
|
except Exception as outer_ex:
|
2013-10-28 18:49:23 +00:00
|
|
|
# We don't really know what these are, but they are probably retryable
|
2013-10-27 23:06:20 +00:00
|
|
|
logger.exception('Exception processing job: %s' % outer_ex.message)
|
2013-10-28 18:49:23 +00:00
|
|
|
return False
|
2013-10-24 20:37:03 +00:00
|
|
|
|
2013-10-30 17:22:00 +00:00
|
|
|
finally:
|
|
|
|
if not db_connection.is_closed():
|
|
|
|
logger.debug('Closing thread db connection.')
|
|
|
|
db_connection.close()
|
|
|
|
|
2013-10-24 20:37:03 +00:00
|
|
|
|
|
|
|
def process_work_items(pool):
|
|
|
|
logger.debug('Getting work item from queue.')
|
|
|
|
|
2013-10-25 22:17:43 +00:00
|
|
|
item = dockerfile_build_queue.get(processing_time=60*60) # allow 1 hr
|
2013-10-24 20:37:03 +00:00
|
|
|
|
|
|
|
while item:
|
|
|
|
logger.debug('Queue gave us some work: %s' % item.body)
|
|
|
|
|
|
|
|
request = json.loads(item.body)
|
|
|
|
|
|
|
|
def build_callback(item):
|
|
|
|
local_item = item
|
|
|
|
def complete_callback(completed):
|
|
|
|
if completed:
|
|
|
|
dockerfile_build_queue.complete(local_item)
|
2013-10-29 19:42:19 +00:00
|
|
|
else:
|
|
|
|
# We have a retryable error, add the job back to the queue
|
|
|
|
dockerfile_build_queue.incomplete(local_item)
|
|
|
|
|
2013-10-24 20:37:03 +00:00
|
|
|
return complete_callback
|
|
|
|
|
2013-10-27 23:06:20 +00:00
|
|
|
logger.debug('Sending work item to thread pool: %s' % pool)
|
2013-10-25 05:14:38 +00:00
|
|
|
pool.apply_async(babysit_builder, [request],
|
|
|
|
callback=build_callback(item))
|
2013-10-24 20:37:03 +00:00
|
|
|
|
|
|
|
item = dockerfile_build_queue.get()
|
|
|
|
|
|
|
|
logger.debug('No more work.')
|
|
|
|
|
2013-10-30 17:22:00 +00:00
|
|
|
if not db_connection.is_closed():
|
|
|
|
logger.debug('Closing thread db connection.')
|
|
|
|
db_connection.close()
|
|
|
|
|
2013-10-24 20:37:03 +00:00
|
|
|
|
|
|
|
def start_worker():
|
|
|
|
pool = ThreadPool(3)
|
2013-10-27 23:06:20 +00:00
|
|
|
logger.debug('Scheduling worker.')
|
2013-10-24 20:37:03 +00:00
|
|
|
|
|
|
|
sched = Scheduler()
|
|
|
|
sched.start()
|
|
|
|
|
2013-10-28 18:49:23 +00:00
|
|
|
sched.add_interval_job(process_work_items, args=[pool], seconds=30)
|
2013-10-24 20:37:03 +00:00
|
|
|
|
|
|
|
while True:
|
|
|
|
time.sleep(60 * 60 * 24) # sleep one day, basically forever
|
|
|
|
|
|
|
|
|
|
|
|
desc = 'Worker daemon to monitor dockerfile build'
|
|
|
|
parser = argparse.ArgumentParser(description=desc)
|
|
|
|
parser.add_argument('-D', action='store_true', default=False,
|
|
|
|
help='Run the worker in daemon mode.')
|
|
|
|
parser.add_argument('--log', default='dockerfilebuild.log',
|
|
|
|
help='Specify the log file for the worker as a daemon.')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
if args.D:
|
|
|
|
handler = logging.FileHandler(args.log)
|
|
|
|
handler.setFormatter(formatter)
|
|
|
|
root_logger.addHandler(handler)
|
2013-10-29 23:59:29 +00:00
|
|
|
with daemon.DaemonContext(files_preserve=[handler.stream],
|
|
|
|
working_directory=os.getcwd()):
|
2013-10-24 20:37:03 +00:00
|
|
|
start_worker()
|
|
|
|
|
|
|
|
else:
|
|
|
|
handler = logging.StreamHandler()
|
|
|
|
handler.setFormatter(formatter)
|
|
|
|
root_logger.addHandler(handler)
|
|
|
|
start_worker()
|