Bug fixes and associated changes involved with spinning up build nodes and sending jobs to them.
This commit is contained in:
parent
3026f83c2c
commit
d7f51fb764
5 changed files with 169 additions and 102 deletions
14
config.py
14
config.py
|
@ -88,16 +88,21 @@ class GitHubProdConfig(GitHubTestConfig):
|
||||||
GITHUB_CLIENT_SECRET = 'f89d8bb28ea3bd4e1c68808500d185a816be53b1'
|
GITHUB_CLIENT_SECRET = 'f89d8bb28ea3bd4e1c68808500d185a816be53b1'
|
||||||
|
|
||||||
|
|
||||||
class DigitalOceanConfig():
|
class DigitalOceanConfig(object):
|
||||||
DO_CLIENT_ID = 'LJ44y2wwYj1MD0BRxS6qHA'
|
DO_CLIENT_ID = 'LJ44y2wwYj1MD0BRxS6qHA'
|
||||||
DO_CLIENT_SECRET = 'b9357a6f6ff45a33bb03f6dbbad135f9'
|
DO_CLIENT_SECRET = 'b9357a6f6ff45a33bb03f6dbbad135f9'
|
||||||
DO_SSH_KEY_ID = '46986'
|
DO_SSH_KEY_ID = '46986'
|
||||||
DO_SSH_PRIVATE_KEY_FILENAME = 'certs/digital_ocean'
|
DO_SSH_PRIVATE_KEY_FILENAME = 'certs/digital_ocean'
|
||||||
|
DO_ALLOWED_REGIONS = {1, 4}
|
||||||
|
|
||||||
|
|
||||||
|
class BuildNodeConfig(object):
|
||||||
|
BUILD_NODE_PULL_TOKEN = 'F02O2E86CQLKZUQ0O81J8XDHQ6F0N1V36L9JTOEEK6GKKMT1GI8PTJQT4OU88Y6G'
|
||||||
|
|
||||||
|
|
||||||
class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB,
|
class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB,
|
||||||
StripeTestConfig, MixpanelTestConfig, GitHubTestConfig,
|
StripeTestConfig, MixpanelTestConfig, GitHubTestConfig,
|
||||||
DigitalOceanConfig, AWSCredentials):
|
DigitalOceanConfig, AWSCredentials, BuildNodeConfig):
|
||||||
REGISTRY_SERVER = 'localhost:5000'
|
REGISTRY_SERVER = 'localhost:5000'
|
||||||
LOGGING_CONFIG = {
|
LOGGING_CONFIG = {
|
||||||
'level': logging.DEBUG,
|
'level': logging.DEBUG,
|
||||||
|
@ -110,7 +115,8 @@ class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB,
|
||||||
|
|
||||||
class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
|
class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
|
||||||
StripeLiveConfig, MixpanelTestConfig,
|
StripeLiveConfig, MixpanelTestConfig,
|
||||||
GitHubProdConfig, DigitalOceanConfig):
|
GitHubProdConfig, DigitalOceanConfig,
|
||||||
|
BuildNodeConfig):
|
||||||
REGISTRY_SERVER = 'localhost:5000'
|
REGISTRY_SERVER = 'localhost:5000'
|
||||||
LOGGING_CONFIG = {
|
LOGGING_CONFIG = {
|
||||||
'level': logging.DEBUG,
|
'level': logging.DEBUG,
|
||||||
|
@ -121,7 +127,7 @@ class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
|
||||||
|
|
||||||
class ProductionConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
|
class ProductionConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
|
||||||
StripeLiveConfig, MixpanelProdConfig,
|
StripeLiveConfig, MixpanelProdConfig,
|
||||||
GitHubProdConfig, DigitalOceanConfig):
|
GitHubProdConfig, DigitalOceanConfig, BuildNodeConfig):
|
||||||
REGISTRY_SERVER = 'quay.io'
|
REGISTRY_SERVER = 'quay.io'
|
||||||
LOGGING_CONFIG = {
|
LOGGING_CONFIG = {
|
||||||
'stream': sys.stderr,
|
'stream': sys.stderr,
|
||||||
|
|
|
@ -556,7 +556,7 @@ def load_token_data(code):
|
||||||
|
|
||||||
def get_repository_build(request_dbid):
|
def get_repository_build(request_dbid):
|
||||||
try:
|
try:
|
||||||
return RepositoryBuild.get(RepositoryBuild == request_dbid)
|
return RepositoryBuild.get(RepositoryBuild.id == request_dbid)
|
||||||
except RepositoryBuild.DoesNotExist:
|
except RepositoryBuild.DoesNotExist:
|
||||||
msg = 'Unable to locate a build by id: %s' % request_dbid
|
msg = 'Unable to locate a build by id: %s' % request_dbid
|
||||||
raise InvalidRepositoryBuildException(msg)
|
raise InvalidRepositoryBuildException(msg)
|
||||||
|
|
|
@ -20,9 +20,8 @@ class S3FileWriteException(Exception):
|
||||||
|
|
||||||
class UserRequestFiles(object):
|
class UserRequestFiles(object):
|
||||||
def __init__(self, s3_access_key, s3_secret_key, bucket_name):
|
def __init__(self, s3_access_key, s3_secret_key, bucket_name):
|
||||||
self._s3_conn = boto.s3.connection.S3Connection(s3_access_key,
|
self._s3_conn = boto.connect_s3(s3_access_key, s3_secret_key,
|
||||||
s3_secret_key,
|
is_secure=False)
|
||||||
is_secure=False)
|
|
||||||
self._bucket_name = bucket_name
|
self._bucket_name = bucket_name
|
||||||
self._bucket = self._s3_conn.get_bucket(bucket_name)
|
self._bucket = self._s3_conn.get_bucket(bucket_name)
|
||||||
self._access_key = s3_access_key
|
self._access_key = s3_access_key
|
||||||
|
|
|
@ -423,7 +423,7 @@ def request_repo_build(namespace, repository):
|
||||||
tag = '%s/%s/%s' % (host, repo.namespace, repo.name)
|
tag = '%s/%s/%s' % (host, repo.namespace, repo.name)
|
||||||
build_request = model.create_repository_build(repo, token, dockerfile_id,
|
build_request = model.create_repository_build(repo, token, dockerfile_id,
|
||||||
tag)
|
tag)
|
||||||
dockerfile_build_queue.put(json.dumps({'request_id': build_request.id}))
|
dockerfile_build_queue.put(json.dumps({'build_id': build_request.id}))
|
||||||
|
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'started': True
|
'started': True
|
||||||
|
|
|
@ -5,9 +5,12 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import digitalocean
|
import digitalocean
|
||||||
import requests
|
import requests
|
||||||
|
import paramiko
|
||||||
|
|
||||||
from apscheduler.scheduler import Scheduler
|
from apscheduler.scheduler import Scheduler
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
|
from base64 import b64encode
|
||||||
|
from requests.exceptions import ConnectionError
|
||||||
|
|
||||||
from data.queue import dockerfile_build_queue
|
from data.queue import dockerfile_build_queue
|
||||||
from data.userfiles import UserRequestFiles
|
from data.userfiles import UserRequestFiles
|
||||||
|
@ -35,113 +38,169 @@ def try_connection(url, retries=5, period=5):
|
||||||
raise ex
|
raise ex
|
||||||
|
|
||||||
|
|
||||||
|
def try_connect_ssh(client, ip_addr, port, user, key_filename, retries=5,
|
||||||
|
period=5):
|
||||||
|
try:
|
||||||
|
client.connect(ip_addr, port, user, look_for_keys=False,
|
||||||
|
key_filename=key_filename)
|
||||||
|
except Exception as ex:
|
||||||
|
if retries:
|
||||||
|
logger.debug('Retrying connection to ssh ip: %s:%s after %ss' %
|
||||||
|
(ip_addr, port, period))
|
||||||
|
time.sleep(period)
|
||||||
|
return try_connect_ssh(client, ip_addr, port, user, key_filename,
|
||||||
|
retries-1, period)
|
||||||
|
raise ex
|
||||||
|
|
||||||
|
|
||||||
def get_status(url):
|
def get_status(url):
|
||||||
return requests.get(url).json()['status']
|
return requests.get(url).json()['status']
|
||||||
|
|
||||||
|
|
||||||
def babysit_builder(request):
|
def babysit_builder(request):
|
||||||
manager = digitalocean.Manager(client_id=app.config['DO_CLIENT_ID'],
|
|
||||||
api_key=app.config['DO_CLIENT_SECRET'])
|
|
||||||
repository_build = model.get_repository_build(request['build_id'])
|
|
||||||
|
|
||||||
# check if there is already a DO node for this build job, if so clean it up
|
|
||||||
old_id = repository_build.build_node_id
|
|
||||||
if old_id
|
|
||||||
old_droplet = digitalocean.Droplet(old_id)
|
|
||||||
old_droplet.destroy()
|
|
||||||
|
|
||||||
# start the DO node
|
|
||||||
name = 'dockerfile-build-%s' % repository_build.id
|
|
||||||
droplet = digitalocean.Droplet(client_id=app.config['DO_CLIENT_ID'],
|
|
||||||
api_key=app.config['DO_CLIENT_SECRET'],
|
|
||||||
name=name,
|
|
||||||
region_id=1, # New York,
|
|
||||||
image_id=1004145, # Docker on 13.04
|
|
||||||
size_id=66, # 512MB,
|
|
||||||
backup_active=False)
|
|
||||||
droplet.create(ssh_key_ids=[app.config['DO_SSH_KEY_ID']])
|
|
||||||
repository_build.build_node_id = droplet.id
|
|
||||||
repository_build.phase = 'starting'
|
|
||||||
repository_build.save()
|
|
||||||
|
|
||||||
startup = droplet.get_events()[0]
|
|
||||||
while int(startup.percentage) != 100:
|
|
||||||
logger.debug('Droplet startup percentage: %s' % startup.percentage)
|
|
||||||
time.sleep(5)
|
|
||||||
startup.load()
|
|
||||||
|
|
||||||
droplet.load()
|
|
||||||
logger.debug('Droplet started at ip address: %s' % droplet.ip_address)
|
|
||||||
|
|
||||||
# connect to it with ssh
|
|
||||||
repository_build.phase = 'initializing'
|
|
||||||
repository_build.save()
|
|
||||||
|
|
||||||
ssh_client = paramiko.SSHClient()
|
|
||||||
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
||||||
ssh_client.connect(self._container_ip, self._config.sshd_port, "root",
|
|
||||||
look_for_keys=False,
|
|
||||||
key_filename=app.config['DO_SSH_PRIVATE_KEY_FILENAME'])
|
|
||||||
|
|
||||||
# Pull and run the buildserver
|
|
||||||
pull_cmd = 'docker pull quay.io/quay/buildserver'
|
|
||||||
_, stdout, _ = ssh_client.exec_command(pull_cmd)
|
|
||||||
|
|
||||||
start_cmd = 'sudo docker run -d -privileged quay.io/quay/buildserver'
|
|
||||||
_, stdout, _ = ssh_client.exec_command(start_cmd)
|
|
||||||
|
|
||||||
# wait for the server to be ready
|
|
||||||
logger.debug('Waiting for buildserver to be ready')
|
|
||||||
build_endpoint = 'http://%s:5002/build/' % droplet.ip_address
|
|
||||||
try:
|
try:
|
||||||
try_connection()
|
|
||||||
except ConnectionError:
|
|
||||||
#TODO cleanup
|
|
||||||
pass
|
|
||||||
|
|
||||||
# send it the job
|
logger.debug('Starting work item: %s' % request)
|
||||||
logger.debug('Sending build server request')
|
repository_build = model.get_repository_build(request['build_id'])
|
||||||
|
logger.debug('Request details: %s' % repository_build)
|
||||||
|
|
||||||
user_files = UserRequestFiles(app.config['AWS_ACCESS_KEY'],
|
# Initialize digital ocean API
|
||||||
app.config['AWS_SECRET_KEY'],
|
do_client_id = app.config['DO_CLIENT_ID']
|
||||||
app.config['REGISTRY_S3_BUCKET'])
|
do_api_key = app.config['DO_CLIENT_SECRET']
|
||||||
|
manager = digitalocean.Manager(client_id=do_client_id, api_key=do_api_key)
|
||||||
|
|
||||||
repo = repository_build.repository
|
# check if there is already a DO node for this build, if so clean it up
|
||||||
payload = {
|
old_id = repository_build.build_node_id
|
||||||
'tag': repository_build.tag,
|
if old_id:
|
||||||
'resource_url': user_files.get_file_url(repository_build.resource_key),
|
logger.debug('Cleaning up old DO node: %s' % old_id)
|
||||||
'token': repository_build.access_token.code,
|
old_droplet = digitalocean.Droplet(id=old_id, client_id=do_client_id,
|
||||||
}
|
api_key=do_api_key)
|
||||||
start_build = requests.post(build_endpoint, data=payload)
|
old_droplet.destroy()
|
||||||
|
|
||||||
# wait for the job to be complete
|
# Pick the region for the new droplet
|
||||||
status_url = start_build.headers['Location']
|
allowed_regions = app.config['DO_ALLOWED_REGIONS']
|
||||||
repository_build.phase = 'building'
|
available_regions = {region.id for region in manager.get_all_regions()}
|
||||||
repository_build.status_url = status_url
|
regions = available_regions.intersection(allowed_regions)
|
||||||
repository_build.save()
|
if not regions:
|
||||||
|
logger.error('No droplets in our allowed regtions, available: %s' %
|
||||||
|
available_regions)
|
||||||
|
return False
|
||||||
|
|
||||||
logger.debug('Waiting for job to be complete')
|
# start the DO node
|
||||||
status = get_status(status_url)
|
name = 'dockerfile-build-%s' % repository_build.id
|
||||||
while status != 'error' and status != 'complete':
|
logger.debug('Starting DO node: %s' % name)
|
||||||
logger.debug('Job status is: %s' % status)
|
droplet = digitalocean.Droplet(client_id=do_client_id,
|
||||||
time.sleep(5)
|
api_key=do_api_key,
|
||||||
|
name=name,
|
||||||
|
region_id=regions.pop(),
|
||||||
|
image_id=1004145, # Docker on 13.04
|
||||||
|
size_id=66, # 512MB,
|
||||||
|
backup_active=False)
|
||||||
|
droplet.create(ssh_key_ids=[app.config['DO_SSH_KEY_ID']])
|
||||||
|
repository_build.build_node_id = droplet.id
|
||||||
|
repository_build.phase = 'starting'
|
||||||
|
repository_build.save()
|
||||||
|
|
||||||
|
startup = droplet.get_events()[0]
|
||||||
|
startup.load()
|
||||||
|
while not startup.percentage or int(startup.percentage) != 100:
|
||||||
|
logger.debug('Droplet startup percentage: %s' % startup.percentage)
|
||||||
|
time.sleep(5)
|
||||||
|
startup.load()
|
||||||
|
|
||||||
|
droplet.load()
|
||||||
|
logger.debug('Droplet started at ip address: %s' % droplet.ip_address)
|
||||||
|
|
||||||
|
# connect to it with ssh
|
||||||
|
repository_build.phase = 'initializing'
|
||||||
|
repository_build.save()
|
||||||
|
|
||||||
|
ssh_client = paramiko.SSHClient()
|
||||||
|
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||||
|
try_connect_ssh(ssh_client, droplet.ip_address, 22, 'root',
|
||||||
|
key_filename=app.config['DO_SSH_PRIVATE_KEY_FILENAME'])
|
||||||
|
|
||||||
|
# Load the node with the pull token
|
||||||
|
token = app.config['BUILD_NODE_PULL_TOKEN']
|
||||||
|
basicauth = b64encode('%s:%s' % ('$token', token))
|
||||||
|
auth_object = {
|
||||||
|
'https://quay.io/v1/': {
|
||||||
|
'auth': basicauth,
|
||||||
|
'email': '',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
create_auth_cmd = 'echo \'%s\' > .dockercfg' % json.dumps(auth_object)
|
||||||
|
ssh_client.exec_command(create_auth_cmd)
|
||||||
|
|
||||||
|
# Pull and run the buildserver
|
||||||
|
pull_cmd = 'docker pull quay.io/quay/buildserver'
|
||||||
|
_, stdout, _ = ssh_client.exec_command(pull_cmd)
|
||||||
|
pull_status = stdout.channel.recv_exit_status()
|
||||||
|
|
||||||
|
if pull_status != 0:
|
||||||
|
logger.error('Pull command failed for host: %s' % droplet.ip_address)
|
||||||
|
else:
|
||||||
|
logger.debug('Pull status was: %s' % pull_status)
|
||||||
|
|
||||||
|
start_cmd = 'docker run -d -privileged -lxc-conf="lxc.aa_profile=unconfined" quay.io/quay/buildserver'
|
||||||
|
ssh_client.exec_command(start_cmd)
|
||||||
|
|
||||||
|
# wait for the server to be ready
|
||||||
|
logger.debug('Waiting for buildserver to be ready')
|
||||||
|
build_endpoint = 'http://%s:5002/build/' % droplet.ip_address
|
||||||
|
try:
|
||||||
|
try_connection(build_endpoint)
|
||||||
|
except ConnectionError:
|
||||||
|
#TODO cleanup
|
||||||
|
pass
|
||||||
|
|
||||||
|
# send it the job
|
||||||
|
logger.debug('Sending build server request')
|
||||||
|
|
||||||
|
user_files = UserRequestFiles(app.config['AWS_ACCESS_KEY'],
|
||||||
|
app.config['AWS_SECRET_KEY'],
|
||||||
|
app.config['REGISTRY_S3_BUCKET'])
|
||||||
|
|
||||||
|
repo = repository_build.repository
|
||||||
|
payload = {
|
||||||
|
'tag': repository_build.tag,
|
||||||
|
'resource_url': user_files.get_file_url(repository_build.resource_key),
|
||||||
|
'token': repository_build.access_token.code,
|
||||||
|
}
|
||||||
|
start_build = requests.post(build_endpoint, data=payload)
|
||||||
|
|
||||||
|
# wait for the job to be complete
|
||||||
|
status_url = start_build.headers['Location']
|
||||||
|
repository_build.phase = 'building'
|
||||||
|
repository_build.status_url = status_url
|
||||||
|
repository_build.save()
|
||||||
|
|
||||||
|
logger.debug('Waiting for job to be complete')
|
||||||
status = get_status(status_url)
|
status = get_status(status_url)
|
||||||
|
while status != 'error' and status != 'complete':
|
||||||
|
logger.debug('Job status is: %s' % status)
|
||||||
|
time.sleep(5)
|
||||||
|
status = get_status(status_url)
|
||||||
|
|
||||||
logger.debug('Job complete with status: %s' % status)
|
logger.debug('Job complete with status: %s' % status)
|
||||||
if status == 'error':
|
if status == 'error':
|
||||||
repository_build.phase = 'error'
|
repository_build.phase = 'error'
|
||||||
else:
|
else:
|
||||||
repository_build.phase = 'complete'
|
repository_build.phase = 'complete'
|
||||||
|
|
||||||
# clean up the DO node
|
# clean up the DO node
|
||||||
logger.debug('Cleaning up DO node.')
|
logger.debug('Cleaning up DO node.')
|
||||||
droplet.destroy()
|
# droplet.destroy()
|
||||||
|
|
||||||
repository_build.status_url = None
|
repository_build.status_url = None
|
||||||
repository_build.build_node_id = None;
|
repository_build.build_node_id = None;
|
||||||
repository_build.save()
|
repository_build.save()
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
except Exception as outer_ex:
|
||||||
|
logger.exception('Exception processing job: %s' % outer_ex.message)
|
||||||
|
|
||||||
|
|
||||||
def process_work_items(pool):
|
def process_work_items(pool):
|
||||||
|
@ -161,6 +220,7 @@ def process_work_items(pool):
|
||||||
dockerfile_build_queue.complete(local_item)
|
dockerfile_build_queue.complete(local_item)
|
||||||
return complete_callback
|
return complete_callback
|
||||||
|
|
||||||
|
logger.debug('Sending work item to thread pool: %s' % pool)
|
||||||
pool.apply_async(babysit_builder, [request],
|
pool.apply_async(babysit_builder, [request],
|
||||||
callback=build_callback(item))
|
callback=build_callback(item))
|
||||||
|
|
||||||
|
@ -171,12 +231,14 @@ def process_work_items(pool):
|
||||||
|
|
||||||
def start_worker():
|
def start_worker():
|
||||||
pool = ThreadPool(3)
|
pool = ThreadPool(3)
|
||||||
logger.debug("Scheduling worker.")
|
logger.debug('Scheduling worker.')
|
||||||
|
|
||||||
sched = Scheduler()
|
sched = Scheduler()
|
||||||
sched.start()
|
sched.start()
|
||||||
|
|
||||||
sched.add_interval_job(process_work_items, args=[pool], seconds=30)
|
# sched.add_interval_job(process_work_items, args=[pool], seconds=30)
|
||||||
|
|
||||||
|
process_work_items(pool)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
time.sleep(60 * 60 * 24) # sleep one day, basically forever
|
time.sleep(60 * 60 * 24) # sleep one day, basically forever
|
||||||
|
|
Reference in a new issue