Bug fixes and associated changes involved with spinning up build nodes and sending jobs to them.
This commit is contained in:
parent
3026f83c2c
commit
d7f51fb764
5 changed files with 169 additions and 102 deletions
14
config.py
14
config.py
|
@ -88,16 +88,21 @@ class GitHubProdConfig(GitHubTestConfig):
|
|||
GITHUB_CLIENT_SECRET = 'f89d8bb28ea3bd4e1c68808500d185a816be53b1'
|
||||
|
||||
|
||||
class DigitalOceanConfig():
|
||||
class DigitalOceanConfig(object):
|
||||
DO_CLIENT_ID = 'LJ44y2wwYj1MD0BRxS6qHA'
|
||||
DO_CLIENT_SECRET = 'b9357a6f6ff45a33bb03f6dbbad135f9'
|
||||
DO_SSH_KEY_ID = '46986'
|
||||
DO_SSH_PRIVATE_KEY_FILENAME = 'certs/digital_ocean'
|
||||
DO_ALLOWED_REGIONS = {1, 4}
|
||||
|
||||
|
||||
class BuildNodeConfig(object):
|
||||
BUILD_NODE_PULL_TOKEN = 'F02O2E86CQLKZUQ0O81J8XDHQ6F0N1V36L9JTOEEK6GKKMT1GI8PTJQT4OU88Y6G'
|
||||
|
||||
|
||||
class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB,
|
||||
StripeTestConfig, MixpanelTestConfig, GitHubTestConfig,
|
||||
DigitalOceanConfig, AWSCredentials):
|
||||
DigitalOceanConfig, AWSCredentials, BuildNodeConfig):
|
||||
REGISTRY_SERVER = 'localhost:5000'
|
||||
LOGGING_CONFIG = {
|
||||
'level': logging.DEBUG,
|
||||
|
@ -110,7 +115,8 @@ class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB,
|
|||
|
||||
class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
|
||||
StripeLiveConfig, MixpanelTestConfig,
|
||||
GitHubProdConfig, DigitalOceanConfig):
|
||||
GitHubProdConfig, DigitalOceanConfig,
|
||||
BuildNodeConfig):
|
||||
REGISTRY_SERVER = 'localhost:5000'
|
||||
LOGGING_CONFIG = {
|
||||
'level': logging.DEBUG,
|
||||
|
@ -121,7 +127,7 @@ class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
|
|||
|
||||
class ProductionConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
|
||||
StripeLiveConfig, MixpanelProdConfig,
|
||||
GitHubProdConfig, DigitalOceanConfig):
|
||||
GitHubProdConfig, DigitalOceanConfig, BuildNodeConfig):
|
||||
REGISTRY_SERVER = 'quay.io'
|
||||
LOGGING_CONFIG = {
|
||||
'stream': sys.stderr,
|
||||
|
|
|
@ -556,7 +556,7 @@ def load_token_data(code):
|
|||
|
||||
def get_repository_build(request_dbid):
|
||||
try:
|
||||
return RepositoryBuild.get(RepositoryBuild == request_dbid)
|
||||
return RepositoryBuild.get(RepositoryBuild.id == request_dbid)
|
||||
except RepositoryBuild.DoesNotExist:
|
||||
msg = 'Unable to locate a build by id: %s' % request_dbid
|
||||
raise InvalidRepositoryBuildException(msg)
|
||||
|
|
|
@ -20,9 +20,8 @@ class S3FileWriteException(Exception):
|
|||
|
||||
class UserRequestFiles(object):
|
||||
def __init__(self, s3_access_key, s3_secret_key, bucket_name):
|
||||
self._s3_conn = boto.s3.connection.S3Connection(s3_access_key,
|
||||
s3_secret_key,
|
||||
is_secure=False)
|
||||
self._s3_conn = boto.connect_s3(s3_access_key, s3_secret_key,
|
||||
is_secure=False)
|
||||
self._bucket_name = bucket_name
|
||||
self._bucket = self._s3_conn.get_bucket(bucket_name)
|
||||
self._access_key = s3_access_key
|
||||
|
|
|
@ -423,7 +423,7 @@ def request_repo_build(namespace, repository):
|
|||
tag = '%s/%s/%s' % (host, repo.namespace, repo.name)
|
||||
build_request = model.create_repository_build(repo, token, dockerfile_id,
|
||||
tag)
|
||||
dockerfile_build_queue.put(json.dumps({'request_id': build_request.id}))
|
||||
dockerfile_build_queue.put(json.dumps({'build_id': build_request.id}))
|
||||
|
||||
return jsonify({
|
||||
'started': True
|
||||
|
|
|
@ -5,9 +5,12 @@ import time
|
|||
import argparse
|
||||
import digitalocean
|
||||
import requests
|
||||
import paramiko
|
||||
|
||||
from apscheduler.scheduler import Scheduler
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from base64 import b64encode
|
||||
from requests.exceptions import ConnectionError
|
||||
|
||||
from data.queue import dockerfile_build_queue
|
||||
from data.userfiles import UserRequestFiles
|
||||
|
@ -35,113 +38,169 @@ def try_connection(url, retries=5, period=5):
|
|||
raise ex
|
||||
|
||||
|
||||
def try_connect_ssh(client, ip_addr, port, user, key_filename, retries=5,
|
||||
period=5):
|
||||
try:
|
||||
client.connect(ip_addr, port, user, look_for_keys=False,
|
||||
key_filename=key_filename)
|
||||
except Exception as ex:
|
||||
if retries:
|
||||
logger.debug('Retrying connection to ssh ip: %s:%s after %ss' %
|
||||
(ip_addr, port, period))
|
||||
time.sleep(period)
|
||||
return try_connect_ssh(client, ip_addr, port, user, key_filename,
|
||||
retries-1, period)
|
||||
raise ex
|
||||
|
||||
|
||||
def get_status(url):
|
||||
return requests.get(url).json()['status']
|
||||
|
||||
|
||||
def babysit_builder(request):
|
||||
manager = digitalocean.Manager(client_id=app.config['DO_CLIENT_ID'],
|
||||
api_key=app.config['DO_CLIENT_SECRET'])
|
||||
repository_build = model.get_repository_build(request['build_id'])
|
||||
|
||||
# check if there is already a DO node for this build job, if so clean it up
|
||||
old_id = repository_build.build_node_id
|
||||
if old_id
|
||||
old_droplet = digitalocean.Droplet(old_id)
|
||||
old_droplet.destroy()
|
||||
|
||||
# start the DO node
|
||||
name = 'dockerfile-build-%s' % repository_build.id
|
||||
droplet = digitalocean.Droplet(client_id=app.config['DO_CLIENT_ID'],
|
||||
api_key=app.config['DO_CLIENT_SECRET'],
|
||||
name=name,
|
||||
region_id=1, # New York,
|
||||
image_id=1004145, # Docker on 13.04
|
||||
size_id=66, # 512MB,
|
||||
backup_active=False)
|
||||
droplet.create(ssh_key_ids=[app.config['DO_SSH_KEY_ID']])
|
||||
repository_build.build_node_id = droplet.id
|
||||
repository_build.phase = 'starting'
|
||||
repository_build.save()
|
||||
|
||||
startup = droplet.get_events()[0]
|
||||
while int(startup.percentage) != 100:
|
||||
logger.debug('Droplet startup percentage: %s' % startup.percentage)
|
||||
time.sleep(5)
|
||||
startup.load()
|
||||
|
||||
droplet.load()
|
||||
logger.debug('Droplet started at ip address: %s' % droplet.ip_address)
|
||||
|
||||
# connect to it with ssh
|
||||
repository_build.phase = 'initializing'
|
||||
repository_build.save()
|
||||
|
||||
ssh_client = paramiko.SSHClient()
|
||||
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
ssh_client.connect(self._container_ip, self._config.sshd_port, "root",
|
||||
look_for_keys=False,
|
||||
key_filename=app.config['DO_SSH_PRIVATE_KEY_FILENAME'])
|
||||
|
||||
# Pull and run the buildserver
|
||||
pull_cmd = 'docker pull quay.io/quay/buildserver'
|
||||
_, stdout, _ = ssh_client.exec_command(pull_cmd)
|
||||
|
||||
start_cmd = 'sudo docker run -d -privileged quay.io/quay/buildserver'
|
||||
_, stdout, _ = ssh_client.exec_command(start_cmd)
|
||||
|
||||
# wait for the server to be ready
|
||||
logger.debug('Waiting for buildserver to be ready')
|
||||
build_endpoint = 'http://%s:5002/build/' % droplet.ip_address
|
||||
try:
|
||||
try_connection()
|
||||
except ConnectionError:
|
||||
#TODO cleanup
|
||||
pass
|
||||
|
||||
# send it the job
|
||||
logger.debug('Sending build server request')
|
||||
logger.debug('Starting work item: %s' % request)
|
||||
repository_build = model.get_repository_build(request['build_id'])
|
||||
logger.debug('Request details: %s' % repository_build)
|
||||
|
||||
user_files = UserRequestFiles(app.config['AWS_ACCESS_KEY'],
|
||||
app.config['AWS_SECRET_KEY'],
|
||||
app.config['REGISTRY_S3_BUCKET'])
|
||||
# Initialize digital ocean API
|
||||
do_client_id = app.config['DO_CLIENT_ID']
|
||||
do_api_key = app.config['DO_CLIENT_SECRET']
|
||||
manager = digitalocean.Manager(client_id=do_client_id, api_key=do_api_key)
|
||||
|
||||
repo = repository_build.repository
|
||||
payload = {
|
||||
'tag': repository_build.tag,
|
||||
'resource_url': user_files.get_file_url(repository_build.resource_key),
|
||||
'token': repository_build.access_token.code,
|
||||
}
|
||||
start_build = requests.post(build_endpoint, data=payload)
|
||||
# check if there is already a DO node for this build, if so clean it up
|
||||
old_id = repository_build.build_node_id
|
||||
if old_id:
|
||||
logger.debug('Cleaning up old DO node: %s' % old_id)
|
||||
old_droplet = digitalocean.Droplet(id=old_id, client_id=do_client_id,
|
||||
api_key=do_api_key)
|
||||
old_droplet.destroy()
|
||||
|
||||
# wait for the job to be complete
|
||||
status_url = start_build.headers['Location']
|
||||
repository_build.phase = 'building'
|
||||
repository_build.status_url = status_url
|
||||
repository_build.save()
|
||||
# Pick the region for the new droplet
|
||||
allowed_regions = app.config['DO_ALLOWED_REGIONS']
|
||||
available_regions = {region.id for region in manager.get_all_regions()}
|
||||
regions = available_regions.intersection(allowed_regions)
|
||||
if not regions:
|
||||
logger.error('No droplets in our allowed regtions, available: %s' %
|
||||
available_regions)
|
||||
return False
|
||||
|
||||
logger.debug('Waiting for job to be complete')
|
||||
status = get_status(status_url)
|
||||
while status != 'error' and status != 'complete':
|
||||
logger.debug('Job status is: %s' % status)
|
||||
time.sleep(5)
|
||||
# start the DO node
|
||||
name = 'dockerfile-build-%s' % repository_build.id
|
||||
logger.debug('Starting DO node: %s' % name)
|
||||
droplet = digitalocean.Droplet(client_id=do_client_id,
|
||||
api_key=do_api_key,
|
||||
name=name,
|
||||
region_id=regions.pop(),
|
||||
image_id=1004145, # Docker on 13.04
|
||||
size_id=66, # 512MB,
|
||||
backup_active=False)
|
||||
droplet.create(ssh_key_ids=[app.config['DO_SSH_KEY_ID']])
|
||||
repository_build.build_node_id = droplet.id
|
||||
repository_build.phase = 'starting'
|
||||
repository_build.save()
|
||||
|
||||
startup = droplet.get_events()[0]
|
||||
startup.load()
|
||||
while not startup.percentage or int(startup.percentage) != 100:
|
||||
logger.debug('Droplet startup percentage: %s' % startup.percentage)
|
||||
time.sleep(5)
|
||||
startup.load()
|
||||
|
||||
droplet.load()
|
||||
logger.debug('Droplet started at ip address: %s' % droplet.ip_address)
|
||||
|
||||
# connect to it with ssh
|
||||
repository_build.phase = 'initializing'
|
||||
repository_build.save()
|
||||
|
||||
ssh_client = paramiko.SSHClient()
|
||||
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
try_connect_ssh(ssh_client, droplet.ip_address, 22, 'root',
|
||||
key_filename=app.config['DO_SSH_PRIVATE_KEY_FILENAME'])
|
||||
|
||||
# Load the node with the pull token
|
||||
token = app.config['BUILD_NODE_PULL_TOKEN']
|
||||
basicauth = b64encode('%s:%s' % ('$token', token))
|
||||
auth_object = {
|
||||
'https://quay.io/v1/': {
|
||||
'auth': basicauth,
|
||||
'email': '',
|
||||
},
|
||||
}
|
||||
|
||||
create_auth_cmd = 'echo \'%s\' > .dockercfg' % json.dumps(auth_object)
|
||||
ssh_client.exec_command(create_auth_cmd)
|
||||
|
||||
# Pull and run the buildserver
|
||||
pull_cmd = 'docker pull quay.io/quay/buildserver'
|
||||
_, stdout, _ = ssh_client.exec_command(pull_cmd)
|
||||
pull_status = stdout.channel.recv_exit_status()
|
||||
|
||||
if pull_status != 0:
|
||||
logger.error('Pull command failed for host: %s' % droplet.ip_address)
|
||||
else:
|
||||
logger.debug('Pull status was: %s' % pull_status)
|
||||
|
||||
start_cmd = 'docker run -d -privileged -lxc-conf="lxc.aa_profile=unconfined" quay.io/quay/buildserver'
|
||||
ssh_client.exec_command(start_cmd)
|
||||
|
||||
# wait for the server to be ready
|
||||
logger.debug('Waiting for buildserver to be ready')
|
||||
build_endpoint = 'http://%s:5002/build/' % droplet.ip_address
|
||||
try:
|
||||
try_connection(build_endpoint)
|
||||
except ConnectionError:
|
||||
#TODO cleanup
|
||||
pass
|
||||
|
||||
# send it the job
|
||||
logger.debug('Sending build server request')
|
||||
|
||||
user_files = UserRequestFiles(app.config['AWS_ACCESS_KEY'],
|
||||
app.config['AWS_SECRET_KEY'],
|
||||
app.config['REGISTRY_S3_BUCKET'])
|
||||
|
||||
repo = repository_build.repository
|
||||
payload = {
|
||||
'tag': repository_build.tag,
|
||||
'resource_url': user_files.get_file_url(repository_build.resource_key),
|
||||
'token': repository_build.access_token.code,
|
||||
}
|
||||
start_build = requests.post(build_endpoint, data=payload)
|
||||
|
||||
# wait for the job to be complete
|
||||
status_url = start_build.headers['Location']
|
||||
repository_build.phase = 'building'
|
||||
repository_build.status_url = status_url
|
||||
repository_build.save()
|
||||
|
||||
logger.debug('Waiting for job to be complete')
|
||||
status = get_status(status_url)
|
||||
while status != 'error' and status != 'complete':
|
||||
logger.debug('Job status is: %s' % status)
|
||||
time.sleep(5)
|
||||
status = get_status(status_url)
|
||||
|
||||
logger.debug('Job complete with status: %s' % status)
|
||||
if status == 'error':
|
||||
repository_build.phase = 'error'
|
||||
else:
|
||||
repository_build.phase = 'complete'
|
||||
logger.debug('Job complete with status: %s' % status)
|
||||
if status == 'error':
|
||||
repository_build.phase = 'error'
|
||||
else:
|
||||
repository_build.phase = 'complete'
|
||||
|
||||
# clean up the DO node
|
||||
logger.debug('Cleaning up DO node.')
|
||||
droplet.destroy()
|
||||
# clean up the DO node
|
||||
logger.debug('Cleaning up DO node.')
|
||||
# droplet.destroy()
|
||||
|
||||
repository_build.status_url = None
|
||||
repository_build.build_node_id = None;
|
||||
repository_build.save()
|
||||
repository_build.status_url = None
|
||||
repository_build.build_node_id = None;
|
||||
repository_build.save()
|
||||
|
||||
return True
|
||||
return True
|
||||
|
||||
except Exception as outer_ex:
|
||||
logger.exception('Exception processing job: %s' % outer_ex.message)
|
||||
|
||||
|
||||
def process_work_items(pool):
|
||||
|
@ -161,6 +220,7 @@ def process_work_items(pool):
|
|||
dockerfile_build_queue.complete(local_item)
|
||||
return complete_callback
|
||||
|
||||
logger.debug('Sending work item to thread pool: %s' % pool)
|
||||
pool.apply_async(babysit_builder, [request],
|
||||
callback=build_callback(item))
|
||||
|
||||
|
@ -171,12 +231,14 @@ def process_work_items(pool):
|
|||
|
||||
def start_worker():
|
||||
pool = ThreadPool(3)
|
||||
logger.debug("Scheduling worker.")
|
||||
logger.debug('Scheduling worker.')
|
||||
|
||||
sched = Scheduler()
|
||||
sched.start()
|
||||
|
||||
sched.add_interval_job(process_work_items, args=[pool], seconds=30)
|
||||
# sched.add_interval_job(process_work_items, args=[pool], seconds=30)
|
||||
|
||||
process_work_items(pool)
|
||||
|
||||
while True:
|
||||
time.sleep(60 * 60 * 24) # sleep one day, basically forever
|
||||
|
|
Reference in a new issue