Bug fixes and associated changes involved with spinning up build nodes and sending jobs to them.

This commit is contained in:
yackob03 2013-10-27 19:06:20 -04:00
parent 3026f83c2c
commit d7f51fb764
5 changed files with 169 additions and 102 deletions

View file

@ -88,16 +88,21 @@ class GitHubProdConfig(GitHubTestConfig):
GITHUB_CLIENT_SECRET = 'f89d8bb28ea3bd4e1c68808500d185a816be53b1'
class DigitalOceanConfig():
class DigitalOceanConfig(object):
DO_CLIENT_ID = 'LJ44y2wwYj1MD0BRxS6qHA'
DO_CLIENT_SECRET = 'b9357a6f6ff45a33bb03f6dbbad135f9'
DO_SSH_KEY_ID = '46986'
DO_SSH_PRIVATE_KEY_FILENAME = 'certs/digital_ocean'
DO_ALLOWED_REGIONS = {1, 4}
class BuildNodeConfig(object):
BUILD_NODE_PULL_TOKEN = 'F02O2E86CQLKZUQ0O81J8XDHQ6F0N1V36L9JTOEEK6GKKMT1GI8PTJQT4OU88Y6G'
class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB,
StripeTestConfig, MixpanelTestConfig, GitHubTestConfig,
DigitalOceanConfig, AWSCredentials):
DigitalOceanConfig, AWSCredentials, BuildNodeConfig):
REGISTRY_SERVER = 'localhost:5000'
LOGGING_CONFIG = {
'level': logging.DEBUG,
@ -110,7 +115,8 @@ class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB,
class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
StripeLiveConfig, MixpanelTestConfig,
GitHubProdConfig, DigitalOceanConfig):
GitHubProdConfig, DigitalOceanConfig,
BuildNodeConfig):
REGISTRY_SERVER = 'localhost:5000'
LOGGING_CONFIG = {
'level': logging.DEBUG,
@ -121,7 +127,7 @@ class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
class ProductionConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
StripeLiveConfig, MixpanelProdConfig,
GitHubProdConfig, DigitalOceanConfig):
GitHubProdConfig, DigitalOceanConfig, BuildNodeConfig):
REGISTRY_SERVER = 'quay.io'
LOGGING_CONFIG = {
'stream': sys.stderr,

View file

@ -556,7 +556,7 @@ def load_token_data(code):
def get_repository_build(request_dbid):
try:
return RepositoryBuild.get(RepositoryBuild == request_dbid)
return RepositoryBuild.get(RepositoryBuild.id == request_dbid)
except RepositoryBuild.DoesNotExist:
msg = 'Unable to locate a build by id: %s' % request_dbid
raise InvalidRepositoryBuildException(msg)

View file

@ -20,9 +20,8 @@ class S3FileWriteException(Exception):
class UserRequestFiles(object):
def __init__(self, s3_access_key, s3_secret_key, bucket_name):
self._s3_conn = boto.s3.connection.S3Connection(s3_access_key,
s3_secret_key,
is_secure=False)
self._s3_conn = boto.connect_s3(s3_access_key, s3_secret_key,
is_secure=False)
self._bucket_name = bucket_name
self._bucket = self._s3_conn.get_bucket(bucket_name)
self._access_key = s3_access_key

View file

@ -423,7 +423,7 @@ def request_repo_build(namespace, repository):
tag = '%s/%s/%s' % (host, repo.namespace, repo.name)
build_request = model.create_repository_build(repo, token, dockerfile_id,
tag)
dockerfile_build_queue.put(json.dumps({'request_id': build_request.id}))
dockerfile_build_queue.put(json.dumps({'build_id': build_request.id}))
return jsonify({
'started': True

View file

@ -5,9 +5,12 @@ import time
import argparse
import digitalocean
import requests
import paramiko
from apscheduler.scheduler import Scheduler
from multiprocessing.pool import ThreadPool
from base64 import b64encode
from requests.exceptions import ConnectionError
from data.queue import dockerfile_build_queue
from data.userfiles import UserRequestFiles
@ -35,113 +38,169 @@ def try_connection(url, retries=5, period=5):
raise ex
def try_connect_ssh(client, ip_addr, port, user, key_filename, retries=5,
period=5):
try:
client.connect(ip_addr, port, user, look_for_keys=False,
key_filename=key_filename)
except Exception as ex:
if retries:
logger.debug('Retrying connection to ssh ip: %s:%s after %ss' %
(ip_addr, port, period))
time.sleep(period)
return try_connect_ssh(client, ip_addr, port, user, key_filename,
retries-1, period)
raise ex
def get_status(url):
return requests.get(url).json()['status']
def babysit_builder(request):
manager = digitalocean.Manager(client_id=app.config['DO_CLIENT_ID'],
api_key=app.config['DO_CLIENT_SECRET'])
repository_build = model.get_repository_build(request['build_id'])
# check if there is already a DO node for this build job, if so clean it up
old_id = repository_build.build_node_id
if old_id
old_droplet = digitalocean.Droplet(old_id)
old_droplet.destroy()
# start the DO node
name = 'dockerfile-build-%s' % repository_build.id
droplet = digitalocean.Droplet(client_id=app.config['DO_CLIENT_ID'],
api_key=app.config['DO_CLIENT_SECRET'],
name=name,
region_id=1, # New York,
image_id=1004145, # Docker on 13.04
size_id=66, # 512MB,
backup_active=False)
droplet.create(ssh_key_ids=[app.config['DO_SSH_KEY_ID']])
repository_build.build_node_id = droplet.id
repository_build.phase = 'starting'
repository_build.save()
startup = droplet.get_events()[0]
while int(startup.percentage) != 100:
logger.debug('Droplet startup percentage: %s' % startup.percentage)
time.sleep(5)
startup.load()
droplet.load()
logger.debug('Droplet started at ip address: %s' % droplet.ip_address)
# connect to it with ssh
repository_build.phase = 'initializing'
repository_build.save()
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(self._container_ip, self._config.sshd_port, "root",
look_for_keys=False,
key_filename=app.config['DO_SSH_PRIVATE_KEY_FILENAME'])
# Pull and run the buildserver
pull_cmd = 'docker pull quay.io/quay/buildserver'
_, stdout, _ = ssh_client.exec_command(pull_cmd)
start_cmd = 'sudo docker run -d -privileged quay.io/quay/buildserver'
_, stdout, _ = ssh_client.exec_command(start_cmd)
# wait for the server to be ready
logger.debug('Waiting for buildserver to be ready')
build_endpoint = 'http://%s:5002/build/' % droplet.ip_address
try:
try_connection()
except ConnectionError:
#TODO cleanup
pass
# send it the job
logger.debug('Sending build server request')
logger.debug('Starting work item: %s' % request)
repository_build = model.get_repository_build(request['build_id'])
logger.debug('Request details: %s' % repository_build)
user_files = UserRequestFiles(app.config['AWS_ACCESS_KEY'],
app.config['AWS_SECRET_KEY'],
app.config['REGISTRY_S3_BUCKET'])
# Initialize digital ocean API
do_client_id = app.config['DO_CLIENT_ID']
do_api_key = app.config['DO_CLIENT_SECRET']
manager = digitalocean.Manager(client_id=do_client_id, api_key=do_api_key)
repo = repository_build.repository
payload = {
'tag': repository_build.tag,
'resource_url': user_files.get_file_url(repository_build.resource_key),
'token': repository_build.access_token.code,
}
start_build = requests.post(build_endpoint, data=payload)
# check if there is already a DO node for this build, if so clean it up
old_id = repository_build.build_node_id
if old_id:
logger.debug('Cleaning up old DO node: %s' % old_id)
old_droplet = digitalocean.Droplet(id=old_id, client_id=do_client_id,
api_key=do_api_key)
old_droplet.destroy()
# wait for the job to be complete
status_url = start_build.headers['Location']
repository_build.phase = 'building'
repository_build.status_url = status_url
repository_build.save()
# Pick the region for the new droplet
allowed_regions = app.config['DO_ALLOWED_REGIONS']
available_regions = {region.id for region in manager.get_all_regions()}
regions = available_regions.intersection(allowed_regions)
if not regions:
logger.error('No droplets in our allowed regtions, available: %s' %
available_regions)
return False
logger.debug('Waiting for job to be complete')
status = get_status(status_url)
while status != 'error' and status != 'complete':
logger.debug('Job status is: %s' % status)
time.sleep(5)
# start the DO node
name = 'dockerfile-build-%s' % repository_build.id
logger.debug('Starting DO node: %s' % name)
droplet = digitalocean.Droplet(client_id=do_client_id,
api_key=do_api_key,
name=name,
region_id=regions.pop(),
image_id=1004145, # Docker on 13.04
size_id=66, # 512MB,
backup_active=False)
droplet.create(ssh_key_ids=[app.config['DO_SSH_KEY_ID']])
repository_build.build_node_id = droplet.id
repository_build.phase = 'starting'
repository_build.save()
startup = droplet.get_events()[0]
startup.load()
while not startup.percentage or int(startup.percentage) != 100:
logger.debug('Droplet startup percentage: %s' % startup.percentage)
time.sleep(5)
startup.load()
droplet.load()
logger.debug('Droplet started at ip address: %s' % droplet.ip_address)
# connect to it with ssh
repository_build.phase = 'initializing'
repository_build.save()
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try_connect_ssh(ssh_client, droplet.ip_address, 22, 'root',
key_filename=app.config['DO_SSH_PRIVATE_KEY_FILENAME'])
# Load the node with the pull token
token = app.config['BUILD_NODE_PULL_TOKEN']
basicauth = b64encode('%s:%s' % ('$token', token))
auth_object = {
'https://quay.io/v1/': {
'auth': basicauth,
'email': '',
},
}
create_auth_cmd = 'echo \'%s\' > .dockercfg' % json.dumps(auth_object)
ssh_client.exec_command(create_auth_cmd)
# Pull and run the buildserver
pull_cmd = 'docker pull quay.io/quay/buildserver'
_, stdout, _ = ssh_client.exec_command(pull_cmd)
pull_status = stdout.channel.recv_exit_status()
if pull_status != 0:
logger.error('Pull command failed for host: %s' % droplet.ip_address)
else:
logger.debug('Pull status was: %s' % pull_status)
start_cmd = 'docker run -d -privileged -lxc-conf="lxc.aa_profile=unconfined" quay.io/quay/buildserver'
ssh_client.exec_command(start_cmd)
# wait for the server to be ready
logger.debug('Waiting for buildserver to be ready')
build_endpoint = 'http://%s:5002/build/' % droplet.ip_address
try:
try_connection(build_endpoint)
except ConnectionError:
#TODO cleanup
pass
# send it the job
logger.debug('Sending build server request')
user_files = UserRequestFiles(app.config['AWS_ACCESS_KEY'],
app.config['AWS_SECRET_KEY'],
app.config['REGISTRY_S3_BUCKET'])
repo = repository_build.repository
payload = {
'tag': repository_build.tag,
'resource_url': user_files.get_file_url(repository_build.resource_key),
'token': repository_build.access_token.code,
}
start_build = requests.post(build_endpoint, data=payload)
# wait for the job to be complete
status_url = start_build.headers['Location']
repository_build.phase = 'building'
repository_build.status_url = status_url
repository_build.save()
logger.debug('Waiting for job to be complete')
status = get_status(status_url)
while status != 'error' and status != 'complete':
logger.debug('Job status is: %s' % status)
time.sleep(5)
status = get_status(status_url)
logger.debug('Job complete with status: %s' % status)
if status == 'error':
repository_build.phase = 'error'
else:
repository_build.phase = 'complete'
logger.debug('Job complete with status: %s' % status)
if status == 'error':
repository_build.phase = 'error'
else:
repository_build.phase = 'complete'
# clean up the DO node
logger.debug('Cleaning up DO node.')
droplet.destroy()
# clean up the DO node
logger.debug('Cleaning up DO node.')
# droplet.destroy()
repository_build.status_url = None
repository_build.build_node_id = None;
repository_build.save()
repository_build.status_url = None
repository_build.build_node_id = None;
repository_build.save()
return True
return True
except Exception as outer_ex:
logger.exception('Exception processing job: %s' % outer_ex.message)
def process_work_items(pool):
@ -161,6 +220,7 @@ def process_work_items(pool):
dockerfile_build_queue.complete(local_item)
return complete_callback
logger.debug('Sending work item to thread pool: %s' % pool)
pool.apply_async(babysit_builder, [request],
callback=build_callback(item))
@ -171,12 +231,14 @@ def process_work_items(pool):
def start_worker():
pool = ThreadPool(3)
logger.debug("Scheduling worker.")
logger.debug('Scheduling worker.')
sched = Scheduler()
sched.start()
sched.add_interval_job(process_work_items, args=[pool], seconds=30)
# sched.add_interval_job(process_work_items, args=[pool], seconds=30)
process_work_items(pool)
while True:
time.sleep(60 * 60 * 24) # sleep one day, basically forever