Bug fixes and associated changes involved with spinning up build nodes and sending jobs to them.

This commit is contained in:
yackob03 2013-10-27 19:06:20 -04:00
parent 3026f83c2c
commit d7f51fb764
5 changed files with 169 additions and 102 deletions

View file

@ -88,16 +88,21 @@ class GitHubProdConfig(GitHubTestConfig):
GITHUB_CLIENT_SECRET = 'f89d8bb28ea3bd4e1c68808500d185a816be53b1' GITHUB_CLIENT_SECRET = 'f89d8bb28ea3bd4e1c68808500d185a816be53b1'
class DigitalOceanConfig(): class DigitalOceanConfig(object):
DO_CLIENT_ID = 'LJ44y2wwYj1MD0BRxS6qHA' DO_CLIENT_ID = 'LJ44y2wwYj1MD0BRxS6qHA'
DO_CLIENT_SECRET = 'b9357a6f6ff45a33bb03f6dbbad135f9' DO_CLIENT_SECRET = 'b9357a6f6ff45a33bb03f6dbbad135f9'
DO_SSH_KEY_ID = '46986' DO_SSH_KEY_ID = '46986'
DO_SSH_PRIVATE_KEY_FILENAME = 'certs/digital_ocean' DO_SSH_PRIVATE_KEY_FILENAME = 'certs/digital_ocean'
DO_ALLOWED_REGIONS = {1, 4}
class BuildNodeConfig(object):
BUILD_NODE_PULL_TOKEN = 'F02O2E86CQLKZUQ0O81J8XDHQ6F0N1V36L9JTOEEK6GKKMT1GI8PTJQT4OU88Y6G'
class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB, class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB,
StripeTestConfig, MixpanelTestConfig, GitHubTestConfig, StripeTestConfig, MixpanelTestConfig, GitHubTestConfig,
DigitalOceanConfig, AWSCredentials): DigitalOceanConfig, AWSCredentials, BuildNodeConfig):
REGISTRY_SERVER = 'localhost:5000' REGISTRY_SERVER = 'localhost:5000'
LOGGING_CONFIG = { LOGGING_CONFIG = {
'level': logging.DEBUG, 'level': logging.DEBUG,
@ -110,7 +115,8 @@ class DebugConfig(FlaskConfig, MailConfig, LocalStorage, SQLiteDB,
class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL, class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
StripeLiveConfig, MixpanelTestConfig, StripeLiveConfig, MixpanelTestConfig,
GitHubProdConfig, DigitalOceanConfig): GitHubProdConfig, DigitalOceanConfig,
BuildNodeConfig):
REGISTRY_SERVER = 'localhost:5000' REGISTRY_SERVER = 'localhost:5000'
LOGGING_CONFIG = { LOGGING_CONFIG = {
'level': logging.DEBUG, 'level': logging.DEBUG,
@ -121,7 +127,7 @@ class LocalHostedConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
class ProductionConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL, class ProductionConfig(FlaskConfig, MailConfig, S3Storage, RDSMySQL,
StripeLiveConfig, MixpanelProdConfig, StripeLiveConfig, MixpanelProdConfig,
GitHubProdConfig, DigitalOceanConfig): GitHubProdConfig, DigitalOceanConfig, BuildNodeConfig):
REGISTRY_SERVER = 'quay.io' REGISTRY_SERVER = 'quay.io'
LOGGING_CONFIG = { LOGGING_CONFIG = {
'stream': sys.stderr, 'stream': sys.stderr,

View file

@ -556,7 +556,7 @@ def load_token_data(code):
def get_repository_build(request_dbid): def get_repository_build(request_dbid):
try: try:
return RepositoryBuild.get(RepositoryBuild == request_dbid) return RepositoryBuild.get(RepositoryBuild.id == request_dbid)
except RepositoryBuild.DoesNotExist: except RepositoryBuild.DoesNotExist:
msg = 'Unable to locate a build by id: %s' % request_dbid msg = 'Unable to locate a build by id: %s' % request_dbid
raise InvalidRepositoryBuildException(msg) raise InvalidRepositoryBuildException(msg)

View file

@ -20,9 +20,8 @@ class S3FileWriteException(Exception):
class UserRequestFiles(object): class UserRequestFiles(object):
def __init__(self, s3_access_key, s3_secret_key, bucket_name): def __init__(self, s3_access_key, s3_secret_key, bucket_name):
self._s3_conn = boto.s3.connection.S3Connection(s3_access_key, self._s3_conn = boto.connect_s3(s3_access_key, s3_secret_key,
s3_secret_key, is_secure=False)
is_secure=False)
self._bucket_name = bucket_name self._bucket_name = bucket_name
self._bucket = self._s3_conn.get_bucket(bucket_name) self._bucket = self._s3_conn.get_bucket(bucket_name)
self._access_key = s3_access_key self._access_key = s3_access_key

View file

@ -423,7 +423,7 @@ def request_repo_build(namespace, repository):
tag = '%s/%s/%s' % (host, repo.namespace, repo.name) tag = '%s/%s/%s' % (host, repo.namespace, repo.name)
build_request = model.create_repository_build(repo, token, dockerfile_id, build_request = model.create_repository_build(repo, token, dockerfile_id,
tag) tag)
dockerfile_build_queue.put(json.dumps({'request_id': build_request.id})) dockerfile_build_queue.put(json.dumps({'build_id': build_request.id}))
return jsonify({ return jsonify({
'started': True 'started': True

View file

@ -5,9 +5,12 @@ import time
import argparse import argparse
import digitalocean import digitalocean
import requests import requests
import paramiko
from apscheduler.scheduler import Scheduler from apscheduler.scheduler import Scheduler
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from base64 import b64encode
from requests.exceptions import ConnectionError
from data.queue import dockerfile_build_queue from data.queue import dockerfile_build_queue
from data.userfiles import UserRequestFiles from data.userfiles import UserRequestFiles
@ -35,113 +38,169 @@ def try_connection(url, retries=5, period=5):
raise ex raise ex
def try_connect_ssh(client, ip_addr, port, user, key_filename, retries=5,
period=5):
try:
client.connect(ip_addr, port, user, look_for_keys=False,
key_filename=key_filename)
except Exception as ex:
if retries:
logger.debug('Retrying connection to ssh ip: %s:%s after %ss' %
(ip_addr, port, period))
time.sleep(period)
return try_connect_ssh(client, ip_addr, port, user, key_filename,
retries-1, period)
raise ex
def get_status(url): def get_status(url):
return requests.get(url).json()['status'] return requests.get(url).json()['status']
def babysit_builder(request): def babysit_builder(request):
manager = digitalocean.Manager(client_id=app.config['DO_CLIENT_ID'],
api_key=app.config['DO_CLIENT_SECRET'])
repository_build = model.get_repository_build(request['build_id'])
# check if there is already a DO node for this build job, if so clean it up
old_id = repository_build.build_node_id
if old_id
old_droplet = digitalocean.Droplet(old_id)
old_droplet.destroy()
# start the DO node
name = 'dockerfile-build-%s' % repository_build.id
droplet = digitalocean.Droplet(client_id=app.config['DO_CLIENT_ID'],
api_key=app.config['DO_CLIENT_SECRET'],
name=name,
region_id=1, # New York,
image_id=1004145, # Docker on 13.04
size_id=66, # 512MB,
backup_active=False)
droplet.create(ssh_key_ids=[app.config['DO_SSH_KEY_ID']])
repository_build.build_node_id = droplet.id
repository_build.phase = 'starting'
repository_build.save()
startup = droplet.get_events()[0]
while int(startup.percentage) != 100:
logger.debug('Droplet startup percentage: %s' % startup.percentage)
time.sleep(5)
startup.load()
droplet.load()
logger.debug('Droplet started at ip address: %s' % droplet.ip_address)
# connect to it with ssh
repository_build.phase = 'initializing'
repository_build.save()
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(self._container_ip, self._config.sshd_port, "root",
look_for_keys=False,
key_filename=app.config['DO_SSH_PRIVATE_KEY_FILENAME'])
# Pull and run the buildserver
pull_cmd = 'docker pull quay.io/quay/buildserver'
_, stdout, _ = ssh_client.exec_command(pull_cmd)
start_cmd = 'sudo docker run -d -privileged quay.io/quay/buildserver'
_, stdout, _ = ssh_client.exec_command(start_cmd)
# wait for the server to be ready
logger.debug('Waiting for buildserver to be ready')
build_endpoint = 'http://%s:5002/build/' % droplet.ip_address
try: try:
try_connection()
except ConnectionError:
#TODO cleanup
pass
# send it the job logger.debug('Starting work item: %s' % request)
logger.debug('Sending build server request') repository_build = model.get_repository_build(request['build_id'])
logger.debug('Request details: %s' % repository_build)
user_files = UserRequestFiles(app.config['AWS_ACCESS_KEY'], # Initialize digital ocean API
app.config['AWS_SECRET_KEY'], do_client_id = app.config['DO_CLIENT_ID']
app.config['REGISTRY_S3_BUCKET']) do_api_key = app.config['DO_CLIENT_SECRET']
manager = digitalocean.Manager(client_id=do_client_id, api_key=do_api_key)
repo = repository_build.repository # check if there is already a DO node for this build, if so clean it up
payload = { old_id = repository_build.build_node_id
'tag': repository_build.tag, if old_id:
'resource_url': user_files.get_file_url(repository_build.resource_key), logger.debug('Cleaning up old DO node: %s' % old_id)
'token': repository_build.access_token.code, old_droplet = digitalocean.Droplet(id=old_id, client_id=do_client_id,
} api_key=do_api_key)
start_build = requests.post(build_endpoint, data=payload) old_droplet.destroy()
# wait for the job to be complete # Pick the region for the new droplet
status_url = start_build.headers['Location'] allowed_regions = app.config['DO_ALLOWED_REGIONS']
repository_build.phase = 'building' available_regions = {region.id for region in manager.get_all_regions()}
repository_build.status_url = status_url regions = available_regions.intersection(allowed_regions)
repository_build.save() if not regions:
logger.error('No droplets in our allowed regtions, available: %s' %
available_regions)
return False
logger.debug('Waiting for job to be complete') # start the DO node
status = get_status(status_url) name = 'dockerfile-build-%s' % repository_build.id
while status != 'error' and status != 'complete': logger.debug('Starting DO node: %s' % name)
logger.debug('Job status is: %s' % status) droplet = digitalocean.Droplet(client_id=do_client_id,
time.sleep(5) api_key=do_api_key,
name=name,
region_id=regions.pop(),
image_id=1004145, # Docker on 13.04
size_id=66, # 512MB,
backup_active=False)
droplet.create(ssh_key_ids=[app.config['DO_SSH_KEY_ID']])
repository_build.build_node_id = droplet.id
repository_build.phase = 'starting'
repository_build.save()
startup = droplet.get_events()[0]
startup.load()
while not startup.percentage or int(startup.percentage) != 100:
logger.debug('Droplet startup percentage: %s' % startup.percentage)
time.sleep(5)
startup.load()
droplet.load()
logger.debug('Droplet started at ip address: %s' % droplet.ip_address)
# connect to it with ssh
repository_build.phase = 'initializing'
repository_build.save()
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try_connect_ssh(ssh_client, droplet.ip_address, 22, 'root',
key_filename=app.config['DO_SSH_PRIVATE_KEY_FILENAME'])
# Load the node with the pull token
token = app.config['BUILD_NODE_PULL_TOKEN']
basicauth = b64encode('%s:%s' % ('$token', token))
auth_object = {
'https://quay.io/v1/': {
'auth': basicauth,
'email': '',
},
}
create_auth_cmd = 'echo \'%s\' > .dockercfg' % json.dumps(auth_object)
ssh_client.exec_command(create_auth_cmd)
# Pull and run the buildserver
pull_cmd = 'docker pull quay.io/quay/buildserver'
_, stdout, _ = ssh_client.exec_command(pull_cmd)
pull_status = stdout.channel.recv_exit_status()
if pull_status != 0:
logger.error('Pull command failed for host: %s' % droplet.ip_address)
else:
logger.debug('Pull status was: %s' % pull_status)
start_cmd = 'docker run -d -privileged -lxc-conf="lxc.aa_profile=unconfined" quay.io/quay/buildserver'
ssh_client.exec_command(start_cmd)
# wait for the server to be ready
logger.debug('Waiting for buildserver to be ready')
build_endpoint = 'http://%s:5002/build/' % droplet.ip_address
try:
try_connection(build_endpoint)
except ConnectionError:
#TODO cleanup
pass
# send it the job
logger.debug('Sending build server request')
user_files = UserRequestFiles(app.config['AWS_ACCESS_KEY'],
app.config['AWS_SECRET_KEY'],
app.config['REGISTRY_S3_BUCKET'])
repo = repository_build.repository
payload = {
'tag': repository_build.tag,
'resource_url': user_files.get_file_url(repository_build.resource_key),
'token': repository_build.access_token.code,
}
start_build = requests.post(build_endpoint, data=payload)
# wait for the job to be complete
status_url = start_build.headers['Location']
repository_build.phase = 'building'
repository_build.status_url = status_url
repository_build.save()
logger.debug('Waiting for job to be complete')
status = get_status(status_url) status = get_status(status_url)
while status != 'error' and status != 'complete':
logger.debug('Job status is: %s' % status)
time.sleep(5)
status = get_status(status_url)
logger.debug('Job complete with status: %s' % status) logger.debug('Job complete with status: %s' % status)
if status == 'error': if status == 'error':
repository_build.phase = 'error' repository_build.phase = 'error'
else: else:
repository_build.phase = 'complete' repository_build.phase = 'complete'
# clean up the DO node # clean up the DO node
logger.debug('Cleaning up DO node.') logger.debug('Cleaning up DO node.')
droplet.destroy() # droplet.destroy()
repository_build.status_url = None repository_build.status_url = None
repository_build.build_node_id = None; repository_build.build_node_id = None;
repository_build.save() repository_build.save()
return True return True
except Exception as outer_ex:
logger.exception('Exception processing job: %s' % outer_ex.message)
def process_work_items(pool): def process_work_items(pool):
@ -161,6 +220,7 @@ def process_work_items(pool):
dockerfile_build_queue.complete(local_item) dockerfile_build_queue.complete(local_item)
return complete_callback return complete_callback
logger.debug('Sending work item to thread pool: %s' % pool)
pool.apply_async(babysit_builder, [request], pool.apply_async(babysit_builder, [request],
callback=build_callback(item)) callback=build_callback(item))
@ -171,12 +231,14 @@ def process_work_items(pool):
def start_worker(): def start_worker():
pool = ThreadPool(3) pool = ThreadPool(3)
logger.debug("Scheduling worker.") logger.debug('Scheduling worker.')
sched = Scheduler() sched = Scheduler()
sched.start() sched.start()
sched.add_interval_job(process_work_items, args=[pool], seconds=30) # sched.add_interval_job(process_work_items, args=[pool], seconds=30)
process_work_items(pool)
while True: while True:
time.sleep(60 * 60 * 24) # sleep one day, basically forever time.sleep(60 * 60 * 24) # sleep one day, basically forever