quay/workers/worker.py

import logging
import json
import signal
import sys

from threading import Event
from apscheduler.scheduler import Scheduler
from datetime import datetime, timedelta
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
from threading import Thread
from time import sleep

from data.model import db

logger = logging.getLogger(__name__)

class JobException(Exception):
  """ A job exception is an exception that is caused by something being malformed in the job. When
      a worker raises this exception the job will be terminated and the retry will not be returned
      to the queue. """
  pass


class WorkerUnhealthyException(Exception):
  """ When this exception is raised, the worker is no longer healthy and will not accept any more
      work. When this is raised while processing a queue item, the item should be returned to the
      queue along with another retry. """
  pass


class WorkerStatusServer(HTTPServer):
  def __init__(self, worker, *args, **kwargs):
    HTTPServer.__init__(self, *args, **kwargs)
    self.worker = worker


class WorkerStatusHandler(BaseHTTPRequestHandler):
  def do_GET(self):
    if self.path == '/status':
      # Return the worker status
      code = 200 if self.server.worker.is_healthy() else 503
      self.send_response(code)
      self.send_header('Content-Type', 'text/plain')
      self.end_headers()
      self.wfile.write('OK')
    elif self.path == '/terminate':
      # Return whether it is safe to terminate the worker process
      code = 200 if self.server.worker.is_terminated() else 503
      self.send_response(code)
    else:
      self.send_error(404)

  def do_POST(self):
    if self.path == '/terminate':
      try:
        self.server.worker.join()
        self.send_response(200)
      except:
        self.send_response(500)
    else:
      self.send_error(404)


class Worker(object):
  def __init__(self, queue, poll_period_seconds=30, reservation_seconds=300,
               watchdog_period_seconds=60):
    self._sched = Scheduler()
    self._poll_period_seconds = poll_period_seconds
    self._reservation_seconds = reservation_seconds
    self._watchdog_period_seconds = watchdog_period_seconds
    self._stop = Event()
    self._terminated = Event()
    self._queue = queue
    self.current_queue_item = None

  def process_queue_item(self, job_details):
    """ Return True if complete, False if it should be retried. """
    raise NotImplementedError('Workers must implement run.')

  def watchdog(self):
    """ Function that gets run once every watchdog_period_seconds. """
    pass

  def _close_db_handle(self):
    if not db.is_closed():
      logger.debug('Disconnecting from database.')
      db.close()

  def is_healthy(self):
    return not self._stop.is_set()

  def is_terminated(self):
    return self._terminated.is_set()

  def extend_processing(self, seconds_from_now):
    if self.current_queue_item is not None:
      self._queue.extend_processing(self.current_queue_item, seconds_from_now)

  def run_watchdog(self):
    logger.debug('Running watchdog.')
    try:
      self.watchdog()
    except WorkerUnhealthyException:
      logger.error('The worker has encountered an error and will not take new jobs.')
      self._stop.set()

  def poll_queue(self):
    logger.debug('Getting work item from queue.')

    self.current_queue_item = self._queue.get()
    while self.current_queue_item:
      logger.debug('Queue gave us some work: %s', self.current_queue_item.body)

      job_details = json.loads(self.current_queue_item.body)

      try:
        self.process_queue_item(job_details)
        self._queue.complete(self.current_queue_item)
      except JobException:
        logger.warning('An error occurred processing request: %s', self.current_queue_item.body)
        self._queue.incomplete(self.current_queue_item)
      except WorkerUnhealthyException:
        logger.error('The worker has encountered an error and will not take new jobs. Job is being requeued.')
        self._stop.set()
        self._queue.incomplete(self.current_queue_item, restore_retry=True)
      finally:
        self.current_queue_item = None

        # Close the db handle periodically
        self._close_db_handle()

      if not self._stop.is_set():
        self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)

    if not self._stop.is_set():
      logger.debug('No more work.')

  def update_queue_metrics(self):
    self._queue.update_metrics()

  def start(self, start_status_server_port=None):
    if start_status_server_port is not None:
      # Start a status server on a thread
      server_address = ('', start_status_server_port)
      httpd = WorkerStatusServer(self, server_address, WorkerStatusHandler)
      server_thread = Thread(target=httpd.serve_forever)
      server_thread.daemon = True
      server_thread.start()

    logger.debug("Scheduling worker.")

    soon = datetime.now() + timedelta(seconds=.001)

    self._sched.start()
    self._sched.add_interval_job(self.poll_queue, seconds=self._poll_period_seconds,
                                 start_date=soon)
    self._sched.add_interval_job(self.update_queue_metrics, seconds=60, start_date=soon)
    self._sched.add_interval_job(self.run_watchdog, seconds=self._watchdog_period_seconds)

    signal.signal(signal.SIGTERM, self.terminate)
    signal.signal(signal.SIGINT, self.terminate)

    while not self._stop.wait(1):
      pass

    logger.debug('Waiting for running tasks to complete.')
    self._sched.shutdown()
    logger.debug('Finished.')

    self._terminated.set()

    # Wait forever if we're running a server
    while start_status_server_port is not None:
      sleep(60)

  def terminate(self, signal_num=None, stack_frame=None, graceful=False):
    if self._terminated.is_set():
      sys.exit(1)

    else:
      logger.debug('Shutting down worker.')
      self._stop.set()

      if not graceful:
        # Give back the retry that we took for this queue item so that if it were down to zero
        # retries it will still be picked up by another worker
        if self.current_queue_item is not None:
          self._queue.incomplete(self.current_queue_item, restore_retry=True)

  def join(self):
    self.terminate(graceful=True)
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00			`import logging`
			`import json`
Switch over to phusion baseimage. Prevent everything from daemonizing and start it with runit under phusion. Make workers trap and handle sigint and sigterm. Extend the reservation to 1hr for dockerfilebuild. Update nginx to remove the dependency on libgd. Merge the requirements and requirements enterprise files. 2014-04-11 17:32:45 +00:00			`import signal`
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`import sys`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
			`from threading import Event`
			`from apscheduler.scheduler import Scheduler`
Run a worker task immediately when it starts. 2014-04-22 17:55:54 +00:00			`from datetime import datetime, timedelta`
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler`
			`from threading import Thread`
			`from time import sleep`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`from data.model import db`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
			`logger = logging.getLogger(__name__)`

Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00			`class JobException(Exception):`
			`""" A job exception is an exception that is caused by something being malformed in the job. When`
			`a worker raises this exception the job will be terminated and the retry will not be returned`
			`to the queue. """`
			`pass`


			`class WorkerUnhealthyException(Exception):`
			`""" When this exception is raised, the worker is no longer healthy and will not accept any more`
			`work. When this is raised while processing a queue item, the item should be returned to the`
			`queue along with another retry. """`
			`pass`

Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`class WorkerStatusServer(HTTPServer):`
			`def __init__(self, worker, args, *kwargs):`
			`HTTPServer.__init__(self, args, *kwargs)`
			`self.worker = worker`


			`class WorkerStatusHandler(BaseHTTPRequestHandler):`
			`def do_GET(self):`
			`if self.path == '/status':`
			`# Return the worker status`
			`code = 200 if self.server.worker.is_healthy() else 503`
			`self.send_response(code)`
Update the worker status endpoint to be ELB friendly. 2014-07-18 19:04:20 +00:00			`self.send_header('Content-Type', 'text/plain')`
			`self.end_headers()`
			`self.wfile.write('OK')`
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`elif self.path == '/terminate':`
			`# Return whether it is safe to terminate the worker process`
			`code = 200 if self.server.worker.is_terminated() else 503`
			`self.send_response(code)`
			`else:`
			`self.send_error(404)`

			`def do_POST(self):`
			`if self.path == '/terminate':`
			`try:`
			`self.server.worker.join()`
			`self.send_response(200)`
			`except:`
			`self.send_response(500)`
			`else:`
			`self.send_error(404)`


Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00			`class Worker(object):`
Add a watchdog timer to the build worker to kill a build step that takes more than 20 minutes. 2014-04-02 23:32:41 +00:00			`def __init__(self, queue, poll_period_seconds=30, reservation_seconds=300,`
			`watchdog_period_seconds=60):`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00			`self._sched = Scheduler()`
			`self._poll_period_seconds = poll_period_seconds`
			`self._reservation_seconds = reservation_seconds`
Add a watchdog timer to the build worker to kill a build step that takes more than 20 minutes. 2014-04-02 23:32:41 +00:00			`self._watchdog_period_seconds = watchdog_period_seconds`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00			`self._stop = Event()`
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`self._terminated = Event()`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00			`self._queue = queue`
Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00			`self.current_queue_item = None`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
			`def process_queue_item(self, job_details):`
			`""" Return True if complete, False if it should be retried. """`
			`raise NotImplementedError('Workers must implement run.')`

Add a watchdog timer to the build worker to kill a build step that takes more than 20 minutes. 2014-04-02 23:32:41 +00:00			`def watchdog(self):`
			`""" Function that gets run once every watchdog_period_seconds. """`
			`pass`

Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`def _close_db_handle(self):`
			`if not db.is_closed():`
			`logger.debug('Disconnecting from database.')`
			`db.close()`

			`def is_healthy(self):`
			`return not self._stop.is_set()`

			`def is_terminated(self):`
			`return self._terminated.is_set()`

Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00			`def extend_processing(self, seconds_from_now):`
			`if self.current_queue_item is not None:`
			`self._queue.extend_processing(self.current_queue_item, seconds_from_now)`

Make build workers report that they are unhealthy when we get an LXC error or a Docker connection issue 2014-07-30 21:54:58 +00:00			`def run_watchdog(self):`
			`logger.debug('Running watchdog.')`
			`try:`
			`self.watchdog()`
			`except WorkerUnhealthyException:`
			`logger.error('The worker has encountered an error and will not take new jobs.')`
			`self._stop.set()`

Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00			`def poll_queue(self):`
			`logger.debug('Getting work item from queue.')`

Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00			`self.current_queue_item = self._queue.get()`
			`while self.current_queue_item:`
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`logger.debug('Queue gave us some work: %s', self.current_queue_item.body)`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00			`job_details = json.loads(self.current_queue_item.body)`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00			`try:`
			`self.process_queue_item(job_details)`
			`self._queue.complete(self.current_queue_item)`
			`except JobException:`
			`logger.warning('An error occurred processing request: %s', self.current_queue_item.body)`
			`self._queue.incomplete(self.current_queue_item)`
			`except WorkerUnhealthyException:`
Make build workers report that they are unhealthy when we get an LXC error or a Docker connection issue 2014-07-30 21:54:58 +00:00			`logger.error('The worker has encountered an error and will not take new jobs. Job is being requeued.')`
Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00			`self._stop.set()`
			`self._queue.incomplete(self.current_queue_item, restore_retry=True)`
			`finally:`
			`self.current_queue_item = None`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`# Close the db handle periodically`
			`self._close_db_handle()`

Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00			`if not self._stop.is_set():`
			`self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00			`if not self._stop.is_set():`
			`logger.debug('No more work.')`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
Fix the metrics so they are usable for scaling the workers down and up. Switch all datetimes which touch the database from now to utcnow. Fix the worker Dockerfile. 2014-05-23 18:16:26 +00:00			`def update_queue_metrics(self):`
			`self._queue.update_metrics()`

Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`def start(self, start_status_server_port=None):`
			`if start_status_server_port is not None:`
			`# Start a status server on a thread`
			`server_address = ('', start_status_server_port)`
			`httpd = WorkerStatusServer(self, server_address, WorkerStatusHandler)`
			`server_thread = Thread(target=httpd.serve_forever)`
			`server_thread.daemon = True`
			`server_thread.start()`

Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00			`logger.debug("Scheduling worker.")`

Run a worker task immediately when it starts. 2014-04-22 17:55:54 +00:00			`soon = datetime.now() + timedelta(seconds=.001)`

Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00			`self._sched.start()`
Run a worker task immediately when it starts. 2014-04-22 17:55:54 +00:00			`self._sched.add_interval_job(self.poll_queue, seconds=self._poll_period_seconds,`
			`start_date=soon)`
Fix the metrics so they are usable for scaling the workers down and up. Switch all datetimes which touch the database from now to utcnow. Fix the worker Dockerfile. 2014-05-23 18:16:26 +00:00			`self._sched.add_interval_job(self.update_queue_metrics, seconds=60, start_date=soon)`
Make build workers report that they are unhealthy when we get an LXC error or a Docker connection issue 2014-07-30 21:54:58 +00:00			`self._sched.add_interval_job(self.run_watchdog, seconds=self._watchdog_period_seconds)`
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`signal.signal(signal.SIGTERM, self.terminate)`
			`signal.signal(signal.SIGINT, self.terminate)`
Switch over to phusion baseimage. Prevent everything from daemonizing and start it with runit under phusion. Make workers trap and handle sigint and sigterm. Extend the reservation to 1hr for dockerfilebuild. Update nginx to remove the dependency on libgd. Merge the requirements and requirements enterprise files. 2014-04-11 17:32:45 +00:00
Extract some boilerplate from the worker and create a base class. Port the diffs worker over to the base. 2013-11-15 20:50:20 +00:00			`while not self._stop.wait(1):`
			`pass`

Switch over to phusion baseimage. Prevent everything from daemonizing and start it with runit under phusion. Make workers trap and handle sigint and sigterm. Extend the reservation to 1hr for dockerfilebuild. Update nginx to remove the dependency on libgd. Merge the requirements and requirements enterprise files. 2014-04-11 17:32:45 +00:00			`logger.debug('Waiting for running tasks to complete.')`
			`self._sched.shutdown()`
			`logger.debug('Finished.')`

Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`self._terminated.set()`
Improve the builder response to being terminated or dying. 2014-05-06 22:46:19 +00:00
Upgrade to the 0.11.1 tutum version of docker. Package it as a Dockerfile using Docker in Docker. Add a status server option to the workers to utilize the new termination signal and status features of gantry. 2014-05-16 22:31:24 +00:00			`# Wait forever if we're running a server`
			`while start_status_server_port is not None:`
			`sleep(60)`

			`def terminate(self, signal_num=None, stack_frame=None, graceful=False):`
			`if self._terminated.is_set():`
			`sys.exit(1)`

			`else:`
			`logger.debug('Shutting down worker.')`
			`self._stop.set()`

			`if not graceful:`
			`# Give back the retry that we took for this queue item so that if it were down to zero`
			`# retries it will still be picked up by another worker`
			`if self.current_queue_item is not None:`
			`self._queue.incomplete(self.current_queue_item, restore_retry=True)`

			`def join(self):`
			`self.terminate(graceful=True)`