Switch to a unified worker system

- Handles logging - Handles reporting to Sentry - Removes old code around serving a web endpoint (unused now)
2015-07-28 17:25:12 -04:00 · 2015-07-28 17:25:12 -04:00 · ac0cca2d90
commit ac0cca2d90
parent dbd9a32c85
7 changed files with 264 additions and 268 deletions
--- a/workers/worker.py
+++ b/workers/worker.py
@ -1,93 +1,33 @@
 import logging
-import json
 import signal
 import sys
+import socket

-from threading import Event, Lock
+from threading import Event
 from apscheduler.schedulers.background import BackgroundScheduler
 from datetime import datetime, timedelta
-from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
 from threading import Thread
 from time import sleep
+from raven import Client

+from app import app
 from data.model import db
-from data.queue import WorkQueue
+from functools import wraps

 logger = logging.getLogger(__name__)

-class JobException(Exception):
-  """ A job exception is an exception that is caused by something being malformed in the job. When
-      a worker raises this exception the job will be terminated and the retry will not be returned
-      to the queue. """
-  pass
-
-
-class WorkerUnhealthyException(Exception):
-  """ When this exception is raised, the worker is no longer healthy and will not accept any more
-      work. When this is raised while processing a queue item, the item should be returned to the
-      queue along with another retry. """
-  pass
-
-
-class WorkerStatusServer(HTTPServer):
-  def __init__(self, worker, *args, **kwargs):
-    HTTPServer.__init__(self, *args, **kwargs)
-    self.worker = worker
-
-
-class WorkerStatusHandler(BaseHTTPRequestHandler):
-  def do_GET(self):
-    if self.path == '/status':
-      # Return the worker status
-      code = 200 if self.server.worker.is_healthy() else 503
-      self.send_response(code)
-      self.send_header('Content-Type', 'text/plain')
-      self.end_headers()
-      self.wfile.write('OK')
-    elif self.path == '/terminate':
-      # Return whether it is safe to terminate the worker process
-      code = 200 if self.server.worker.is_terminated() else 503
-      self.send_response(code)
-    else:
-      self.send_error(404)
-
-  def do_POST(self):
-    if self.path == '/terminate':
-      try:
-        self.server.worker.join()
-        self.send_response(200)
-      except:
-        self.send_response(500)
-    else:
-      self.send_error(404)
-

 class Worker(object):
-  def __init__(self, queue, poll_period_seconds=30, reservation_seconds=300,
-               watchdog_period_seconds=60, retry_after_seconds=300):
+  """ Base class for workers which perform some work periodically. """
+  def __init__(self):
    self._sched = BackgroundScheduler()
-    self._poll_period_seconds = poll_period_seconds
-    self._reservation_seconds = reservation_seconds
-    self._watchdog_period_seconds = watchdog_period_seconds
-    self._retry_after_seconds = retry_after_seconds
+    self._operations = []
    self._stop = Event()
    self._terminated = Event()
-    self._queue = queue
-    self._current_item_lock = Lock()
-    self.current_queue_item = None

-  def process_queue_item(self, job_details):
-    """ Return True if complete, False if it should be retried. """
-    raise NotImplementedError('Workers must implement run.')
-
-  def watchdog(self):
-    """ Function that gets run once every watchdog_period_seconds. """
-    pass
-
-  def _close_db_handle(self):
-    if not db.is_closed():
-      logger.debug('Disconnecting from database.')
-      db.close()
+    if app.config.get('EXCEPTION_LOG_TYPE', 'FakeSentry') == 'Sentry':
+      worker_name = '%s:worker-%s' % (socket.gethostname(), self.__class__.__name__)
+      self._raven_client = Client(app.config.get('SENTRY_DSN', ''), name=worker_name)

  def is_healthy(self):
    return not self._stop.is_set()
@ -95,90 +35,33 @@ class Worker(object):
  def is_terminated(self):
    return self._terminated.is_set()

-  def extend_processing(self, seconds_from_now):
-    with self._current_item_lock:
-      if self.current_queue_item is not None:
-        self._queue.extend_processing(self.current_queue_item, seconds_from_now)
-
-  def run_watchdog(self):
-    logger.debug('Running watchdog.')
-    try:
-      self.watchdog()
-    except WorkerUnhealthyException as exc:
-      logger.error('The worker has encountered an error via watchdog and will not take new jobs')
-      logger.error(exc.message)
-      self.mark_current_incomplete(restore_retry=True)
-      self._stop.set()
-
-  def poll_queue(self):
-    logger.debug('Getting work item from queue.')
-
-    with self._current_item_lock:
-      self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)
-
-    while True:
-      # Retrieve the current item in the queue over which to operate. We do so under
-      # a lock to make sure we are always retrieving an item when in a healthy state.
-      current_queue_item = None
-      with self._current_item_lock:
-        current_queue_item = self.current_queue_item
-        if current_queue_item is None:
-          # Close the db handle.
-          self._close_db_handle()
-          break
-
-      logger.debug('Queue gave us some work: %s', current_queue_item.body)
-      job_details = json.loads(current_queue_item.body)
+  def ungracefully_terminated(self):
+    """ Method called when the worker has been terminated in an ungraceful fashion. """
+    pass

+  def add_operation(self, operation_func, operation_sec):
+    @wraps(operation_func)
+    def _operation_func():
      try:
-        self.process_queue_item(job_details)
-        self.mark_current_complete()
+        return operation_func()
+      except Exception:
+        logger.exception('Operation raised exception')
+        if self._raven_client:
+          logger.debug('Logging exception to Sentry')
+          self._raven_client.captureException()

-      except JobException as jex:
-        logger.warning('An error occurred processing request: %s', current_queue_item.body)
-        logger.warning('Job exception: %s' % jex)
-        self.mark_current_incomplete(restore_retry=False)
+    self._operations.append((_operation_func, operation_sec))

-      except WorkerUnhealthyException as exc:
-        logger.error('The worker has encountered an error via the job and will not take new jobs')
-        logger.error(exc.message)
-        self.mark_current_incomplete(restore_retry=True)
-        self._stop.set()
-
-      finally:
-        # Close the db handle.
-        self._close_db_handle()
-
-      if not self._stop.is_set():
-        with self._current_item_lock:
-          self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)
-
-    if not self._stop.is_set():
-      logger.debug('No more work.')
-
-  def update_queue_metrics(self):
-    self._queue.update_metrics()
-
-  def start(self, start_status_server_port=None):
-    if start_status_server_port is not None:
-      # Start a status server on a thread
-      server_address = ('', start_status_server_port)
-      httpd = WorkerStatusServer(self, server_address, WorkerStatusHandler)
-      server_thread = Thread(target=httpd.serve_forever)
-      server_thread.daemon = True
-      server_thread.start()
-
-    logger.debug("Scheduling worker.")
+  def start(self):
+    logging.config.fileConfig('conf/logging.conf', disable_existing_loggers=False)
+    logger.debug('Scheduling worker.')

    soon = datetime.now() + timedelta(seconds=.001)

    self._sched.start()
-    self._sched.add_job(self.poll_queue, 'interval', seconds=self._poll_period_seconds,
-                        start_date=soon, max_instances=1)
-    self._sched.add_job(self.update_queue_metrics, 'interval', seconds=60, start_date=soon,
-                        max_instances=1)
-    self._sched.add_job(self.run_watchdog, 'interval', seconds=self._watchdog_period_seconds,
-                        max_instances=1)
+    for operation_func, operation_sec in self._operations:
+      self._sched.add_job(operation_func, 'interval', seconds=operation_sec,
+                          start_date=soon, max_instances=1)

    signal.signal(signal.SIGTERM, self.terminate)
    signal.signal(signal.SIGINT, self.terminate)
@ -192,23 +75,6 @@ class Worker(object):

    self._terminated.set()

-    # Wait forever if we're running a server
-    while start_status_server_port is not None:
-      sleep(60)
-
-  def mark_current_incomplete(self, restore_retry=False):
-    with self._current_item_lock:
-      if self.current_queue_item is not None:
-        self._queue.incomplete(self.current_queue_item, restore_retry=restore_retry,
-                               retry_after=self._retry_after_seconds)
-        self.current_queue_item = None
-
-  def mark_current_complete(self):
-    with self._current_item_lock:
-      if self.current_queue_item is not None:
-        self._queue.complete(self.current_queue_item)
-        self.current_queue_item = None
-
  def terminate(self, signal_num=None, stack_frame=None, graceful=False):
    if self._terminated.is_set():
      sys.exit(1)
@ -218,9 +84,7 @@ class Worker(object):
      self._stop.set()

      if not graceful:
-        # Give back the retry that we took for this queue item so that if it were down to zero
-        # retries it will still be picked up by another worker
-        self.mark_current_incomplete()
+        self.ungracefully_terminated()

  def join(self):
    self.terminate(graceful=True)