Switch to a unified worker system

- Handles logging - Handles reporting to Sentry - Removes old code around serving a web endpoint (unused now)
2015-07-28 17:25:12 -04:00 · 2015-07-28 17:25:12 -04:00 · ac0cca2d90
commit ac0cca2d90
parent dbd9a32c85
7 changed files with 264 additions and 268 deletions
--- a/workers/queueworker.py
+++ b/workers/queueworker.py
@ -0,0 +1,144 @@
+import logging
+import json
+import signal
+import sys
+
+from threading import Event, Lock
+from datetime import datetime, timedelta
+from threading import Thread
+from time import sleep
+
+from data.model import db
+from data.queue import WorkQueue
+
+from workers.worker import Worker
+
+logger = logging.getLogger(__name__)
+
+class JobException(Exception):
+  """ A job exception is an exception that is caused by something being malformed in the job. When
+      a worker raises this exception the job will be terminated and the retry will not be returned
+      to the queue. """
+  pass
+
+
+class WorkerUnhealthyException(Exception):
+  """ When this exception is raised, the worker is no longer healthy and will not accept any more
+      work. When this is raised while processing a queue item, the item should be returned to the
+      queue along with another retry. """
+  pass
+
+class QueueWorker(Worker):
+  def __init__(self, queue, poll_period_seconds=30, reservation_seconds=300,
+               watchdog_period_seconds=60, retry_after_seconds=300):
+    super(QueueWorker, self).__init__()
+
+    self._poll_period_seconds = poll_period_seconds
+    self._reservation_seconds = reservation_seconds
+    self._watchdog_period_seconds = watchdog_period_seconds
+    self._retry_after_seconds = retry_after_seconds
+    self._stop = Event()
+    self._terminated = Event()
+    self._queue = queue
+    self._current_item_lock = Lock()
+    self.current_queue_item = None
+
+    # Add the various operations.
+    self.add_operation(self.poll_queue, self._poll_period_seconds)
+    self.add_operation(self.update_queue_metrics, 60)
+    self.add_operation(self.run_watchdog, self._watchdog_period_seconds)
+
+  def process_queue_item(self, job_details):
+    """ Return True if complete, False if it should be retried. """
+    raise NotImplementedError('Workers must implement run.')
+
+  def watchdog(self):
+    """ Function that gets run once every watchdog_period_seconds. """
+    pass
+
+  def _close_db_handle(self):
+    if not db.is_closed():
+      logger.debug('Disconnecting from database.')
+      db.close()
+
+  def extend_processing(self, seconds_from_now):
+    with self._current_item_lock:
+      if self.current_queue_item is not None:
+        self._queue.extend_processing(self.current_queue_item, seconds_from_now)
+
+  def run_watchdog(self):
+    logger.debug('Running watchdog.')
+    try:
+      self.watchdog()
+    except WorkerUnhealthyException as exc:
+      logger.error('The worker has encountered an error via watchdog and will not take new jobs')
+      logger.error(exc.message)
+      self.mark_current_incomplete(restore_retry=True)
+      self._stop.set()
+
+  def poll_queue(self):
+    logger.debug('Getting work item from queue.')
+
+    with self._current_item_lock:
+      self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)
+
+    while True:
+      # Retrieve the current item in the queue over which to operate. We do so under
+      # a lock to make sure we are always retrieving an item when in a healthy state.
+      current_queue_item = None
+      with self._current_item_lock:
+        current_queue_item = self.current_queue_item
+        if current_queue_item is None:
+          # Close the db handle.
+          self._close_db_handle()
+          break
+
+      logger.debug('Queue gave us some work: %s', current_queue_item.body)
+      job_details = json.loads(current_queue_item.body)
+
+      try:
+        self.process_queue_item(job_details)
+        self.mark_current_complete()
+
+      except JobException as jex:
+        logger.warning('An error occurred processing request: %s', current_queue_item.body)
+        logger.warning('Job exception: %s' % jex)
+        self.mark_current_incomplete(restore_retry=False)
+
+      except WorkerUnhealthyException as exc:
+        logger.error('The worker has encountered an error via the job and will not take new jobs')
+        logger.error(exc.message)
+        self.mark_current_incomplete(restore_retry=True)
+        self._stop.set()
+
+      finally:
+        # Close the db handle.
+        self._close_db_handle()
+
+      if not self._stop.is_set():
+        with self._current_item_lock:
+          self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)
+
+    if not self._stop.is_set():
+      logger.debug('No more work.')
+
+  def update_queue_metrics(self):
+    self._queue.update_metrics()
+
+  def mark_current_incomplete(self, restore_retry=False):
+    with self._current_item_lock:
+      if self.current_queue_item is not None:
+        self._queue.incomplete(self.current_queue_item, restore_retry=restore_retry,
+                               retry_after=self._retry_after_seconds)
+        self.current_queue_item = None
+
+  def mark_current_complete(self):
+    with self._current_item_lock:
+      if self.current_queue_item is not None:
+        self._queue.complete(self.current_queue_item)
+        self.current_queue_item = None
+
+  def ungracefully_terminated(self):
+    # Give back the retry that we took for this queue item so that if it were down to zero
+    # retries it will still be picked up by another worker
+    self.mark_current_incomplete()