Switch to a unified worker system
- Handles logging - Handles reporting to Sentry - Removes old code around serving a web endpoint (unused now)
This commit is contained in:
parent
dbd9a32c85
commit
ac0cca2d90
7 changed files with 264 additions and 268 deletions
|
@ -1,6 +1,5 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from apscheduler.schedulers.blocking import BlockingScheduler
|
|
||||||
from peewee import fn
|
from peewee import fn
|
||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
from gzip import GzipFile
|
from gzip import GzipFile
|
||||||
|
@ -10,47 +9,51 @@ from data.archivedlogs import JSON_MIMETYPE
|
||||||
from data.database import RepositoryBuild, db_random_func
|
from data.database import RepositoryBuild, db_random_func
|
||||||
from app import build_logs, log_archive
|
from app import build_logs, log_archive
|
||||||
from util.streamingjsonencoder import StreamingJSONEncoder
|
from util.streamingjsonencoder import StreamingJSONEncoder
|
||||||
|
from workers.worker import Worker
|
||||||
|
|
||||||
POLL_PERIOD_SECONDS = 30
|
POLL_PERIOD_SECONDS = 30
|
||||||
MEMORY_TEMPFILE_SIZE = 64 * 1024 # Large enough to handle approximately 99% of builds in memory
|
MEMORY_TEMPFILE_SIZE = 64 * 1024 # Large enough to handle approximately 99% of builds in memory
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
sched = BlockingScheduler()
|
|
||||||
|
|
||||||
@sched.scheduled_job(trigger='interval', seconds=30)
|
class ArchiveBuildLogsWorker(Worker):
|
||||||
def archive_redis_buildlogs():
|
def __init__(self):
|
||||||
""" Archive a single build, choosing a candidate at random. This process must be idempotent to
|
super(ArchiveBuildLogsWorker, self).__init__()
|
||||||
avoid needing two-phase commit. """
|
self.add_operation(self._archive_redis_buildlogs, POLL_PERIOD_SECONDS)
|
||||||
try:
|
|
||||||
# Get a random build to archive
|
|
||||||
to_archive = model.build.archivable_buildlogs_query().order_by(db_random_func()).get()
|
|
||||||
logger.debug('Archiving: %s', to_archive.uuid)
|
|
||||||
|
|
||||||
length, entries = build_logs.get_log_entries(to_archive.uuid, 0)
|
def _archive_redis_buildlogs(self):
|
||||||
to_encode = {
|
""" Archive a single build, choosing a candidate at random. This process must be idempotent to
|
||||||
'start': 0,
|
avoid needing two-phase commit. """
|
||||||
'total': length,
|
try:
|
||||||
'logs': entries,
|
# Get a random build to archive
|
||||||
}
|
to_archive = model.build.archivable_buildlogs_query().order_by(db_random_func()).get()
|
||||||
|
logger.debug('Archiving: %s', to_archive.uuid)
|
||||||
|
|
||||||
with SpooledTemporaryFile(MEMORY_TEMPFILE_SIZE) as tempfile:
|
length, entries = build_logs.get_log_entries(to_archive.uuid, 0)
|
||||||
with GzipFile('testarchive', fileobj=tempfile) as zipstream:
|
to_encode = {
|
||||||
for chunk in StreamingJSONEncoder().iterencode(to_encode):
|
'start': 0,
|
||||||
zipstream.write(chunk)
|
'total': length,
|
||||||
|
'logs': entries,
|
||||||
|
}
|
||||||
|
|
||||||
tempfile.seek(0)
|
with SpooledTemporaryFile(MEMORY_TEMPFILE_SIZE) as tempfile:
|
||||||
log_archive.store_file(tempfile, JSON_MIMETYPE, content_encoding='gzip',
|
with GzipFile('testarchive', fileobj=tempfile) as zipstream:
|
||||||
file_id=to_archive.uuid)
|
for chunk in StreamingJSONEncoder().iterencode(to_encode):
|
||||||
|
zipstream.write(chunk)
|
||||||
|
|
||||||
to_archive.logs_archived = True
|
tempfile.seek(0)
|
||||||
to_archive.save()
|
log_archive.store_file(tempfile, JSON_MIMETYPE, content_encoding='gzip',
|
||||||
|
file_id=to_archive.uuid)
|
||||||
|
|
||||||
build_logs.expire_log_entries(to_archive.uuid)
|
to_archive.logs_archived = True
|
||||||
|
to_archive.save()
|
||||||
|
|
||||||
except RepositoryBuild.DoesNotExist:
|
build_logs.expire_log_entries(to_archive.uuid)
|
||||||
logger.debug('No more builds to archive')
|
|
||||||
|
except RepositoryBuild.DoesNotExist:
|
||||||
|
logger.debug('No more builds to archive')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
worker = ArchiveBuildLogsWorker()
|
||||||
sched.start()
|
worker.start()
|
||||||
|
|
|
@ -3,22 +3,17 @@ import logging
|
||||||
from app import image_diff_queue
|
from app import image_diff_queue
|
||||||
from data import model
|
from data import model
|
||||||
from endpoints.v1.registry import process_image_changes
|
from endpoints.v1.registry import process_image_changes
|
||||||
from workers.worker import Worker
|
from workers.queueworker import QueueWorker
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DiffsWorker(Worker):
|
class DiffsWorker(QueueWorker):
|
||||||
def process_queue_item(self, job_details):
|
def process_queue_item(self, job_details):
|
||||||
image_id = job_details['image_id']
|
image_id = job_details['image_id']
|
||||||
repository = job_details['repository']
|
repository = job_details['repository']
|
||||||
|
namespace = model.user.get_namespace_by_user_id(job_details['namespace_user_id'])
|
||||||
# TODO switch to the namespace_user_id branch only once exisiting jobs have all gone through
|
|
||||||
if 'namespace_user_id' in job_details:
|
|
||||||
namespace = model.get_namespace_by_user_id(job_details['namespace_user_id'])
|
|
||||||
else:
|
|
||||||
namespace = job_details['namespace']
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
process_image_changes(namespace, repository, image_id)
|
process_image_changes(namespace, repository, image_id)
|
||||||
|
@ -32,7 +27,5 @@ class DiffsWorker(Worker):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.config.fileConfig('conf/logging.conf', disable_existing_loggers=False)
|
|
||||||
|
|
||||||
worker = DiffsWorker(image_diff_queue)
|
worker = DiffsWorker(image_diff_queue)
|
||||||
worker.start()
|
worker.start()
|
||||||
|
|
|
@ -1,30 +1,30 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from apscheduler.schedulers.blocking import BlockingScheduler
|
|
||||||
|
|
||||||
from app import app
|
from app import app
|
||||||
from data.database import UseThenDisconnect
|
from data.database import UseThenDisconnect
|
||||||
from data.model.repository import find_repository_with_garbage, garbage_collect_repo
|
from data.model.repository import find_repository_with_garbage, garbage_collect_repo
|
||||||
|
from workers.worker import Worker
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
sched = BlockingScheduler()
|
|
||||||
|
|
||||||
@sched.scheduled_job(trigger='interval', seconds=10)
|
class GarbageCollectionWorker(Worker):
|
||||||
def garbage_collect_repositories():
|
def __init__(self):
|
||||||
""" Performs garbage collection on repositories. """
|
super(GarbageCollectionWorker, self).__init__()
|
||||||
|
self.add_operation(self._garbage_collection_repos, 10)
|
||||||
|
|
||||||
with UseThenDisconnect(app.config):
|
def _garbage_collection_repos(self):
|
||||||
repository = find_repository_with_garbage(app.config.get('EXP_ASYNC_GARBAGE_COLLECTION', []))
|
""" Performs garbage collection on repositories. """
|
||||||
if repository is None:
|
with UseThenDisconnect(app.config):
|
||||||
logger.debug('No repository with garbage found')
|
repository = find_repository_with_garbage(app.config.get('EXP_ASYNC_GARBAGE_COLLECTION', []))
|
||||||
return False
|
if repository is None:
|
||||||
|
logger.debug('No repository with garbage found')
|
||||||
logger.debug('Starting GC of repository #%s (%s)', repository.id, repository.name)
|
return
|
||||||
garbage_collect_repo(repository)
|
|
||||||
logger.debug('Finished GC of repository #%s (%s)', repository.id, repository.name)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
logger.debug('Starting GC of repository #%s (%s)', repository.id, repository.name)
|
||||||
|
garbage_collect_repo(repository)
|
||||||
|
logger.debug('Finished GC of repository #%s (%s)', repository.id, repository.name)
|
||||||
|
return
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
worker = GarbageCollectionWorker()
|
||||||
sched.start()
|
worker.start()
|
||||||
|
|
|
@ -1,19 +1,16 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from app import notification_queue
|
from app import notification_queue
|
||||||
from workers.worker import Worker
|
|
||||||
|
|
||||||
from endpoints.notificationmethod import NotificationMethod, InvalidNotificationMethodException
|
from endpoints.notificationmethod import NotificationMethod, InvalidNotificationMethodException
|
||||||
from endpoints.notificationevent import NotificationEvent, InvalidNotificationEventException
|
from endpoints.notificationevent import NotificationEvent, InvalidNotificationEventException
|
||||||
from workers.worker import JobException
|
from workers.queueworker import QueueWorker, JobException
|
||||||
|
|
||||||
from data import model
|
from data import model
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class NotificationWorker(QueueWorker):
|
||||||
class NotificationWorker(Worker):
|
|
||||||
def process_queue_item(self, job_details):
|
def process_queue_item(self, job_details):
|
||||||
notification_uuid = job_details['notification_uuid']
|
notification_uuid = job_details['notification_uuid']
|
||||||
|
|
||||||
|
@ -39,8 +36,6 @@ class NotificationWorker(Worker):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.config.fileConfig('conf/logging.conf', disable_existing_loggers=False)
|
|
||||||
|
|
||||||
worker = NotificationWorker(notification_queue, poll_period_seconds=10, reservation_seconds=30,
|
worker = NotificationWorker(notification_queue, poll_period_seconds=10, reservation_seconds=30,
|
||||||
retry_after_seconds=30)
|
retry_after_seconds=30)
|
||||||
worker.start()
|
worker.start()
|
||||||
|
|
144
workers/queueworker.py
Normal file
144
workers/queueworker.py
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from threading import Event, Lock
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from threading import Thread
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
from data.model import db
|
||||||
|
from data.queue import WorkQueue
|
||||||
|
|
||||||
|
from workers.worker import Worker
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class JobException(Exception):
|
||||||
|
""" A job exception is an exception that is caused by something being malformed in the job. When
|
||||||
|
a worker raises this exception the job will be terminated and the retry will not be returned
|
||||||
|
to the queue. """
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class WorkerUnhealthyException(Exception):
|
||||||
|
""" When this exception is raised, the worker is no longer healthy and will not accept any more
|
||||||
|
work. When this is raised while processing a queue item, the item should be returned to the
|
||||||
|
queue along with another retry. """
|
||||||
|
pass
|
||||||
|
|
||||||
|
class QueueWorker(Worker):
|
||||||
|
def __init__(self, queue, poll_period_seconds=30, reservation_seconds=300,
|
||||||
|
watchdog_period_seconds=60, retry_after_seconds=300):
|
||||||
|
super(QueueWorker, self).__init__()
|
||||||
|
|
||||||
|
self._poll_period_seconds = poll_period_seconds
|
||||||
|
self._reservation_seconds = reservation_seconds
|
||||||
|
self._watchdog_period_seconds = watchdog_period_seconds
|
||||||
|
self._retry_after_seconds = retry_after_seconds
|
||||||
|
self._stop = Event()
|
||||||
|
self._terminated = Event()
|
||||||
|
self._queue = queue
|
||||||
|
self._current_item_lock = Lock()
|
||||||
|
self.current_queue_item = None
|
||||||
|
|
||||||
|
# Add the various operations.
|
||||||
|
self.add_operation(self.poll_queue, self._poll_period_seconds)
|
||||||
|
self.add_operation(self.update_queue_metrics, 60)
|
||||||
|
self.add_operation(self.run_watchdog, self._watchdog_period_seconds)
|
||||||
|
|
||||||
|
def process_queue_item(self, job_details):
|
||||||
|
""" Return True if complete, False if it should be retried. """
|
||||||
|
raise NotImplementedError('Workers must implement run.')
|
||||||
|
|
||||||
|
def watchdog(self):
|
||||||
|
""" Function that gets run once every watchdog_period_seconds. """
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _close_db_handle(self):
|
||||||
|
if not db.is_closed():
|
||||||
|
logger.debug('Disconnecting from database.')
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
def extend_processing(self, seconds_from_now):
|
||||||
|
with self._current_item_lock:
|
||||||
|
if self.current_queue_item is not None:
|
||||||
|
self._queue.extend_processing(self.current_queue_item, seconds_from_now)
|
||||||
|
|
||||||
|
def run_watchdog(self):
|
||||||
|
logger.debug('Running watchdog.')
|
||||||
|
try:
|
||||||
|
self.watchdog()
|
||||||
|
except WorkerUnhealthyException as exc:
|
||||||
|
logger.error('The worker has encountered an error via watchdog and will not take new jobs')
|
||||||
|
logger.error(exc.message)
|
||||||
|
self.mark_current_incomplete(restore_retry=True)
|
||||||
|
self._stop.set()
|
||||||
|
|
||||||
|
def poll_queue(self):
|
||||||
|
logger.debug('Getting work item from queue.')
|
||||||
|
|
||||||
|
with self._current_item_lock:
|
||||||
|
self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Retrieve the current item in the queue over which to operate. We do so under
|
||||||
|
# a lock to make sure we are always retrieving an item when in a healthy state.
|
||||||
|
current_queue_item = None
|
||||||
|
with self._current_item_lock:
|
||||||
|
current_queue_item = self.current_queue_item
|
||||||
|
if current_queue_item is None:
|
||||||
|
# Close the db handle.
|
||||||
|
self._close_db_handle()
|
||||||
|
break
|
||||||
|
|
||||||
|
logger.debug('Queue gave us some work: %s', current_queue_item.body)
|
||||||
|
job_details = json.loads(current_queue_item.body)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.process_queue_item(job_details)
|
||||||
|
self.mark_current_complete()
|
||||||
|
|
||||||
|
except JobException as jex:
|
||||||
|
logger.warning('An error occurred processing request: %s', current_queue_item.body)
|
||||||
|
logger.warning('Job exception: %s' % jex)
|
||||||
|
self.mark_current_incomplete(restore_retry=False)
|
||||||
|
|
||||||
|
except WorkerUnhealthyException as exc:
|
||||||
|
logger.error('The worker has encountered an error via the job and will not take new jobs')
|
||||||
|
logger.error(exc.message)
|
||||||
|
self.mark_current_incomplete(restore_retry=True)
|
||||||
|
self._stop.set()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Close the db handle.
|
||||||
|
self._close_db_handle()
|
||||||
|
|
||||||
|
if not self._stop.is_set():
|
||||||
|
with self._current_item_lock:
|
||||||
|
self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)
|
||||||
|
|
||||||
|
if not self._stop.is_set():
|
||||||
|
logger.debug('No more work.')
|
||||||
|
|
||||||
|
def update_queue_metrics(self):
|
||||||
|
self._queue.update_metrics()
|
||||||
|
|
||||||
|
def mark_current_incomplete(self, restore_retry=False):
|
||||||
|
with self._current_item_lock:
|
||||||
|
if self.current_queue_item is not None:
|
||||||
|
self._queue.incomplete(self.current_queue_item, restore_retry=restore_retry,
|
||||||
|
retry_after=self._retry_after_seconds)
|
||||||
|
self.current_queue_item = None
|
||||||
|
|
||||||
|
def mark_current_complete(self):
|
||||||
|
with self._current_item_lock:
|
||||||
|
if self.current_queue_item is not None:
|
||||||
|
self._queue.complete(self.current_queue_item)
|
||||||
|
self.current_queue_item = None
|
||||||
|
|
||||||
|
def ungracefully_terminated(self):
|
||||||
|
# Give back the retry that we took for this queue item so that if it were down to zero
|
||||||
|
# retries it will still be picked up by another worker
|
||||||
|
self.mark_current_incomplete()
|
|
@ -1,54 +1,51 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from apscheduler.schedulers.blocking import BlockingScheduler
|
|
||||||
|
|
||||||
from app import app
|
from app import app
|
||||||
from data.database import (Repository, LogEntry, RepositoryActionCount, db_random_func, fn,
|
from data.database import (Repository, LogEntry, RepositoryActionCount, db_random_func, fn,
|
||||||
UseThenDisconnect)
|
UseThenDisconnect)
|
||||||
from datetime import date, datetime, timedelta
|
from datetime import date, datetime, timedelta
|
||||||
|
from workers.worker import Worker
|
||||||
|
|
||||||
POLL_PERIOD_SECONDS = 30
|
POLL_PERIOD_SECONDS = 10
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
sched = BlockingScheduler()
|
|
||||||
|
|
||||||
@sched.scheduled_job(trigger='interval', seconds=10)
|
class RepositoryActionCountWorker(Worker):
|
||||||
def count_repository_actions():
|
def __init__(self):
|
||||||
""" Counts actions for a random repository for the previous day. """
|
super(RepositoryActionCountWorker, self).__init__()
|
||||||
|
self.add_operation(self._count_repository_actions, POLL_PERIOD_SECONDS)
|
||||||
|
|
||||||
with UseThenDisconnect(app.config):
|
def _count_repository_actions(self):
|
||||||
try:
|
""" Counts actions for a random repository for the previous day. """
|
||||||
# Get a random repository to count.
|
|
||||||
today = date.today()
|
|
||||||
yesterday = today - timedelta(days=1)
|
|
||||||
has_yesterday_actions = (RepositoryActionCount.select(RepositoryActionCount.repository)
|
|
||||||
.where(RepositoryActionCount.date == yesterday))
|
|
||||||
|
|
||||||
to_count = (Repository.select()
|
with UseThenDisconnect(app.config):
|
||||||
.where(~(Repository.id << (has_yesterday_actions)))
|
|
||||||
.order_by(db_random_func()).get())
|
|
||||||
|
|
||||||
logger.debug('Counting: %s', to_count.id)
|
|
||||||
|
|
||||||
actions = (LogEntry.select()
|
|
||||||
.where(LogEntry.repository == to_count,
|
|
||||||
LogEntry.datetime >= yesterday,
|
|
||||||
LogEntry.datetime < today)
|
|
||||||
.count())
|
|
||||||
|
|
||||||
# Create the row.
|
|
||||||
try:
|
try:
|
||||||
RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
|
# Get a random repository to count.
|
||||||
except:
|
today = date.today()
|
||||||
logger.exception('Exception when writing count')
|
yesterday = today - timedelta(days=1)
|
||||||
|
has_yesterday_actions = (RepositoryActionCount.select(RepositoryActionCount.repository)
|
||||||
|
.where(RepositoryActionCount.date == yesterday))
|
||||||
|
|
||||||
return True
|
to_count = (Repository.select()
|
||||||
|
.where(~(Repository.id << (has_yesterday_actions)))
|
||||||
|
.order_by(db_random_func()).get())
|
||||||
|
|
||||||
except Repository.DoesNotExist:
|
logger.debug('Counting: %s', to_count.id)
|
||||||
logger.debug('No further repositories to count')
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
actions = (LogEntry.select()
|
||||||
|
.where(LogEntry.repository == to_count,
|
||||||
|
LogEntry.datetime >= yesterday,
|
||||||
|
LogEntry.datetime < today)
|
||||||
|
.count())
|
||||||
|
|
||||||
|
# Create the row.
|
||||||
|
try:
|
||||||
|
RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
|
||||||
|
except:
|
||||||
|
logger.exception('Exception when writing count')
|
||||||
|
except Repository.DoesNotExist:
|
||||||
|
logger.debug('No further repositories to count')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
worker = RepositoryActionCountWorker()
|
||||||
sched.start()
|
worker.start()
|
||||||
|
|
|
@ -1,93 +1,33 @@
|
||||||
import logging
|
import logging
|
||||||
import json
|
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
import socket
|
||||||
|
|
||||||
from threading import Event, Lock
|
from threading import Event
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
|
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
from raven import Client
|
||||||
|
|
||||||
|
from app import app
|
||||||
from data.model import db
|
from data.model import db
|
||||||
from data.queue import WorkQueue
|
from functools import wraps
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class JobException(Exception):
|
|
||||||
""" A job exception is an exception that is caused by something being malformed in the job. When
|
|
||||||
a worker raises this exception the job will be terminated and the retry will not be returned
|
|
||||||
to the queue. """
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class WorkerUnhealthyException(Exception):
|
|
||||||
""" When this exception is raised, the worker is no longer healthy and will not accept any more
|
|
||||||
work. When this is raised while processing a queue item, the item should be returned to the
|
|
||||||
queue along with another retry. """
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class WorkerStatusServer(HTTPServer):
|
|
||||||
def __init__(self, worker, *args, **kwargs):
|
|
||||||
HTTPServer.__init__(self, *args, **kwargs)
|
|
||||||
self.worker = worker
|
|
||||||
|
|
||||||
|
|
||||||
class WorkerStatusHandler(BaseHTTPRequestHandler):
|
|
||||||
def do_GET(self):
|
|
||||||
if self.path == '/status':
|
|
||||||
# Return the worker status
|
|
||||||
code = 200 if self.server.worker.is_healthy() else 503
|
|
||||||
self.send_response(code)
|
|
||||||
self.send_header('Content-Type', 'text/plain')
|
|
||||||
self.end_headers()
|
|
||||||
self.wfile.write('OK')
|
|
||||||
elif self.path == '/terminate':
|
|
||||||
# Return whether it is safe to terminate the worker process
|
|
||||||
code = 200 if self.server.worker.is_terminated() else 503
|
|
||||||
self.send_response(code)
|
|
||||||
else:
|
|
||||||
self.send_error(404)
|
|
||||||
|
|
||||||
def do_POST(self):
|
|
||||||
if self.path == '/terminate':
|
|
||||||
try:
|
|
||||||
self.server.worker.join()
|
|
||||||
self.send_response(200)
|
|
||||||
except:
|
|
||||||
self.send_response(500)
|
|
||||||
else:
|
|
||||||
self.send_error(404)
|
|
||||||
|
|
||||||
|
|
||||||
class Worker(object):
|
class Worker(object):
|
||||||
def __init__(self, queue, poll_period_seconds=30, reservation_seconds=300,
|
""" Base class for workers which perform some work periodically. """
|
||||||
watchdog_period_seconds=60, retry_after_seconds=300):
|
def __init__(self):
|
||||||
self._sched = BackgroundScheduler()
|
self._sched = BackgroundScheduler()
|
||||||
self._poll_period_seconds = poll_period_seconds
|
self._operations = []
|
||||||
self._reservation_seconds = reservation_seconds
|
|
||||||
self._watchdog_period_seconds = watchdog_period_seconds
|
|
||||||
self._retry_after_seconds = retry_after_seconds
|
|
||||||
self._stop = Event()
|
self._stop = Event()
|
||||||
self._terminated = Event()
|
self._terminated = Event()
|
||||||
self._queue = queue
|
|
||||||
self._current_item_lock = Lock()
|
|
||||||
self.current_queue_item = None
|
|
||||||
|
|
||||||
def process_queue_item(self, job_details):
|
if app.config.get('EXCEPTION_LOG_TYPE', 'FakeSentry') == 'Sentry':
|
||||||
""" Return True if complete, False if it should be retried. """
|
worker_name = '%s:worker-%s' % (socket.gethostname(), self.__class__.__name__)
|
||||||
raise NotImplementedError('Workers must implement run.')
|
self._raven_client = Client(app.config.get('SENTRY_DSN', ''), name=worker_name)
|
||||||
|
|
||||||
def watchdog(self):
|
|
||||||
""" Function that gets run once every watchdog_period_seconds. """
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _close_db_handle(self):
|
|
||||||
if not db.is_closed():
|
|
||||||
logger.debug('Disconnecting from database.')
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
def is_healthy(self):
|
def is_healthy(self):
|
||||||
return not self._stop.is_set()
|
return not self._stop.is_set()
|
||||||
|
@ -95,90 +35,33 @@ class Worker(object):
|
||||||
def is_terminated(self):
|
def is_terminated(self):
|
||||||
return self._terminated.is_set()
|
return self._terminated.is_set()
|
||||||
|
|
||||||
def extend_processing(self, seconds_from_now):
|
def ungracefully_terminated(self):
|
||||||
with self._current_item_lock:
|
""" Method called when the worker has been terminated in an ungraceful fashion. """
|
||||||
if self.current_queue_item is not None:
|
pass
|
||||||
self._queue.extend_processing(self.current_queue_item, seconds_from_now)
|
|
||||||
|
|
||||||
def run_watchdog(self):
|
|
||||||
logger.debug('Running watchdog.')
|
|
||||||
try:
|
|
||||||
self.watchdog()
|
|
||||||
except WorkerUnhealthyException as exc:
|
|
||||||
logger.error('The worker has encountered an error via watchdog and will not take new jobs')
|
|
||||||
logger.error(exc.message)
|
|
||||||
self.mark_current_incomplete(restore_retry=True)
|
|
||||||
self._stop.set()
|
|
||||||
|
|
||||||
def poll_queue(self):
|
|
||||||
logger.debug('Getting work item from queue.')
|
|
||||||
|
|
||||||
with self._current_item_lock:
|
|
||||||
self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
# Retrieve the current item in the queue over which to operate. We do so under
|
|
||||||
# a lock to make sure we are always retrieving an item when in a healthy state.
|
|
||||||
current_queue_item = None
|
|
||||||
with self._current_item_lock:
|
|
||||||
current_queue_item = self.current_queue_item
|
|
||||||
if current_queue_item is None:
|
|
||||||
# Close the db handle.
|
|
||||||
self._close_db_handle()
|
|
||||||
break
|
|
||||||
|
|
||||||
logger.debug('Queue gave us some work: %s', current_queue_item.body)
|
|
||||||
job_details = json.loads(current_queue_item.body)
|
|
||||||
|
|
||||||
|
def add_operation(self, operation_func, operation_sec):
|
||||||
|
@wraps(operation_func)
|
||||||
|
def _operation_func():
|
||||||
try:
|
try:
|
||||||
self.process_queue_item(job_details)
|
return operation_func()
|
||||||
self.mark_current_complete()
|
except Exception:
|
||||||
|
logger.exception('Operation raised exception')
|
||||||
|
if self._raven_client:
|
||||||
|
logger.debug('Logging exception to Sentry')
|
||||||
|
self._raven_client.captureException()
|
||||||
|
|
||||||
except JobException as jex:
|
self._operations.append((_operation_func, operation_sec))
|
||||||
logger.warning('An error occurred processing request: %s', current_queue_item.body)
|
|
||||||
logger.warning('Job exception: %s' % jex)
|
|
||||||
self.mark_current_incomplete(restore_retry=False)
|
|
||||||
|
|
||||||
except WorkerUnhealthyException as exc:
|
def start(self):
|
||||||
logger.error('The worker has encountered an error via the job and will not take new jobs')
|
logging.config.fileConfig('conf/logging.conf', disable_existing_loggers=False)
|
||||||
logger.error(exc.message)
|
logger.debug('Scheduling worker.')
|
||||||
self.mark_current_incomplete(restore_retry=True)
|
|
||||||
self._stop.set()
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Close the db handle.
|
|
||||||
self._close_db_handle()
|
|
||||||
|
|
||||||
if not self._stop.is_set():
|
|
||||||
with self._current_item_lock:
|
|
||||||
self.current_queue_item = self._queue.get(processing_time=self._reservation_seconds)
|
|
||||||
|
|
||||||
if not self._stop.is_set():
|
|
||||||
logger.debug('No more work.')
|
|
||||||
|
|
||||||
def update_queue_metrics(self):
|
|
||||||
self._queue.update_metrics()
|
|
||||||
|
|
||||||
def start(self, start_status_server_port=None):
|
|
||||||
if start_status_server_port is not None:
|
|
||||||
# Start a status server on a thread
|
|
||||||
server_address = ('', start_status_server_port)
|
|
||||||
httpd = WorkerStatusServer(self, server_address, WorkerStatusHandler)
|
|
||||||
server_thread = Thread(target=httpd.serve_forever)
|
|
||||||
server_thread.daemon = True
|
|
||||||
server_thread.start()
|
|
||||||
|
|
||||||
logger.debug("Scheduling worker.")
|
|
||||||
|
|
||||||
soon = datetime.now() + timedelta(seconds=.001)
|
soon = datetime.now() + timedelta(seconds=.001)
|
||||||
|
|
||||||
self._sched.start()
|
self._sched.start()
|
||||||
self._sched.add_job(self.poll_queue, 'interval', seconds=self._poll_period_seconds,
|
for operation_func, operation_sec in self._operations:
|
||||||
start_date=soon, max_instances=1)
|
self._sched.add_job(operation_func, 'interval', seconds=operation_sec,
|
||||||
self._sched.add_job(self.update_queue_metrics, 'interval', seconds=60, start_date=soon,
|
start_date=soon, max_instances=1)
|
||||||
max_instances=1)
|
|
||||||
self._sched.add_job(self.run_watchdog, 'interval', seconds=self._watchdog_period_seconds,
|
|
||||||
max_instances=1)
|
|
||||||
|
|
||||||
signal.signal(signal.SIGTERM, self.terminate)
|
signal.signal(signal.SIGTERM, self.terminate)
|
||||||
signal.signal(signal.SIGINT, self.terminate)
|
signal.signal(signal.SIGINT, self.terminate)
|
||||||
|
@ -192,23 +75,6 @@ class Worker(object):
|
||||||
|
|
||||||
self._terminated.set()
|
self._terminated.set()
|
||||||
|
|
||||||
# Wait forever if we're running a server
|
|
||||||
while start_status_server_port is not None:
|
|
||||||
sleep(60)
|
|
||||||
|
|
||||||
def mark_current_incomplete(self, restore_retry=False):
|
|
||||||
with self._current_item_lock:
|
|
||||||
if self.current_queue_item is not None:
|
|
||||||
self._queue.incomplete(self.current_queue_item, restore_retry=restore_retry,
|
|
||||||
retry_after=self._retry_after_seconds)
|
|
||||||
self.current_queue_item = None
|
|
||||||
|
|
||||||
def mark_current_complete(self):
|
|
||||||
with self._current_item_lock:
|
|
||||||
if self.current_queue_item is not None:
|
|
||||||
self._queue.complete(self.current_queue_item)
|
|
||||||
self.current_queue_item = None
|
|
||||||
|
|
||||||
def terminate(self, signal_num=None, stack_frame=None, graceful=False):
|
def terminate(self, signal_num=None, stack_frame=None, graceful=False):
|
||||||
if self._terminated.is_set():
|
if self._terminated.is_set():
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -218,9 +84,7 @@ class Worker(object):
|
||||||
self._stop.set()
|
self._stop.set()
|
||||||
|
|
||||||
if not graceful:
|
if not graceful:
|
||||||
# Give back the retry that we took for this queue item so that if it were down to zero
|
self.ungracefully_terminated()
|
||||||
# retries it will still be picked up by another worker
|
|
||||||
self.mark_current_incomplete()
|
|
||||||
|
|
||||||
def join(self):
|
def join(self):
|
||||||
self.terminate(graceful=True)
|
self.terminate(graceful=True)
|
||||||
|
|
Reference in a new issue