From 72fdf93d29e397c4b4a3727bf80bd85dd6dbe71e Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 2 Nov 2016 14:06:07 -0400 Subject: [PATCH] Add monit-based monitoring of build manager Should catch when the build manager freezes and restart it --- Dockerfile | 6 +++++- conf/init/service/buildmanager/run | 6 +++++- conf/monitrc | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 conf/monitrc diff --git a/Dockerfile b/Dockerfile index 8df4cd357..94bd55d3d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND noninteractive ENV HOME /root # Install system packages -RUN apt-get update # 07SEP2016 +RUN apt-get update # 02NOV2016 RUN apt-get install -y \ g++ \ gdebi-core \ @@ -27,6 +27,7 @@ RUN apt-get install -y \ libpq5 \ libsasl2-dev \ libsasl2-modules \ + monit \ nginx \ nodejs \ npm \ @@ -112,6 +113,9 @@ ADD conf/init/zz_boot.sh /etc/my_init.d/ ADD conf/init/service/ /etc/service/ RUN rm -rf /etc/service/syslog-forwarder +ADD conf/monitrc /etc/monit/monitrc +RUN chmod 0600 /etc/monit/monitrc + # remove after phusion/baseimage-docker#338 is fixed ADD conf/init/logrotate.conf /etc/logrotate.conf diff --git a/conf/init/service/buildmanager/run b/conf/init/service/buildmanager/run index b8ba199b2..3c753782a 100755 --- a/conf/init/service/buildmanager/run +++ b/conf/init/service/buildmanager/run @@ -2,7 +2,11 @@ echo 'Starting internal build manager' +# Run monit to ensure the build manager is restarted if/when it locks up. +monit + +# Run the build manager. cd / -TROLLIUSDEBUG=1 venv/bin/python -m buildman.builder 2>&1 +exec TROLLIUSDEBUG=1 venv/bin/python -m buildman.builder 2>&1 echo 'Internal build manager exited' \ No newline at end of file diff --git a/conf/monitrc b/conf/monitrc new file mode 100644 index 000000000..7e3021067 --- /dev/null +++ b/conf/monitrc @@ -0,0 +1,17 @@ +set daemon 10 with start delay 30 + +set httpd port 2812 and + use address localhost # only accept connection from localhost + allow localhost # allow localhost to connect to the server and + allow admin:monit # require user 'admin' with password 'monit' + +check host buildmanager with address localhost + start program = "/usr/bin/sv start /etc/service/buildmanager" with timeout 10 seconds + stop program = "/usr/bin/sv kill /etc/service/buildmanager" + if failed port 8787 protocol http + request "/" + status = 405 + with timeout 3 seconds + for 3 cycles + then restart + if 10 restarts within 10 cycles then timeout