Fix worker count to use CPU affinity correctly and be properly bounded

We were using the `cpu_count`, which doesn't respect container affinity. Now, we use `cpu_affinity` and also bound to make sure we don't start a million workers

Fixes https://jira.coreos.com/browse/QUAY-928
This commit is contained in:
Joseph Schorr 2018-05-03 11:57:20 +03:00
parent e80c56e441
commit b26a131085
7 changed files with 115 additions and 10 deletions

View file

@ -2,16 +2,16 @@ import sys
import os import os
sys.path.append(os.path.join(os.path.dirname(__file__), "../")) sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
import multiprocessing
import logging import logging
from Crypto import Random from Crypto import Random
from util.log import logfile_path from util.log import logfile_path
from util.workers import get_worker_count
logconfig = logfile_path(debug=True) logconfig = logfile_path(debug=True)
bind = '0.0.0.0:5000' bind = '0.0.0.0:5000'
workers = max(int(os.getenv("WORKER_COUNT_LOCAL", multiprocessing.cpu_count())), 2) workers = get_worker_count('local', 2, minimum=2, maximum=8)
worker_class = 'gevent' worker_class = 'gevent'
daemon = False daemon = False
pythonpath = '.' pythonpath = '.'

View file

@ -2,16 +2,16 @@ import sys
import os import os
sys.path.append(os.path.join(os.path.dirname(__file__), "../")) sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
import multiprocessing
import logging import logging
from Crypto import Random from Crypto import Random
from util.log import logfile_path from util.log import logfile_path
from util.workers import get_worker_count
logconfig = logfile_path(debug=False) logconfig = logfile_path(debug=False)
bind = 'unix:/tmp/gunicorn_registry.sock' bind = 'unix:/tmp/gunicorn_registry.sock'
workers = max(int(os.getenv("WORKER_COUNT_REGISTRY", multiprocessing.cpu_count() * 4)), 8) workers = get_worker_count('registry', 4, minimum=8, maximum=64)
worker_class = 'gevent' worker_class = 'gevent'
pythonpath = '.' pythonpath = '.'
preload_app = True preload_app = True

View file

@ -2,16 +2,16 @@ import sys
import os import os
sys.path.append(os.path.join(os.path.dirname(__file__), "../")) sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
import multiprocessing
import logging import logging
from Crypto import Random from Crypto import Random
from util.log import logfile_path from util.log import logfile_path
from util.workers import get_worker_count
logconfig = logfile_path(debug=False) logconfig = logfile_path(debug=False)
bind = 'unix:/tmp/gunicorn_secscan.sock' bind = 'unix:/tmp/gunicorn_secscan.sock'
workers = max(int(os.getenv("WORKER_COUNT_SECSCAN", multiprocessing.cpu_count())), 2) workers = get_worker_count('secscan', 2, minimum=2, maximum=4)
worker_class = 'gevent' worker_class = 'gevent'
pythonpath = '.' pythonpath = '.'
preload_app = True preload_app = True

View file

@ -2,16 +2,16 @@ import sys
import os import os
sys.path.append(os.path.join(os.path.dirname(__file__), "../")) sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
import multiprocessing
import logging import logging
from Crypto import Random from Crypto import Random
from util.log import logfile_path from util.log import logfile_path
from util.workers import get_worker_count
logconfig = logfile_path(debug=False) logconfig = logfile_path(debug=False)
bind = 'unix:/tmp/gunicorn_verbs.sock' bind = 'unix:/tmp/gunicorn_verbs.sock'
workers = max(int(os.getenv("WORKER_COUNT_VERBS", multiprocessing.cpu_count())), 2) workers = get_worker_count('verbs', 2, minimum=2, maximum=32)
pythonpath = '.' pythonpath = '.'
preload_app = True preload_app = True
timeout = 2000 # Because sync workers timeout = 2000 # Because sync workers

View file

@ -2,17 +2,17 @@ import sys
import os import os
sys.path.append(os.path.join(os.path.dirname(__file__), "../")) sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
import multiprocessing
import logging import logging
from Crypto import Random from Crypto import Random
from util.log import logfile_path from util.log import logfile_path
from util.workers import get_worker_count
logconfig = logfile_path(debug=False) logconfig = logfile_path(debug=False)
bind = 'unix:/tmp/gunicorn_web.sock' bind = 'unix:/tmp/gunicorn_web.sock'
workers = max(int(os.getenv("WORKER_COUNT_WEB", multiprocessing.cpu_count())), 2) workers = get_worker_count('web', 2, minimum=2, maximum=32)
worker_class = 'gevent' worker_class = 'gevent'
pythonpath = '.' pythonpath = '.'
preload_app = True preload_app = True

74
util/test/test_workers.py Normal file
View file

@ -0,0 +1,74 @@
from mock import patch
import pytest
from util.workers import get_worker_count
@pytest.mark.parametrize('kind_name,env_vars,cpu_affinity,multiplier,minimum,maximum,expected', [
# No override and CPU affinity * multiplier is between min and max => cpu affinity * multiplier.
('registry', {}, [0, 1], 10, 8, 64, 20),
# No override and CPU affinity * multiplier is below min => min.
('registry', {}, [0], 1, 8, 64, 8),
# No override and CPU affinity * multiplier is above max => max.
('registry', {}, [0, 1, 2, 3], 20, 8, 64, 64),
# Override based on specific env var.
('registry', {
'WORKER_COUNT_REGISTRY': 12,
}, [0, 1], 10, 8, 64, 12),
# Override based on specific env var (ignores maximum).
('registry', {
'WORKER_COUNT_REGISTRY': 120,
}, [0, 1], 10, 8, 64, 120),
# Override based on specific env var (respects minimum).
('registry', {
'WORKER_COUNT_REGISTRY': 1,
}, [0, 1], 10, 8, 64, 8),
# Override based on generic env var.
('registry', {
'WORKER_COUNT': 12,
}, [0, 1], 10, 8, 64, 12),
# Override based on generic env var (ignores maximum).
('registry', {
'WORKER_COUNT': 120,
}, [0, 1], 10, 8, 64, 120),
# Override based on generic env var (respects minimum).
('registry', {
'WORKER_COUNT': 1,
}, [0, 1], 10, 8, 64, 8),
# Override always uses specific first.
('registry', {
'WORKER_COUNT_REGISTRY': 120,
'WORKER_COUNT': 12,
}, [0, 1], 10, 8, 64, 120),
# Non-matching override.
('verbs', {
'WORKER_COUNT_REGISTRY': 120,
}, [0, 1], 10, 8, 64, 20),
# Zero worker count (use defaults).
('verbs', {
'WORKER_COUNT': 0,
}, [0, 1], 10, 8, 64, 8),
])
def test_get_worker_count(kind_name, env_vars, cpu_affinity, multiplier, minimum, maximum,
expected):
class FakeProcess(object):
def __init__(self, pid):
pass
def cpu_affinity(self):
return cpu_affinity
with patch('os.environ.get', env_vars.get):
with patch('psutil.Process', FakeProcess):
assert get_worker_count(kind_name, multiplier, minimum, maximum) == expected

31
util/workers.py Normal file
View file

@ -0,0 +1,31 @@
import os
import psutil
def get_worker_count(worker_kind_name, multiplier, minimum=None, maximum=None):
""" Returns the number of gunicorn workers to run for the given worker kind,
based on a combination of environment variable, multiplier, minimum (if any),
and number of accessible CPU cores.
"""
minimum = minimum or multiplier
maximum = maximum or (multiplier * multiplier)
# Check for an override via an environment variable.
override_value = os.environ.get('WORKER_COUNT_' + worker_kind_name.upper())
if override_value is not None:
return max(override_value, minimum)
override_value = os.environ.get('WORKER_COUNT')
if override_value is not None:
return max(override_value, minimum)
# Load the number of CPU cores via affinity, and use that to calculate the
# number of workers to run.
p = psutil.Process(os.getpid())
try:
cpu_count = len(p.cpu_affinity())
except AttributeError:
# cpu_affinity isn't supported on this platform. Assume 2.
cpu_count = 2
return min(max(cpu_count * multiplier, minimum), maximum)