Merge pull request #3348 from quay/optimize-log-listing

Optimize log listing
This commit is contained in:
Joseph Schorr 2019-01-29 11:25:32 -05:00 committed by GitHub
commit 6c3eab47e0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 63 additions and 64 deletions

View file

@ -16,8 +16,8 @@ from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage, _
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
MINIMUM_RANGE_SIZE = 1000 MINIMUM_RANGE_SIZE = 1 # second
MAXIMUM_RANGE_SIZE = 100000 MAXIMUM_RANGE_SIZE = 60 * 60 * 24 * 30 # seconds ~= 1 month
EXPECTED_ITERATION_LOG_COUNT = 1000 EXPECTED_ITERATION_LOG_COUNT = 1000
@ -48,9 +48,11 @@ class TableLogsModel(ActionLogsDataInterface):
ignore=filter_kinds, model=m) ignore=filter_kinds, model=m)
logs, next_page_token = model.modelutil.paginate(logs_query, m, logs, next_page_token = model.modelutil.paginate(logs_query, m,
descending=True, page_token=page_token, descending=True,
page_token=page_token,
limit=20, limit=20,
max_page=max_page_count) max_page=max_page_count,
sort_field_name='datetime')
return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token) return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token)
# First check the LogEntry3 table for the most recent logs, unless we've been expressly told # First check the LogEntry3 table for the most recent logs, unless we've been expressly told
@ -149,40 +151,16 @@ class TableLogsModel(ActionLogsDataInterface):
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None, def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None): namespace_id=None, max_query_time=None):
# Lookup the starting and ending IDs for the log range in the table. This operation is quite
# quick, so we use it as a bounding box for the later lookups.
min_id, elapsed = _run_and_time(lambda: model.log.get_minimum_id_for_logs(start_datetime,
repository_id,
namespace_id))
if elapsed > max_query_time:
logger.error('Retrieval of min ID for export logs `%s/%s` timed out with time of `%s`',
namespace_id, repository_id, elapsed)
raise LogsIterationTimeout()
max_id, elapsed = _run_and_time(lambda: model.log.get_maximum_id_for_logs(end_datetime,
repository_id,
namespace_id))
if elapsed > max_query_time:
logger.error('Retrieval of max ID for export logs `%s/%s` timed out with time of `%s`',
namespace_id, repository_id, elapsed)
raise LogsIterationTimeout()
min_id = min_id or 1
max_id = max_id or 1
logger.info('Found log range of %s to %s for export logs `%s/%s`', min_id, max_id,
namespace_id, repository_id)
# Using an adjusting scale, start downloading log rows in batches, starting at # Using an adjusting scale, start downloading log rows in batches, starting at
# MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or # MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
# the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes # the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
# longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out. # longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
batch_start_time = datetime.utcnow() batch_start_time = datetime.utcnow()
current_start_id = min_id current_start_datetime = start_datetime
current_batch_size = MINIMUM_RANGE_SIZE current_batch_size = timedelta(seconds=MINIMUM_RANGE_SIZE)
while current_start_id <= max_id: while current_start_datetime < end_datetime:
# Verify we haven't been working for too long. # Verify we haven't been working for too long.
work_elapsed = datetime.utcnow() - batch_start_time work_elapsed = datetime.utcnow() - batch_start_time
if work_elapsed > max_query_time: if work_elapsed > max_query_time:
@ -190,32 +168,36 @@ class TableLogsModel(ActionLogsDataInterface):
namespace_id, repository_id, work_elapsed) namespace_id, repository_id, work_elapsed)
raise LogsIterationTimeout() raise LogsIterationTimeout()
id_range = [current_start_id, min(max_id, current_start_id + current_batch_size)] current_end_datetime = current_start_datetime + current_batch_size
current_end_datetime = min(current_end_datetime, end_datetime)
# Load the next set of logs. # Load the next set of logs.
def load_logs(): def load_logs():
logger.debug('Retrieving logs over range %s with namespace %s and repository %s', logger.debug('Retrieving logs over range %s-%s with namespace %s and repository %s',
id_range, namespace_id, repository_id) current_start_datetime, current_end_datetime, namespace_id, repository_id)
logs_query = model.log.get_logs_query(namespace=namespace_id, logs_query = model.log.get_logs_query(namespace=namespace_id,
repository=repository_id, repository=repository_id,
id_range=id_range) start_time=current_start_datetime,
end_time=current_end_datetime)
return [Log.for_logentry(log) for log in logs_query] return [Log.for_logentry(log) for log in logs_query]
logs, elapsed = _run_and_time(load_logs) logs, elapsed = _run_and_time(load_logs)
if elapsed > max_query_time: if elapsed > max_query_time:
logger.error('Retrieval of logs for export logs `%s/%s` with range `%s` timed out at `%s`', logger.error('Retrieval of logs for export `%s/%s` with range `%s-%s` timed out at `%s`',
namespace_id, repository_id, id_range, elapsed) namespace_id, repository_id, current_start_datetime, current_end_datetime,
elapsed)
raise LogsIterationTimeout() raise LogsIterationTimeout()
yield logs yield logs
# Move forward. # Move forward.
current_start_id = id_range[1] + 1 current_start_datetime = current_end_datetime
# Increase the batch size if necessary. # Increase the batch size if necessary.
if len(logs) < EXPECTED_ITERATION_LOG_COUNT: if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
current_batch_size = min(MAXIMUM_RANGE_SIZE, current_batch_size * 2) seconds = min(MAXIMUM_RANGE_SIZE, current_batch_size.total_seconds() * 2)
current_batch_size = timedelta(seconds=seconds)
def _run_and_time(fn): def _run_and_time(fn):

View file

@ -1,31 +1,39 @@
import dateutil.parser
from datetime import datetime
from peewee import SQL from peewee import SQL
def paginate(query, model, descending=False, page_token=None, limit=50, id_alias=None,
max_page=None): def paginate(query, model, descending=False, page_token=None, limit=50, sort_field_alias=None,
""" Paginates the given query using an ID range, starting at the optional page_token. max_page=None, sort_field_name=None):
""" Paginates the given query using an field range, starting at the optional page_token.
Returns a *list* of matching results along with an unencrypted page_token for the Returns a *list* of matching results along with an unencrypted page_token for the
next page, if any. If descending is set to True, orders by the ID descending rather next page, if any. If descending is set to True, orders by the field descending rather
than ascending. than ascending.
""" """
# Note: We use the id_alias for the order_by, but not the where below. The alias is necessary # Note: We use the sort_field_alias for the order_by, but not the where below. The alias is
# for certain queries that use unions in MySQL, as it gets confused on which ID to order by. # necessary for certain queries that use unions in MySQL, as it gets confused on which field
# The where clause, on the other hand, cannot use the alias because Postgres does not allow # to order by. The where clause, on the other hand, cannot use the alias because Postgres does
# aliases in where clauses. # not allow aliases in where clauses.
id_field = model.id sort_field_name = sort_field_name or 'id'
if id_alias is not None: sort_field = getattr(model, sort_field_name)
id_field = SQL(id_alias)
if sort_field_alias is not None:
sort_field_name = sort_field_alias
sort_field = SQL(sort_field_alias)
if descending: if descending:
query = query.order_by(id_field.desc()) query = query.order_by(sort_field.desc())
else: else:
query = query.order_by(id_field) query = query.order_by(sort_field)
start_id = pagination_start(page_token) start_index = pagination_start(page_token)
if start_id is not None: if start_index is not None:
if descending: if descending:
query = query.where(model.id <= start_id) query = query.where(sort_field <= start_index)
else: else:
query = query.where(model.id >= start_id) query = query.where(sort_field >= start_index)
query = query.limit(limit + 1) query = query.limit(limit + 1)
@ -33,28 +41,37 @@ def paginate(query, model, descending=False, page_token=None, limit=50, id_alias
if page_number is not None and max_page is not None and page_number > max_page: if page_number is not None and max_page is not None and page_number > max_page:
return [], None return [], None
return paginate_query(query, limit=limit, id_alias=id_alias, page_number=page_number) return paginate_query(query, limit=limit, sort_field_name=sort_field_name,
page_number=page_number)
def pagination_start(page_token=None): def pagination_start(page_token=None):
""" Returns the start ID for pagination for the given page token. Will return None if None. """ """ Returns the start index for pagination for the given page token. Will return None if None. """
if page_token is not None: if page_token is not None:
return page_token.get('start_id') start_index = page_token.get('start_index')
if page_token.get('is_datetime'):
start_index = dateutil.parser.parse(start_index)
return start_index
return None return None
def paginate_query(query, limit=50, id_alias=None, page_number=None): def paginate_query(query, limit=50, sort_field_name=None, page_number=None):
""" Executes the given query and returns a page's worth of results, as well as the page token """ Executes the given query and returns a page's worth of results, as well as the page token
for the next page (if any). for the next page (if any).
""" """
results = list(query) results = list(query)
page_token = None page_token = None
if len(results) > limit: if len(results) > limit:
start_id = getattr(results[limit], id_alias or 'id') start_index = getattr(results[limit], sort_field_name or 'id')
is_datetime = False
if isinstance(start_index, datetime):
start_index = start_index.isoformat() + "Z"
is_datetime = True
page_token = { page_token = {
'start_id': start_id, 'start_index': start_index,
'page_number': page_number + 1 if page_number else 1, 'page_number': page_number + 1 if page_number else 1,
'is_datetime': is_datetime,
} }
return results[0:limit], page_token return results[0:limit], page_token

View file

@ -89,7 +89,7 @@ class PreOCIModel(RepositoryDataInterface):
kind_filter=repo_kind) kind_filter=repo_kind)
repos, next_page_token = model.modelutil.paginate_query(repo_query, limit=REPOS_PER_PAGE, repos, next_page_token = model.modelutil.paginate_query(repo_query, limit=REPOS_PER_PAGE,
id_alias='rid') sort_field_name='rid')
# Collect the IDs of the repositories found for subequent lookup of popularity # Collect the IDs of the repositories found for subequent lookup of popularity
# and/or last modified. # and/or last modified.