Merge pull request #3348 from quay/optimize-log-listing
Optimize log listing
This commit is contained in:
commit
6c3eab47e0
3 changed files with 63 additions and 64 deletions
|
@ -16,8 +16,8 @@ from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage, _
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MINIMUM_RANGE_SIZE = 1000
|
||||
MAXIMUM_RANGE_SIZE = 100000
|
||||
MINIMUM_RANGE_SIZE = 1 # second
|
||||
MAXIMUM_RANGE_SIZE = 60 * 60 * 24 * 30 # seconds ~= 1 month
|
||||
EXPECTED_ITERATION_LOG_COUNT = 1000
|
||||
|
||||
|
||||
|
@ -48,9 +48,11 @@ class TableLogsModel(ActionLogsDataInterface):
|
|||
ignore=filter_kinds, model=m)
|
||||
|
||||
logs, next_page_token = model.modelutil.paginate(logs_query, m,
|
||||
descending=True, page_token=page_token,
|
||||
descending=True,
|
||||
page_token=page_token,
|
||||
limit=20,
|
||||
max_page=max_page_count)
|
||||
max_page=max_page_count,
|
||||
sort_field_name='datetime')
|
||||
return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token)
|
||||
|
||||
# First check the LogEntry3 table for the most recent logs, unless we've been expressly told
|
||||
|
@ -149,40 +151,16 @@ class TableLogsModel(ActionLogsDataInterface):
|
|||
|
||||
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||||
namespace_id=None, max_query_time=None):
|
||||
# Lookup the starting and ending IDs for the log range in the table. This operation is quite
|
||||
# quick, so we use it as a bounding box for the later lookups.
|
||||
min_id, elapsed = _run_and_time(lambda: model.log.get_minimum_id_for_logs(start_datetime,
|
||||
repository_id,
|
||||
namespace_id))
|
||||
if elapsed > max_query_time:
|
||||
logger.error('Retrieval of min ID for export logs `%s/%s` timed out with time of `%s`',
|
||||
namespace_id, repository_id, elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
max_id, elapsed = _run_and_time(lambda: model.log.get_maximum_id_for_logs(end_datetime,
|
||||
repository_id,
|
||||
namespace_id))
|
||||
if elapsed > max_query_time:
|
||||
logger.error('Retrieval of max ID for export logs `%s/%s` timed out with time of `%s`',
|
||||
namespace_id, repository_id, elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
min_id = min_id or 1
|
||||
max_id = max_id or 1
|
||||
|
||||
logger.info('Found log range of %s to %s for export logs `%s/%s`', min_id, max_id,
|
||||
namespace_id, repository_id)
|
||||
|
||||
# Using an adjusting scale, start downloading log rows in batches, starting at
|
||||
# MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
|
||||
# the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
|
||||
# longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
|
||||
batch_start_time = datetime.utcnow()
|
||||
|
||||
current_start_id = min_id
|
||||
current_batch_size = MINIMUM_RANGE_SIZE
|
||||
current_start_datetime = start_datetime
|
||||
current_batch_size = timedelta(seconds=MINIMUM_RANGE_SIZE)
|
||||
|
||||
while current_start_id <= max_id:
|
||||
while current_start_datetime < end_datetime:
|
||||
# Verify we haven't been working for too long.
|
||||
work_elapsed = datetime.utcnow() - batch_start_time
|
||||
if work_elapsed > max_query_time:
|
||||
|
@ -190,32 +168,36 @@ class TableLogsModel(ActionLogsDataInterface):
|
|||
namespace_id, repository_id, work_elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
id_range = [current_start_id, min(max_id, current_start_id + current_batch_size)]
|
||||
current_end_datetime = current_start_datetime + current_batch_size
|
||||
current_end_datetime = min(current_end_datetime, end_datetime)
|
||||
|
||||
# Load the next set of logs.
|
||||
def load_logs():
|
||||
logger.debug('Retrieving logs over range %s with namespace %s and repository %s',
|
||||
id_range, namespace_id, repository_id)
|
||||
logger.debug('Retrieving logs over range %s-%s with namespace %s and repository %s',
|
||||
current_start_datetime, current_end_datetime, namespace_id, repository_id)
|
||||
|
||||
logs_query = model.log.get_logs_query(namespace=namespace_id,
|
||||
repository=repository_id,
|
||||
id_range=id_range)
|
||||
start_time=current_start_datetime,
|
||||
end_time=current_end_datetime)
|
||||
return [Log.for_logentry(log) for log in logs_query]
|
||||
|
||||
logs, elapsed = _run_and_time(load_logs)
|
||||
if elapsed > max_query_time:
|
||||
logger.error('Retrieval of logs for export logs `%s/%s` with range `%s` timed out at `%s`',
|
||||
namespace_id, repository_id, id_range, elapsed)
|
||||
logger.error('Retrieval of logs for export `%s/%s` with range `%s-%s` timed out at `%s`',
|
||||
namespace_id, repository_id, current_start_datetime, current_end_datetime,
|
||||
elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
yield logs
|
||||
|
||||
# Move forward.
|
||||
current_start_id = id_range[1] + 1
|
||||
current_start_datetime = current_end_datetime
|
||||
|
||||
# Increase the batch size if necessary.
|
||||
if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
|
||||
current_batch_size = min(MAXIMUM_RANGE_SIZE, current_batch_size * 2)
|
||||
seconds = min(MAXIMUM_RANGE_SIZE, current_batch_size.total_seconds() * 2)
|
||||
current_batch_size = timedelta(seconds=seconds)
|
||||
|
||||
|
||||
def _run_and_time(fn):
|
||||
|
|
|
@ -1,31 +1,39 @@
|
|||
import dateutil.parser
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from peewee import SQL
|
||||
|
||||
def paginate(query, model, descending=False, page_token=None, limit=50, id_alias=None,
|
||||
max_page=None):
|
||||
""" Paginates the given query using an ID range, starting at the optional page_token.
|
||||
|
||||
def paginate(query, model, descending=False, page_token=None, limit=50, sort_field_alias=None,
|
||||
max_page=None, sort_field_name=None):
|
||||
""" Paginates the given query using an field range, starting at the optional page_token.
|
||||
Returns a *list* of matching results along with an unencrypted page_token for the
|
||||
next page, if any. If descending is set to True, orders by the ID descending rather
|
||||
next page, if any. If descending is set to True, orders by the field descending rather
|
||||
than ascending.
|
||||
"""
|
||||
# Note: We use the id_alias for the order_by, but not the where below. The alias is necessary
|
||||
# for certain queries that use unions in MySQL, as it gets confused on which ID to order by.
|
||||
# The where clause, on the other hand, cannot use the alias because Postgres does not allow
|
||||
# aliases in where clauses.
|
||||
id_field = model.id
|
||||
if id_alias is not None:
|
||||
id_field = SQL(id_alias)
|
||||
# Note: We use the sort_field_alias for the order_by, but not the where below. The alias is
|
||||
# necessary for certain queries that use unions in MySQL, as it gets confused on which field
|
||||
# to order by. The where clause, on the other hand, cannot use the alias because Postgres does
|
||||
# not allow aliases in where clauses.
|
||||
sort_field_name = sort_field_name or 'id'
|
||||
sort_field = getattr(model, sort_field_name)
|
||||
|
||||
if sort_field_alias is not None:
|
||||
sort_field_name = sort_field_alias
|
||||
sort_field = SQL(sort_field_alias)
|
||||
|
||||
if descending:
|
||||
query = query.order_by(id_field.desc())
|
||||
query = query.order_by(sort_field.desc())
|
||||
else:
|
||||
query = query.order_by(id_field)
|
||||
query = query.order_by(sort_field)
|
||||
|
||||
start_id = pagination_start(page_token)
|
||||
if start_id is not None:
|
||||
start_index = pagination_start(page_token)
|
||||
if start_index is not None:
|
||||
if descending:
|
||||
query = query.where(model.id <= start_id)
|
||||
query = query.where(sort_field <= start_index)
|
||||
else:
|
||||
query = query.where(model.id >= start_id)
|
||||
query = query.where(sort_field >= start_index)
|
||||
|
||||
query = query.limit(limit + 1)
|
||||
|
||||
|
@ -33,28 +41,37 @@ def paginate(query, model, descending=False, page_token=None, limit=50, id_alias
|
|||
if page_number is not None and max_page is not None and page_number > max_page:
|
||||
return [], None
|
||||
|
||||
return paginate_query(query, limit=limit, id_alias=id_alias, page_number=page_number)
|
||||
return paginate_query(query, limit=limit, sort_field_name=sort_field_name,
|
||||
page_number=page_number)
|
||||
|
||||
|
||||
def pagination_start(page_token=None):
|
||||
""" Returns the start ID for pagination for the given page token. Will return None if None. """
|
||||
""" Returns the start index for pagination for the given page token. Will return None if None. """
|
||||
if page_token is not None:
|
||||
return page_token.get('start_id')
|
||||
|
||||
start_index = page_token.get('start_index')
|
||||
if page_token.get('is_datetime'):
|
||||
start_index = dateutil.parser.parse(start_index)
|
||||
return start_index
|
||||
return None
|
||||
|
||||
|
||||
def paginate_query(query, limit=50, id_alias=None, page_number=None):
|
||||
def paginate_query(query, limit=50, sort_field_name=None, page_number=None):
|
||||
""" Executes the given query and returns a page's worth of results, as well as the page token
|
||||
for the next page (if any).
|
||||
"""
|
||||
results = list(query)
|
||||
page_token = None
|
||||
if len(results) > limit:
|
||||
start_id = getattr(results[limit], id_alias or 'id')
|
||||
start_index = getattr(results[limit], sort_field_name or 'id')
|
||||
is_datetime = False
|
||||
if isinstance(start_index, datetime):
|
||||
start_index = start_index.isoformat() + "Z"
|
||||
is_datetime = True
|
||||
|
||||
page_token = {
|
||||
'start_id': start_id,
|
||||
'start_index': start_index,
|
||||
'page_number': page_number + 1 if page_number else 1,
|
||||
'is_datetime': is_datetime,
|
||||
}
|
||||
|
||||
return results[0:limit], page_token
|
||||
|
|
|
@ -89,7 +89,7 @@ class PreOCIModel(RepositoryDataInterface):
|
|||
kind_filter=repo_kind)
|
||||
|
||||
repos, next_page_token = model.modelutil.paginate_query(repo_query, limit=REPOS_PER_PAGE,
|
||||
id_alias='rid')
|
||||
sort_field_name='rid')
|
||||
|
||||
# Collect the IDs of the repositories found for subequent lookup of popularity
|
||||
# and/or last modified.
|
||||
|
|
Reference in a new issue