Merge pull request #3348 from quay/optimize-log-listing
Optimize log listing
This commit is contained in:
commit
6c3eab47e0
3 changed files with 63 additions and 64 deletions
|
@ -16,8 +16,8 @@ from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage, _
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
MINIMUM_RANGE_SIZE = 1000
|
MINIMUM_RANGE_SIZE = 1 # second
|
||||||
MAXIMUM_RANGE_SIZE = 100000
|
MAXIMUM_RANGE_SIZE = 60 * 60 * 24 * 30 # seconds ~= 1 month
|
||||||
EXPECTED_ITERATION_LOG_COUNT = 1000
|
EXPECTED_ITERATION_LOG_COUNT = 1000
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,9 +48,11 @@ class TableLogsModel(ActionLogsDataInterface):
|
||||||
ignore=filter_kinds, model=m)
|
ignore=filter_kinds, model=m)
|
||||||
|
|
||||||
logs, next_page_token = model.modelutil.paginate(logs_query, m,
|
logs, next_page_token = model.modelutil.paginate(logs_query, m,
|
||||||
descending=True, page_token=page_token,
|
descending=True,
|
||||||
|
page_token=page_token,
|
||||||
limit=20,
|
limit=20,
|
||||||
max_page=max_page_count)
|
max_page=max_page_count,
|
||||||
|
sort_field_name='datetime')
|
||||||
return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token)
|
return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token)
|
||||||
|
|
||||||
# First check the LogEntry3 table for the most recent logs, unless we've been expressly told
|
# First check the LogEntry3 table for the most recent logs, unless we've been expressly told
|
||||||
|
@ -149,40 +151,16 @@ class TableLogsModel(ActionLogsDataInterface):
|
||||||
|
|
||||||
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||||||
namespace_id=None, max_query_time=None):
|
namespace_id=None, max_query_time=None):
|
||||||
# Lookup the starting and ending IDs for the log range in the table. This operation is quite
|
|
||||||
# quick, so we use it as a bounding box for the later lookups.
|
|
||||||
min_id, elapsed = _run_and_time(lambda: model.log.get_minimum_id_for_logs(start_datetime,
|
|
||||||
repository_id,
|
|
||||||
namespace_id))
|
|
||||||
if elapsed > max_query_time:
|
|
||||||
logger.error('Retrieval of min ID for export logs `%s/%s` timed out with time of `%s`',
|
|
||||||
namespace_id, repository_id, elapsed)
|
|
||||||
raise LogsIterationTimeout()
|
|
||||||
|
|
||||||
max_id, elapsed = _run_and_time(lambda: model.log.get_maximum_id_for_logs(end_datetime,
|
|
||||||
repository_id,
|
|
||||||
namespace_id))
|
|
||||||
if elapsed > max_query_time:
|
|
||||||
logger.error('Retrieval of max ID for export logs `%s/%s` timed out with time of `%s`',
|
|
||||||
namespace_id, repository_id, elapsed)
|
|
||||||
raise LogsIterationTimeout()
|
|
||||||
|
|
||||||
min_id = min_id or 1
|
|
||||||
max_id = max_id or 1
|
|
||||||
|
|
||||||
logger.info('Found log range of %s to %s for export logs `%s/%s`', min_id, max_id,
|
|
||||||
namespace_id, repository_id)
|
|
||||||
|
|
||||||
# Using an adjusting scale, start downloading log rows in batches, starting at
|
# Using an adjusting scale, start downloading log rows in batches, starting at
|
||||||
# MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
|
# MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
|
||||||
# the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
|
# the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
|
||||||
# longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
|
# longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
|
||||||
batch_start_time = datetime.utcnow()
|
batch_start_time = datetime.utcnow()
|
||||||
|
|
||||||
current_start_id = min_id
|
current_start_datetime = start_datetime
|
||||||
current_batch_size = MINIMUM_RANGE_SIZE
|
current_batch_size = timedelta(seconds=MINIMUM_RANGE_SIZE)
|
||||||
|
|
||||||
while current_start_id <= max_id:
|
while current_start_datetime < end_datetime:
|
||||||
# Verify we haven't been working for too long.
|
# Verify we haven't been working for too long.
|
||||||
work_elapsed = datetime.utcnow() - batch_start_time
|
work_elapsed = datetime.utcnow() - batch_start_time
|
||||||
if work_elapsed > max_query_time:
|
if work_elapsed > max_query_time:
|
||||||
|
@ -190,32 +168,36 @@ class TableLogsModel(ActionLogsDataInterface):
|
||||||
namespace_id, repository_id, work_elapsed)
|
namespace_id, repository_id, work_elapsed)
|
||||||
raise LogsIterationTimeout()
|
raise LogsIterationTimeout()
|
||||||
|
|
||||||
id_range = [current_start_id, min(max_id, current_start_id + current_batch_size)]
|
current_end_datetime = current_start_datetime + current_batch_size
|
||||||
|
current_end_datetime = min(current_end_datetime, end_datetime)
|
||||||
|
|
||||||
# Load the next set of logs.
|
# Load the next set of logs.
|
||||||
def load_logs():
|
def load_logs():
|
||||||
logger.debug('Retrieving logs over range %s with namespace %s and repository %s',
|
logger.debug('Retrieving logs over range %s-%s with namespace %s and repository %s',
|
||||||
id_range, namespace_id, repository_id)
|
current_start_datetime, current_end_datetime, namespace_id, repository_id)
|
||||||
|
|
||||||
logs_query = model.log.get_logs_query(namespace=namespace_id,
|
logs_query = model.log.get_logs_query(namespace=namespace_id,
|
||||||
repository=repository_id,
|
repository=repository_id,
|
||||||
id_range=id_range)
|
start_time=current_start_datetime,
|
||||||
|
end_time=current_end_datetime)
|
||||||
return [Log.for_logentry(log) for log in logs_query]
|
return [Log.for_logentry(log) for log in logs_query]
|
||||||
|
|
||||||
logs, elapsed = _run_and_time(load_logs)
|
logs, elapsed = _run_and_time(load_logs)
|
||||||
if elapsed > max_query_time:
|
if elapsed > max_query_time:
|
||||||
logger.error('Retrieval of logs for export logs `%s/%s` with range `%s` timed out at `%s`',
|
logger.error('Retrieval of logs for export `%s/%s` with range `%s-%s` timed out at `%s`',
|
||||||
namespace_id, repository_id, id_range, elapsed)
|
namespace_id, repository_id, current_start_datetime, current_end_datetime,
|
||||||
|
elapsed)
|
||||||
raise LogsIterationTimeout()
|
raise LogsIterationTimeout()
|
||||||
|
|
||||||
yield logs
|
yield logs
|
||||||
|
|
||||||
# Move forward.
|
# Move forward.
|
||||||
current_start_id = id_range[1] + 1
|
current_start_datetime = current_end_datetime
|
||||||
|
|
||||||
# Increase the batch size if necessary.
|
# Increase the batch size if necessary.
|
||||||
if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
|
if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
|
||||||
current_batch_size = min(MAXIMUM_RANGE_SIZE, current_batch_size * 2)
|
seconds = min(MAXIMUM_RANGE_SIZE, current_batch_size.total_seconds() * 2)
|
||||||
|
current_batch_size = timedelta(seconds=seconds)
|
||||||
|
|
||||||
|
|
||||||
def _run_and_time(fn):
|
def _run_and_time(fn):
|
||||||
|
|
|
@ -1,31 +1,39 @@
|
||||||
|
import dateutil.parser
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from peewee import SQL
|
from peewee import SQL
|
||||||
|
|
||||||
def paginate(query, model, descending=False, page_token=None, limit=50, id_alias=None,
|
|
||||||
max_page=None):
|
def paginate(query, model, descending=False, page_token=None, limit=50, sort_field_alias=None,
|
||||||
""" Paginates the given query using an ID range, starting at the optional page_token.
|
max_page=None, sort_field_name=None):
|
||||||
|
""" Paginates the given query using an field range, starting at the optional page_token.
|
||||||
Returns a *list* of matching results along with an unencrypted page_token for the
|
Returns a *list* of matching results along with an unencrypted page_token for the
|
||||||
next page, if any. If descending is set to True, orders by the ID descending rather
|
next page, if any. If descending is set to True, orders by the field descending rather
|
||||||
than ascending.
|
than ascending.
|
||||||
"""
|
"""
|
||||||
# Note: We use the id_alias for the order_by, but not the where below. The alias is necessary
|
# Note: We use the sort_field_alias for the order_by, but not the where below. The alias is
|
||||||
# for certain queries that use unions in MySQL, as it gets confused on which ID to order by.
|
# necessary for certain queries that use unions in MySQL, as it gets confused on which field
|
||||||
# The where clause, on the other hand, cannot use the alias because Postgres does not allow
|
# to order by. The where clause, on the other hand, cannot use the alias because Postgres does
|
||||||
# aliases in where clauses.
|
# not allow aliases in where clauses.
|
||||||
id_field = model.id
|
sort_field_name = sort_field_name or 'id'
|
||||||
if id_alias is not None:
|
sort_field = getattr(model, sort_field_name)
|
||||||
id_field = SQL(id_alias)
|
|
||||||
|
if sort_field_alias is not None:
|
||||||
|
sort_field_name = sort_field_alias
|
||||||
|
sort_field = SQL(sort_field_alias)
|
||||||
|
|
||||||
if descending:
|
if descending:
|
||||||
query = query.order_by(id_field.desc())
|
query = query.order_by(sort_field.desc())
|
||||||
else:
|
else:
|
||||||
query = query.order_by(id_field)
|
query = query.order_by(sort_field)
|
||||||
|
|
||||||
start_id = pagination_start(page_token)
|
start_index = pagination_start(page_token)
|
||||||
if start_id is not None:
|
if start_index is not None:
|
||||||
if descending:
|
if descending:
|
||||||
query = query.where(model.id <= start_id)
|
query = query.where(sort_field <= start_index)
|
||||||
else:
|
else:
|
||||||
query = query.where(model.id >= start_id)
|
query = query.where(sort_field >= start_index)
|
||||||
|
|
||||||
query = query.limit(limit + 1)
|
query = query.limit(limit + 1)
|
||||||
|
|
||||||
|
@ -33,28 +41,37 @@ def paginate(query, model, descending=False, page_token=None, limit=50, id_alias
|
||||||
if page_number is not None and max_page is not None and page_number > max_page:
|
if page_number is not None and max_page is not None and page_number > max_page:
|
||||||
return [], None
|
return [], None
|
||||||
|
|
||||||
return paginate_query(query, limit=limit, id_alias=id_alias, page_number=page_number)
|
return paginate_query(query, limit=limit, sort_field_name=sort_field_name,
|
||||||
|
page_number=page_number)
|
||||||
|
|
||||||
|
|
||||||
def pagination_start(page_token=None):
|
def pagination_start(page_token=None):
|
||||||
""" Returns the start ID for pagination for the given page token. Will return None if None. """
|
""" Returns the start index for pagination for the given page token. Will return None if None. """
|
||||||
if page_token is not None:
|
if page_token is not None:
|
||||||
return page_token.get('start_id')
|
start_index = page_token.get('start_index')
|
||||||
|
if page_token.get('is_datetime'):
|
||||||
|
start_index = dateutil.parser.parse(start_index)
|
||||||
|
return start_index
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def paginate_query(query, limit=50, id_alias=None, page_number=None):
|
def paginate_query(query, limit=50, sort_field_name=None, page_number=None):
|
||||||
""" Executes the given query and returns a page's worth of results, as well as the page token
|
""" Executes the given query and returns a page's worth of results, as well as the page token
|
||||||
for the next page (if any).
|
for the next page (if any).
|
||||||
"""
|
"""
|
||||||
results = list(query)
|
results = list(query)
|
||||||
page_token = None
|
page_token = None
|
||||||
if len(results) > limit:
|
if len(results) > limit:
|
||||||
start_id = getattr(results[limit], id_alias or 'id')
|
start_index = getattr(results[limit], sort_field_name or 'id')
|
||||||
|
is_datetime = False
|
||||||
|
if isinstance(start_index, datetime):
|
||||||
|
start_index = start_index.isoformat() + "Z"
|
||||||
|
is_datetime = True
|
||||||
|
|
||||||
page_token = {
|
page_token = {
|
||||||
'start_id': start_id,
|
'start_index': start_index,
|
||||||
'page_number': page_number + 1 if page_number else 1,
|
'page_number': page_number + 1 if page_number else 1,
|
||||||
|
'is_datetime': is_datetime,
|
||||||
}
|
}
|
||||||
|
|
||||||
return results[0:limit], page_token
|
return results[0:limit], page_token
|
||||||
|
|
|
@ -89,7 +89,7 @@ class PreOCIModel(RepositoryDataInterface):
|
||||||
kind_filter=repo_kind)
|
kind_filter=repo_kind)
|
||||||
|
|
||||||
repos, next_page_token = model.modelutil.paginate_query(repo_query, limit=REPOS_PER_PAGE,
|
repos, next_page_token = model.modelutil.paginate_query(repo_query, limit=REPOS_PER_PAGE,
|
||||||
id_alias='rid')
|
sort_field_name='rid')
|
||||||
|
|
||||||
# Collect the IDs of the repositories found for subequent lookup of popularity
|
# Collect the IDs of the repositories found for subequent lookup of popularity
|
||||||
# and/or last modified.
|
# and/or last modified.
|
||||||
|
|
Reference in a new issue