quay/data/model/log.py

import json

from calendar import timegm
from peewee import JOIN_LEFT_OUTER, fn
from datetime import datetime, timedelta
from cachetools import lru_cache

from data.database import LogEntry, LogEntryKind, User, RepositoryActionCount, db
from data.model import config, user, DataModelException

def _logs_query(selections, start_time, end_time, performer=None, repository=None, namespace=None,
                ignore=None):
  joined = (LogEntry
            .select(*selections)
            .switch(LogEntry)
            .where(LogEntry.datetime >= start_time, LogEntry.datetime < end_time))

  if repository:
    joined = joined.where(LogEntry.repository == repository)

  if performer:
    joined = joined.where(LogEntry.performer == performer)

  if namespace:
    namespace_user = user.get_user_or_org(namespace)
    if namespace_user is None:
      raise DataModelException('Invalid namespace requested')

    joined = joined.where(LogEntry.account == namespace_user.id)

  if ignore:
    kind_map = get_log_entry_kinds()
    ignore_ids = [kind_map[kind_name] for kind_name in ignore]
    joined = joined.where(~(LogEntry.kind << ignore_ids))

  return joined


@lru_cache(maxsize=1)
def get_log_entry_kinds():
  kind_map = {}
  for kind in LogEntryKind.select():
    kind_map[kind.id] = kind.name
    kind_map[kind.name] = kind.id

  return kind_map


def _get_log_entry_kind(name):
  kinds = get_log_entry_kinds()
  return kinds[name]


def get_aggregated_logs(start_time, end_time, performer=None, repository=None, namespace=None,
                        ignore=None):
  date = db.extract_date('day', LogEntry.datetime)
  selections = [LogEntry.kind, date.alias('day'), fn.Count(LogEntry.id).alias('count')]
  query = _logs_query(selections, start_time, end_time, performer, repository, namespace, ignore)
  return query.group_by(date, LogEntry.kind)


def get_logs_query(start_time, end_time, performer=None, repository=None, namespace=None,
                   ignore=None):
  Performer = User.alias()
  Account = User.alias()
  selections = [LogEntry, Performer]

  if namespace is None and repository is None:
    selections.append(Account)

  query = _logs_query(selections, start_time, end_time, performer, repository, namespace, ignore)
  query = (query.switch(LogEntry)
                .join(Performer, JOIN_LEFT_OUTER,
                  on=(LogEntry.performer == Performer.id).alias('performer')))

  if namespace is None and repository is None:
    query = (query.switch(LogEntry)
                  .join(Account, JOIN_LEFT_OUTER,
                    on=(LogEntry.account == Account.id).alias('account')))

  return query


def _json_serialize(obj):
  if isinstance(obj, datetime):
    return timegm(obj.utctimetuple())

  return obj


def log_action(kind_name, user_or_organization_name, performer=None, repository=None,
               ip=None, metadata={}, timestamp=None):
  if not timestamp:
    timestamp = datetime.today()

  account = None
  if user_or_organization_name is not None:
    account = User.get(User.username == user_or_organization_name).id
  else:
    account = config.app_config.get('SERVICE_LOG_ACCOUNT_ID')
    if account is None:
      account = User.select(fn.Min(User.id)).tuples().get()[0]

  if performer is not None:
    performer = performer.id

  if repository is not None:
    repository = repository.id

  kind = _get_log_entry_kind(kind_name)
  metadata_json = json.dumps(metadata, default=_json_serialize)
  LogEntry.create(kind=kind, account=account, performer=performer,
                  repository=repository, ip=ip, metadata_json=metadata_json,
                  datetime=timestamp)


def get_stale_logs_start_id():
  """ Gets the oldest log entry. """
  try:
    return (LogEntry
            .select(LogEntry.id)
            .order_by(LogEntry.id)
            .limit(1)
            .tuples())[0][0]
  except IndexError:
    return None


def get_stale_logs_cutoff_id(cutoff_date):
  """ Gets the most recent ID created before the cutoff_date. """
  try:
    return (LogEntry
            .select(fn.Max(LogEntry.id))
            .where(LogEntry.datetime <= cutoff_date)
            .tuples())[0][0]
  except IndexError:
    return None


def get_stale_logs(start_id, end_id):
  """ Returns all the logs with IDs between start_id and end_id inclusively. """
  return LogEntry.select().where((LogEntry.id >= start_id), (LogEntry.id <= end_id))


def delete_stale_logs(start_id, end_id):
  """ Deletes all the logs with IDs between start_id and end_id. """
  LogEntry.delete().where((LogEntry.id >= start_id), (LogEntry.id <= end_id)).execute()


def get_repository_action_counts(repo, start_date):
  return RepositoryActionCount.select().where(RepositoryActionCount.repository == repo,
                                              RepositoryActionCount.date >= start_date)


def get_repositories_action_sums(repository_ids):
  if not repository_ids:
    return {}

  # Filter the join to recent entries only.
  last_week = datetime.now() - timedelta(weeks=1)
  tuples = (RepositoryActionCount
            .select(RepositoryActionCount.repository, fn.Sum(RepositoryActionCount.count))
            .where(RepositoryActionCount.repository << repository_ids)
            .where(RepositoryActionCount.date >= last_week)
            .group_by(RepositoryActionCount.repository)
            .tuples())

  action_count_map = {}
  for record in tuples:
    action_count_map[record[0]] = record[1]

  return action_count_map
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`import json`

keys ui WIP 2016-04-01 17:55:29 +00:00			`from calendar import timegm`
Add an index for lookup by account to log entries Also fixes the query to require one less join 2016-08-12 20:53:17 +00:00			`from peewee import JOIN_LEFT_OUTER, fn`
			`from datetime import datetime, timedelta`
Fix logs view and API - We needed to use an engine-agnostic way to extract the days - Joining with the LogEntryKind table has horrible performance in MySQL, so do it ourselves - Limit to 50 logs per page 2015-08-05 21:36:17 +00:00			`from cachetools import lru_cache`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00
Change repo stats to use the RAC table and a nice UI 2016-06-22 18:50:59 +00:00			`from data.database import LogEntry, LogEntryKind, User, RepositoryActionCount, db`
Add an index for lookup by account to log entries Also fixes the query to require one less join 2016-08-12 20:53:17 +00:00			`from data.model import config, user, DataModelException`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00
Change account-less logs to use a user and not null This allows us to skip the migration 2016-04-26 19:16:55 +00:00			`def _logs_query(selections, start_time, end_time, performer=None, repository=None, namespace=None,`
			`ignore=None):`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`joined = (LogEntry`
Switch to using an aggregated logs query and infinite scrolling This should allow users to work with large logs set. Fixes #294 2015-07-31 17:38:02 +00:00			`.select(*selections)`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`.switch(LogEntry)`
Switch to using an aggregated logs query and infinite scrolling This should allow users to work with large logs set. Fixes #294 2015-07-31 17:38:02 +00:00			`.where(LogEntry.datetime >= start_time, LogEntry.datetime < end_time))`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00
			`if repository:`
			`joined = joined.where(LogEntry.repository == repository)`

			`if performer:`
			`joined = joined.where(LogEntry.performer == performer)`

			`if namespace:`
Add an index for lookup by account to log entries Also fixes the query to require one less join 2016-08-12 20:53:17 +00:00			`namespace_user = user.get_user_or_org(namespace)`
			`if namespace_user is None:`
			`raise DataModelException('Invalid namespace requested')`

			`joined = joined.where(LogEntry.account == namespace_user.id)`
Switch to using an aggregated logs query and infinite scrolling This should allow users to work with large logs set. Fixes #294 2015-07-31 17:38:02 +00:00
Change account-less logs to use a user and not null This allows us to skip the migration 2016-04-26 19:16:55 +00:00			`if ignore:`
			`kind_map = get_log_entry_kinds()`
			`ignore_ids = [kind_map[kind_name] for kind_name in ignore]`
			`joined = joined.where(~(LogEntry.kind << ignore_ids))`

Switch to using an aggregated logs query and infinite scrolling This should allow users to work with large logs set. Fixes #294 2015-07-31 17:38:02 +00:00			`return joined`


Fix logs view and API - We needed to use an engine-agnostic way to extract the days - Joining with the LogEntryKind table has horrible performance in MySQL, so do it ourselves - Limit to 50 logs per page 2015-08-05 21:36:17 +00:00			`@lru_cache(maxsize=1)`
			`def get_log_entry_kinds():`
			`kind_map = {}`
			`for kind in LogEntryKind.select():`
			`kind_map[kind.id] = kind.name`
Change account-less logs to use a user and not null This allows us to skip the migration 2016-04-26 19:16:55 +00:00			`kind_map[kind.name] = kind.id`
Fix logs view and API - We needed to use an engine-agnostic way to extract the days - Joining with the LogEntryKind table has horrible performance in MySQL, so do it ourselves - Limit to 50 logs per page 2015-08-05 21:36:17 +00:00
			`return kind_map`


Always use log entry kind cache Fixes #1445 2016-05-13 19:20:55 +00:00			`def _get_log_entry_kind(name):`
			`kinds = get_log_entry_kinds()`
			`return kinds[name]`


Change account-less logs to use a user and not null This allows us to skip the migration 2016-04-26 19:16:55 +00:00			`def get_aggregated_logs(start_time, end_time, performer=None, repository=None, namespace=None,`
			`ignore=None):`
Fix logs view and API - We needed to use an engine-agnostic way to extract the days - Joining with the LogEntryKind table has horrible performance in MySQL, so do it ourselves - Limit to 50 logs per page 2015-08-05 21:36:17 +00:00			`date = db.extract_date('day', LogEntry.datetime)`
MySQL and Postgres complain about the group by, so calculate dates ourselves 2015-08-06 16:52:55 +00:00			`selections = [LogEntry.kind, date.alias('day'), fn.Count(LogEntry.id).alias('count')]`
Change account-less logs to use a user and not null This allows us to skip the migration 2016-04-26 19:16:55 +00:00			`query = _logs_query(selections, start_time, end_time, performer, repository, namespace, ignore)`
Fix logs view and API - We needed to use an engine-agnostic way to extract the days - Joining with the LogEntryKind table has horrible performance in MySQL, so do it ourselves - Limit to 50 logs per page 2015-08-05 21:36:17 +00:00			`return query.group_by(date, LogEntry.kind)`
Switch to using an aggregated logs query and infinite scrolling This should allow users to work with large logs set. Fixes #294 2015-07-31 17:38:02 +00:00

Change account-less logs to use a user and not null This allows us to skip the migration 2016-04-26 19:16:55 +00:00			`def get_logs_query(start_time, end_time, performer=None, repository=None, namespace=None,`
			`ignore=None):`
Switch to using an aggregated logs query and infinite scrolling This should allow users to work with large logs set. Fixes #294 2015-07-31 17:38:02 +00:00			`Performer = User.alias()`
Log more information to the action logs and display the namespaces for superusers This helps superusers understand better what, exactly, is going on in the registry 2017-02-14 19:55:24 +00:00			`Account = User.alias()`
Fix logs view and API - We needed to use an engine-agnostic way to extract the days - Joining with the LogEntryKind table has horrible performance in MySQL, so do it ourselves - Limit to 50 logs per page 2015-08-05 21:36:17 +00:00			`selections = [LogEntry, Performer]`
Switch to using an aggregated logs query and infinite scrolling This should allow users to work with large logs set. Fixes #294 2015-07-31 17:38:02 +00:00
Log more information to the action logs and display the namespaces for superusers This helps superusers understand better what, exactly, is going on in the registry 2017-02-14 19:55:24 +00:00			`if namespace is None and repository is None:`
			`selections.append(Account)`

Change account-less logs to use a user and not null This allows us to skip the migration 2016-04-26 19:16:55 +00:00			`query = _logs_query(selections, start_time, end_time, performer, repository, namespace, ignore)`
Switch to using an aggregated logs query and infinite scrolling This should allow users to work with large logs set. Fixes #294 2015-07-31 17:38:02 +00:00			`query = (query.switch(LogEntry)`
			`.join(Performer, JOIN_LEFT_OUTER,`
			`on=(LogEntry.performer == Performer.id).alias('performer')))`

Log more information to the action logs and display the namespaces for superusers This helps superusers understand better what, exactly, is going on in the registry 2017-02-14 19:55:24 +00:00			`if namespace is None and repository is None:`
			`query = (query.switch(LogEntry)`
			`.join(Account, JOIN_LEFT_OUTER,`
			`on=(LogEntry.account == Account.id).alias('account')))`

Add ID-based pagination to logs using new decorators and an encrypted token Fixes #599 2015-12-22 14:05:17 +00:00			`return query`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00

keys ui WIP 2016-04-01 17:55:29 +00:00			`def _json_serialize(obj):`
			`if isinstance(obj, datetime):`
			`return timegm(obj.utctimetuple())`

			`return obj`


Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`def log_action(kind_name, user_or_organization_name, performer=None, repository=None,`
			`ip=None, metadata={}, timestamp=None):`
			`if not timestamp:`
			`timestamp = datetime.today()`

keys ui WIP 2016-04-01 17:55:29 +00:00			`account = None`
			`if user_or_organization_name is not None:`
			`account = User.get(User.username == user_or_organization_name).id`
Change account-less logs to use a user and not null This allows us to skip the migration 2016-04-26 19:16:55 +00:00			`else:`
			`account = config.app_config.get('SERVICE_LOG_ACCOUNT_ID')`
			`if account is None:`
			`account = User.select(fn.Min(User.id)).tuples().get()[0]`
keys ui WIP 2016-04-01 17:55:29 +00:00
modify log_action to internally resolve IDs 2016-08-05 20:37:04 +00:00			`if performer is not None:`
			`performer = performer.id`

			`if repository is not None:`
			`repository = repository.id`

Always use log entry kind cache Fixes #1445 2016-05-13 19:20:55 +00:00			`kind = _get_log_entry_kind(kind_name)`
keys ui WIP 2016-04-01 17:55:29 +00:00			`metadata_json = json.dumps(metadata, default=_json_serialize)`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`LogEntry.create(kind=kind, account=account, performer=performer,`
keys ui WIP 2016-04-01 17:55:29 +00:00			`repository=repository, ip=ip, metadata_json=metadata_json,`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`datetime=timestamp)`


add a log rotation worker Fixes #609. 2015-10-09 19:41:56 +00:00			`def get_stale_logs_start_id():`
			`""" Gets the oldest log entry. """`
			`try:`
			`return (LogEntry`
			`.select(LogEntry.id)`
			`.order_by(LogEntry.id)`
			`.limit(1)`
			`.tuples())[0][0]`
			`except IndexError:`
			`return None`


vastly simplify log rotation 2016-02-09 20:20:52 +00:00			`def get_stale_logs_cutoff_id(cutoff_date):`
add a log rotation worker Fixes #609. 2015-10-09 19:41:56 +00:00			`""" Gets the most recent ID created before the cutoff_date. """`
			`try:`
			`return (LogEntry`
log: cutoff at the max id past the cutoff_date Previously we were using the min, which is always going to be equivalant to the min id in the table. 2016-05-11 03:13:10 +00:00			`.select(fn.Max(LogEntry.id))`
add a log rotation worker Fixes #609. 2015-10-09 19:41:56 +00:00			`.where(LogEntry.datetime <= cutoff_date)`
			`.tuples())[0][0]`
			`except IndexError:`
			`return None`


			`def get_stale_logs(start_id, end_id):`
			`""" Returns all the logs with IDs between start_id and end_id inclusively. """`
			`return LogEntry.select().where((LogEntry.id >= start_id), (LogEntry.id <= end_id))`


			`def delete_stale_logs(start_id, end_id):`
			`""" Deletes all the logs with IDs between start_id and end_id. """`
			`LogEntry.delete().where((LogEntry.id >= start_id), (LogEntry.id <= end_id)).execute()`
Change repo stats to use the RAC table and a nice UI 2016-06-22 18:50:59 +00:00

			`def get_repository_action_counts(repo, start_date):`
			`return RepositoryActionCount.select().where(RepositoryActionCount.repository == repo,`
			`RepositoryActionCount.date >= start_date)`


			`def get_repositories_action_sums(repository_ids):`
			`if not repository_ids:`
			`return {}`

			`# Filter the join to recent entries only.`
			`last_week = datetime.now() - timedelta(weeks=1)`
			`tuples = (RepositoryActionCount`
			`.select(RepositoryActionCount.repository, fn.Sum(RepositoryActionCount.count))`
			`.where(RepositoryActionCount.repository << repository_ids)`
			`.where(RepositoryActionCount.date >= last_week)`
			`.group_by(RepositoryActionCount.repository)`
			`.tuples())`

			`action_count_map = {}`
			`for record in tuples:`
			`action_count_map[record[0]] = record[1]`

			`return action_count_map`