quay/data/model/_basequery.py

import logging

from peewee import fn, PeeweeException
from cachetools import lru_cache

from datetime import datetime, timedelta

from data.model import DataModelException, config
from data.database import (Repository, User, Team, TeamMember, RepositoryPermission, TeamRole,
                           Namespace, Visibility, ImageStorage, Image, RepositoryKind,
                           db_for_update)

logger = logging.getLogger(__name__)

def reduce_as_tree(queries_to_reduce):
  """ This method will split a list of queries into halves recursively until we reach individual
      queries, at which point it will start unioning the queries, or the already unioned subqueries.
      This works around a bug in peewee SQL generation where reducing linearly generates a chain
      of queries that will exceed the recursion depth limit when it has around 80 queries.
      """
  mid = len(queries_to_reduce)/2
  left = queries_to_reduce[:mid]
  right = queries_to_reduce[mid:]

  to_reduce_right = right[0]
  if len(right) > 1:
    to_reduce_right = reduce_as_tree(right)

  if len(left) > 1:
    to_reduce_left = reduce_as_tree(left)
  elif len(left) == 1:
    to_reduce_left = left[0]
  else:
    return to_reduce_right

  return to_reduce_left.union_all(to_reduce_right)


def get_existing_repository(namespace_name, repository_name, for_update=False, kind_filter=None):
  query = (Repository
           .select(Repository, Namespace)
           .join(Namespace, on=(Repository.namespace_user == Namespace.id))
           .where(Namespace.username == namespace_name,
                  Repository.name == repository_name))

  if kind_filter:
    query = (query
             .switch(Repository)
             .join(RepositoryKind)
             .where(RepositoryKind.name == kind_filter))

  if for_update:
    query = db_for_update(query)

  return query.get()


@lru_cache(maxsize=1)
def get_public_repo_visibility():
  return Visibility.get(name='public')


def _lookup_team_role(name):
  return _lookup_team_roles()[name]


@lru_cache(maxsize=1)
def _lookup_team_roles():
  return {role.name:role for role in TeamRole.select()}


def filter_to_repos_for_user(query, user_id=None, namespace=None, repo_kind='image',
                             include_public=True, start_id=None):
  if not include_public and not user_id:
    return Repository.select().where(Repository.id == '-1')

  # Filter on the type of repository.
  if repo_kind is not None:
    try:
      query = query.where(Repository.kind == Repository.kind.get_id(repo_kind))
    except RepositoryKind.DoesNotExist:
      raise DataModelException('Unknown repository kind')

  # Add the start ID if necessary.
  if start_id is not None:
    query = query.where(Repository.id >= start_id)

  # Add a namespace filter if necessary.
  if namespace:
    query = query.where(Namespace.username == namespace)

  # Build a set of queries that, when unioned together, return the full set of visible repositories
  # for the filters specified.
  queries = []

  if include_public:
    queries.append(query.where(Repository.visibility == get_public_repo_visibility()))

  if user_id is not None:
    AdminTeam = Team.alias()
    AdminTeamMember = TeamMember.alias()

    # Add repositories in which the user has permission.
    queries.append(query
                   .switch(RepositoryPermission)
                   .where(RepositoryPermission.user == user_id))

    # Add repositories in which the user is a member of a team that has permission.
    queries.append(query
                   .switch(RepositoryPermission)
                   .join(Team)
                   .join(TeamMember)
                   .where(TeamMember.user == user_id))

    # Add repositories under namespaces in which the user is the org admin.
    queries.append(query
                   .switch(Repository)
                   .join(AdminTeam, on=(Repository.namespace_user == AdminTeam.organization))
                   .join(AdminTeamMember, on=(AdminTeam.id == AdminTeamMember.team))
                   .where(AdminTeam.role == _lookup_team_role('admin'))
                   .where(AdminTeamMember.user == user_id))

  return reduce(lambda l, r: l | r, queries)


def get_user_organizations(username):
  UserAlias = User.alias()
  return (User
          .select()
          .distinct()
          .join(Team)
          .join(TeamMember)
          .join(UserAlias, on=(UserAlias.id == TeamMember.user))
          .where(User.organization == True, UserAlias.username == username))


def calculate_image_aggregate_size(ancestors_str, image_size, parent_image):
  ancestors = ancestors_str.split('/')[1:-1]
  if not ancestors:
    return image_size

  if parent_image is None:
    raise DataModelException('Could not load parent image')

  ancestor_size = parent_image.aggregate_size
  if ancestor_size is not None:
    return ancestor_size + image_size

  # Fallback to a slower path if the parent doesn't have an aggregate size saved.
  # TODO: remove this code if/when we do a full backfill.
  ancestor_size = (ImageStorage
                   .select(fn.Sum(ImageStorage.image_size))
                   .join(Image)
                   .where(Image.id << ancestors)
                   .scalar())
  if ancestor_size is None:
    return None

  return ancestor_size + image_size


def update_last_accessed(token_or_user):
  """ Updates the `last_accessed` field on the given token or user. If the existing field's value
      is within the configured threshold, the update is skipped. """
  threshold = timedelta(seconds=config.app_config.get('LAST_ACCESSED_UPDATE_THRESHOLD_S', 120))
  if (token_or_user.last_accessed is not None and
      datetime.utcnow() - token_or_user.last_accessed < threshold):
    # Skip updating, as we don't want to put undue pressure on the database.
    return

  model_class = token_or_user.__class__
  last_accessed = datetime.utcnow()

  try:
    (model_class
     .update(last_accessed=last_accessed)
     .where(model_class.id == token_or_user.id)
     .execute())
    token_or_user.last_accessed = last_accessed
  except PeeweeException as ex:
    # If there is any form of DB exception, only fail if strict logging is enabled.
    strict_logging_disabled = config.app_config.get('ALLOW_PULLS_WITHOUT_STRICT_LOGGING')
    if strict_logging_disabled:
      data = {
        'exception': ex,
        'token_or_user': token_or_user.id,
        'class': str(model_class),
      }

      logger.exception('update last_accessed for token/user failed', extra=data)
    else:
      raise
Add last_accessed information to User and expose for robot accounts Fixes https://jira.coreos.com/browse/QUAY-848 2018-03-13 00:30:19 +00:00			`import logging`

			`from peewee import fn, PeeweeException`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`from cachetools import lru_cache`

Add last_accessed information to User and expose for robot accounts Fixes https://jira.coreos.com/browse/QUAY-848 2018-03-13 00:30:19 +00:00			`from datetime import datetime, timedelta`

			`from data.model import DataModelException, config`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`from data.database import (Repository, User, Team, TeamMember, RepositoryPermission, TeamRole,`
Disallow non-apps-supported APIs for application repositories 2017-03-22 18:30:13 +00:00			`Namespace, Visibility, ImageStorage, Image, RepositoryKind,`
			`db_for_update)`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00
Add last_accessed information to User and expose for robot accounts Fixes https://jira.coreos.com/browse/QUAY-848 2018-03-13 00:30:19 +00:00			`logger = logging.getLogger(__name__)`

Add a batch `get_matching_tags_for_images` method This will be used in the security notification worker to retrieving the tags needed in a set of batch calls, rather than multiple calls per image 2017-05-02 19:38:25 +00:00			`def reduce_as_tree(queries_to_reduce):`
			`""" This method will split a list of queries into halves recursively until we reach individual`
			`queries, at which point it will start unioning the queries, or the already unioned subqueries.`
			`This works around a bug in peewee SQL generation where reducing linearly generates a chain`
			`of queries that will exceed the recursion depth limit when it has around 80 queries.`
			`"""`
			`mid = len(queries_to_reduce)/2`
			`left = queries_to_reduce[:mid]`
			`right = queries_to_reduce[mid:]`

			`to_reduce_right = right[0]`
			`if len(right) > 1:`
			`to_reduce_right = reduce_as_tree(right)`

			`if len(left) > 1:`
			`to_reduce_left = reduce_as_tree(left)`
			`elif len(left) == 1:`
			`to_reduce_left = left[0]`
			`else:`
			`return to_reduce_right`

			`return to_reduce_left.union_all(to_reduce_right)`

Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00
Disallow non-apps-supported APIs for application repositories 2017-03-22 18:30:13 +00:00			`def get_existing_repository(namespace_name, repository_name, for_update=False, kind_filter=None):`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`query = (Repository`
			`.select(Repository, Namespace)`
			`.join(Namespace, on=(Repository.namespace_user == Namespace.id))`
data.model._basequery: audited for repo_kind usage 2017-03-20 23:05:25 +00:00			`.where(Namespace.username == namespace_name,`
			`Repository.name == repository_name))`
Disallow non-apps-supported APIs for application repositories 2017-03-22 18:30:13 +00:00
			`if kind_filter:`
			`query = (query`
			`.switch(Repository)`
			`.join(RepositoryKind)`
			`.where(RepositoryKind.name == kind_filter))`

Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`if for_update:`
			`query = db_for_update(query)`

			`return query.get()`


			`@lru_cache(maxsize=1)`
			`def get_public_repo_visibility():`
			`return Visibility.get(name='public')`


Fix filtering of repos only visible to org admins 2016-08-31 17:51:53 +00:00			`def _lookup_team_role(name):`
Fix build by pre-calling the caches They were being called in a test-dependent order, which caused any tests which relied on query count to fail 2017-06-27 15:11:46 +00:00			`return _lookup_team_roles()[name]`


			`@lru_cache(maxsize=1)`
			`def _lookup_team_roles():`
			`return {role.name:role for role in TeamRole.select()}`
Fix filtering of repos only visible to org admins 2016-08-31 17:51:53 +00:00

Change repo filtering for users to use a user ID reference, rather than the username While this means we need an additional query for initial lookup, it makes the filtering query (which is the heavy part) require far fewer joins, thus making it more efficient. Also adds a new unit test to verify that our filter filters to the correct set of repositories. 2018-06-19 14:51:30 +00:00			`def filter_to_repos_for_user(query, user_id=None, namespace=None, repo_kind='image',`
data.model._basequery: audited for repo_kind usage 2017-03-20 23:05:25 +00:00			`include_public=True, start_id=None):`
Change repo filtering for users to use a user ID reference, rather than the username While this means we need an additional query for initial lookup, it makes the filtering query (which is the heavy part) require far fewer joins, thus making it more efficient. Also adds a new unit test to verify that our filter filters to the correct set of repositories. 2018-06-19 14:51:30 +00:00			`if not include_public and not user_id:`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`return Repository.select().where(Repository.id == '-1')`

data.model._basequery: audited for repo_kind usage 2017-03-20 23:05:25 +00:00			`# Filter on the type of repository.`
Implement new search UI We now have both autocomplete-based searching for quick results, as well as a full search page for a full listing of results 2017-04-07 21:25:44 +00:00			`if repo_kind is not None:`
			`try:`
			`query = query.where(Repository.kind == Repository.kind.get_id(repo_kind))`
			`except RepositoryKind.DoesNotExist:`
			`raise DataModelException('Unknown repository kind')`
data.model._basequery: audited for repo_kind usage 2017-03-20 23:05:25 +00:00
Fix pagination of repositories Fixes #1725 2016-08-15 20:11:45 +00:00			`# Add the start ID if necessary.`
			`if start_id is not None:`
			`query = query.where(Repository.id >= start_id)`

Change repo filtering for users to use a user ID reference, rather than the username While this means we need an additional query for initial lookup, it makes the filtering query (which is the heavy part) require far fewer joins, thus making it more efficient. Also adds a new unit test to verify that our filter filters to the correct set of repositories. 2018-06-19 14:51:30 +00:00			`# Add a namespace filter if necessary.`
			`if namespace:`
			`query = query.where(Namespace.username == namespace)`

Unionize the mega query - It needed more performance-based benefits 2015-10-07 17:00:12 +00:00			`# Build a set of queries that, when unioned together, return the full set of visible repositories`
			`# for the filters specified.`
			`queries = []`

			`if include_public:`
Upgrade Peewee to latest 3.x This requires a number of small changes in the data model code, as well as additional testing. 2018-04-06 17:48:01 +00:00			`queries.append(query.where(Repository.visibility == get_public_repo_visibility()))`
Unionize the mega query - It needed more performance-based benefits 2015-10-07 17:00:12 +00:00
Change repo filtering for users to use a user ID reference, rather than the username While this means we need an additional query for initial lookup, it makes the filtering query (which is the heavy part) require far fewer joins, thus making it more efficient. Also adds a new unit test to verify that our filter filters to the correct set of repositories. 2018-06-19 14:51:30 +00:00			`if user_id is not None:`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00			`AdminTeam = Team.alias()`
			`AdminTeamMember = TeamMember.alias()`

Unionize the mega query - It needed more performance-based benefits 2015-10-07 17:00:12 +00:00			`# Add repositories in which the user has permission.`
Cleanup some indentation and imports 2016-01-05 17:12:57 +00:00			`queries.append(query`
			`.switch(RepositoryPermission)`
Change repo filtering for users to use a user ID reference, rather than the username While this means we need an additional query for initial lookup, it makes the filtering query (which is the heavy part) require far fewer joins, thus making it more efficient. Also adds a new unit test to verify that our filter filters to the correct set of repositories. 2018-06-19 14:51:30 +00:00			`.where(RepositoryPermission.user == user_id))`
Unionize the mega query - It needed more performance-based benefits 2015-10-07 17:00:12 +00:00
			`# Add repositories in which the user is a member of a team that has permission.`
Cleanup some indentation and imports 2016-01-05 17:12:57 +00:00			`queries.append(query`
			`.switch(RepositoryPermission)`
			`.join(Team)`
			`.join(TeamMember)`
Change repo filtering for users to use a user ID reference, rather than the username While this means we need an additional query for initial lookup, it makes the filtering query (which is the heavy part) require far fewer joins, thus making it more efficient. Also adds a new unit test to verify that our filter filters to the correct set of repositories. 2018-06-19 14:51:30 +00:00			`.where(TeamMember.user == user_id))`
Unionize the mega query - It needed more performance-based benefits 2015-10-07 17:00:12 +00:00
			`# Add repositories under namespaces in which the user is the org admin.`
Cleanup some indentation and imports 2016-01-05 17:12:57 +00:00			`queries.append(query`
			`.switch(Repository)`
Change repo filtering for users to use a user ID reference, rather than the username While this means we need an additional query for initial lookup, it makes the filtering query (which is the heavy part) require far fewer joins, thus making it more efficient. Also adds a new unit test to verify that our filter filters to the correct set of repositories. 2018-06-19 14:51:30 +00:00			`.join(AdminTeam, on=(Repository.namespace_user == AdminTeam.organization))`
Cleanup some indentation and imports 2016-01-05 17:12:57 +00:00			`.join(AdminTeamMember, on=(AdminTeam.id == AdminTeamMember.team))`
Change repo filtering for users to use a user ID reference, rather than the username While this means we need an additional query for initial lookup, it makes the filtering query (which is the heavy part) require far fewer joins, thus making it more efficient. Also adds a new unit test to verify that our filter filters to the correct set of repositories. 2018-06-19 14:51:30 +00:00			`.where(AdminTeam.role == _lookup_team_role('admin'))`
			`.where(AdminTeamMember.user == user_id))`
Unionize the mega query - It needed more performance-based benefits 2015-10-07 17:00:12 +00:00
			`return reduce(lambda l, r: l \| r, queries)`
Accidental refactor, split out legacy.py into separate sumodules and update all call sites. 2015-07-15 21:25:41 +00:00

			`def get_user_organizations(username):`
			`UserAlias = User.alias()`
			`return (User`
			`.select()`
			`.distinct()`
			`.join(Team)`
			`.join(TeamMember)`
			`.join(UserAlias, on=(UserAlias.id == TeamMember.user))`
			`.where(User.organization == True, UserAlias.username == username))`
Hash v1 uploads for torrent chunks 2016-01-05 17:14:52 +00:00

			`def calculate_image_aggregate_size(ancestors_str, image_size, parent_image):`
			`ancestors = ancestors_str.split('/')[1:-1]`
			`if not ancestors:`
			`return image_size`

			`if parent_image is None:`
			`raise DataModelException('Could not load parent image')`

			`ancestor_size = parent_image.aggregate_size`
			`if ancestor_size is not None:`
			`return ancestor_size + image_size`

			`# Fallback to a slower path if the parent doesn't have an aggregate size saved.`
			`# TODO: remove this code if/when we do a full backfill.`
			`ancestor_size = (ImageStorage`
			`.select(fn.Sum(ImageStorage.image_size))`
			`.join(Image)`
			`.where(Image.id << ancestors)`
			`.scalar())`
			`if ancestor_size is None:`
			`return None`

			`return ancestor_size + image_size`
Add last_accessed information to User and expose for robot accounts Fixes https://jira.coreos.com/browse/QUAY-848 2018-03-13 00:30:19 +00:00

			`def update_last_accessed(token_or_user):`
			""" Updates the `last_accessed` field on the given token or user. If the existing field's value
			`is within the configured threshold, the update is skipped. """`
			`threshold = timedelta(seconds=config.app_config.get('LAST_ACCESSED_UPDATE_THRESHOLD_S', 120))`
			`if (token_or_user.last_accessed is not None and`
			`datetime.utcnow() - token_or_user.last_accessed < threshold):`
			`# Skip updating, as we don't want to put undue pressure on the database.`
			`return`

			`model_class = token_or_user.__class__`
			`last_accessed = datetime.utcnow()`

			`try:`
			`(model_class`
			`.update(last_accessed=last_accessed)`
			`.where(model_class.id == token_or_user.id)`
			`.execute())`
			`token_or_user.last_accessed = last_accessed`
			`except PeeweeException as ex:`
			`# If there is any form of DB exception, only fail if strict logging is enabled.`
			`strict_logging_disabled = config.app_config.get('ALLOW_PULLS_WITHOUT_STRICT_LOGGING')`
			`if strict_logging_disabled:`
			`data = {`
			`'exception': ex,`
			`'token_or_user': token_or_user.id,`
			`'class': str(model_class),`
			`}`

			`logger.exception('update last_accessed for token/user failed', extra=data)`
			`else:`
			`raise`