Merge branch 'master' into nomenclature

Conflicts:
	test/data/test.db
This commit is contained in:
Jake Moshenko 2014-11-17 17:59:59 -05:00
commit f4681f2c18
60 changed files with 1716 additions and 496 deletions

View file

@ -3,7 +3,7 @@ import logging
import dateutil.parser
import json
from datetime import datetime, timedelta
from datetime import datetime, timedelta, date
from data.database import (User, Repository, Image, AccessToken, Role, RepositoryPermission,
Visibility, RepositoryTag, EmailConfirmation, FederatedLogin,
@ -14,7 +14,7 @@ from data.database import (User, Repository, Image, AccessToken, Role, Repositor
ExternalNotificationEvent, ExternalNotificationMethod,
RepositoryNotification, RepositoryAuthorizedEmail, TeamMemberInvite,
DerivedImageStorage, ImageStorageTransformation, random_string_generator,
db, BUILD_PHASE)
db, BUILD_PHASE, QuayUserField)
from peewee import JOIN_LEFT_OUTER, fn
from util.validation import (validate_username, validate_email, validate_password,
INVALID_PASSWORD_MESSAGE)
@ -288,6 +288,7 @@ def delete_robot(robot_username):
try:
robot = User.get(username=robot_username, robot=True)
robot.delete_instance(recursive=True, delete_nullable=True)
except User.DoesNotExist:
raise InvalidRobotException('Could not find robot with username: %s' %
robot_username)
@ -632,7 +633,7 @@ def get_matching_users(username_prefix, robot_namespace=None,
query = (User
.select(User.username, User.robot)
.group_by(User.username)
.group_by(User.username, User.robot)
.where(direct_user_query))
if organization:
@ -829,8 +830,10 @@ def _filter_to_repos_for_user(query, username=None, namespace=None,
if namespace:
where_clause = where_clause & (Namespace.username == namespace)
# TODO(jschorr, jake): Figure out why the old join on Visibility was so darn slow and
# remove this hack.
if include_public:
new_clause = (Visibility.name == 'public')
new_clause = (Repository.visibility == _get_public_repo_visibility())
if where_clause:
where_clause = where_clause | new_clause
else:
@ -839,6 +842,16 @@ def _filter_to_repos_for_user(query, username=None, namespace=None,
return query.where(where_clause)
_public_repo_visibility_cache = None
def _get_public_repo_visibility():
global _public_repo_visibility_cache
if not _public_repo_visibility_cache:
_public_repo_visibility_cache = Visibility.get(name='public')
return _public_repo_visibility_cache
def get_matching_repositories(repo_term, username=None):
namespace_term = repo_term
name_term = repo_term
@ -1059,16 +1072,26 @@ def get_repository(namespace_name, repository_name):
return None
def get_repo_image(namespace_name, repository_name, image_id):
def get_repo_image(namespace_name, repository_name, docker_image_id):
def limit_to_image_id(query):
return query.where(Image.docker_image_id == image_id)
return query.where(Image.docker_image_id == docker_image_id).limit(1)
query = _get_repository_images(namespace_name, repository_name, limit_to_image_id)
try:
return query.get()
except Image.DoesNotExist:
return None
def get_repo_image_extended(namespace_name, repository_name, docker_image_id):
def limit_to_image_id(query):
return query.where(Image.docker_image_id == docker_image_id).limit(1)
images = _get_repository_images_base(namespace_name, repository_name, limit_to_image_id)
if not images:
return None
else:
return images[0]
return images[0]
def repository_is_public(namespace_name, repository_name):
try:
@ -1161,20 +1184,21 @@ def __translate_ancestry(old_ancestry, translations, repository, username, prefe
if old_ancestry == '/':
return '/'
def translate_id(old_id):
def translate_id(old_id, docker_image_id):
logger.debug('Translating id: %s', old_id)
if old_id not in translations:
# Figure out which docker_image_id the old id refers to, then find a
# a local one
old = Image.select(Image.docker_image_id).where(Image.id == old_id).get()
image_in_repo = find_create_or_link_image(old.docker_image_id, repository, username,
image_in_repo = find_create_or_link_image(docker_image_id, repository, username,
translations, preferred_location)
translations[old_id] = image_in_repo.id
return translations[old_id]
# Select all the ancestor Docker IDs in a single query.
old_ids = [int(id_str) for id_str in old_ancestry.split('/')[1:-1]]
new_ids = [str(translate_id(old_id)) for old_id in old_ids]
query = Image.select(Image.id, Image.docker_image_id).where(Image.id << old_ids)
old_images = {i.id: i.docker_image_id for i in query}
# Translate the old images into new ones.
new_ids = [str(translate_id(old_id, old_images[old_id])) for old_id in old_ids]
return '/%s/' % '/'.join(new_ids)
@ -1186,36 +1210,22 @@ def _create_storage(location_name):
return storage
def find_create_or_link_image(docker_image_id, repository, username, translations,
preferred_location):
def _find_or_link_image(existing_image, repository, username, translations, preferred_location):
# TODO(jake): This call is currently recursively done under a single transaction. Can we make
# it instead be done under a set of transactions?
with config.app_config['DB_TRANSACTION_FACTORY'](db):
# Check for an existing image, under the transaction, to make sure it doesn't already exist.
repo_image = get_repo_image(repository.namespace_user.username, repository.name,
docker_image_id)
existing_image.docker_image_id)
if repo_image:
return repo_image
query = (Image
.select(Image, ImageStorage)
.distinct()
.join(ImageStorage)
.switch(Image)
.join(Repository)
.join(Visibility)
.switch(Repository)
.join(RepositoryPermission, JOIN_LEFT_OUTER)
.switch(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(ImageStorage.uploading == False))
query = (_filter_to_repos_for_user(query, username)
.where(Image.docker_image_id == docker_image_id))
new_image_ancestry = '/'
origin_image_id = None
# Make sure the existing base image still exists.
try:
to_copy = query.get()
to_copy = Image.select().join(ImageStorage).where(Image.id == existing_image.id).get()
msg = 'Linking image to existing storage with docker id: %s and uuid: %s'
logger.debug(msg, docker_image_id, to_copy.storage.uuid)
logger.debug(msg, existing_image.docker_image_id, to_copy.storage.uuid)
new_image_ancestry = __translate_ancestry(to_copy.ancestors, translations, repository,
username, preferred_location)
@ -1223,25 +1233,71 @@ def find_create_or_link_image(docker_image_id, repository, username, translation
storage = to_copy.storage
storage.locations = {placement.location.name
for placement in storage.imagestorageplacement_set}
origin_image_id = to_copy.id
new_image = Image.create(docker_image_id=existing_image.docker_image_id,
repository=repository, storage=storage,
ancestors=new_image_ancestry)
logger.debug('Storing translation %s -> %s', existing_image.id, new_image.id)
translations[existing_image.id] = new_image.id
return new_image
except Image.DoesNotExist:
logger.debug('Creating new storage for docker id: %s', docker_image_id)
storage = _create_storage(preferred_location)
logger.debug('Storage locations: %s', storage.locations)
new_image = Image.create(docker_image_id=docker_image_id,
repository=repository, storage=storage,
ancestors=new_image_ancestry)
logger.debug('new_image storage locations: %s', new_image.storage.locations)
return None
if origin_image_id:
logger.debug('Storing translation %s -> %s', origin_image_id, new_image.id)
translations[origin_image_id] = new_image.id
def find_create_or_link_image(docker_image_id, repository, username, translations,
preferred_location):
return new_image
# First check for the image existing in the repository. If found, we simply return it.
repo_image = get_repo_image(repository.namespace_user.username, repository.name,
docker_image_id)
if repo_image:
return repo_image
# We next check to see if there is an existing storage the new image can link to.
existing_image_query = (Image
.select(Image, ImageStorage)
.distinct()
.join(ImageStorage)
.switch(Image)
.join(Repository)
.join(RepositoryPermission, JOIN_LEFT_OUTER)
.switch(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(ImageStorage.uploading == False))
existing_image_query = (_filter_to_repos_for_user(existing_image_query, username)
.where(Image.docker_image_id == docker_image_id))
# If there is an existing image, we try to translate its ancestry and copy its storage.
new_image = None
try:
logger.debug('Looking up existing image for ID: %s', docker_image_id)
existing_image = existing_image_query.get()
logger.debug('Existing image %s found for ID: %s', existing_image.id, docker_image_id)
new_image = _find_or_link_image(existing_image, repository, username, translations,
preferred_location)
if new_image:
return new_image
except Image.DoesNotExist:
logger.debug('No existing image found for ID: %s', docker_image_id)
pass
# Otherwise, create a new storage directly.
with config.app_config['DB_TRANSACTION_FACTORY'](db):
# Final check for an existing image, under the transaction.
repo_image = get_repo_image(repository.namespace_user.username, repository.name,
docker_image_id)
if repo_image:
return repo_image
logger.debug('Creating new storage for docker id: %s', docker_image_id)
storage = _create_storage(preferred_location)
return Image.create(docker_image_id=docker_image_id,
repository=repository, storage=storage,
ancestors='/')
def find_or_create_derived_storage(source, transformation_name, preferred_location):
@ -1355,6 +1411,15 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created
fetched.storage.save()
return fetched
def _get_repository_images(namespace_name, repository_name, query_modifier):
query = (Image
.select()
.join(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(Repository.name == repository_name, Namespace.username == namespace_name))
query = query_modifier(query)
return query
def _get_repository_images_base(namespace_name, repository_name, query_modifier):
query = (ImageStoragePlacement
@ -1391,6 +1456,20 @@ def _get_repository_images_base(namespace_name, repository_name, query_modifier)
return images.values()
def lookup_repository_images(namespace_name, repository_name, docker_image_ids):
return (Image
.select()
.join(Repository)
.join(Namespace, on=(Repository.namespace_user == Namespace.id))
.where(Repository.name == repository_name, Namespace.username == namespace_name,
Image.docker_image_id << docker_image_ids))
def get_matching_repository_images(namespace_name, repository_name, docker_image_ids):
def modify_query(q):
return q.where(Image.docker_image_id << docker_image_ids)
return _get_repository_images_base(namespace_name, repository_name, modify_query)
def get_repository_images(namespace_name, repository_name):
return _get_repository_images_base(namespace_name, repository_name, lambda q: q)
@ -1406,7 +1485,12 @@ def list_repository_tags(namespace_name, repository_name):
def garbage_collect_repository(namespace_name, repository_name):
storage_id_whitelist = {}
with config.app_config['DB_TRANSACTION_FACTORY'](db):
# TODO (jake): We could probably select this and all the images in a single query using
# a different kind of join.
# Get a list of all images used by tags in the repository
tag_query = (RepositoryTag
.select(RepositoryTag, Image, ImageStorage)
@ -1425,29 +1509,31 @@ def garbage_collect_repository(namespace_name, repository_name):
referenced_anscestors = referenced_anscestors.union(set(ancestor_list))
referenced_anscestors.add(tag.image.id)
all_repo_images = get_repository_images(namespace_name, repository_name)
all_repo_images = _get_repository_images(namespace_name, repository_name, lambda q: q)
all_images = {int(img.id): img for img in all_repo_images}
to_remove = set(all_images.keys()).difference(referenced_anscestors)
if len(to_remove) > 0:
logger.info('Cleaning up unreferenced images: %s', to_remove)
storage_id_whitelist = {all_images[to_remove_id].storage.id for to_remove_id in to_remove}
Image.delete().where(Image.id << list(to_remove)).execute()
garbage_collect_storage(storage_id_whitelist)
if len(to_remove) > 0:
logger.info('Garbage collecting storage for images: %s', to_remove)
garbage_collect_storage(storage_id_whitelist)
return len(to_remove)
def garbage_collect_storage(storage_id_whitelist):
# We are going to make the conscious decision to not delete image storage inside the transaction
# This may end up producing garbage in s3, trading off for higher availability in the database
if len(storage_id_whitelist) == 0:
return
def placements_query_to_paths_set(placements_query):
return {(placement.location.name, config.store.image_path(placement.storage.uuid))
for placement in placements_query}
def orphaned_storage_query(select_base_query, candidates):
def orphaned_storage_query(select_base_query, candidates, group_by):
return (select_base_query
.switch(ImageStorage)
.join(Image, JOIN_LEFT_OUTER)
@ -1455,14 +1541,19 @@ def garbage_collect_storage(storage_id_whitelist):
.join(DerivedImageStorage, JOIN_LEFT_OUTER,
on=(ImageStorage.id == DerivedImageStorage.derivative))
.where(ImageStorage.id << list(candidates))
.group_by(ImageStorage)
.group_by(*group_by)
.having((fn.Count(Image.id) == 0) & (fn.Count(DerivedImageStorage.id) == 0)))
logger.debug('Garbage collecting storage from candidates: %s', storage_id_whitelist)
# Note: We remove the derived image storage in its own transaction as a way to reduce the
# time that the transaction holds on the database indicies. This could result in a derived
# image storage being deleted for an image storage which is later reused during this time,
# but since these are caches anyway, it isn't terrible and worth the tradeoff (for now).
logger.debug('Garbage collecting derived storage from candidates: %s', storage_id_whitelist)
with config.app_config['DB_TRANSACTION_FACTORY'](db):
# Find out which derived storages will be removed, and add them to the whitelist
orphaned_from_candidates = list(orphaned_storage_query(ImageStorage.select(ImageStorage.id),
storage_id_whitelist))
storage_id_whitelist,
(ImageStorage.id,)))
if len(orphaned_from_candidates) > 0:
derived_to_remove = (ImageStorage
@ -1478,6 +1569,12 @@ def garbage_collect_storage(storage_id_whitelist):
.where(DerivedImageStorage.source << orphaned_from_candidates)
.execute())
# Note: Both of these deletes must occur in the same transaction (unfortunately) because a
# storage without any placement is invalid, and a placement cannot exist without a storage.
# TODO(jake): We might want to allow for null storages on placements, which would allow us to
# delete the storages, then delete the placements in a non-transaction.
logger.debug('Garbage collecting storages from candidates: %s', storage_id_whitelist)
with config.app_config['DB_TRANSACTION_FACTORY'](db):
# Track all of the data that should be removed from blob storage
placements_to_remove = orphaned_storage_query(ImageStoragePlacement
.select(ImageStoragePlacement,
@ -1486,7 +1583,10 @@ def garbage_collect_storage(storage_id_whitelist):
.join(ImageStorageLocation)
.switch(ImageStoragePlacement)
.join(ImageStorage),
storage_id_whitelist)
storage_id_whitelist,
(ImageStorage, ImageStoragePlacement,
ImageStorageLocation))
paths_to_remove = placements_query_to_paths_set(placements_to_remove.clone())
# Remove the placements for orphaned storages
@ -1499,14 +1599,17 @@ def garbage_collect_storage(storage_id_whitelist):
# Remove the all orphaned storages
orphaned_storages = list(orphaned_storage_query(ImageStorage.select(ImageStorage.id),
storage_id_whitelist))
storage_id_whitelist,
(ImageStorage.id,)))
if len(orphaned_storages) > 0:
(ImageStorage
.delete()
.where(ImageStorage.id << orphaned_storages)
.execute())
# Delete the actual blob storage
# We are going to make the conscious decision to not delete image storage blobs inside
# transactions.
# This may end up producing garbage in s3, trading off for higher availability in the database.
for location_name, image_path in paths_to_remove:
logger.debug('Removing %s from %s', image_path, location_name)
config.store.remove({location_name}, image_path)
@ -1527,7 +1630,7 @@ def get_tag_image(namespace_name, repository_name, tag_name):
def get_image_by_id(namespace_name, repository_name, docker_image_id):
image = get_repo_image(namespace_name, repository_name, docker_image_id)
image = get_repo_image_extended(namespace_name, repository_name, docker_image_id)
if not image:
raise DataModelException('Unable to find image \'%s\' for repo \'%s/%s\'' %
(docker_image_id, namespace_name, repository_name))
@ -1714,7 +1817,7 @@ def purge_repository(namespace_name, repository_name):
# Delete the rest of the repository metadata
fetched = _get_repository(namespace_name, repository_name)
fetched.delete_instance(recursive=True)
fetched.delete_instance(recursive=True, delete_nullable=True)
def get_private_repo_count(username):
@ -1758,11 +1861,10 @@ def get_repository_delegate_tokens(namespace_name, repository_name):
def get_repo_delegate_token(namespace_name, repository_name, code):
repo_query = get_repository_delegate_tokens(namespace_name, repository_name)
found = list(repo_query.where(AccessToken.code == code))
if found:
return found[0]
else:
try:
return repo_query.where(AccessToken.code == code).get()
except AccessToken.DoesNotExist:
raise InvalidTokenException('Unable to find token with code: %s' % code)
@ -1937,9 +2039,9 @@ def list_logs(start_time, end_time, performer=None, repository=None, namespace=N
if namespace:
joined = joined.where(User.username == namespace)
return joined.where(
return list(joined.where(
LogEntry.datetime >= start_time,
LogEntry.datetime < end_time).order_by(LogEntry.datetime.desc())
LogEntry.datetime < end_time).order_by(LogEntry.datetime.desc()))
def log_action(kind_name, user_or_organization_name, performer=None,
@ -1951,7 +2053,7 @@ def log_action(kind_name, user_or_organization_name, performer=None,
kind = LogEntryKind.get(LogEntryKind.name == kind_name)
account = User.get(User.username == user_or_organization_name)
LogEntry.create(kind=kind, account=account, performer=performer,
repository=repository, access_token=access_token, ip=ip,
repository=repository, ip=ip,
metadata_json=json.dumps(metadata), datetime=timestamp)
@ -2239,6 +2341,18 @@ def confirm_team_invite(code, user):
found.delete_instance()
return (team, inviter)
def get_repository_usage():
one_month_ago = date.today() - timedelta(weeks=4)
repo_pull = LogEntryKind.get(name = 'pull_repo')
repo_verb = LogEntryKind.get(name = 'repo_verb')
return (LogEntry.select(LogEntry.ip, LogEntry.repository)
.where((LogEntry.kind == repo_pull) | (LogEntry.kind == repo_verb))
.where(~(LogEntry.repository >> None))
.where(LogEntry.datetime >= one_month_ago)
.group_by(LogEntry.ip, LogEntry.repository)
.count())
def archivable_buildlogs_query():
presumed_dead_date = datetime.utcnow() - PRESUMED_DEAD_BUILD_AGE
return (RepositoryBuild.select()