2015-07-15 21:25:41 +00:00
|
|
|
import logging
|
2016-08-01 22:22:38 +00:00
|
|
|
import random
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2017-03-23 14:46:04 +00:00
|
|
|
from enum import Enum
|
2015-07-15 21:25:41 +00:00
|
|
|
from datetime import timedelta, datetime
|
2016-08-17 19:09:19 +00:00
|
|
|
from peewee import JOIN_LEFT_OUTER, fn, SQL, IntegrityError
|
2017-05-03 21:02:24 +00:00
|
|
|
from playhouse.shortcuts import case
|
2016-08-01 22:22:38 +00:00
|
|
|
from cachetools import ttl_cache
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
from data.model import (
|
|
|
|
config, DataModelException, tag, db_transaction, storage, permission, _basequery)
|
|
|
|
from data.database import (
|
2017-10-18 21:03:27 +00:00
|
|
|
Repository, Namespace, RepositoryTag, Star, Image, ImageStorage, User, Visibility, Tag,
|
2017-07-24 15:05:15 +00:00
|
|
|
RepositoryPermission, RepositoryActionCount, Role, RepositoryAuthorizedEmail, TagManifest,
|
|
|
|
DerivedStorageForImage, Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
|
|
|
|
db_random_func, db_concat_func, RepositorySearchScore)
|
2017-01-11 19:53:14 +00:00
|
|
|
from data.text import prefix_search
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
from util.itertoolrecipes import take
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
2017-03-23 14:46:04 +00:00
|
|
|
SEARCH_FIELDS = Enum("SearchFields", ["name", "description"])
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2017-03-22 18:30:33 +00:00
|
|
|
|
|
|
|
def get_repo_kind_name(repo):
|
2017-03-22 19:02:27 +00:00
|
|
|
return Repository.kind.get_name(repo.kind_id)
|
2017-03-22 18:30:33 +00:00
|
|
|
|
|
|
|
|
2016-09-12 20:19:19 +00:00
|
|
|
def get_repository_count():
|
|
|
|
return Repository.select().count()
|
|
|
|
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2015-10-13 16:55:40 +00:00
|
|
|
def get_public_repo_visibility():
|
|
|
|
return _basequery.get_public_repo_visibility()
|
|
|
|
|
|
|
|
|
2017-03-20 23:05:55 +00:00
|
|
|
def create_repository(namespace, name, creating_user, visibility='private', repo_kind='image'):
|
2015-07-15 21:25:41 +00:00
|
|
|
private = Visibility.get(name=visibility)
|
|
|
|
namespace_user = User.get(username=namespace)
|
2017-03-20 23:05:55 +00:00
|
|
|
repo = Repository.create(name=name, visibility=private, namespace_user=namespace_user,
|
|
|
|
kind=Repository.kind.get_id(repo_kind))
|
2015-07-15 21:25:41 +00:00
|
|
|
admin = Role.get(name='admin')
|
|
|
|
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
yesterday = datetime.now() - timedelta(days=1)
|
|
|
|
RepositoryActionCount.create(repository=repo, count=0, date=yesterday)
|
2017-03-17 17:51:45 +00:00
|
|
|
RepositorySearchScore.create(repository=repo, score=0)
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
|
2015-07-15 21:25:41 +00:00
|
|
|
if creating_user and not creating_user.organization:
|
|
|
|
RepositoryPermission.create(user=creating_user, repository=repo, role=admin)
|
|
|
|
|
|
|
|
if creating_user.username != namespace:
|
|
|
|
# Permission prototypes only work for orgs
|
|
|
|
permission.apply_default_permissions(repo, creating_user)
|
|
|
|
return repo
|
|
|
|
|
|
|
|
|
2017-03-22 18:30:13 +00:00
|
|
|
def get_repository(namespace_name, repository_name, kind_filter=None):
|
2015-07-15 21:25:41 +00:00
|
|
|
try:
|
2017-03-22 18:30:13 +00:00
|
|
|
return _basequery.get_existing_repository(namespace_name, repository_name,
|
|
|
|
kind_filter=kind_filter)
|
2015-07-15 21:25:41 +00:00
|
|
|
except Repository.DoesNotExist:
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2017-03-23 01:51:55 +00:00
|
|
|
def get_or_create_repository(namespace, name, creating_user, visibility='private',
|
|
|
|
repo_kind='image'):
|
|
|
|
repo = get_repository(namespace, name, repo_kind)
|
|
|
|
if repo is None:
|
|
|
|
repo = create_repository(namespace, name, creating_user, visibility, repo_kind)
|
|
|
|
return repo
|
|
|
|
|
|
|
|
|
2016-08-26 18:48:39 +00:00
|
|
|
def purge_repository(namespace_name, repository_name):
|
2016-08-31 15:42:31 +00:00
|
|
|
""" Completely delete all traces of the repository. Will return True upon
|
|
|
|
complete success, and False upon partial or total failure. Garbage
|
|
|
|
collection is incremental and repeatable, so this return value does
|
|
|
|
not need to be checked or responded to.
|
|
|
|
"""
|
|
|
|
|
2017-02-21 15:59:27 +00:00
|
|
|
try:
|
|
|
|
repo = _basequery.get_existing_repository(namespace_name, repository_name)
|
|
|
|
except Repository.DoesNotExist:
|
|
|
|
return False
|
2016-07-18 22:20:00 +00:00
|
|
|
|
2017-10-18 21:03:27 +00:00
|
|
|
# Delete the repository of all OCI-referenced entries.
|
|
|
|
# Note that new-model Tag's must be deleted in *two* passes, as they can reference parent tags,
|
|
|
|
# and MySQL is... particular... about such relationships when deleting.
|
|
|
|
Tag.delete().where(Tag.repository == repo, ~(Tag.linked_tag >> None)).execute()
|
|
|
|
Tag.delete().where(Tag.repository == repo).execute()
|
|
|
|
|
2016-08-26 18:48:39 +00:00
|
|
|
# Delete all tags to allow gc to reclaim storage
|
|
|
|
previously_referenced = tag.purge_all_tags(repo)
|
|
|
|
unreferenced_image_q = Image.select(Image.id).where(Image.repository == repo)
|
2015-10-21 18:07:25 +00:00
|
|
|
|
2016-08-26 18:48:39 +00:00
|
|
|
if len(previously_referenced) > 0:
|
2017-07-24 15:05:15 +00:00
|
|
|
unreferenced_image_q = (unreferenced_image_q.where(~(Image.id << list(previously_referenced))))
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2016-08-26 18:48:39 +00:00
|
|
|
unreferenced_candidates = set(img[0] for img in unreferenced_image_q.tuples())
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
# Gc to remove the images and storage
|
2016-08-31 15:42:31 +00:00
|
|
|
all_repo_images = previously_referenced | unreferenced_candidates
|
2017-07-14 17:09:19 +00:00
|
|
|
successful_gc = garbage_collect_repo(repo, all_repo_images, is_purge=True)
|
2016-08-31 15:42:31 +00:00
|
|
|
if not successful_gc:
|
|
|
|
return False
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
# Delete the rest of the repository metadata
|
2017-02-21 15:59:27 +00:00
|
|
|
try:
|
|
|
|
fetched = _basequery.get_existing_repository(namespace_name, repository_name)
|
|
|
|
except Repository.DoesNotExist:
|
|
|
|
return False
|
|
|
|
|
2015-07-15 21:25:41 +00:00
|
|
|
fetched.delete_instance(recursive=True, delete_nullable=False)
|
2017-04-21 21:21:17 +00:00
|
|
|
|
2017-04-12 19:47:24 +00:00
|
|
|
# Run callbacks
|
|
|
|
for callback in config.repo_cleanup_callbacks:
|
|
|
|
callback(namespace_name, repository_name)
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2016-08-31 15:42:31 +00:00
|
|
|
return True
|
|
|
|
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2016-08-01 22:22:38 +00:00
|
|
|
@ttl_cache(maxsize=1, ttl=600)
|
|
|
|
def _get_gc_expiration_policies():
|
2017-07-24 15:05:15 +00:00
|
|
|
policy_tuples_query = (
|
|
|
|
Namespace.select(Namespace.removed_tag_expiration_s).distinct()
|
|
|
|
.limit(100) # This sucks but it's the only way to limit memory
|
|
|
|
.tuples())
|
2016-08-01 22:22:38 +00:00
|
|
|
return [policy[0] for policy in policy_tuples_query]
|
|
|
|
|
|
|
|
|
|
|
|
def get_random_gc_policy():
|
|
|
|
""" Return a single random policy from the database to use when garbage collecting.
|
2017-03-20 23:05:55 +00:00
|
|
|
"""
|
2016-08-01 22:22:38 +00:00
|
|
|
return random.choice(_get_gc_expiration_policies())
|
|
|
|
|
|
|
|
|
|
|
|
def find_repository_with_garbage(limit_to_gc_policy_s):
|
|
|
|
expiration_timestamp = get_epoch_timestamp() - limit_to_gc_policy_s
|
2015-06-19 18:55:30 +00:00
|
|
|
|
|
|
|
try:
|
2017-07-24 15:05:15 +00:00
|
|
|
candidates = (RepositoryTag.select(RepositoryTag.repository).join(Repository)
|
2015-06-19 18:55:30 +00:00
|
|
|
.join(Namespace, on=(Repository.namespace_user == Namespace.id))
|
|
|
|
.where(~(RepositoryTag.lifetime_end_ts >> None),
|
2016-08-01 22:22:38 +00:00
|
|
|
(RepositoryTag.lifetime_end_ts <= expiration_timestamp),
|
2017-07-24 15:05:15 +00:00
|
|
|
(Namespace.removed_tag_expiration_s == limit_to_gc_policy_s)).limit(500)
|
|
|
|
.distinct().alias('candidates'))
|
2015-06-19 18:55:30 +00:00
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
found = (RepositoryTag.select(candidates.c.repository_id).from_(candidates)
|
|
|
|
.order_by(db_random_func()).get())
|
2015-06-19 18:55:30 +00:00
|
|
|
|
2015-06-19 18:55:44 +00:00
|
|
|
if found is None:
|
|
|
|
return
|
2015-06-19 18:55:30 +00:00
|
|
|
|
2015-06-19 18:55:44 +00:00
|
|
|
return Repository.get(Repository.id == found.repository_id)
|
2015-06-19 18:55:30 +00:00
|
|
|
except RepositoryTag.DoesNotExist:
|
|
|
|
return None
|
|
|
|
except Repository.DoesNotExist:
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2017-07-14 17:09:19 +00:00
|
|
|
def _all_images_for_gc(repo):
|
|
|
|
""" Returns all the images found in the given repository, for the purposes of GC. """
|
|
|
|
images = (Image
|
|
|
|
.select(Image.id, Image.docker_image_id,
|
|
|
|
ImageStorage.id, ImageStorage.uuid)
|
|
|
|
.join(ImageStorage)
|
|
|
|
.where(Image.repository == repo))
|
|
|
|
return list(images)
|
|
|
|
|
|
|
|
|
|
|
|
def _filter_to_unreferenced(repo, candidates_orphans):
|
|
|
|
""" Filters the given candidate orphan images into those unreferenced by any tag or
|
|
|
|
other image. """
|
|
|
|
|
|
|
|
# Any image directly referenced by a tag that still exists, cannot be GCed.
|
|
|
|
direct_referenced = (RepositoryTag
|
|
|
|
.select(RepositoryTag.image)
|
|
|
|
.where(RepositoryTag.repository == repo.id,
|
|
|
|
RepositoryTag.image << candidates_orphans))
|
|
|
|
|
|
|
|
# Any image which is the parent of another image, cannot be GCed.
|
|
|
|
parent_referenced = (Image
|
|
|
|
.select(Image.parent)
|
|
|
|
.where(Image.repository == repo.id,
|
|
|
|
Image.parent << candidates_orphans))
|
|
|
|
|
|
|
|
referenced_candidates = (direct_referenced | parent_referenced)
|
|
|
|
|
|
|
|
# We desire a few pieces of information from the database from the following
|
|
|
|
# query: all of the image ids which are associated with this repository,
|
|
|
|
# and the storages which are associated with those images.
|
|
|
|
unreferenced_candidates = (Image
|
|
|
|
.select(Image.id, Image.docker_image_id,
|
|
|
|
ImageStorage.id, ImageStorage.uuid)
|
|
|
|
.join(ImageStorage)
|
|
|
|
.where(Image.id << candidates_orphans,
|
|
|
|
~(Image.id << referenced_candidates)))
|
|
|
|
return list(unreferenced_candidates)
|
|
|
|
|
|
|
|
|
|
|
|
def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False):
|
2016-08-31 15:42:31 +00:00
|
|
|
""" Garbage collect the specified repository object. This will remove all
|
|
|
|
images, derived images, and other associated metadata, for images which
|
|
|
|
are no longer referenced by a tag or another image which is itself
|
|
|
|
tagged. Returns True if garbage collection was completed without error
|
|
|
|
and False otherwise. Retries are safe and work incrementally, so this
|
|
|
|
return value does not need to be checked or handled.
|
|
|
|
"""
|
2015-06-19 18:55:44 +00:00
|
|
|
logger.debug('Garbage collecting repository %s', repo.id)
|
|
|
|
|
2015-11-24 17:44:07 +00:00
|
|
|
storage_id_whitelist = set()
|
2016-08-26 18:48:39 +00:00
|
|
|
|
2017-07-14 17:09:19 +00:00
|
|
|
candidate_orphan_image_set = tag.garbage_collect_tags(repo)
|
2016-08-26 18:48:39 +00:00
|
|
|
if extra_candidate_set:
|
|
|
|
candidate_orphan_image_set.update(extra_candidate_set)
|
|
|
|
|
|
|
|
if not len(candidate_orphan_image_set):
|
|
|
|
logger.debug('No candidate images for GC for repo: %s', repo.id)
|
2016-08-31 15:42:31 +00:00
|
|
|
return True
|
2016-08-26 18:48:39 +00:00
|
|
|
|
2017-06-22 22:09:17 +00:00
|
|
|
all_images_removed = set()
|
|
|
|
all_storage_id_whitelist = set()
|
|
|
|
all_unreferenced_candidates = set()
|
|
|
|
|
2017-07-14 17:09:19 +00:00
|
|
|
if not is_purge:
|
|
|
|
# Remove any images directly referenced by tags, to prune the working set.
|
|
|
|
direct_referenced = (RepositoryTag.select(RepositoryTag.image).where(
|
|
|
|
RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set)))
|
|
|
|
candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced])
|
2017-06-22 22:14:06 +00:00
|
|
|
|
2017-06-22 22:09:17 +00:00
|
|
|
# Iteratively try to remove images from the database. The only images we can remove are those
|
|
|
|
# that are not referenced by tags AND not the parents of other images. We continue removing images
|
|
|
|
# until no changes are found.
|
|
|
|
iteration = 0
|
2017-06-22 22:14:06 +00:00
|
|
|
making_progress = True
|
|
|
|
while candidate_orphan_image_set and making_progress:
|
2017-06-22 22:09:17 +00:00
|
|
|
iteration = iteration + 1
|
|
|
|
logger.debug('Starting iteration #%s for GC of repository %s with candidates: %s', iteration,
|
|
|
|
repo.id, candidate_orphan_image_set)
|
|
|
|
candidates_orphans = list(candidate_orphan_image_set)
|
|
|
|
|
|
|
|
with db_transaction():
|
2017-07-14 17:09:19 +00:00
|
|
|
# Find the images to delete.
|
|
|
|
images_to_gc = (_all_images_for_gc(repo) if is_purge
|
|
|
|
else _filter_to_unreferenced(repo, candidates_orphans))
|
2017-06-22 22:09:17 +00:00
|
|
|
|
2017-07-14 17:09:19 +00:00
|
|
|
# Make sure we are making progress.
|
|
|
|
image_ids_to_remove = [candidate.id for candidate in images_to_gc]
|
2017-06-22 22:14:06 +00:00
|
|
|
making_progress = bool(len(image_ids_to_remove))
|
2017-06-22 22:09:17 +00:00
|
|
|
if len(image_ids_to_remove) == 0:
|
2017-07-14 17:09:19 +00:00
|
|
|
# No more images to remove.
|
2017-06-22 22:09:17 +00:00
|
|
|
break
|
|
|
|
|
2016-12-22 19:27:42 +00:00
|
|
|
logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove)
|
2017-07-14 17:09:19 +00:00
|
|
|
storage_id_whitelist = set([candidate.storage_id for candidate in images_to_gc])
|
2015-11-24 17:44:07 +00:00
|
|
|
|
|
|
|
# Lookup any derived images for the images to remove.
|
2017-07-24 15:05:15 +00:00
|
|
|
derived = DerivedStorageForImage.select().where(DerivedStorageForImage.source_image <<
|
|
|
|
image_ids_to_remove)
|
2015-11-24 17:44:07 +00:00
|
|
|
|
|
|
|
has_derived = False
|
|
|
|
for derived_image in derived:
|
|
|
|
has_derived = True
|
|
|
|
storage_id_whitelist.add(derived_image.derivative_id)
|
|
|
|
|
|
|
|
# Delete any derived images and the images themselves.
|
|
|
|
if has_derived:
|
2016-08-17 19:09:19 +00:00
|
|
|
try:
|
2017-07-24 15:05:15 +00:00
|
|
|
(DerivedStorageForImage.delete()
|
|
|
|
.where(DerivedStorageForImage.source_image << image_ids_to_remove).execute())
|
2016-08-17 19:09:19 +00:00
|
|
|
except IntegrityError:
|
2016-12-22 19:27:42 +00:00
|
|
|
logger.info('Could not GC derived images %s; will try again soon', image_ids_to_remove)
|
2016-08-31 15:42:31 +00:00
|
|
|
return False
|
2016-08-17 19:09:19 +00:00
|
|
|
|
|
|
|
try:
|
2016-12-22 19:27:42 +00:00
|
|
|
Image.delete().where(Image.id << image_ids_to_remove).execute()
|
2016-08-17 19:09:19 +00:00
|
|
|
except IntegrityError:
|
2016-12-22 19:27:42 +00:00
|
|
|
logger.info('Could not GC images %s; will try again soon', image_ids_to_remove)
|
2016-08-31 15:42:31 +00:00
|
|
|
return False
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2017-06-22 22:14:06 +00:00
|
|
|
# Add the images to the removed set and remove them from the candidate set.
|
|
|
|
all_images_removed.update(image_ids_to_remove)
|
|
|
|
all_storage_id_whitelist.update(storage_id_whitelist)
|
2017-07-14 17:09:19 +00:00
|
|
|
all_unreferenced_candidates.update(images_to_gc)
|
2017-06-22 22:09:17 +00:00
|
|
|
|
2017-06-22 22:14:06 +00:00
|
|
|
candidate_orphan_image_set.difference_update(image_ids_to_remove)
|
2017-06-22 22:09:17 +00:00
|
|
|
|
2016-12-22 19:27:42 +00:00
|
|
|
# If any images were removed, GC any orphaned storages.
|
2017-06-22 22:09:17 +00:00
|
|
|
if len(all_images_removed) > 0:
|
|
|
|
logger.info('Garbage collecting storage for images: %s', all_images_removed)
|
|
|
|
storage_ids_removed = set(storage.garbage_collect_storage(all_storage_id_whitelist))
|
2016-12-22 19:27:42 +00:00
|
|
|
|
|
|
|
# If any storages were removed and cleanup callbacks are registered, call them with
|
|
|
|
# the images+storages removed.
|
|
|
|
if storage_ids_removed and config.image_cleanup_callbacks:
|
2017-07-24 15:05:15 +00:00
|
|
|
image_storages_removed = [
|
|
|
|
candidate for candidate in all_unreferenced_candidates
|
|
|
|
if candidate.storage_id in storage_ids_removed
|
|
|
|
]
|
2016-12-22 19:27:42 +00:00
|
|
|
for callback in config.image_cleanup_callbacks:
|
|
|
|
callback(image_storages_removed)
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2016-08-31 15:42:31 +00:00
|
|
|
return True
|
|
|
|
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
def star_repository(user, repository):
|
|
|
|
""" Stars a repository. """
|
|
|
|
star = Star.create(user=user.id, repository=repository.id)
|
|
|
|
star.save()
|
|
|
|
|
|
|
|
|
|
|
|
def unstar_repository(user, repository):
|
|
|
|
""" Unstars a repository. """
|
|
|
|
try:
|
2017-07-24 15:05:15 +00:00
|
|
|
(Star.delete().where(Star.repository == repository.id, Star.user == user.id).execute())
|
2015-07-15 21:25:41 +00:00
|
|
|
except Star.DoesNotExist:
|
|
|
|
raise DataModelException('Star not found.')
|
2017-04-21 21:21:17 +00:00
|
|
|
|
|
|
|
|
2017-04-15 12:26:33 +00:00
|
|
|
def set_trust(repo, trust_enabled):
|
|
|
|
repo.trust_enabled = trust_enabled
|
|
|
|
repo.save()
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
def set_description(repo, description):
|
|
|
|
repo.description = description
|
|
|
|
repo.save()
|
|
|
|
|
|
|
|
|
2017-03-23 21:16:19 +00:00
|
|
|
def get_user_starred_repositories(user, kind_filter='image'):
|
2015-07-15 21:25:41 +00:00
|
|
|
""" Retrieves all of the repositories a user has starred. """
|
2017-03-23 21:16:19 +00:00
|
|
|
try:
|
|
|
|
repo_kind = Repository.kind.get_id(kind_filter)
|
|
|
|
except RepositoryKind.DoesNotExist:
|
|
|
|
raise DataModelException('Unknown kind of repository')
|
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
query = (Repository.select(Repository, User, Visibility, Repository.id.alias('rid')).join(Star)
|
|
|
|
.switch(Repository).join(User).switch(Repository).join(Visibility)
|
2017-03-23 21:16:19 +00:00
|
|
|
.where(Star.user == user, Repository.kind == repo_kind))
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
return query
|
|
|
|
|
|
|
|
|
|
|
|
def repository_is_starred(user, repository):
|
|
|
|
""" Determines whether a user has starred a repository or not. """
|
|
|
|
try:
|
2017-07-24 15:05:15 +00:00
|
|
|
(Star.select().where(Star.repository == repository.id, Star.user == user.id).get())
|
2015-07-15 21:25:41 +00:00
|
|
|
return True
|
|
|
|
except Star.DoesNotExist:
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def get_when_last_modified(repository_ids):
|
2017-04-07 21:25:44 +00:00
|
|
|
""" Returns a map from repository ID to the last modified time (in s) for each repository in the
|
|
|
|
given repository IDs list.
|
|
|
|
"""
|
2015-07-21 21:20:24 +00:00
|
|
|
if not repository_ids:
|
|
|
|
return {}
|
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
tuples = (RepositoryTag.select(RepositoryTag.repository, fn.Max(RepositoryTag.lifetime_start_ts))
|
|
|
|
.where(RepositoryTag.repository << repository_ids).group_by(RepositoryTag.repository)
|
2015-07-15 21:25:41 +00:00
|
|
|
.tuples())
|
|
|
|
|
|
|
|
last_modified_map = {}
|
|
|
|
for record in tuples:
|
|
|
|
last_modified_map[record[0]] = record[1]
|
|
|
|
|
|
|
|
return last_modified_map
|
|
|
|
|
|
|
|
|
2017-04-07 21:25:44 +00:00
|
|
|
def get_stars(repository_ids):
|
|
|
|
""" Returns a map from repository ID to the number of stars for each repository in the
|
|
|
|
given repository IDs list.
|
|
|
|
"""
|
|
|
|
if not repository_ids:
|
|
|
|
return {}
|
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
tuples = (Star.select(Star.repository, fn.Count(Star.id))
|
|
|
|
.where(Star.repository << repository_ids).group_by(Star.repository).tuples())
|
2017-04-07 21:25:44 +00:00
|
|
|
|
|
|
|
star_map = {}
|
|
|
|
for record in tuples:
|
|
|
|
star_map[record[0]] = record[1]
|
|
|
|
|
|
|
|
return star_map
|
|
|
|
|
|
|
|
|
2017-03-23 21:16:19 +00:00
|
|
|
def get_visible_repositories(username, namespace=None, kind_filter='image', include_public=False,
|
2017-03-20 23:05:55 +00:00
|
|
|
start_id=None, limit=None):
|
2015-07-21 21:20:24 +00:00
|
|
|
""" Returns the repositories visible to the given user (if any).
|
|
|
|
"""
|
2015-07-20 18:17:26 +00:00
|
|
|
if not include_public and not username:
|
2016-06-30 21:31:46 +00:00
|
|
|
# Short circuit by returning a query that will find no repositories. We need to return a query
|
|
|
|
# here, as it will be modified by other queries later on.
|
2016-07-06 20:15:54 +00:00
|
|
|
return Repository.select(Repository.id.alias('rid')).where(Repository.id == -1)
|
2015-07-20 18:17:26 +00:00
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
query = (Repository.select(Repository.name,
|
|
|
|
Repository.id.alias('rid'), Repository.description,
|
|
|
|
Namespace.username, Repository.visibility, Repository.kind)
|
|
|
|
.switch(Repository).join(Namespace, on=(Repository.namespace_user == Namespace.id)))
|
2016-08-10 19:08:06 +00:00
|
|
|
|
|
|
|
if username:
|
|
|
|
# Note: We only need the permissions table if we will filter based on a user's permissions.
|
|
|
|
query = query.switch(Repository).distinct().join(RepositoryPermission, JOIN_LEFT_OUTER)
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2017-03-23 21:16:19 +00:00
|
|
|
query = _basequery.filter_to_repos_for_user(query, username, namespace, kind_filter,
|
|
|
|
include_public, start_id=start_id)
|
2016-08-15 20:11:45 +00:00
|
|
|
|
|
|
|
if limit is not None:
|
|
|
|
query = query.limit(limit).order_by(SQL('rid'))
|
|
|
|
|
2015-07-15 21:25:41 +00:00
|
|
|
return query
|
|
|
|
|
|
|
|
|
2017-03-23 01:51:55 +00:00
|
|
|
def get_app_repository(namespace_name, repository_name):
|
|
|
|
""" Find an application repository. """
|
|
|
|
try:
|
|
|
|
return _basequery.get_existing_repository(namespace_name, repository_name,
|
|
|
|
kind_filter='application')
|
|
|
|
except Repository.DoesNotExist:
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2017-03-23 14:46:04 +00:00
|
|
|
def get_app_search(lookup, search_fields=None, username=None, limit=50):
|
|
|
|
if search_fields is None:
|
2017-03-23 15:35:17 +00:00
|
|
|
search_fields = set([SEARCH_FIELDS.name.name])
|
|
|
|
|
2017-03-23 01:51:55 +00:00
|
|
|
return get_filtered_matching_repositories(lookup, filter_username=username,
|
2017-07-24 15:05:15 +00:00
|
|
|
search_fields=search_fields, repo_kind='application',
|
|
|
|
offset=0, limit=limit)
|
2017-03-23 01:51:55 +00:00
|
|
|
|
|
|
|
|
2017-03-20 23:05:55 +00:00
|
|
|
def get_filtered_matching_repositories(lookup_value, filter_username=None, repo_kind='image',
|
2017-03-23 14:46:04 +00:00
|
|
|
offset=0, limit=25, search_fields=None):
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
""" Returns an iterator of all repositories matching the given lookup value, with optional
|
|
|
|
filtering to a specific user. If the user is unspecified, only public repositories will
|
|
|
|
be returned.
|
2015-07-15 21:25:41 +00:00
|
|
|
"""
|
2017-03-23 14:46:04 +00:00
|
|
|
if search_fields is None:
|
|
|
|
search_fields = set([SEARCH_FIELDS.description.name, SEARCH_FIELDS.name.name])
|
2015-07-15 21:25:41 +00:00
|
|
|
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
# Build the unfiltered search query.
|
2017-03-20 23:05:55 +00:00
|
|
|
unfiltered_query = _get_sorted_matching_repositories(lookup_value, repo_kind=repo_kind,
|
2017-03-23 14:46:04 +00:00
|
|
|
search_fields=search_fields,
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
include_private=filter_username is not None)
|
|
|
|
|
|
|
|
# Add a filter to the iterator, if necessary.
|
|
|
|
if filter_username is not None:
|
2017-03-23 15:35:17 +00:00
|
|
|
iterator = _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit,
|
|
|
|
repo_kind)
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
else:
|
|
|
|
iterator = unfiltered_query
|
|
|
|
|
|
|
|
if offset > 0:
|
|
|
|
take(offset, iterator)
|
|
|
|
|
|
|
|
# Return the results.
|
|
|
|
return list(take(limit, iterator))
|
|
|
|
|
|
|
|
|
2017-03-23 15:35:17 +00:00
|
|
|
def _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit, repo_kind):
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
encountered = set()
|
|
|
|
chunk_count = limit * 2
|
|
|
|
unfiltered_page = 0
|
|
|
|
iteration_count = 0
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
while iteration_count < 10: # Just to be safe
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
# Find the next chunk's worth of repository IDs, paginated by the chunk size.
|
|
|
|
unfiltered_page = unfiltered_page + 1
|
|
|
|
found_ids = [r.id for r in unfiltered_query.paginate(unfiltered_page, chunk_count)]
|
2015-07-15 21:25:41 +00:00
|
|
|
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
# Make sure we haven't encountered these results before. This code is used to handle
|
|
|
|
# the case where we've previously seen a result, as pagination is not necessary
|
|
|
|
# stable in SQL databases.
|
|
|
|
unfiltered_repository_ids = set(found_ids)
|
|
|
|
new_unfiltered_ids = unfiltered_repository_ids - encountered
|
|
|
|
if not new_unfiltered_ids:
|
|
|
|
break
|
|
|
|
|
|
|
|
encountered.update(new_unfiltered_ids)
|
|
|
|
|
|
|
|
# Filter the repositories found to only those visible to the current user.
|
2017-07-24 15:05:15 +00:00
|
|
|
query = (Repository.select(Repository, Namespace).distinct()
|
|
|
|
.join(Namespace, on=(Namespace.id == Repository.namespace_user)).switch(Repository)
|
|
|
|
.join(RepositoryPermission).where(Repository.id << list(new_unfiltered_ids)))
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2017-03-23 15:35:17 +00:00
|
|
|
filtered = _basequery.filter_to_repos_for_user(query, filter_username, repo_kind=repo_kind)
|
2017-05-03 22:38:46 +00:00
|
|
|
|
|
|
|
# Sort the filtered repositories by their initial order.
|
|
|
|
all_filtered_repos = list(filtered)
|
|
|
|
all_filtered_repos.sort(key=lambda repo: found_ids.index(repo.id))
|
|
|
|
|
|
|
|
# Yield the repositories in sorted order.
|
|
|
|
for filtered_repo in all_filtered_repos:
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
yield filtered_repo
|
2015-07-15 21:25:41 +00:00
|
|
|
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
# If the number of found IDs is less than the chunk count, then we're done.
|
|
|
|
if len(found_ids) < chunk_count:
|
|
|
|
break
|
2015-07-15 21:25:41 +00:00
|
|
|
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
iteration_count = iteration_count + 1
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
|
2017-03-23 14:46:04 +00:00
|
|
|
def _get_sorted_matching_repositories(lookup_value, repo_kind='image', include_private=False,
|
|
|
|
search_fields=None):
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
""" Returns a query of repositories matching the given lookup string, with optional inclusion of
|
|
|
|
private repositories. Note that this method does *not* filter results based on visibility
|
|
|
|
to users.
|
|
|
|
"""
|
2017-03-23 14:46:04 +00:00
|
|
|
|
|
|
|
if search_fields is None:
|
|
|
|
search_fields = set([SEARCH_FIELDS.description.name, SEARCH_FIELDS.name.name])
|
|
|
|
|
2017-11-28 14:50:23 +00:00
|
|
|
if lookup_value:
|
|
|
|
# Always search at least on name (init clause)
|
|
|
|
clause = Repository.name.match(lookup_value)
|
|
|
|
computed_score = RepositorySearchScore.score.alias('score')
|
|
|
|
|
|
|
|
# If the description field is in the search fields, then we need to compute a synthetic score
|
|
|
|
# to discount the weight of the description more than the name.
|
|
|
|
if SEARCH_FIELDS.description.name in search_fields:
|
|
|
|
clause = Repository.description.match(lookup_value) | clause
|
|
|
|
cases = [(Repository.name.match(lookup_value), 100 * RepositorySearchScore.score),]
|
|
|
|
computed_score = case(None, cases, RepositorySearchScore.score).alias('score')
|
|
|
|
else:
|
|
|
|
clause = (Repository.id >= 0)
|
|
|
|
computed_score = RepositorySearchScore.score.alias('score')
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
query = (Repository.select(Repository, Namespace, computed_score)
|
|
|
|
.join(Namespace, on=(Namespace.id == Repository.namespace_user)).where(clause)
|
2017-03-17 15:30:24 +00:00
|
|
|
.group_by(Repository.id, Namespace.id))
|
2015-07-15 21:25:41 +00:00
|
|
|
|
2017-04-07 21:25:44 +00:00
|
|
|
if repo_kind is not None:
|
|
|
|
query = query.where(Repository.kind == Repository.kind.get_id(repo_kind))
|
|
|
|
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
if not include_private:
|
|
|
|
query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
|
2017-01-11 20:03:14 +00:00
|
|
|
|
2017-07-24 15:05:15 +00:00
|
|
|
query = (query.switch(Repository).join(RepositorySearchScore)
|
|
|
|
.group_by(Repository, Namespace, RepositorySearchScore).order_by(SQL('score').desc()))
|
2015-07-15 21:25:41 +00:00
|
|
|
|
Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.
Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
2017-02-27 22:56:44 +00:00
|
|
|
return query
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
def lookup_repository(repo_id):
|
|
|
|
try:
|
|
|
|
return Repository.get(Repository.id == repo_id)
|
|
|
|
except Repository.DoesNotExist:
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def is_repository_public(repository):
|
2017-02-17 17:09:48 +00:00
|
|
|
return repository.visibility_id == _basequery.get_public_repo_visibility().id
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
def repository_is_public(namespace_name, repository_name):
|
|
|
|
try:
|
2017-07-24 15:05:15 +00:00
|
|
|
(Repository.select().join(Namespace, on=(Repository.namespace_user == Namespace.id))
|
|
|
|
.switch(Repository).join(Visibility).where(Namespace.username == namespace_name,
|
|
|
|
Repository.name == repository_name,
|
|
|
|
Visibility.name == 'public').get())
|
2015-07-15 21:25:41 +00:00
|
|
|
return True
|
|
|
|
except Repository.DoesNotExist:
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def set_repository_visibility(repo, visibility):
|
|
|
|
visibility_obj = Visibility.get(name=visibility)
|
|
|
|
if not visibility_obj:
|
|
|
|
return
|
|
|
|
|
|
|
|
repo.visibility = visibility_obj
|
|
|
|
repo.save()
|
|
|
|
|
|
|
|
|
|
|
|
def get_email_authorized_for_repo(namespace, repository, email):
|
|
|
|
try:
|
2017-07-24 15:05:15 +00:00
|
|
|
return (RepositoryAuthorizedEmail.select(RepositoryAuthorizedEmail, Repository, Namespace)
|
|
|
|
.join(Repository).join(Namespace, on=(Repository.namespace_user == Namespace.id))
|
|
|
|
.where(Namespace.username == namespace, Repository.name == repository,
|
|
|
|
RepositoryAuthorizedEmail.email == email).get())
|
2015-07-15 21:25:41 +00:00
|
|
|
except RepositoryAuthorizedEmail.DoesNotExist:
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def create_email_authorization_for_repo(namespace_name, repository_name, email):
|
|
|
|
try:
|
|
|
|
repo = _basequery.get_existing_repository(namespace_name, repository_name)
|
|
|
|
except Repository.DoesNotExist:
|
2017-07-24 15:05:15 +00:00
|
|
|
raise DataModelException('Invalid repository %s/%s' % (namespace_name, repository_name))
|
2015-07-15 21:25:41 +00:00
|
|
|
|
|
|
|
return RepositoryAuthorizedEmail.create(repository=repo, email=email, confirmed=False)
|
|
|
|
|
|
|
|
|
|
|
|
def confirm_email_authorization_for_repo(code):
|
|
|
|
try:
|
2017-07-24 15:05:15 +00:00
|
|
|
found = (RepositoryAuthorizedEmail.select(RepositoryAuthorizedEmail, Repository, Namespace)
|
|
|
|
.join(Repository).join(Namespace, on=(Repository.namespace_user == Namespace.id))
|
|
|
|
.where(RepositoryAuthorizedEmail.code == code).get())
|
2015-07-15 21:25:41 +00:00
|
|
|
except RepositoryAuthorizedEmail.DoesNotExist:
|
|
|
|
raise DataModelException('Invalid confirmation code.')
|
|
|
|
|
|
|
|
found.confirmed = True
|
|
|
|
found.save()
|
|
|
|
|
|
|
|
return found
|
|
|
|
|
|
|
|
|
2017-03-20 23:05:55 +00:00
|
|
|
def list_popular_public_repos(action_count_threshold, time_span, repo_kind='image'):
|
2016-06-17 17:52:27 +00:00
|
|
|
cutoff = datetime.now() - time_span
|
2017-07-24 15:05:15 +00:00
|
|
|
return (Repository.select(Namespace.username, Repository.name)
|
|
|
|
.join(Namespace, on=(Repository.namespace_user == Namespace.id)).switch(Repository)
|
|
|
|
.join(RepositoryActionCount).where(RepositoryActionCount.date >= cutoff,
|
|
|
|
Repository.visibility == get_public_repo_visibility(),
|
|
|
|
Repository.kind == Repository.kind.get_id(repo_kind))
|
2016-06-30 21:31:46 +00:00
|
|
|
.group_by(RepositoryActionCount.repository, Repository.name, Namespace.username)
|
2017-07-24 15:05:15 +00:00
|
|
|
.having(fn.Sum(RepositoryActionCount.count) >= action_count_threshold).tuples())
|
2017-06-07 19:05:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
def is_empty(namespace_name, repository_name):
|
|
|
|
""" Returns if the repository referenced by the given namespace and name is empty. If the repo
|
|
|
|
doesn't exist, returns True.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
tag.list_repository_tags(namespace_name, repository_name).limit(1).get()
|
|
|
|
return False
|
|
|
|
except RepositoryTag.DoesNotExist:
|
|
|
|
return True
|