import logging from peewee import JOIN_LEFT_OUTER, fn from datetime import timedelta, datetime from data.model import (DataModelException, tag, db_transaction, storage, image, permission, _basequery, config) from data.database import (Repository, Namespace, RepositoryTag, Star, Image, ImageStorage, User, Visibility, RepositoryPermission, TupleSelector, RepositoryActionCount, Role, RepositoryAuthorizedEmail, db_for_update, get_epoch_timestamp, db_random_func) logger = logging.getLogger(__name__) def create_repository(namespace, name, creating_user, visibility='private'): private = Visibility.get(name=visibility) namespace_user = User.get(username=namespace) repo = Repository.create(name=name, visibility=private, namespace_user=namespace_user) admin = Role.get(name='admin') if creating_user and not creating_user.organization: RepositoryPermission.create(user=creating_user, repository=repo, role=admin) if creating_user.username != namespace: # Permission prototypes only work for orgs permission.apply_default_permissions(repo, creating_user) return repo def get_repository(namespace_name, repository_name): try: return _basequery.get_existing_repository(namespace_name, repository_name) except Repository.DoesNotExist: return None def _purge_all_repository_tags(namespace_name, repository_name): """ Immediately purge all repository tags without respecting the lifeline procedure """ try: repo = _basequery.get_existing_repository(namespace_name, repository_name) except Repository.DoesNotExist: raise DataModelException('Invalid repository \'%s/%s\'' % (namespace_name, repository_name)) RepositoryTag.delete().where(RepositoryTag.repository == repo.id).execute() def purge_repository(namespace_name, repository_name): # Delete all tags to allow gc to reclaim storage _purge_all_repository_tags(namespace_name, repository_name) # Gc to remove the images and storage garbage_collect_repository(namespace_name, repository_name) # Delete the rest of the repository metadata fetched = _basequery.get_existing_repository(namespace_name, repository_name) fetched.delete_instance(recursive=True, delete_nullable=False) def find_repository_with_garbage(filter_list=None): # TODO(jschorr): Remove the filter once we have turned the experiment on for everyone. if filter_list is not None and not filter_list: return None epoch_timestamp = get_epoch_timestamp() try: candidates = (RepositoryTag .select(RepositoryTag.repository) .join(Repository) .join(Namespace, on=(Repository.namespace_user == Namespace.id)) .where(~(RepositoryTag.lifetime_end_ts >> None), (RepositoryTag.lifetime_end_ts <= (epoch_timestamp - Namespace.removed_tag_expiration_s))) .limit(500) .alias('candidates')) if filter_list: candidates = candidates.where(Namespace.username << filter_list) found = (RepositoryTag .select(candidates.c.repository_id) .from_(candidates) .order_by(db_random_func()) .get()) if found is None: return return Repository.get(Repository.id == found.repository_id) except RepositoryTag.DoesNotExist: return None except Repository.DoesNotExist: return None def garbage_collect_repository(namespace_name, repository_name): # If the namespace is the async experiment, don't perform garbage collection here. # TODO(jschorr): Remove this check once we have turned the experiment on for everyone. if namespace_name in config.app_config.get('EXP_ASYNC_GARBAGE_COLLECTION', []): return repo = get_repository(namespace_name, repository_name) if repo is not None: garbage_collect_repo(repo) def garbage_collect_repo(repo): logger.debug('Garbage collecting repository %s', repo.id) storage_id_whitelist = {} tag.garbage_collect_tags(repo) with db_transaction(): # Get a list of all images used by tags in the repository tagged_images = (Image .select(Image.id, Image.ancestors) .join(RepositoryTag) .where(Image.repository == repo)) def gen_referenced_ancestors(): for tagged_image in tagged_images: # The ancestor list is in the format '/1/2/3/', extract just the ids ancestor_id_strings = tagged_image.ancestors.split('/')[1:-1] for img_id_str in ancestor_id_strings: yield int(img_id_str) yield tagged_image.id referenced_ancestors = set(gen_referenced_ancestors()) # We desire two pieces of information from the database from the following # query: all of the image ids which are associated with this repository, # and the storages which are associated with those images. In order to # fetch just this information, and bypass all of the peewee model parsing # code, which is overkill for just two fields, we use a tuple query, and # feed that directly to the dictionary tuple constructor which takes an # iterable of tuples containing [(k, v), (k, v), ...] all_repo_images = Image.select(Image.id, Image.storage).where(Image.repository == repo).tuples() images_to_storages = dict(all_repo_images) to_remove = set(images_to_storages.keys()).difference(referenced_ancestors) if len(to_remove) > 0: logger.info('Cleaning up unreferenced images: %s', to_remove) storage_id_whitelist = {images_to_storages[to_remove_id] for to_remove_id in to_remove} Image.delete().where(Image.id << list(to_remove)).execute() if len(to_remove) > 0: logger.info('Garbage collecting storage for images: %s', to_remove) storage.garbage_collect_storage(storage_id_whitelist) def star_repository(user, repository): """ Stars a repository. """ star = Star.create(user=user.id, repository=repository.id) star.save() def unstar_repository(user, repository): """ Unstars a repository. """ try: (Star .delete() .where(Star.repository == repository.id, Star.user == user.id) .execute()) except Star.DoesNotExist: raise DataModelException('Star not found.') def get_user_starred_repositories(user, limit=None, page=None): """ Retrieves all of the repositories a user has starred. """ query = (Repository .select(Repository, User, Visibility) .join(Star) .switch(Repository) .join(User) .switch(Repository) .join(Visibility) .where(Star.user == user)) if page and limit: query = query.paginate(page, limit) elif limit: query = query.limit(limit) return query def repository_is_starred(user, repository): """ Determines whether a user has starred a repository or not. """ try: (Star .select() .where(Star.repository == repository.id, Star.user == user.id) .get()) return True except Star.DoesNotExist: return False def get_when_last_modified(repository_ids): if not repository_ids: return {} tuples = (RepositoryTag .select(RepositoryTag.repository, fn.Max(RepositoryTag.lifetime_start_ts)) .where(RepositoryTag.repository << repository_ids) .group_by(RepositoryTag.repository) .tuples()) last_modified_map = {} for record in tuples: last_modified_map[record[0]] = record[1] return last_modified_map def get_action_counts(repository_ids): if not repository_ids: return {} # Filter the join to recent entries only. last_week = datetime.now() - timedelta(weeks=1) tuples = (RepositoryActionCount .select(RepositoryActionCount.repository, fn.Sum(RepositoryActionCount.count)) .where(RepositoryActionCount.repository << repository_ids) .where(RepositoryActionCount.date >= last_week) .group_by(RepositoryActionCount.repository) .tuples()) action_count_map = {} for record in tuples: action_count_map[record[0]] = record[1] return action_count_map def get_visible_repositories(username, namespace=None, page=None, limit=None, include_public=False): """ Returns the repositories visible to the given user (if any). """ if not include_public and not username: return [] fields = [Repository.name, Repository.id, Repository.description, Visibility.name, Namespace.username] query = _visible_repository_query(username=username, page=page, limit=limit, namespace=namespace, include_public=include_public, select_models=fields) if limit: query = query.limit(limit) if namespace: query = query.where(Namespace.username == namespace) return query def _visible_repository_query(username=None, include_public=True, limit=None, page=None, namespace=None, select_models=[]): query = (Repository .select(*select_models) # MySQL/RDS complains is there are selected models for counts. .distinct() .join(Visibility) .switch(Repository) .join(Namespace, on=(Repository.namespace_user == Namespace.id)) .switch(Repository) .join(RepositoryPermission, JOIN_LEFT_OUTER)) query = _basequery.filter_to_repos_for_user(query, username, namespace, include_public) if page: query = query.paginate(page, limit) elif limit: query = query.limit(limit) return query def get_sorted_matching_repositories(prefix, only_public, checker, limit=10): """ Returns repositories matching the given prefix string and passing the given checker function. """ last_week = datetime.now() - timedelta(weeks=1) results = [] existing_ids = [] def get_search_results(search_clause, with_count=False): if len(results) >= limit: return select_items = [Repository, Namespace] if with_count: select_items.append(fn.Sum(RepositoryActionCount.count).alias('count')) query = (Repository .select(*select_items) .join(Namespace, JOIN_LEFT_OUTER, on=(Namespace.id == Repository.namespace_user)) .switch(Repository) .where(search_clause) .group_by(Repository, Namespace)) if only_public: query = query.where(Repository.visibility == _basequery.get_public_repo_visibility()) if existing_ids: query = query.where(~(Repository.id << existing_ids)) if with_count: query = (query .switch(Repository) .join(RepositoryActionCount) .where(RepositoryActionCount.date >= last_week) .order_by(fn.Sum(RepositoryActionCount.count).desc())) for result in query: if len(results) >= limit: return results # Note: We compare IDs here, instead of objects, because calling .visibility on the # Repository will kick off a new SQL query to retrieve that visibility enum value. We don't # join the visibility table in SQL, as well, because it is ungodly slow in MySQL :-/ result.is_public = result.visibility_id == _basequery.get_public_repo_visibility().id result.count = result.count if with_count else 0 if not checker(result): continue results.append(result) existing_ids.append(result.id) # For performance reasons, we conduct the repo name and repo namespace searches on their # own. This also affords us the ability to give higher precedence to repository names matching # over namespaces, which is semantically correct. get_search_results(Repository.name ** (prefix + '%'), with_count=True) get_search_results(Repository.name ** (prefix + '%'), with_count=False) get_search_results(Namespace.username ** (prefix + '%'), with_count=True) get_search_results(Namespace.username ** (prefix + '%'), with_count=False) return results def lookup_repository(repo_id): try: return Repository.get(Repository.id == repo_id) except Repository.DoesNotExist: return None def is_repository_public(repository): return repository.visibility == _basequery.get_public_repo_visibility() def repository_is_public(namespace_name, repository_name): try: (Repository .select() .join(Namespace, on=(Repository.namespace_user == Namespace.id)) .switch(Repository) .join(Visibility) .where(Namespace.username == namespace_name, Repository.name == repository_name, Visibility.name == 'public') .get()) return True except Repository.DoesNotExist: return False def set_repository_visibility(repo, visibility): visibility_obj = Visibility.get(name=visibility) if not visibility_obj: return repo.visibility = visibility_obj repo.save() def get_email_authorized_for_repo(namespace, repository, email): try: return (RepositoryAuthorizedEmail .select(RepositoryAuthorizedEmail, Repository, Namespace) .join(Repository) .join(Namespace, on=(Repository.namespace_user == Namespace.id)) .where(Namespace.username == namespace, Repository.name == repository, RepositoryAuthorizedEmail.email == email) .get()) except RepositoryAuthorizedEmail.DoesNotExist: return None def create_email_authorization_for_repo(namespace_name, repository_name, email): try: repo = _basequery.get_existing_repository(namespace_name, repository_name) except Repository.DoesNotExist: raise DataModelException('Invalid repository %s/%s' % (namespace_name, repository_name)) return RepositoryAuthorizedEmail.create(repository=repo, email=email, confirmed=False) def confirm_email_authorization_for_repo(code): try: found = (RepositoryAuthorizedEmail .select(RepositoryAuthorizedEmail, Repository, Namespace) .join(Repository) .join(Namespace, on=(Repository.namespace_user == Namespace.id)) .where(RepositoryAuthorizedEmail.code == code) .get()) except RepositoryAuthorizedEmail.DoesNotExist: raise DataModelException('Invalid confirmation code.') found.confirmed = True found.save() return found