initial import for Open Source 🎉

This commit is contained in:
Jimmy Zelinskie 2019-11-12 11:09:47 -05:00
parent 1898c361f3
commit 9c0dd3b722
2048 changed files with 218743 additions and 0 deletions

38
util/migrate/__init__.py Normal file
View file

@ -0,0 +1,38 @@
import logging
from sqlalchemy.types import TypeDecorator, Text, String
from sqlalchemy.dialects.mysql import TEXT as MySQLText, LONGTEXT, VARCHAR as MySQLString
logger = logging.getLogger(__name__)
class UTF8LongText(TypeDecorator):
""" Platform-independent UTF-8 LONGTEXT type.
Uses MySQL's LongText with charset utf8mb4, otherwise uses TEXT, because
other engines default to UTF-8 and have longer TEXT fields.
"""
impl = Text
def load_dialect_impl(self, dialect):
if dialect.name == 'mysql':
return dialect.type_descriptor(LONGTEXT(charset='utf8mb4', collation='utf8mb4_unicode_ci'))
else:
return dialect.type_descriptor(Text())
class UTF8CharField(TypeDecorator):
""" Platform-independent UTF-8 Char type.
Uses MySQL's VARCHAR with charset utf8mb4, otherwise uses String, because
other engines default to UTF-8.
"""
impl = String
def load_dialect_impl(self, dialect):
if dialect.name == 'mysql':
return dialect.type_descriptor(MySQLString(charset='utf8mb4', collation='utf8mb4_unicode_ci',
length=self.impl.length))
else:
return dialect.type_descriptor(String(length=self.impl.length))

175
util/migrate/allocator.py Normal file
View file

@ -0,0 +1,175 @@
import logging
import random
from bintrees import RBTree
from threading import Event
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class NoAvailableKeysError(ValueError):
pass
class CompletedKeys(object):
def __init__(self, max_index, min_index=0):
self._max_index = max_index
self._min_index = min_index
self.num_remaining = max_index - min_index
self._slabs = RBTree()
def _get_previous_or_none(self, index):
try:
return self._slabs.floor_item(index)
except KeyError:
return None
def is_available(self, index):
logger.debug('Testing index %s', index)
if index >= self._max_index or index < self._min_index:
logger.debug('Index out of range')
return False
try:
prev_start, prev_length = self._slabs.floor_item(index)
logger.debug('Prev range: %s-%s', prev_start, prev_start + prev_length)
return (prev_start + prev_length) <= index
except KeyError:
return True
def mark_completed(self, start_index, past_last_index):
logger.debug('Marking the range completed: %s-%s', start_index, past_last_index)
num_completed = min(past_last_index, self._max_index) - max(start_index, self._min_index)
# Find the item directly before this and see if there is overlap
to_discard = set()
try:
prev_start, prev_length = self._slabs.floor_item(start_index)
max_prev_completed = prev_start + prev_length
if max_prev_completed >= start_index:
# we are going to merge with the range before us
logger.debug('Merging with the prev range: %s-%s', prev_start, prev_start + prev_length)
to_discard.add(prev_start)
num_completed = max(num_completed - (max_prev_completed - start_index), 0)
start_index = prev_start
past_last_index = max(past_last_index, prev_start + prev_length)
except KeyError:
pass
# Find all keys between the start and last index and merge them into one block
for merge_start, merge_length in self._slabs.iter_items(start_index, past_last_index + 1):
if merge_start in to_discard:
logger.debug('Already merged with block %s-%s', merge_start, merge_start + merge_length)
continue
candidate_next_index = merge_start + merge_length
logger.debug('Merging with block %s-%s', merge_start, candidate_next_index)
num_completed -= merge_length - max(candidate_next_index - past_last_index, 0)
to_discard.add(merge_start)
past_last_index = max(past_last_index, candidate_next_index)
# write the new block which is fully merged
discard = False
if past_last_index >= self._max_index:
logger.debug('Discarding block and setting new max to: %s', start_index)
self._max_index = start_index
discard = True
if start_index <= self._min_index:
logger.debug('Discarding block and setting new min to: %s', past_last_index)
self._min_index = past_last_index
discard = True
if to_discard:
logger.debug('Discarding %s obsolete blocks', len(to_discard))
self._slabs.remove_items(to_discard)
if not discard:
logger.debug('Writing new block with range: %s-%s', start_index, past_last_index)
self._slabs.insert(start_index, past_last_index - start_index)
# Update the number of remaining items with the adjustments we've made
assert num_completed >= 0
self.num_remaining -= num_completed
logger.debug('Total blocks: %s', len(self._slabs))
def get_block_start_index(self, block_size_estimate):
logger.debug('Total range: %s-%s', self._min_index, self._max_index)
if self._max_index <= self._min_index:
raise NoAvailableKeysError('All indexes have been marked completed')
num_holes = len(self._slabs) + 1
random_hole = random.randint(0, num_holes - 1)
logger.debug('Selected random hole %s with %s total holes', random_hole, num_holes)
hole_start = self._min_index
past_hole_end = self._max_index
# Now that we have picked a hole, we need to define the bounds
if random_hole > 0:
# There will be a slab before this hole, find where it ends
bound_entries = self._slabs.nsmallest(random_hole + 1)[-2:]
left_index, left_len = bound_entries[0]
logger.debug('Left range %s-%s', left_index, left_index + left_len)
hole_start = left_index + left_len
if len(bound_entries) > 1:
right_index, right_len = bound_entries[1]
logger.debug('Right range %s-%s', right_index, right_index + right_len)
past_hole_end, _ = bound_entries[1]
elif not self._slabs.is_empty():
right_index, right_len = self._slabs.nsmallest(1)[0]
logger.debug('Right range %s-%s', right_index, right_index + right_len)
past_hole_end, _ = self._slabs.nsmallest(1)[0]
# Now that we have our hole bounds, select a random block from [0:len - block_size_estimate]
logger.debug('Selecting from hole range: %s-%s', hole_start, past_hole_end)
rand_max_bound = max(hole_start, past_hole_end - block_size_estimate)
logger.debug('Rand max bound: %s', rand_max_bound)
return random.randint(hole_start, rand_max_bound)
def yield_random_entries(batch_query, primary_key_field, batch_size, max_id, min_id=0):
""" This method will yield items from random blocks in the database. We will track metadata
about which keys are available for work, and we will complete the backfill when there is no
more work to be done. The method yields tuples of (candidate, Event), and if the work was
already done by another worker, the caller should set the event. Batch candidates must have
an "id" field which can be inspected.
"""
min_id = max(min_id, 0)
max_id = max(max_id, 1)
allocator = CompletedKeys(max_id + 1, min_id)
try:
while True:
start_index = allocator.get_block_start_index(batch_size)
end_index = min(start_index + batch_size, max_id + 1)
all_candidates = list(batch_query()
.where(primary_key_field >= start_index,
primary_key_field < end_index)
.order_by(primary_key_field))
if len(all_candidates) == 0:
logger.info('No candidates, marking entire block completed %s-%s', start_index, end_index)
allocator.mark_completed(start_index, end_index)
continue
logger.info('Found %s candidates, processing block', len(all_candidates))
batch_completed = 0
for candidate in all_candidates:
abort_early = Event()
yield candidate, abort_early, allocator.num_remaining - batch_completed
batch_completed += 1
if abort_early.is_set():
logger.info('Overlap with another worker, aborting')
break
completed_through = candidate.id + 1
logger.info('Marking id range as completed: %s-%s', start_index, completed_through)
allocator.mark_completed(start_index, completed_through)
except NoAvailableKeysError:
logger.info('No more work')

View file

@ -0,0 +1,54 @@
import logging
from app import app
from data.database import User
from util.names import parse_robot_username
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def cleanup_old_robots(page_size=50, force=False):
""" Deletes any robots that live under namespaces that no longer exist. """
if not force and not app.config.get('SETUP_COMPLETE', False):
return
# Collect the robot accounts to delete.
page_number = 1
to_delete = []
encountered_namespaces = {}
while True:
found_bots = False
for robot in list(User.select().where(User.robot == True).paginate(page_number, page_size)):
found_bots = True
logger.info("Checking robot %s (page %s)", robot.username, page_number)
parsed = parse_robot_username(robot.username)
if parsed is None:
continue
namespace, _ = parsed
if namespace in encountered_namespaces:
if not encountered_namespaces[namespace]:
logger.info('Marking %s to be deleted', robot.username)
to_delete.append(robot)
else:
try:
User.get(username=namespace)
encountered_namespaces[namespace] = True
except User.DoesNotExist:
# Save the robot account for deletion.
logger.info('Marking %s to be deleted', robot.username)
to_delete.append(robot)
encountered_namespaces[namespace] = False
if not found_bots:
break
page_number = page_number + 1
# Cleanup any robot accounts whose corresponding namespace doesn't exist.
logger.info('Found %s robots to delete', len(to_delete))
for index, robot in enumerate(to_delete):
logger.info('Deleting robot %s of %s (%s)', index, len(to_delete), robot.username)
robot.delete_instance(recursive=True, delete_nullable=True)

View file

@ -0,0 +1,48 @@
import logging
import time
from datetime import datetime, timedelta
from data.database import RepositoryBuild, AccessToken
from app import app
logger = logging.getLogger(__name__)
BATCH_SIZE = 1000
def delete_temporary_access_tokens(older_than):
# Find the highest ID up to which we should delete
up_to_id = (AccessToken
.select(AccessToken.id)
.where(AccessToken.created < older_than)
.limit(1)
.order_by(AccessToken.id.desc())
.get().id)
logger.debug('Deleting temporary access tokens with ids lower than: %s', up_to_id)
access_tokens_in_builds = (RepositoryBuild.select(RepositoryBuild.access_token).distinct())
while up_to_id > 0:
starting_at_id = max(up_to_id - BATCH_SIZE, 0)
logger.debug('Deleting tokens with ids between %s and %s', starting_at_id, up_to_id)
start_time = datetime.utcnow()
(AccessToken
.delete()
.where(AccessToken.id >= starting_at_id,
AccessToken.id < up_to_id,
AccessToken.temporary == True,
~(AccessToken.id << access_tokens_in_builds))
.execute())
time_to_delete = datetime.utcnow() - start_time
up_to_id -= BATCH_SIZE
logger.debug('Sleeping for %s seconds', time_to_delete.total_seconds())
time.sleep(time_to_delete.total_seconds())
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
delete_temporary_access_tokens(datetime.utcnow() - timedelta(days=2))

13
util/migrate/table_ops.py Normal file
View file

@ -0,0 +1,13 @@
def copy_table_contents(source_table, destination_table, conn):
if conn.engine.name == 'postgresql':
conn.execute('INSERT INTO "%s" SELECT * FROM "%s"' % (destination_table, source_table))
result = list(conn.execute('Select Max(id) from "%s"' % destination_table))[0]
if result[0] is not None:
new_start_id = result[0] + 1
conn.execute('ALTER SEQUENCE "%s_id_seq" RESTART WITH %s' % (destination_table, new_start_id))
else:
conn.execute("INSERT INTO `%s` SELECT * FROM `%s` WHERE 1" % (destination_table, source_table))
result = list(conn.execute('Select Max(id) from `%s` WHERE 1' % destination_table))[0]
if result[0] is not None:
new_start_id = result[0] + 1
conn.execute("ALTER TABLE `%s` AUTO_INCREMENT = %s" % (destination_table, new_start_id))

View file

@ -0,0 +1,176 @@
import random
import pytest
from datetime import datetime, timedelta
from util.migrate.allocator import CompletedKeys, NoAvailableKeysError, yield_random_entries
def test_merge_blocks_operations():
candidates = CompletedKeys(10)
assert candidates.num_remaining == 10
candidates.mark_completed(1, 5)
assert candidates.is_available(5)
assert candidates.is_available(0)
assert not candidates.is_available(1)
assert not candidates.is_available(4)
assert not candidates.is_available(11)
assert not candidates.is_available(10)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 6
candidates.mark_completed(5, 6)
assert not candidates.is_available(5)
assert candidates.is_available(6)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 5
candidates.mark_completed(3, 8)
assert candidates.is_available(9)
assert candidates.is_available(8)
assert not candidates.is_available(7)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 3
def test_adjust_max():
candidates = CompletedKeys(10)
assert candidates.num_remaining == 10
assert len(candidates._slabs) == 0
assert candidates.is_available(9)
candidates.mark_completed(5, 12)
assert len(candidates._slabs) == 0
assert candidates.num_remaining == 5
assert not candidates.is_available(9)
assert candidates.is_available(4)
def test_adjust_min():
candidates = CompletedKeys(10)
assert candidates.num_remaining == 10
assert len(candidates._slabs) == 0
assert candidates.is_available(2)
candidates.mark_completed(0, 3)
assert len(candidates._slabs) == 0
assert candidates.num_remaining == 7
assert not candidates.is_available(2)
assert candidates.is_available(4)
def test_inside_block():
candidates = CompletedKeys(10)
assert candidates.num_remaining == 10
candidates.mark_completed(1, 8)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 3
candidates.mark_completed(2, 5)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 3
assert not candidates.is_available(1)
assert not candidates.is_available(5)
def test_wrap_block():
candidates = CompletedKeys(10)
assert candidates.num_remaining == 10
candidates.mark_completed(2, 5)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 7
candidates.mark_completed(1, 8)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 3
assert not candidates.is_available(1)
assert not candidates.is_available(5)
def test_non_contiguous():
candidates = CompletedKeys(10)
assert candidates.num_remaining == 10
candidates.mark_completed(1, 5)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 6
assert candidates.is_available(5)
assert candidates.is_available(6)
candidates.mark_completed(6, 8)
assert len(candidates._slabs) == 2
assert candidates.num_remaining == 4
assert candidates.is_available(5)
assert not candidates.is_available(6)
def test_big_merge():
candidates = CompletedKeys(10)
assert candidates.num_remaining == 10
candidates.mark_completed(1, 5)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 6
candidates.mark_completed(6, 8)
assert len(candidates._slabs) == 2
assert candidates.num_remaining == 4
candidates.mark_completed(5, 6)
assert len(candidates._slabs) == 1
assert candidates.num_remaining == 3
def test_range_limits():
candidates = CompletedKeys(10)
assert not candidates.is_available(-1)
assert not candidates.is_available(10)
assert candidates.is_available(9)
assert candidates.is_available(0)
def test_random_saturation():
candidates = CompletedKeys(100)
with pytest.raises(NoAvailableKeysError):
for _ in range(101):
start = candidates.get_block_start_index(10)
assert candidates.is_available(start)
candidates.mark_completed(start, start + 10)
assert candidates.num_remaining == 0
def test_huge_dataset():
candidates = CompletedKeys(1024 * 1024)
start_time = datetime.now()
iterations = 0
with pytest.raises(NoAvailableKeysError):
while (datetime.now() - start_time) < timedelta(seconds=10):
start = candidates.get_block_start_index(1024)
assert candidates.is_available(start)
candidates.mark_completed(start, start + random.randint(512, 1024))
iterations += 1
assert iterations > 1024
assert candidates.num_remaining == 0
class FakeQuery(object):
def __init__(self, result_list):
self._result_list = result_list
def limit(self, *args, **kwargs):
return self
def where(self, *args, **kwargs):
return self
def order_by(self, *args, **kwargs):
return self
def __iter__(self):
return self._result_list.__iter__()
FAKE_PK_FIELD = 10 # Must be able to compare to integers
def test_no_work():
def create_empty_query():
return FakeQuery([])
for _ in yield_random_entries(create_empty_query, FAKE_PK_FIELD, 1, 10):
assert False, 'There should never be any actual work!'

View file

@ -0,0 +1,43 @@
import pytest
from data.database import User
from util.migrate.cleanup_old_robots import cleanup_old_robots
from test.fixtures import *
def test_cleanup_old_robots(initialized_db):
before_robot_count = User.select().where(User.robot == True).count()
before_user_count = User.select().count()
# Run the cleanup once, and ensure it does nothing.
cleanup_old_robots(force=True)
after_robot_count = User.select().where(User.robot == True).count()
after_user_count = User.select().count()
assert before_robot_count == after_robot_count
assert before_user_count == after_user_count
# Create some orphan robots.
created = set()
for index in range(0, 50):
created.add('doesnotexist+a%s' % index)
created.add('anothernamespace+b%s' % index)
User.create(username='doesnotexist+a%s' % index, robot=True)
User.create(username='anothernamespace+b%s' % index, robot=True)
before_robot_count = User.select().where(User.robot == True).count()
before_user_count = User.select().count()
cleanup_old_robots(page_size=10, force=True)
after_robot_count = User.select().where(User.robot == True).count()
after_user_count = User.select().count()
assert before_robot_count == after_robot_count + len(created)
assert before_user_count == after_user_count + len(created)
for name in created:
with pytest.raises(User.DoesNotExist):
User.get(username=name)