initial import for Open Source 🎉
This commit is contained in:
parent
1898c361f3
commit
9c0dd3b722
2048 changed files with 218743 additions and 0 deletions
38
util/migrate/__init__.py
Normal file
38
util/migrate/__init__.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
import logging
|
||||
|
||||
from sqlalchemy.types import TypeDecorator, Text, String
|
||||
from sqlalchemy.dialects.mysql import TEXT as MySQLText, LONGTEXT, VARCHAR as MySQLString
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UTF8LongText(TypeDecorator):
|
||||
""" Platform-independent UTF-8 LONGTEXT type.
|
||||
|
||||
Uses MySQL's LongText with charset utf8mb4, otherwise uses TEXT, because
|
||||
other engines default to UTF-8 and have longer TEXT fields.
|
||||
"""
|
||||
impl = Text
|
||||
|
||||
def load_dialect_impl(self, dialect):
|
||||
if dialect.name == 'mysql':
|
||||
return dialect.type_descriptor(LONGTEXT(charset='utf8mb4', collation='utf8mb4_unicode_ci'))
|
||||
else:
|
||||
return dialect.type_descriptor(Text())
|
||||
|
||||
|
||||
class UTF8CharField(TypeDecorator):
|
||||
""" Platform-independent UTF-8 Char type.
|
||||
|
||||
Uses MySQL's VARCHAR with charset utf8mb4, otherwise uses String, because
|
||||
other engines default to UTF-8.
|
||||
"""
|
||||
impl = String
|
||||
|
||||
def load_dialect_impl(self, dialect):
|
||||
if dialect.name == 'mysql':
|
||||
return dialect.type_descriptor(MySQLString(charset='utf8mb4', collation='utf8mb4_unicode_ci',
|
||||
length=self.impl.length))
|
||||
else:
|
||||
return dialect.type_descriptor(String(length=self.impl.length))
|
175
util/migrate/allocator.py
Normal file
175
util/migrate/allocator.py
Normal file
|
@ -0,0 +1,175 @@
|
|||
import logging
|
||||
import random
|
||||
|
||||
from bintrees import RBTree
|
||||
from threading import Event
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class NoAvailableKeysError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class CompletedKeys(object):
|
||||
def __init__(self, max_index, min_index=0):
|
||||
self._max_index = max_index
|
||||
self._min_index = min_index
|
||||
self.num_remaining = max_index - min_index
|
||||
self._slabs = RBTree()
|
||||
|
||||
def _get_previous_or_none(self, index):
|
||||
try:
|
||||
return self._slabs.floor_item(index)
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
def is_available(self, index):
|
||||
logger.debug('Testing index %s', index)
|
||||
if index >= self._max_index or index < self._min_index:
|
||||
logger.debug('Index out of range')
|
||||
return False
|
||||
|
||||
try:
|
||||
prev_start, prev_length = self._slabs.floor_item(index)
|
||||
logger.debug('Prev range: %s-%s', prev_start, prev_start + prev_length)
|
||||
return (prev_start + prev_length) <= index
|
||||
except KeyError:
|
||||
return True
|
||||
|
||||
def mark_completed(self, start_index, past_last_index):
|
||||
logger.debug('Marking the range completed: %s-%s', start_index, past_last_index)
|
||||
num_completed = min(past_last_index, self._max_index) - max(start_index, self._min_index)
|
||||
|
||||
# Find the item directly before this and see if there is overlap
|
||||
to_discard = set()
|
||||
try:
|
||||
prev_start, prev_length = self._slabs.floor_item(start_index)
|
||||
max_prev_completed = prev_start + prev_length
|
||||
if max_prev_completed >= start_index:
|
||||
# we are going to merge with the range before us
|
||||
logger.debug('Merging with the prev range: %s-%s', prev_start, prev_start + prev_length)
|
||||
to_discard.add(prev_start)
|
||||
num_completed = max(num_completed - (max_prev_completed - start_index), 0)
|
||||
start_index = prev_start
|
||||
past_last_index = max(past_last_index, prev_start + prev_length)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
# Find all keys between the start and last index and merge them into one block
|
||||
for merge_start, merge_length in self._slabs.iter_items(start_index, past_last_index + 1):
|
||||
if merge_start in to_discard:
|
||||
logger.debug('Already merged with block %s-%s', merge_start, merge_start + merge_length)
|
||||
continue
|
||||
|
||||
candidate_next_index = merge_start + merge_length
|
||||
logger.debug('Merging with block %s-%s', merge_start, candidate_next_index)
|
||||
num_completed -= merge_length - max(candidate_next_index - past_last_index, 0)
|
||||
to_discard.add(merge_start)
|
||||
past_last_index = max(past_last_index, candidate_next_index)
|
||||
|
||||
# write the new block which is fully merged
|
||||
discard = False
|
||||
if past_last_index >= self._max_index:
|
||||
logger.debug('Discarding block and setting new max to: %s', start_index)
|
||||
self._max_index = start_index
|
||||
discard = True
|
||||
|
||||
if start_index <= self._min_index:
|
||||
logger.debug('Discarding block and setting new min to: %s', past_last_index)
|
||||
self._min_index = past_last_index
|
||||
discard = True
|
||||
|
||||
if to_discard:
|
||||
logger.debug('Discarding %s obsolete blocks', len(to_discard))
|
||||
self._slabs.remove_items(to_discard)
|
||||
|
||||
if not discard:
|
||||
logger.debug('Writing new block with range: %s-%s', start_index, past_last_index)
|
||||
self._slabs.insert(start_index, past_last_index - start_index)
|
||||
|
||||
# Update the number of remaining items with the adjustments we've made
|
||||
assert num_completed >= 0
|
||||
self.num_remaining -= num_completed
|
||||
logger.debug('Total blocks: %s', len(self._slabs))
|
||||
|
||||
def get_block_start_index(self, block_size_estimate):
|
||||
logger.debug('Total range: %s-%s', self._min_index, self._max_index)
|
||||
if self._max_index <= self._min_index:
|
||||
raise NoAvailableKeysError('All indexes have been marked completed')
|
||||
|
||||
num_holes = len(self._slabs) + 1
|
||||
random_hole = random.randint(0, num_holes - 1)
|
||||
logger.debug('Selected random hole %s with %s total holes', random_hole, num_holes)
|
||||
|
||||
hole_start = self._min_index
|
||||
past_hole_end = self._max_index
|
||||
|
||||
# Now that we have picked a hole, we need to define the bounds
|
||||
if random_hole > 0:
|
||||
# There will be a slab before this hole, find where it ends
|
||||
bound_entries = self._slabs.nsmallest(random_hole + 1)[-2:]
|
||||
left_index, left_len = bound_entries[0]
|
||||
logger.debug('Left range %s-%s', left_index, left_index + left_len)
|
||||
hole_start = left_index + left_len
|
||||
|
||||
if len(bound_entries) > 1:
|
||||
right_index, right_len = bound_entries[1]
|
||||
logger.debug('Right range %s-%s', right_index, right_index + right_len)
|
||||
past_hole_end, _ = bound_entries[1]
|
||||
elif not self._slabs.is_empty():
|
||||
right_index, right_len = self._slabs.nsmallest(1)[0]
|
||||
logger.debug('Right range %s-%s', right_index, right_index + right_len)
|
||||
past_hole_end, _ = self._slabs.nsmallest(1)[0]
|
||||
|
||||
# Now that we have our hole bounds, select a random block from [0:len - block_size_estimate]
|
||||
logger.debug('Selecting from hole range: %s-%s', hole_start, past_hole_end)
|
||||
rand_max_bound = max(hole_start, past_hole_end - block_size_estimate)
|
||||
logger.debug('Rand max bound: %s', rand_max_bound)
|
||||
return random.randint(hole_start, rand_max_bound)
|
||||
|
||||
|
||||
def yield_random_entries(batch_query, primary_key_field, batch_size, max_id, min_id=0):
|
||||
""" This method will yield items from random blocks in the database. We will track metadata
|
||||
about which keys are available for work, and we will complete the backfill when there is no
|
||||
more work to be done. The method yields tuples of (candidate, Event), and if the work was
|
||||
already done by another worker, the caller should set the event. Batch candidates must have
|
||||
an "id" field which can be inspected.
|
||||
"""
|
||||
|
||||
min_id = max(min_id, 0)
|
||||
max_id = max(max_id, 1)
|
||||
allocator = CompletedKeys(max_id + 1, min_id)
|
||||
|
||||
try:
|
||||
while True:
|
||||
start_index = allocator.get_block_start_index(batch_size)
|
||||
end_index = min(start_index + batch_size, max_id + 1)
|
||||
all_candidates = list(batch_query()
|
||||
.where(primary_key_field >= start_index,
|
||||
primary_key_field < end_index)
|
||||
.order_by(primary_key_field))
|
||||
|
||||
if len(all_candidates) == 0:
|
||||
logger.info('No candidates, marking entire block completed %s-%s', start_index, end_index)
|
||||
allocator.mark_completed(start_index, end_index)
|
||||
continue
|
||||
|
||||
logger.info('Found %s candidates, processing block', len(all_candidates))
|
||||
batch_completed = 0
|
||||
for candidate in all_candidates:
|
||||
abort_early = Event()
|
||||
yield candidate, abort_early, allocator.num_remaining - batch_completed
|
||||
batch_completed += 1
|
||||
if abort_early.is_set():
|
||||
logger.info('Overlap with another worker, aborting')
|
||||
break
|
||||
|
||||
completed_through = candidate.id + 1
|
||||
logger.info('Marking id range as completed: %s-%s', start_index, completed_through)
|
||||
allocator.mark_completed(start_index, completed_through)
|
||||
|
||||
except NoAvailableKeysError:
|
||||
logger.info('No more work')
|
54
util/migrate/cleanup_old_robots.py
Normal file
54
util/migrate/cleanup_old_robots.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
import logging
|
||||
|
||||
from app import app
|
||||
from data.database import User
|
||||
from util.names import parse_robot_username
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def cleanup_old_robots(page_size=50, force=False):
|
||||
""" Deletes any robots that live under namespaces that no longer exist. """
|
||||
if not force and not app.config.get('SETUP_COMPLETE', False):
|
||||
return
|
||||
|
||||
# Collect the robot accounts to delete.
|
||||
page_number = 1
|
||||
to_delete = []
|
||||
encountered_namespaces = {}
|
||||
|
||||
while True:
|
||||
found_bots = False
|
||||
for robot in list(User.select().where(User.robot == True).paginate(page_number, page_size)):
|
||||
found_bots = True
|
||||
logger.info("Checking robot %s (page %s)", robot.username, page_number)
|
||||
parsed = parse_robot_username(robot.username)
|
||||
if parsed is None:
|
||||
continue
|
||||
|
||||
namespace, _ = parsed
|
||||
if namespace in encountered_namespaces:
|
||||
if not encountered_namespaces[namespace]:
|
||||
logger.info('Marking %s to be deleted', robot.username)
|
||||
to_delete.append(robot)
|
||||
else:
|
||||
try:
|
||||
User.get(username=namespace)
|
||||
encountered_namespaces[namespace] = True
|
||||
except User.DoesNotExist:
|
||||
# Save the robot account for deletion.
|
||||
logger.info('Marking %s to be deleted', robot.username)
|
||||
to_delete.append(robot)
|
||||
encountered_namespaces[namespace] = False
|
||||
|
||||
if not found_bots:
|
||||
break
|
||||
|
||||
page_number = page_number + 1
|
||||
|
||||
# Cleanup any robot accounts whose corresponding namespace doesn't exist.
|
||||
logger.info('Found %s robots to delete', len(to_delete))
|
||||
for index, robot in enumerate(to_delete):
|
||||
logger.info('Deleting robot %s of %s (%s)', index, len(to_delete), robot.username)
|
||||
robot.delete_instance(recursive=True, delete_nullable=True)
|
48
util/migrate/delete_access_tokens.py
Normal file
48
util/migrate/delete_access_tokens.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
import logging
|
||||
import time
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from data.database import RepositoryBuild, AccessToken
|
||||
from app import app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
def delete_temporary_access_tokens(older_than):
|
||||
# Find the highest ID up to which we should delete
|
||||
up_to_id = (AccessToken
|
||||
.select(AccessToken.id)
|
||||
.where(AccessToken.created < older_than)
|
||||
.limit(1)
|
||||
.order_by(AccessToken.id.desc())
|
||||
.get().id)
|
||||
logger.debug('Deleting temporary access tokens with ids lower than: %s', up_to_id)
|
||||
|
||||
|
||||
access_tokens_in_builds = (RepositoryBuild.select(RepositoryBuild.access_token).distinct())
|
||||
|
||||
while up_to_id > 0:
|
||||
starting_at_id = max(up_to_id - BATCH_SIZE, 0)
|
||||
logger.debug('Deleting tokens with ids between %s and %s', starting_at_id, up_to_id)
|
||||
start_time = datetime.utcnow()
|
||||
(AccessToken
|
||||
.delete()
|
||||
.where(AccessToken.id >= starting_at_id,
|
||||
AccessToken.id < up_to_id,
|
||||
AccessToken.temporary == True,
|
||||
~(AccessToken.id << access_tokens_in_builds))
|
||||
.execute())
|
||||
|
||||
time_to_delete = datetime.utcnow() - start_time
|
||||
|
||||
up_to_id -= BATCH_SIZE
|
||||
|
||||
logger.debug('Sleeping for %s seconds', time_to_delete.total_seconds())
|
||||
time.sleep(time_to_delete.total_seconds())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
delete_temporary_access_tokens(datetime.utcnow() - timedelta(days=2))
|
13
util/migrate/table_ops.py
Normal file
13
util/migrate/table_ops.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
def copy_table_contents(source_table, destination_table, conn):
|
||||
if conn.engine.name == 'postgresql':
|
||||
conn.execute('INSERT INTO "%s" SELECT * FROM "%s"' % (destination_table, source_table))
|
||||
result = list(conn.execute('Select Max(id) from "%s"' % destination_table))[0]
|
||||
if result[0] is not None:
|
||||
new_start_id = result[0] + 1
|
||||
conn.execute('ALTER SEQUENCE "%s_id_seq" RESTART WITH %s' % (destination_table, new_start_id))
|
||||
else:
|
||||
conn.execute("INSERT INTO `%s` SELECT * FROM `%s` WHERE 1" % (destination_table, source_table))
|
||||
result = list(conn.execute('Select Max(id) from `%s` WHERE 1' % destination_table))[0]
|
||||
if result[0] is not None:
|
||||
new_start_id = result[0] + 1
|
||||
conn.execute("ALTER TABLE `%s` AUTO_INCREMENT = %s" % (destination_table, new_start_id))
|
176
util/migrate/test/test_backfill_allocator.py
Normal file
176
util/migrate/test/test_backfill_allocator.py
Normal file
|
@ -0,0 +1,176 @@
|
|||
import random
|
||||
|
||||
import pytest
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from util.migrate.allocator import CompletedKeys, NoAvailableKeysError, yield_random_entries
|
||||
|
||||
def test_merge_blocks_operations():
|
||||
candidates = CompletedKeys(10)
|
||||
assert candidates.num_remaining == 10
|
||||
candidates.mark_completed(1, 5)
|
||||
|
||||
assert candidates.is_available(5)
|
||||
assert candidates.is_available(0)
|
||||
assert not candidates.is_available(1)
|
||||
assert not candidates.is_available(4)
|
||||
assert not candidates.is_available(11)
|
||||
assert not candidates.is_available(10)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 6
|
||||
|
||||
candidates.mark_completed(5, 6)
|
||||
assert not candidates.is_available(5)
|
||||
assert candidates.is_available(6)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 5
|
||||
|
||||
candidates.mark_completed(3, 8)
|
||||
assert candidates.is_available(9)
|
||||
assert candidates.is_available(8)
|
||||
assert not candidates.is_available(7)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 3
|
||||
|
||||
def test_adjust_max():
|
||||
candidates = CompletedKeys(10)
|
||||
assert candidates.num_remaining == 10
|
||||
assert len(candidates._slabs) == 0
|
||||
|
||||
assert candidates.is_available(9)
|
||||
candidates.mark_completed(5, 12)
|
||||
assert len(candidates._slabs) == 0
|
||||
assert candidates.num_remaining == 5
|
||||
|
||||
assert not candidates.is_available(9)
|
||||
assert candidates.is_available(4)
|
||||
|
||||
def test_adjust_min():
|
||||
candidates = CompletedKeys(10)
|
||||
assert candidates.num_remaining == 10
|
||||
assert len(candidates._slabs) == 0
|
||||
|
||||
assert candidates.is_available(2)
|
||||
candidates.mark_completed(0, 3)
|
||||
assert len(candidates._slabs) == 0
|
||||
assert candidates.num_remaining == 7
|
||||
|
||||
assert not candidates.is_available(2)
|
||||
assert candidates.is_available(4)
|
||||
|
||||
def test_inside_block():
|
||||
candidates = CompletedKeys(10)
|
||||
assert candidates.num_remaining == 10
|
||||
candidates.mark_completed(1, 8)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 3
|
||||
|
||||
candidates.mark_completed(2, 5)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 3
|
||||
assert not candidates.is_available(1)
|
||||
assert not candidates.is_available(5)
|
||||
|
||||
def test_wrap_block():
|
||||
candidates = CompletedKeys(10)
|
||||
assert candidates.num_remaining == 10
|
||||
candidates.mark_completed(2, 5)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 7
|
||||
|
||||
candidates.mark_completed(1, 8)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 3
|
||||
assert not candidates.is_available(1)
|
||||
assert not candidates.is_available(5)
|
||||
|
||||
def test_non_contiguous():
|
||||
candidates = CompletedKeys(10)
|
||||
assert candidates.num_remaining == 10
|
||||
|
||||
candidates.mark_completed(1, 5)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 6
|
||||
assert candidates.is_available(5)
|
||||
assert candidates.is_available(6)
|
||||
|
||||
candidates.mark_completed(6, 8)
|
||||
assert len(candidates._slabs) == 2
|
||||
assert candidates.num_remaining == 4
|
||||
assert candidates.is_available(5)
|
||||
assert not candidates.is_available(6)
|
||||
|
||||
def test_big_merge():
|
||||
candidates = CompletedKeys(10)
|
||||
assert candidates.num_remaining == 10
|
||||
|
||||
candidates.mark_completed(1, 5)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 6
|
||||
|
||||
candidates.mark_completed(6, 8)
|
||||
assert len(candidates._slabs) == 2
|
||||
assert candidates.num_remaining == 4
|
||||
|
||||
candidates.mark_completed(5, 6)
|
||||
assert len(candidates._slabs) == 1
|
||||
assert candidates.num_remaining == 3
|
||||
|
||||
def test_range_limits():
|
||||
candidates = CompletedKeys(10)
|
||||
assert not candidates.is_available(-1)
|
||||
assert not candidates.is_available(10)
|
||||
|
||||
assert candidates.is_available(9)
|
||||
assert candidates.is_available(0)
|
||||
|
||||
def test_random_saturation():
|
||||
candidates = CompletedKeys(100)
|
||||
with pytest.raises(NoAvailableKeysError):
|
||||
for _ in range(101):
|
||||
start = candidates.get_block_start_index(10)
|
||||
assert candidates.is_available(start)
|
||||
candidates.mark_completed(start, start + 10)
|
||||
|
||||
assert candidates.num_remaining == 0
|
||||
|
||||
def test_huge_dataset():
|
||||
candidates = CompletedKeys(1024 * 1024)
|
||||
start_time = datetime.now()
|
||||
iterations = 0
|
||||
with pytest.raises(NoAvailableKeysError):
|
||||
while (datetime.now() - start_time) < timedelta(seconds=10):
|
||||
start = candidates.get_block_start_index(1024)
|
||||
assert candidates.is_available(start)
|
||||
candidates.mark_completed(start, start + random.randint(512, 1024))
|
||||
iterations += 1
|
||||
|
||||
assert iterations > 1024
|
||||
assert candidates.num_remaining == 0
|
||||
|
||||
|
||||
class FakeQuery(object):
|
||||
def __init__(self, result_list):
|
||||
self._result_list = result_list
|
||||
|
||||
def limit(self, *args, **kwargs):
|
||||
return self
|
||||
|
||||
def where(self, *args, **kwargs):
|
||||
return self
|
||||
|
||||
def order_by(self, *args, **kwargs):
|
||||
return self
|
||||
|
||||
def __iter__(self):
|
||||
return self._result_list.__iter__()
|
||||
|
||||
|
||||
FAKE_PK_FIELD = 10 # Must be able to compare to integers
|
||||
|
||||
def test_no_work():
|
||||
def create_empty_query():
|
||||
return FakeQuery([])
|
||||
|
||||
for _ in yield_random_entries(create_empty_query, FAKE_PK_FIELD, 1, 10):
|
||||
assert False, 'There should never be any actual work!'
|
43
util/migrate/test/test_cleanup_old_robots.py
Normal file
43
util/migrate/test/test_cleanup_old_robots.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
import pytest
|
||||
|
||||
from data.database import User
|
||||
from util.migrate.cleanup_old_robots import cleanup_old_robots
|
||||
|
||||
from test.fixtures import *
|
||||
|
||||
def test_cleanup_old_robots(initialized_db):
|
||||
before_robot_count = User.select().where(User.robot == True).count()
|
||||
before_user_count = User.select().count()
|
||||
|
||||
# Run the cleanup once, and ensure it does nothing.
|
||||
cleanup_old_robots(force=True)
|
||||
|
||||
after_robot_count = User.select().where(User.robot == True).count()
|
||||
after_user_count = User.select().count()
|
||||
|
||||
assert before_robot_count == after_robot_count
|
||||
assert before_user_count == after_user_count
|
||||
|
||||
# Create some orphan robots.
|
||||
created = set()
|
||||
for index in range(0, 50):
|
||||
created.add('doesnotexist+a%s' % index)
|
||||
created.add('anothernamespace+b%s' % index)
|
||||
|
||||
User.create(username='doesnotexist+a%s' % index, robot=True)
|
||||
User.create(username='anothernamespace+b%s' % index, robot=True)
|
||||
|
||||
before_robot_count = User.select().where(User.robot == True).count()
|
||||
before_user_count = User.select().count()
|
||||
|
||||
cleanup_old_robots(page_size=10, force=True)
|
||||
|
||||
after_robot_count = User.select().where(User.robot == True).count()
|
||||
after_user_count = User.select().count()
|
||||
|
||||
assert before_robot_count == after_robot_count + len(created)
|
||||
assert before_user_count == after_user_count + len(created)
|
||||
|
||||
for name in created:
|
||||
with pytest.raises(User.DoesNotExist):
|
||||
User.get(username=name)
|
Reference in a new issue