initial import for Open Source 🎉

2019-11-12 11:09:47 -05:00 · 2019-11-12 11:09:47 -05:00 · 9c0dd3b722
commit 9c0dd3b722
parent 1898c361f3
2048 changed files with 218743 additions and 0 deletions
--- a/util/migrate/init.py
+++ b/util/migrate/init.py
@ -0,0 +1,38 @@
+import logging
+
+from sqlalchemy.types import TypeDecorator, Text, String
+from sqlalchemy.dialects.mysql import TEXT as MySQLText, LONGTEXT, VARCHAR as MySQLString
+
+
+logger = logging.getLogger(__name__)
+
+
+class UTF8LongText(TypeDecorator):
+  """ Platform-independent UTF-8 LONGTEXT type.
+
+  Uses MySQL's LongText with charset utf8mb4, otherwise uses TEXT, because
+  other engines default to UTF-8 and have longer TEXT fields.
+  """
+  impl = Text
+
+  def load_dialect_impl(self, dialect):
+    if dialect.name == 'mysql':
+      return dialect.type_descriptor(LONGTEXT(charset='utf8mb4', collation='utf8mb4_unicode_ci'))
+    else:
+      return dialect.type_descriptor(Text())
+
+
+class UTF8CharField(TypeDecorator):
+  """ Platform-independent UTF-8 Char type.
+
+  Uses MySQL's VARCHAR with charset utf8mb4, otherwise uses String, because
+  other engines default to UTF-8.
+  """
+  impl = String
+
+  def load_dialect_impl(self, dialect):
+    if dialect.name == 'mysql':
+      return dialect.type_descriptor(MySQLString(charset='utf8mb4', collation='utf8mb4_unicode_ci',
+                                                 length=self.impl.length))
+    else:
+      return dialect.type_descriptor(String(length=self.impl.length))
--- a/util/migrate/allocator.py
+++ b/util/migrate/allocator.py
@ -0,0 +1,175 @@
+import logging
+import random
+
+from bintrees import RBTree
+from threading import Event
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class NoAvailableKeysError(ValueError):
+  pass
+
+
+class CompletedKeys(object):
+  def __init__(self, max_index, min_index=0):
+    self._max_index = max_index
+    self._min_index = min_index
+    self.num_remaining = max_index - min_index
+    self._slabs = RBTree()
+
+  def _get_previous_or_none(self, index):
+    try:
+      return self._slabs.floor_item(index)
+    except KeyError:
+      return None
+
+  def is_available(self, index):
+    logger.debug('Testing index %s', index)
+    if index >= self._max_index or index < self._min_index:
+      logger.debug('Index out of range')
+      return False
+
+    try:
+      prev_start, prev_length = self._slabs.floor_item(index)
+      logger.debug('Prev range: %s-%s', prev_start, prev_start + prev_length)
+      return (prev_start + prev_length) <= index
+    except KeyError:
+      return True
+
+  def mark_completed(self, start_index, past_last_index):
+    logger.debug('Marking the range completed: %s-%s', start_index, past_last_index)
+    num_completed = min(past_last_index, self._max_index) - max(start_index, self._min_index)
+
+    # Find the item directly before this and see if there is overlap
+    to_discard = set()
+    try:
+      prev_start, prev_length = self._slabs.floor_item(start_index)
+      max_prev_completed = prev_start + prev_length
+      if max_prev_completed >= start_index:
+        # we are going to merge with the range before us
+        logger.debug('Merging with the prev range: %s-%s', prev_start, prev_start + prev_length)
+        to_discard.add(prev_start)
+        num_completed = max(num_completed - (max_prev_completed - start_index), 0)
+        start_index = prev_start
+        past_last_index = max(past_last_index, prev_start + prev_length)
+    except KeyError:
+      pass
+
+    # Find all keys between the start and last index and merge them into one block
+    for merge_start, merge_length in self._slabs.iter_items(start_index, past_last_index + 1):
+      if merge_start in to_discard:
+        logger.debug('Already merged with block %s-%s', merge_start, merge_start + merge_length)
+        continue
+
+      candidate_next_index = merge_start + merge_length
+      logger.debug('Merging with block %s-%s', merge_start, candidate_next_index)
+      num_completed -= merge_length - max(candidate_next_index - past_last_index, 0)
+      to_discard.add(merge_start)
+      past_last_index = max(past_last_index, candidate_next_index)
+
+    # write the new block which is fully merged
+    discard = False
+    if past_last_index >= self._max_index:
+      logger.debug('Discarding block and setting new max to: %s', start_index)
+      self._max_index = start_index
+      discard = True
+
+    if start_index <= self._min_index:
+      logger.debug('Discarding block and setting new min to: %s', past_last_index)
+      self._min_index = past_last_index
+      discard = True
+
+    if to_discard:
+      logger.debug('Discarding %s obsolete blocks', len(to_discard))
+      self._slabs.remove_items(to_discard)
+
+    if not discard:
+      logger.debug('Writing new block with range: %s-%s', start_index, past_last_index)
+      self._slabs.insert(start_index, past_last_index - start_index)
+
+    # Update the number of remaining items with the adjustments we've made
+    assert num_completed >= 0
+    self.num_remaining -= num_completed
+    logger.debug('Total blocks: %s', len(self._slabs))
+
+  def get_block_start_index(self, block_size_estimate):
+    logger.debug('Total range: %s-%s', self._min_index, self._max_index)
+    if self._max_index <= self._min_index:
+      raise NoAvailableKeysError('All indexes have been marked completed')
+
+    num_holes = len(self._slabs) + 1
+    random_hole = random.randint(0, num_holes - 1)
+    logger.debug('Selected random hole %s with %s total holes', random_hole, num_holes)
+
+    hole_start = self._min_index
+    past_hole_end = self._max_index
+
+    # Now that we have picked a hole, we need to define the bounds
+    if random_hole > 0:
+      # There will be a slab before this hole, find where it ends
+      bound_entries = self._slabs.nsmallest(random_hole + 1)[-2:]
+      left_index, left_len = bound_entries[0]
+      logger.debug('Left range %s-%s', left_index, left_index + left_len)
+      hole_start = left_index + left_len
+
+      if len(bound_entries) > 1:
+        right_index, right_len = bound_entries[1]
+        logger.debug('Right range %s-%s', right_index, right_index + right_len)
+        past_hole_end, _ = bound_entries[1]
+    elif not self._slabs.is_empty():
+      right_index, right_len = self._slabs.nsmallest(1)[0]
+      logger.debug('Right range %s-%s', right_index, right_index + right_len)
+      past_hole_end, _ = self._slabs.nsmallest(1)[0]
+
+    # Now that we have our hole bounds, select a random block from [0:len - block_size_estimate]
+    logger.debug('Selecting from hole range: %s-%s', hole_start, past_hole_end)
+    rand_max_bound = max(hole_start, past_hole_end - block_size_estimate)
+    logger.debug('Rand max bound: %s', rand_max_bound)
+    return random.randint(hole_start, rand_max_bound)
+
+
+def yield_random_entries(batch_query, primary_key_field, batch_size, max_id, min_id=0):
+  """ This method will yield items from random blocks in the database. We will track metadata
+      about which keys are available for work, and we will complete the backfill when there is no
+      more work to be done. The method yields tuples of (candidate, Event), and if the work was
+      already done by another worker, the caller should set the event. Batch candidates must have
+      an "id" field which can be inspected.
+  """
+
+  min_id = max(min_id, 0)
+  max_id = max(max_id, 1)
+  allocator = CompletedKeys(max_id + 1, min_id)
+
+  try:
+    while True:
+      start_index = allocator.get_block_start_index(batch_size)
+      end_index = min(start_index + batch_size, max_id + 1)
+      all_candidates = list(batch_query()
+                            .where(primary_key_field >= start_index,
+                                   primary_key_field < end_index)
+                            .order_by(primary_key_field))
+
+      if len(all_candidates) == 0:
+        logger.info('No candidates, marking entire block completed %s-%s', start_index, end_index)
+        allocator.mark_completed(start_index, end_index)
+        continue
+
+      logger.info('Found %s candidates, processing block', len(all_candidates))
+      batch_completed = 0
+      for candidate in all_candidates:
+        abort_early = Event()
+        yield candidate, abort_early, allocator.num_remaining - batch_completed
+        batch_completed += 1
+        if abort_early.is_set():
+          logger.info('Overlap with another worker, aborting')
+          break
+
+      completed_through = candidate.id + 1
+      logger.info('Marking id range as completed: %s-%s', start_index, completed_through)
+      allocator.mark_completed(start_index, completed_through)
+
+  except NoAvailableKeysError:
+    logger.info('No more work')
--- a/util/migrate/cleanup_old_robots.py
+++ b/util/migrate/cleanup_old_robots.py
@ -0,0 +1,54 @@
+import logging
+
+from app import app
+from data.database import User
+from util.names import parse_robot_username
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def cleanup_old_robots(page_size=50, force=False):
+  """ Deletes any robots that live under namespaces that no longer exist. """
+  if not force and not app.config.get('SETUP_COMPLETE', False):
+    return
+
+  # Collect the robot accounts to delete.
+  page_number = 1
+  to_delete = []
+  encountered_namespaces = {}
+
+  while True:
+    found_bots = False
+    for robot in list(User.select().where(User.robot == True).paginate(page_number, page_size)):
+      found_bots = True
+      logger.info("Checking robot %s (page %s)", robot.username, page_number)
+      parsed = parse_robot_username(robot.username)
+      if parsed is None:
+        continue
+
+      namespace, _ = parsed
+      if namespace in encountered_namespaces:
+        if not encountered_namespaces[namespace]:
+          logger.info('Marking %s to be deleted', robot.username)
+          to_delete.append(robot)
+      else:
+        try:
+          User.get(username=namespace)
+          encountered_namespaces[namespace] = True
+        except User.DoesNotExist:
+          # Save the robot account for deletion.
+          logger.info('Marking %s to be deleted', robot.username)
+          to_delete.append(robot)
+          encountered_namespaces[namespace] = False
+
+    if not found_bots:
+      break
+
+    page_number = page_number + 1
+
+  # Cleanup any robot accounts whose corresponding namespace doesn't exist.
+  logger.info('Found %s robots to delete', len(to_delete))
+  for index, robot in enumerate(to_delete):
+    logger.info('Deleting robot %s of %s (%s)', index, len(to_delete), robot.username)
+    robot.delete_instance(recursive=True, delete_nullable=True)
--- a/util/migrate/delete_access_tokens.py
+++ b/util/migrate/delete_access_tokens.py
@ -0,0 +1,48 @@
+import logging
+import time
+
+from datetime import datetime, timedelta
+
+from data.database import RepositoryBuild, AccessToken
+from app import app
+
+logger = logging.getLogger(__name__)
+
+BATCH_SIZE = 1000
+
+def delete_temporary_access_tokens(older_than):
+  # Find the highest ID up to which we should delete
+  up_to_id = (AccessToken
+              .select(AccessToken.id)
+              .where(AccessToken.created < older_than)
+              .limit(1)
+              .order_by(AccessToken.id.desc())
+              .get().id)
+  logger.debug('Deleting temporary access tokens with ids lower than: %s', up_to_id)
+
+
+  access_tokens_in_builds = (RepositoryBuild.select(RepositoryBuild.access_token).distinct())
+
+  while up_to_id > 0:
+    starting_at_id = max(up_to_id - BATCH_SIZE, 0)
+    logger.debug('Deleting tokens with ids between %s and %s', starting_at_id, up_to_id)
+    start_time = datetime.utcnow()
+    (AccessToken
+     .delete()
+     .where(AccessToken.id >= starting_at_id,
+            AccessToken.id < up_to_id,
+            AccessToken.temporary == True,
+            ~(AccessToken.id << access_tokens_in_builds))
+     .execute())
+
+    time_to_delete = datetime.utcnow() - start_time
+
+    up_to_id -= BATCH_SIZE
+
+    logger.debug('Sleeping for %s seconds', time_to_delete.total_seconds())
+    time.sleep(time_to_delete.total_seconds())
+
+
+if __name__ == '__main__':
+  logging.basicConfig(level=logging.DEBUG)
+  delete_temporary_access_tokens(datetime.utcnow() - timedelta(days=2))
--- a/util/migrate/table_ops.py
+++ b/util/migrate/table_ops.py
@ -0,0 +1,13 @@
+def copy_table_contents(source_table, destination_table, conn):
+  if conn.engine.name == 'postgresql':
+    conn.execute('INSERT INTO "%s" SELECT * FROM "%s"' % (destination_table, source_table))
+    result = list(conn.execute('Select Max(id) from "%s"' % destination_table))[0]
+    if result[0] is not None:
+      new_start_id = result[0] + 1
+      conn.execute('ALTER SEQUENCE "%s_id_seq" RESTART WITH %s' % (destination_table, new_start_id))
+  else:
+    conn.execute("INSERT INTO `%s` SELECT * FROM `%s` WHERE 1" % (destination_table, source_table))
+    result = list(conn.execute('Select Max(id) from `%s` WHERE 1' % destination_table))[0]
+    if result[0] is not None:
+      new_start_id = result[0] + 1
+      conn.execute("ALTER TABLE `%s` AUTO_INCREMENT = %s" % (destination_table, new_start_id))
--- a/util/migrate/test/test_backfill_allocator.py
+++ b/util/migrate/test/test_backfill_allocator.py
@ -0,0 +1,176 @@
+import random
+
+import pytest
+
+from datetime import datetime, timedelta
+from util.migrate.allocator import CompletedKeys, NoAvailableKeysError, yield_random_entries
+
+def test_merge_blocks_operations():
+  candidates = CompletedKeys(10)
+  assert candidates.num_remaining == 10
+  candidates.mark_completed(1, 5)
+
+  assert candidates.is_available(5)
+  assert candidates.is_available(0)
+  assert not candidates.is_available(1)
+  assert not candidates.is_available(4)
+  assert not candidates.is_available(11)
+  assert not candidates.is_available(10)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 6
+
+  candidates.mark_completed(5, 6)
+  assert not candidates.is_available(5)
+  assert candidates.is_available(6)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 5
+
+  candidates.mark_completed(3, 8)
+  assert candidates.is_available(9)
+  assert candidates.is_available(8)
+  assert not candidates.is_available(7)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 3
+
+def test_adjust_max():
+  candidates = CompletedKeys(10)
+  assert candidates.num_remaining == 10
+  assert len(candidates._slabs) == 0
+
+  assert candidates.is_available(9)
+  candidates.mark_completed(5, 12)
+  assert len(candidates._slabs) == 0
+  assert candidates.num_remaining == 5
+
+  assert not candidates.is_available(9)
+  assert candidates.is_available(4)
+
+def test_adjust_min():
+  candidates = CompletedKeys(10)
+  assert candidates.num_remaining == 10
+  assert len(candidates._slabs) == 0
+
+  assert candidates.is_available(2)
+  candidates.mark_completed(0, 3)
+  assert len(candidates._slabs) == 0
+  assert candidates.num_remaining == 7
+
+  assert not candidates.is_available(2)
+  assert candidates.is_available(4)
+
+def test_inside_block():
+  candidates = CompletedKeys(10)
+  assert candidates.num_remaining == 10
+  candidates.mark_completed(1, 8)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 3
+
+  candidates.mark_completed(2, 5)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 3
+  assert not candidates.is_available(1)
+  assert not candidates.is_available(5)
+
+def test_wrap_block():
+  candidates = CompletedKeys(10)
+  assert candidates.num_remaining == 10
+  candidates.mark_completed(2, 5)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 7
+
+  candidates.mark_completed(1, 8)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 3
+  assert not candidates.is_available(1)
+  assert not candidates.is_available(5)
+
+def test_non_contiguous():
+  candidates = CompletedKeys(10)
+  assert candidates.num_remaining == 10
+
+  candidates.mark_completed(1, 5)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 6
+  assert candidates.is_available(5)
+  assert candidates.is_available(6)
+
+  candidates.mark_completed(6, 8)
+  assert len(candidates._slabs) == 2
+  assert candidates.num_remaining == 4
+  assert candidates.is_available(5)
+  assert not candidates.is_available(6)
+
+def test_big_merge():
+  candidates = CompletedKeys(10)
+  assert candidates.num_remaining == 10
+
+  candidates.mark_completed(1, 5)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 6
+
+  candidates.mark_completed(6, 8)
+  assert len(candidates._slabs) == 2
+  assert candidates.num_remaining == 4
+
+  candidates.mark_completed(5, 6)
+  assert len(candidates._slabs) == 1
+  assert candidates.num_remaining == 3
+
+def test_range_limits():
+  candidates = CompletedKeys(10)
+  assert not candidates.is_available(-1)
+  assert not candidates.is_available(10)
+
+  assert candidates.is_available(9)
+  assert candidates.is_available(0)
+
+def test_random_saturation():
+  candidates = CompletedKeys(100)
+  with pytest.raises(NoAvailableKeysError):
+    for _ in range(101):
+      start = candidates.get_block_start_index(10)
+      assert candidates.is_available(start)
+      candidates.mark_completed(start, start + 10)
+
+  assert candidates.num_remaining == 0
+
+def test_huge_dataset():
+  candidates = CompletedKeys(1024 * 1024)
+  start_time = datetime.now()
+  iterations = 0
+  with pytest.raises(NoAvailableKeysError):
+    while (datetime.now() - start_time) < timedelta(seconds=10):
+      start = candidates.get_block_start_index(1024)
+      assert candidates.is_available(start)
+      candidates.mark_completed(start, start + random.randint(512, 1024))
+      iterations += 1
+
+  assert iterations > 1024
+  assert candidates.num_remaining == 0
+
+
+class FakeQuery(object):
+  def __init__(self, result_list):
+    self._result_list = result_list
+
+  def limit(self, *args, **kwargs):
+    return self
+
+  def where(self, *args, **kwargs):
+    return self
+
+  def order_by(self, *args, **kwargs):
+    return self
+
+  def __iter__(self):
+    return self._result_list.__iter__()
+
+
+FAKE_PK_FIELD = 10  # Must be able to compare to integers
+
+def test_no_work():
+  def create_empty_query():
+    return FakeQuery([])
+
+  for _ in yield_random_entries(create_empty_query, FAKE_PK_FIELD, 1, 10):
+    assert False, 'There should never be any actual work!'
--- a/util/migrate/test/test_cleanup_old_robots.py
+++ b/util/migrate/test/test_cleanup_old_robots.py
@ -0,0 +1,43 @@
+import pytest
+
+from data.database import User
+from util.migrate.cleanup_old_robots import cleanup_old_robots
+
+from test.fixtures import *
+
+def test_cleanup_old_robots(initialized_db):
+  before_robot_count = User.select().where(User.robot == True).count()
+  before_user_count = User.select().count()
+
+  # Run the cleanup once, and ensure it does nothing.
+  cleanup_old_robots(force=True)
+
+  after_robot_count = User.select().where(User.robot == True).count()
+  after_user_count = User.select().count()
+
+  assert before_robot_count == after_robot_count
+  assert before_user_count == after_user_count
+
+  # Create some orphan robots.
+  created = set()
+  for index in range(0, 50):
+    created.add('doesnotexist+a%s' % index)
+    created.add('anothernamespace+b%s' % index)
+
+    User.create(username='doesnotexist+a%s' % index, robot=True)
+    User.create(username='anothernamespace+b%s' % index, robot=True)
+
+  before_robot_count = User.select().where(User.robot == True).count()
+  before_user_count = User.select().count()
+
+  cleanup_old_robots(page_size=10, force=True)
+
+  after_robot_count = User.select().where(User.robot == True).count()
+  after_user_count = User.select().count()
+
+  assert before_robot_count == after_robot_count + len(created)
+  assert before_user_count == after_user_count + len(created)
+
+  for name in created:
+    with pytest.raises(User.DoesNotExist):
+      User.get(username=name)