Merge remote-tracking branch 'upstream/phase4-11-07-2015' into python-registry-v2

This commit is contained in:
Jake Moshenko 2015-11-06 18:18:29 -05:00
commit c2fcf8bead
177 changed files with 4354 additions and 1462 deletions

View file

@ -0,0 +1,76 @@
import logging
from sqlalchemy.types import TypeDecorator, Text
from sqlalchemy.dialects.mysql import TEXT as MySQLText, LONGTEXT
from random import shuffle
logger = logging.getLogger(__name__)
class UTF8LongText(TypeDecorator):
""" Platform-independent UTF-8 LONGTEXT type.
Uses MySQL's LongText with charset utf8mb4, otherwise uses TEXT, because
other engines default to UTF-8 and have longer TEXT fields.
"""
impl = Text
def load_dialect_impl(self, dialect):
if dialect.name == 'mysql':
return dialect.type_descriptor(LONGTEXT(charset='utf8mb4', collation='utf8mb4_unicode_ci'))
else:
return dialect.type_descriptor(Text())
def _chance_duplication(pop_size, samples):
""" The chance of randomly selecting a duplicate when you choose the specified number of samples
from the specified population size.
"""
pairs = (samples * (samples - 1)) / 2.0
unique = (pop_size - 1.0)/pop_size
all_unique = pow(unique, pairs)
return 1 - all_unique
def _num_checks(pop_size, desired):
""" Binary search for the proper number of entries to use to get the specified collision
probability.
"""
s_max = pop_size
s_min = 0
last_test = -1
s_test = s_max
while s_max > s_min and last_test != s_test:
last_test = s_test
s_test = (s_max + s_min)/2
chance = _chance_duplication(pop_size, s_test)
if chance > desired:
s_max = s_test - 1
else:
s_min = s_test
return s_test
def yield_random_entries(batch_query, batch_size, collision_chance):
""" This method will yield semi-random items from a query in a database friendly way until no
more items match the base query modifier. It will pull batches of batch_size from the query
and yield enough items from each batch so that concurrent workers have a reduced chance of
selecting the same items. For example, if your batches return 10,000 entries, and you desire
only a .03 collision_chance, we will only use 25 random entries before going back to the db
for a new batch.
"""
# Seed with some data which will pass the condition, but will be immediately discarded
all_candidates = [1]
while len(all_candidates) > 0:
all_candidates = list(batch_query().limit(batch_size))
shuffle(all_candidates)
num_selections = max(1, _num_checks(len(all_candidates), collision_chance))
logger.debug('Found %s/%s matching entries, processing %s', len(all_candidates), batch_size,
num_selections)
candidates = all_candidates[0:num_selections]
for candidate in candidates:
yield candidate

View file

@ -0,0 +1,108 @@
import logging
from peewee import JOIN_LEFT_OUTER
from peewee import (CharField, BigIntegerField, BooleanField, ForeignKeyField, DateTimeField,
TextField)
from data.database import BaseModel, db, db_for_update, CloseForLongOperation
from app import app, storage
from digest import checksums
from util.migrate import yield_random_entries
logger = logging.getLogger(__name__)
class Repository(BaseModel):
pass
# Vendor the information from tables we will be writing to at the time of this migration
class ImageStorage(BaseModel):
uuid = CharField(index=True, unique=True)
checksum = CharField(null=True)
image_size = BigIntegerField(null=True)
uncompressed_size = BigIntegerField(null=True)
uploading = BooleanField(default=True, null=True)
cas_path = BooleanField(default=True)
content_checksum = CharField(null=True, index=True)
class Image(BaseModel):
docker_image_id = CharField(index=True)
repository = ForeignKeyField(Repository)
ancestors = CharField(index=True, default='/', max_length=64535, null=True)
storage = ForeignKeyField(ImageStorage, index=True, null=True)
created = DateTimeField(null=True)
comment = TextField(null=True)
command = TextField(null=True)
aggregate_size = BigIntegerField(null=True)
v1_json_metadata = TextField(null=True)
v1_checksum = CharField(null=True)
class ImageStorageLocation(BaseModel):
name = CharField(unique=True, index=True)
class ImageStoragePlacement(BaseModel):
storage = ForeignKeyField(ImageStorage)
location = ForeignKeyField(ImageStorageLocation)
def _get_image_storage_locations(storage_id):
placements_query = (ImageStoragePlacement
.select(ImageStoragePlacement, ImageStorageLocation)
.join(ImageStorageLocation)
.switch(ImageStoragePlacement)
.join(ImageStorage, JOIN_LEFT_OUTER)
.where(ImageStorage.id == storage_id))
locations = set()
for placement in placements_query:
locations.add(placement.location.name)
return locations
def backfill_content_checksums():
""" Copies metadata from image storages to their images. """
logger.debug('Image content checksum backfill: Began execution')
def batch_query():
return (ImageStorage
.select(ImageStorage.id, ImageStorage.uuid)
.where(ImageStorage.content_checksum >> None, ImageStorage.uploading == False))
for candidate_storage in yield_random_entries(batch_query, 10000, 0.1):
logger.debug('Computing content checksum for storage: %s', candidate_storage.uuid)
locations = _get_image_storage_locations(candidate_storage.id)
checksum = None
with CloseForLongOperation(app.config):
try:
# Compute the checksum
layer_path = storage.image_layer_path(candidate_storage.uuid)
with storage.stream_read_file(locations, layer_path) as layer_data_handle:
checksum = 'sha256:{0}'.format(checksums.sha256_file(layer_data_handle))
except Exception as exc:
logger.warning('Unable to compute checksum for storage: %s', candidate_storage.uuid)
checksum = 'unknown:{0}'.format(exc.__class__.__name__)
# Now update the ImageStorage with the checksum
with app.config['DB_TRANSACTION_FACTORY'](db):
to_update = db_for_update(ImageStorage.get(ImageStorage.id == candidate_storage.id))
if to_update.content_checksum is not None:
logger.info('Another worker filled in the checksum: %s', candidate_storage.uuid)
else:
logger.debug('Setting content checksum to %s for %s', checksum, candidate_storage.uuid)
to_update.content_checksum = checksum
to_update.save()
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
# logging.getLogger('peewee').setLevel(logging.CRITICAL)
backfill_content_checksums()

View file

@ -0,0 +1,41 @@
import logging
from data.database import Image, ImageStorage, db, db_for_update
from app import app
from util.migrate import yield_random_entries
logger = logging.getLogger(__name__)
def backfill_parent_id():
logger.setLevel(logging.DEBUG)
logger.debug('backfill_parent_id: Starting')
logger.debug('backfill_parent_id: This can be a LONG RUNNING OPERATION. Please wait!')
def fetch_batch():
return (Image
.select(Image.id, Image.ancestors)
.join(ImageStorage)
.where(Image.parent >> None, Image.ancestors != '/',
ImageStorage.uploading == False))
for to_backfill in yield_random_entries(fetch_batch, 10000, 0.3):
with app.config['DB_TRANSACTION_FACTORY'](db):
try:
image = db_for_update(Image
.select()
.where(Image.id == to_backfill.id)).get()
image.parent = to_backfill.ancestors.split('/')[-2]
image.save()
except Image.DoesNotExist:
pass
logger.debug('backfill_parent_id: Completed')
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
logging.getLogger('peewee').setLevel(logging.CRITICAL)
backfill_parent_id()

View file

@ -0,0 +1,70 @@
import logging
from peewee import (CharField, BigIntegerField, BooleanField, ForeignKeyField, DateTimeField,
TextField)
from data.database import BaseModel, db, db_for_update
from util.migrate import yield_random_entries
from app import app
logger = logging.getLogger(__name__)
class Repository(BaseModel):
pass
# Vendor the information from tables we will be writing to at the time of this migration
class ImageStorage(BaseModel):
uuid = CharField(index=True, unique=True)
checksum = CharField(null=True)
image_size = BigIntegerField(null=True)
uncompressed_size = BigIntegerField(null=True)
uploading = BooleanField(default=True, null=True)
cas_path = BooleanField(default=True)
content_checksum = CharField(null=True, index=True)
class Image(BaseModel):
docker_image_id = CharField(index=True)
repository = ForeignKeyField(Repository)
ancestors = CharField(index=True, default='/', max_length=64535, null=True)
storage = ForeignKeyField(ImageStorage, index=True, null=True)
created = DateTimeField(null=True)
comment = TextField(null=True)
command = TextField(null=True)
aggregate_size = BigIntegerField(null=True)
v1_json_metadata = TextField(null=True)
v1_checksum = CharField(null=True)
def backfill_checksums():
""" Copies checksums from image storages to their images. """
logger.debug('Image v1 checksum backfill: Began execution')
def batch_query():
return (Image
.select(Image.id)
.join(ImageStorage)
.where(Image.v1_checksum >> None, ImageStorage.uploading == False,
~(ImageStorage.checksum >> None)))
for candidate_image in yield_random_entries(batch_query, 10000, 0.1):
logger.debug('Computing content checksum for storage: %s', candidate_image.id)
with app.config['DB_TRANSACTION_FACTORY'](db):
try:
image = db_for_update(Image
.select(Image, ImageStorage)
.join(ImageStorage)
.where(Image.id == candidate_image.id)).get()
image.v1_checksum = image.storage.checksum
image.save()
except Image.DoesNotExist:
pass
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
logging.getLogger('peewee').setLevel(logging.CRITICAL)
backfill_checksums()

View file

@ -2,8 +2,10 @@ import logging
from peewee import JOIN_LEFT_OUTER
from data.database import (Image, ImageStorage, ImageStoragePlacement, ImageStorageLocation, db,
db_for_update)
from peewee import (CharField, BigIntegerField, BooleanField, ForeignKeyField, DateTimeField,
TextField)
from data.database import BaseModel, db, db_for_update
from app import app, storage
from data import model
@ -11,6 +13,48 @@ from data import model
logger = logging.getLogger(__name__)
class Repository(BaseModel):
pass
# Vendor the information from tables we will be writing to at the time of this migration
class ImageStorage(BaseModel):
uuid = CharField(index=True, unique=True)
checksum = CharField(null=True)
image_size = BigIntegerField(null=True)
uncompressed_size = BigIntegerField(null=True)
uploading = BooleanField(default=True, null=True)
class Image(BaseModel):
# This class is intentionally denormalized. Even though images are supposed
# to be globally unique we can't treat them as such for permissions and
# security reasons. So rather than Repository <-> Image being many to many
# each image now belongs to exactly one repository.
docker_image_id = CharField(index=True)
repository = ForeignKeyField(Repository)
# '/' separated list of ancestory ids, e.g. /1/2/6/7/10/
ancestors = CharField(index=True, default='/', max_length=64535, null=True)
storage = ForeignKeyField(ImageStorage, index=True, null=True)
created = DateTimeField(null=True)
comment = TextField(null=True)
command = TextField(null=True)
aggregate_size = BigIntegerField(null=True)
v1_json_metadata = TextField(null=True)
class ImageStorageLocation(BaseModel):
name = CharField(unique=True, index=True)
class ImageStoragePlacement(BaseModel):
storage = ForeignKeyField(ImageStorage)
location = ForeignKeyField(ImageStorageLocation)
def image_json_path(storage_uuid):
base_path = storage.image_path(storage_uuid)
return '{0}json'.format(base_path)
@ -19,6 +63,7 @@ def image_json_path(storage_uuid):
def backfill_v1_metadata():
""" Copies metadata from image storages to their images. """
logger.debug('Image v1 metadata backfill: Began execution')
while True:
batch_image_ids = list(Image
.select(Image.id)

View file

@ -2,7 +2,8 @@ import logging
import logging.config
import json
from data.database import RepositoryBuildTrigger, BuildTriggerService, db, db_for_update
from data.database import (db, db_for_update, BaseModel, CharField, ForeignKeyField,
TextField, BooleanField)
from app import app
from buildtrigger.basehandler import BuildTriggerHandler
from util.security.ssh import generate_ssh_keypair
@ -10,6 +11,32 @@ from github import GithubException
logger = logging.getLogger(__name__)
class BuildTriggerService(BaseModel):
name = CharField(index=True, unique=True)
class Repository(BaseModel):
pass
class User(BaseModel):
pass
class AccessToken(BaseModel):
pass
class RepositoryBuildTrigger(BaseModel):
uuid = CharField()
service = ForeignKeyField(BuildTriggerService, index=True)
repository = ForeignKeyField(Repository, index=True)
connected_user = ForeignKeyField(User)
auth_token = CharField(null=True)
private_key = TextField(null=True)
config = TextField(default='{}')
write_token = ForeignKeyField(AccessToken, null=True)
pull_robot = ForeignKeyField(User, null=True, related_name='triggerpullrobot')
used_legacy_github = BooleanField(null=True, default=False)
def backfill_github_deploykeys():
""" Generates and saves private deploy keys for any GitHub build triggers still relying on
the old buildpack behavior. """