Implement some new methods on the storage engines.

This commit is contained in:
Jake Moshenko 2015-08-26 17:08:42 -04:00
parent 4fa37a46d1
commit 398202e6fc
10 changed files with 211 additions and 98 deletions

View file

@ -11,6 +11,7 @@ from random import SystemRandom
from datetime import datetime
from peewee import *
from data.read_slave import ReadSlaveModel
from data.fields import ResumableSHAField, JSONField
from sqlalchemy.engine.url import make_url
from collections import defaultdict
@ -750,35 +751,13 @@ class RepositoryAuthorizedEmail(BaseModel):
)
class ResumableSHAField(TextField):
def db_value(self, value):
sha_state = value.state()
# One of the fields is a byte string, let's base64 encode it to make sure
# we can store and fetch it regardless of default collocation
sha_state[3] = base64.b64encode(sha_state[3])
return json.dumps(sha_state)
def python_value(self, value):
to_resume = resumablehashlib.sha256()
if value is None:
return to_resume
sha_state = json.loads(value)
# We need to base64 decode the data bytestring
sha_state[3] = base64.b64decode(sha_state[3])
to_resume.set_state(sha_state)
return to_resume
class BlobUpload(BaseModel):
repository = ForeignKeyField(Repository, index=True)
uuid = CharField(index=True, unique=True)
byte_count = IntegerField(default=0)
sha_state = ResumableSHAField(null=True, default=resumablehashlib.sha256)
location = ForeignKeyField(ImageStorageLocation)
storage_metadata = JSONField(null=True, default={})
class Meta:
database = db

38
data/fields.py Normal file
View file

@ -0,0 +1,38 @@
import base64
import resumablehashlib
import json
from peewee import TextField
class ResumableSHAField(TextField):
def db_value(self, value):
sha_state = value.state()
# One of the fields is a byte string, let's base64 encode it to make sure
# we can store and fetch it regardless of default collocation.
sha_state[3] = base64.b64encode(sha_state[3])
return json.dumps(sha_state)
def python_value(self, value):
to_resume = resumablehashlib.sha256()
if value is None:
return to_resume
sha_state = json.loads(value)
# We need to base64 decode the data bytestring.
sha_state[3] = base64.b64decode(sha_state[3])
to_resume.set_state(sha_state)
return to_resume
class JSONField(TextField):
def db_value(self, value):
return json.dumps(value)
def python_value(self, value):
if value is None or value == "":
return {}
return json.loads(value)

View file

@ -67,7 +67,8 @@ def get_blob_upload(namespace, repo_name, upload_uuid):
raise InvalidBlobUpload()
def initiate_upload(namespace, repo_name, uuid, location_name):
def initiate_upload(namespace, repo_name, uuid, location_name, storage_metadata):
repo = _basequery.get_existing_repository(namespace, repo_name)
location = ImageStorageLocation.get(name=location_name)
return BlobUpload.create(repository=repo, location=location, uuid=uuid)
return BlobUpload.create(repository=repo, location=location, uuid=uuid,
storage_metadata=storage_metadata)

View file

@ -119,8 +119,8 @@ def _render_range(num_uploaded_bytes, with_bytes_prefix=True):
@anon_protect
def start_blob_upload(namespace, repo_name):
location_name = storage.preferred_locations[0]
new_upload_uuid = storage.initiate_chunked_upload(location_name)
model.blob.initiate_upload(namespace, repo_name, new_upload_uuid, location_name)
new_upload_uuid, upload_metadata = storage.initiate_chunked_upload(location_name)
model.blob.initiate_upload(namespace, repo_name, new_upload_uuid, location_name, upload_metadata)
digest = request.args.get('digest', None)
if digest is None:
@ -205,11 +205,13 @@ def _upload_chunk(namespace, repo_name, upload_uuid):
input_fp = wrap_with_handler(input_fp, found.sha_state.update)
try:
length_written = storage.stream_upload_chunk({found.location.name}, upload_uuid, start_offset,
length, input_fp)
length_written, new_metadata = storage.stream_upload_chunk({found.location.name}, upload_uuid,
start_offset, length, input_fp,
found.storage_metadata)
except InvalidChunkException:
_range_not_satisfiable(found.byte_count)
found.storage_metadata = new_metadata
found.byte_count += length_written
return found
@ -222,7 +224,8 @@ def _finish_upload(namespace, repo_name, upload_obj, expected_digest):
# Mark the blob as uploaded.
final_blob_location = digest_tools.content_path(expected_digest)
storage.complete_chunked_upload({upload_obj.location.name}, upload_obj.uuid, final_blob_location)
storage.complete_chunked_upload({upload_obj.location.name}, upload_obj.uuid, final_blob_location,
upload_obj.storage_metadata)
model.blob.store_blob_record_and_temp_link(namespace, repo_name, expected_digest,
upload_obj.location, upload_obj.byte_count,
app.config['PUSH_TEMP_TAG_EXPIRATION_SEC'])
@ -278,6 +281,6 @@ def cancel_upload(namespace, repo_name, upload_uuid):
# We delete the record for the upload first, since if the partial upload in
# storage fails to delete, it doesn't break anything
found.delete_instance()
storage.cancel_chunked_upload({found.location.name}, found.uuid)
storage.cancel_chunked_upload({found.location.name}, found.uuid, found.storage_metadata)
return make_response('', 204)

View file

@ -42,18 +42,9 @@ class StoragePaths(object):
class BaseStorage(StoragePaths):
"""Storage is organized as follow:
$ROOT/images/<image_id>/json
$ROOT/images/<image_id>/layer
$ROOT/repositories/<namespace>/<repository_name>/<tag_name>
"""
# Useful if we want to change those locations later without rewriting
# the code which uses Storage
repositories = 'repositories'
images = 'images'
# Set the IO buffer to 64kB
buffer_size = 64 * 1024
def __init__(self):
# Set the IO buffer to 64kB
self.buffer_size = 64 * 1024
def setup(self):
""" Called to perform any storage system setup. """
@ -99,31 +90,55 @@ class BaseStorage(StoragePaths):
def get_checksum(self, path):
raise NotImplementedError
def stream_write_to_fp(self, in_fp, out_fp, num_bytes=-1):
""" Copy the specified number of bytes from the input file stream to the output stream. If
num_bytes < 0 copy until the stream ends.
"""
bytes_copied = 0
while bytes_copied < num_bytes or num_bytes == -1:
size_to_read = min(num_bytes - bytes_copied, self.buffer_size)
if size_to_read < 0:
size_to_read = self.buffer_size
try:
buf = in_fp.read(size_to_read)
if not buf:
break
out_fp.write(buf)
bytes_copied += len(buf)
except IOError:
break
return bytes_copied
class InvalidChunkException(RuntimeError):
pass
class BaseStorageV2(BaseStorage):
def initiate_chunked_upload(self, upload_uuid):
""" Start a new chunked upload
def initiate_chunked_upload(self):
""" Start a new chunked upload, returning the uuid and any associated storage metadata
"""
raise NotImplementedError
def stream_upload_chunk(self, uuid, offset, length, in_fp, hash_obj):
def stream_upload_chunk(self, uuid, offset, length, in_fp, storage_metadata):
""" Upload the specified amount of data from the given file pointer to the chunked destination
specified, starting at the given offset. Raises InvalidChunkException if the offset or
length can not be accepted.
specified, starting at the given offset. Returns the number of bytes uploaded, and a new
version of the storage_metadata. Raises InvalidChunkException if the offset or length can
not be accepted.
"""
raise NotImplementedError
def complete_chunked_upload(self, uuid, final_path):
def complete_chunked_upload(self, uuid, final_path, storage_metadata):
""" Complete the chunked upload and store the final results in the path indicated.
Returns nothing.
"""
raise NotImplementedError
def cancel_chunked_upload(self, uuid):
def cancel_chunked_upload(self, uuid, storage_metadata):
""" Cancel the chunked upload and clean up any outstanding partially uploaded data.
Returns nothing.
"""
raise NotImplementedError

View file

@ -3,18 +3,25 @@ import os
import logging
import boto.s3.connection
import boto.s3.multipart
import boto.gs.connection
import boto.s3.key
import boto.gs.key
from io import BufferedIOBase
from uuid import uuid4
from storage.basestorage import BaseStorage
from storage.basestorage import BaseStorageV2, InvalidChunkException
logger = logging.getLogger(__name__)
_MULTIPART_UPLOAD_ID_KEY = 'upload_id'
_LAST_PART_KEY = 'last_part_num'
_LAST_CHUNK_ENCOUNTERED = 'last_chunk_encountered'
class StreamReadKeyAsFile(BufferedIOBase):
def __init__(self, key):
self._key = key
@ -37,9 +44,13 @@ class StreamReadKeyAsFile(BufferedIOBase):
self._key.close(fast=True)
class _CloudStorage(BaseStorage):
class _CloudStorage(BaseStorageV2):
def __init__(self, connection_class, key_class, connect_kwargs, upload_params, storage_path,
access_key, secret_key, bucket_name):
super(_CloudStorage, self).__init__()
self.upload_chunk_size = 5 * 1024 * 1024
self._initialized = False
self._bucket_name = bucket_name
self._access_key = access_key
@ -135,12 +146,9 @@ class _CloudStorage(BaseStorage):
raise IOError('No such key: \'{0}\''.format(path))
return StreamReadKeyAsFile(key)
def stream_write(self, path, fp, content_type=None, content_encoding=None):
def __initiate_multipart_upload(self, path, content_type, content_encoding):
# Minimum size of upload part size on S3 is 5MB
self._initialize_cloud_conn()
buffer_size = 5 * 1024 * 1024
if self.buffer_size > buffer_size:
buffer_size = self.buffer_size
path = self._init_path(path)
metadata = {}
@ -150,16 +158,20 @@ class _CloudStorage(BaseStorage):
if content_encoding is not None:
metadata['Content-Encoding'] = content_encoding
mp = self._cloud_bucket.initiate_multipart_upload(path, metadata=metadata,
**self._upload_params)
return self._cloud_bucket.initiate_multipart_upload(path, metadata=metadata,
**self._upload_params)
def stream_write(self, path, fp, content_type=None, content_encoding=None):
mp = self.__initiate_multipart_upload(path, content_type, content_encoding)
num_part = 1
while True:
try:
buf = fp.read(buffer_size)
if not buf:
buf = StringIO.StringIO()
bytes_written = self.stream_write_to_fp(fp, buf, self.upload_chunk_size)
if bytes_written == 0:
break
io = StringIO.StringIO(buf)
mp.upload_part_from_file(io, num_part)
mp.upload_part_from_file(buf, num_part)
num_part += 1
io.close()
except IOError:
@ -217,6 +229,57 @@ class _CloudStorage(BaseStorage):
return k.etag[1:-1][:7]
def _rel_upload_path(self, uuid):
return 'uploads/{0}'.format(uuid)
def initiate_chunked_upload(self):
self._initialize_cloud_conn()
random_uuid = str(uuid4())
path = self._init_path(self._rel_upload_path(random_uuid))
mpu = self.__initiate_multipart_upload(path, content_type=None, content_encoding=None)
metadata = {
_MULTIPART_UPLOAD_ID_KEY: mpu.id,
_LAST_PART_KEY: 0,
_LAST_CHUNK_ENCOUNTERED: False,
}
return mpu.id, metadata
def _get_multipart_upload_key(self, uuid, storage_metadata):
mpu = boto.s3.multipart.MultiPartUpload(self._cloud_bucket)
mpu.id = storage_metadata[_MULTIPART_UPLOAD_ID_KEY]
mpu.key = self._init_path(self._rel_upload_path(uuid))
return mpu
def stream_upload_chunk(self, uuid, offset, length, in_fp, storage_metadata):
self._initialize_cloud_conn()
mpu = self._get_multipart_upload_key(uuid, storage_metadata)
last_part_num = storage_metadata[_LAST_PART_KEY]
if storage_metadata[_LAST_CHUNK_ENCOUNTERED] and length != 0:
msg = 'Length must be at least the the upload chunk size: %s' % self.upload_chunk_size
raise InvalidChunkException(msg)
part_num = last_part_num + 1
mpu.upload_part_from_file(in_fp, part_num, length)
new_metadata = {
_MULTIPART_UPLOAD_ID_KEY: mpu.id,
_LAST_PART_KEY: part_num,
_LAST_CHUNK_ENCOUNTERED: True,
}
return length, new_metadata
def complete_chunked_upload(self, uuid, final_path, storage_metadata):
mpu = self._get_multipart_upload_key(uuid, storage_metadata)
mpu.complete_upload()
def cancel_chunked_upload(self, uuid, storage_metadata):
mpu = self._get_multipart_upload_key(uuid, storage_metadata)
mpu.cancel_multipart_upload()
class S3Storage(_CloudStorage):
def __init__(self, storage_path, s3_access_key, s3_secret_key, s3_bucket):

View file

@ -1,8 +1,14 @@
from storage.basestorage import BaseStorage
import cStringIO as StringIO
import hashlib
_FAKE_STORAGE_MAP = {}
from collections import defaultdict
from uuid import uuid4
class FakeStorage(BaseStorage):
from storage.basestorage import BaseStorageV2
_FAKE_STORAGE_MAP = defaultdict(StringIO.StringIO)
class FakeStorage(BaseStorageV2):
def _init_path(self, path=None, create=False):
return path
@ -10,16 +16,26 @@ class FakeStorage(BaseStorage):
if not path in _FAKE_STORAGE_MAP:
raise IOError('Fake file %s not found' % path)
return _FAKE_STORAGE_MAP.get(path)
_FAKE_STORAGE_MAP.get(path).seek(0)
return _FAKE_STORAGE_MAP.get(path).read()
def put_content(self, path, content):
_FAKE_STORAGE_MAP[path] = content
_FAKE_STORAGE_MAP.pop(path, None)
_FAKE_STORAGE_MAP[path].write(content)
def stream_read(self, path):
yield _FAKE_STORAGE_MAP[path]
io_obj = _FAKE_STORAGE_MAP[path]
io_obj.seek(0)
while True:
buf = io_obj.read(self.buffer_size)
if not buf:
break
yield buf
def stream_write(self, path, fp, content_type=None, content_encoding=None):
_FAKE_STORAGE_MAP[path] = fp.read()
out_fp = _FAKE_STORAGE_MAP[path]
out_fp.seek(0)
self.stream_write_to_fp(fp, out_fp)
def remove(self, path):
_FAKE_STORAGE_MAP.pop(path, None)
@ -28,4 +44,21 @@ class FakeStorage(BaseStorage):
return path in _FAKE_STORAGE_MAP
def get_checksum(self, path):
return path
return hashlib.sha256(_FAKE_STORAGE_MAP[path].read()).hexdigest()[:7]
def initiate_chunked_upload(self):
new_uuid = str(uuid4())
_FAKE_STORAGE_MAP[new_uuid].seek(0)
return new_uuid, {}
def stream_upload_chunk(self, uuid, offset, length, in_fp, _):
upload_storage = _FAKE_STORAGE_MAP[uuid]
upload_storage.seek(offset)
return self.stream_write_to_fp(in_fp, upload_storage, length)
def complete_chunked_upload(self, uuid, final_path, _):
_FAKE_STORAGE_MAP[final_path] = _FAKE_STORAGE_MAP[uuid]
_FAKE_STORAGE_MAP.pop(uuid, None)
def cancel_chunked_upload(self, uuid, _):
_FAKE_STORAGE_MAP.pop(uuid, None)

View file

@ -14,8 +14,8 @@ logger = logging.getLogger(__name__)
class LocalStorage(BaseStorageV2):
def __init__(self, storage_path):
super(LocalStorage, self).__init__()
self._root_path = storage_path
def _init_path(self, path=None, create=False):
@ -54,28 +54,7 @@ class LocalStorage(BaseStorageV2):
# Size is mandatory
path = self._init_path(path, create=True)
with open(path, mode='wb') as out_fp:
self._stream_write_to_fp(fp, out_fp)
def _stream_write_to_fp(self, in_fp, out_fp, num_bytes=-1):
""" Copy the specified number of bytes from the input file stream to the output stream. If
num_bytes < 0 copy until the stream ends.
"""
bytes_copied = 0
while bytes_copied < num_bytes or num_bytes == -1:
size_to_read = min(num_bytes - bytes_copied, self.buffer_size)
if size_to_read < 0:
size_to_read = self.buffer_size
try:
buf = in_fp.read(size_to_read)
if not buf:
break
out_fp.write(buf)
bytes_copied += len(buf)
except IOError:
break
return bytes_copied
self.stream_write_to_fp(fp, out_fp)
def list_directory(self, path=None):
path = self._init_path(path)
@ -124,14 +103,14 @@ class LocalStorage(BaseStorageV2):
with open(self._init_path(self._rel_upload_path(new_uuid), create=True), 'w'):
pass
return new_uuid
return new_uuid, {}
def stream_upload_chunk(self, uuid, offset, length, in_fp):
def stream_upload_chunk(self, uuid, offset, length, in_fp, _):
with open(self._init_path(self._rel_upload_path(uuid)), 'r+b') as upload_storage:
upload_storage.seek(offset)
return self._stream_write_to_fp(in_fp, upload_storage, length)
return self.stream_write_to_fp(in_fp, upload_storage, length), {}
def complete_chunked_upload(self, uuid, final_path):
def complete_chunked_upload(self, uuid, final_path, _):
content_path = self._rel_upload_path(uuid)
final_path_abs = self._init_path(final_path, create=True)
if not self.exists(final_path_abs):
@ -140,7 +119,7 @@ class LocalStorage(BaseStorageV2):
else:
logger.debug('Content already exists at path: %s', final_path_abs)
def cancel_chunked_upload(self, uuid):
def cancel_chunked_upload(self, uuid, _):
content_path = self._init_path(self._rel_upload_path(uuid))
os.remove(content_path)

View file

@ -13,6 +13,8 @@ logger = logging.getLogger(__name__)
class SwiftStorage(BaseStorage):
def __init__(self, swift_container, storage_path, auth_url, swift_user,
swift_password, auth_version=None, os_options=None, ca_cert_path=None):
super(SwiftStorage, self).__init__()
self._swift_container = swift_container
self._storage_path = storage_path

Binary file not shown.