This repository has been archived on 2020-03-24. You can view files and clone it, but cannot push or open issues or pull requests.
quay/storage/cloud.py

547 lines
20 KiB
Python
Raw Normal View History

2013-09-25 21:50:03 +00:00
import cStringIO as StringIO
import os
import logging
import copy
2013-09-25 21:50:03 +00:00
from boto.exception import S3ResponseError
2013-09-25 21:50:03 +00:00
import boto.s3.connection
import boto.s3.multipart
2014-08-12 06:06:44 +00:00
import boto.gs.connection
2013-09-25 21:50:03 +00:00
import boto.s3.key
2014-08-12 06:06:44 +00:00
import boto.gs.key
2013-09-25 21:50:03 +00:00
from io import BufferedIOBase
from uuid import uuid4
from collections import namedtuple
from util.registry import filelike
from storage.basestorage import BaseStorageV2
2013-09-25 21:50:03 +00:00
logger = logging.getLogger(__name__)
_PartUploadMetadata = namedtuple('_PartUploadMetadata', ['path', 'offset', 'length'])
_CHUNKS_KEY = 'chunks'
class StreamReadKeyAsFile(BufferedIOBase):
def __init__(self, key):
self._key = key
def read(self, amt=None):
if self.closed:
return None
resp = self._key.read(amt)
return resp
def readable(self):
return True
@property
def closed(self):
return self._key.closed
def close(self):
self._key.close(fast=True)
class _CloudStorage(BaseStorageV2):
2016-01-15 16:15:40 +00:00
def __init__(self, metric_queue, connection_class, key_class, connect_kwargs, upload_params,
storage_path, bucket_name, access_key=None, secret_key=None):
super(_CloudStorage, self).__init__()
self.automatic_chunk_size = 5 * 1024 * 1024
self._initialized = False
2014-08-12 06:06:44 +00:00
self._bucket_name = bucket_name
self._access_key = access_key
self._secret_key = secret_key
self._root_path = storage_path
2014-08-12 06:06:44 +00:00
self._connection_class = connection_class
self._key_class = key_class
self._upload_params = upload_params
self._connect_kwargs = connect_kwargs
2014-08-12 06:06:44 +00:00
self._cloud_conn = None
self._cloud_bucket = None
2016-01-15 16:15:40 +00:00
self._metric_queue = metric_queue
2014-08-12 06:06:44 +00:00
def _initialize_cloud_conn(self):
if not self._initialized:
self._cloud_conn = self._connection_class(self._access_key, self._secret_key,
**self._connect_kwargs)
2014-08-12 06:06:44 +00:00
self._cloud_bucket = self._cloud_conn.get_bucket(self._bucket_name)
self._initialized = True
def _debug_key(self, key):
"""Used for debugging only."""
orig_meth = key.bucket.connection.make_request
def new_meth(*args, **kwargs):
print '#' * 16
print args
print kwargs
print '#' * 16
return orig_meth(*args, **kwargs)
key.bucket.connection.make_request = new_meth
def _init_path(self, path=None):
path = os.path.join(self._root_path, path) if path else self._root_path
if path and path[0] == '/':
return path[1:]
return path
def get_cloud_conn(self):
self._initialize_cloud_conn()
return self._cloud_conn
def get_cloud_bucket(self):
return self._cloud_bucket
def get_content(self, path):
2014-08-12 06:06:44 +00:00
self._initialize_cloud_conn()
path = self._init_path(path)
2014-08-12 06:06:44 +00:00
key = self._key_class(self._cloud_bucket, path)
if not key.exists():
raise IOError('No such key: \'{0}\''.format(path))
return key.get_contents_as_string()
def put_content(self, path, content):
2014-08-12 06:06:44 +00:00
self._initialize_cloud_conn()
path = self._init_path(path)
2014-08-12 06:06:44 +00:00
key = self._key_class(self._cloud_bucket, path)
key.set_contents_from_string(content, **self._upload_params)
return path
def get_supports_resumable_downloads(self):
return True
2016-02-11 22:00:38 +00:00
def get_direct_download_url(self, path, expires_in=60, requires_cors=False, head=False):
2014-08-12 06:06:44 +00:00
self._initialize_cloud_conn()
path = self._init_path(path)
2014-08-12 06:06:44 +00:00
k = self._key_class(self._cloud_bucket, path)
2016-02-11 22:00:38 +00:00
if head:
return k.generate_url(expires_in, 'HEAD')
return k.generate_url(expires_in)
def get_direct_upload_url(self, path, mime_type, requires_cors=True):
self._initialize_cloud_conn()
path = self._init_path(path)
key = self._key_class(self._cloud_bucket, path)
url = key.generate_url(300, 'PUT', headers={'Content-Type': mime_type}, encrypt_key=True)
return url
def stream_read(self, path):
2014-08-12 06:06:44 +00:00
self._initialize_cloud_conn()
path = self._init_path(path)
2014-08-12 06:06:44 +00:00
key = self._key_class(self._cloud_bucket, path)
if not key.exists():
raise IOError('No such key: \'{0}\''.format(path))
while True:
buf = key.read(self.buffer_size)
if not buf:
break
yield buf
def stream_read_file(self, path):
2014-08-12 06:06:44 +00:00
self._initialize_cloud_conn()
path = self._init_path(path)
2014-08-12 06:06:44 +00:00
key = self._key_class(self._cloud_bucket, path)
if not key.exists():
raise IOError('No such key: \'{0}\''.format(path))
return StreamReadKeyAsFile(key)
def __initiate_multipart_upload(self, path, content_type, content_encoding):
# Minimum size of upload part size on S3 is 5MB
2014-08-12 06:06:44 +00:00
self._initialize_cloud_conn()
path = self._init_path(path)
metadata = {}
if content_type is not None:
metadata['Content-Type'] = content_type
if content_encoding is not None:
metadata['Content-Encoding'] = content_encoding
if self._metric_queue is not None:
self._metric_queue.put_deprecated('MultipartUploadStart', 1)
self._metric_queue.multipart_upload_start.Inc()
return self._cloud_bucket.initiate_multipart_upload(path, metadata=metadata,
**self._upload_params)
def stream_write(self, path, fp, content_type=None, content_encoding=None):
2015-12-10 04:16:33 +00:00
self._stream_write_internal(path, fp, content_type, content_encoding)
def _stream_write_internal(self, path, fp, content_type=None, content_encoding=None,
cancel_on_error=True, size=filelike.READ_UNTIL_END):
write_error = None
mp = self.__initiate_multipart_upload(path, content_type, content_encoding)
# We are going to reuse this but be VERY careful to only read the number of bytes written to it
buf = StringIO.StringIO()
num_part = 1
total_bytes_written = 0
while size == filelike.READ_UNTIL_END or total_bytes_written < size:
bytes_to_copy = self.automatic_chunk_size
if size != filelike.READ_UNTIL_END:
# We never want to ask for more bytes than our caller has indicated to copy
bytes_to_copy = min(bytes_to_copy, size - total_bytes_written)
buf.seek(0)
try:
# Stage the bytes into the buffer for use with the multipart upload file API
bytes_staged = self.stream_write_to_fp(fp, buf, bytes_to_copy)
if bytes_staged == 0:
break
buf.seek(0)
mp.upload_part_from_file(buf, num_part, size=bytes_staged)
total_bytes_written += bytes_staged
num_part += 1
except IOError as e:
logger.warn('Error when writing to stream in stream_write_internal at path %s: %s', path, e)
write_error = e
if self._metric_queue is not None:
self._metric_queue.put_deprecated('MultipartUploadFailure', 1)
self._metric_queue.multipart_upload_end.Inc(labelvalues=['failure'])
if cancel_on_error:
mp.cancel_upload()
return 0, write_error
2015-12-10 04:16:33 +00:00
else:
break
if total_bytes_written > 0:
if self._metric_queue is not None:
self._metric_queue.put_deprecated('MultipartUploadSuccess', 1)
self._metric_queue.multipart_upload_end.Inc(labelvalues=['success'])
2015-12-10 04:16:33 +00:00
mp.complete_upload()
return total_bytes_written, write_error
def exists(self, path):
2014-08-12 06:06:44 +00:00
self._initialize_cloud_conn()
path = self._init_path(path)
2014-08-12 06:06:44 +00:00
key = self._key_class(self._cloud_bucket, path)
return key.exists()
def remove(self, path):
2014-08-12 06:06:44 +00:00
self._initialize_cloud_conn()
path = self._init_path(path)
2014-08-12 06:06:44 +00:00
key = self._key_class(self._cloud_bucket, path)
if key.exists():
# It's a file
key.delete()
return
# We assume it's a directory
if not path.endswith('/'):
path += '/'
2014-08-12 06:06:44 +00:00
for key in self._cloud_bucket.list(prefix=path):
key.delete()
2014-08-12 06:06:44 +00:00
def get_checksum(self, path):
self._initialize_cloud_conn()
path = self._init_path(path)
key = self._key_class(self._cloud_bucket, path)
k = self._cloud_bucket.lookup(key)
if k is None:
raise IOError('No such key: \'{0}\''.format(path))
return k.etag[1:-1][:7]
def copy_to(self, destination, path):
2015-09-08 20:55:47 +00:00
self._initialize_cloud_conn()
# First try to copy directly via boto, but only if the storages are the
# same type, with the same access information.
if (self.__class__ == destination.__class__ and
self._access_key and self._secret_key and
self._access_key == destination._access_key and
self._secret_key == destination._secret_key):
logger.debug('Copying file from %s to %s via a direct boto copy', self._cloud_bucket,
destination._cloud_bucket)
source_path = self._init_path(path)
source_key = self._key_class(self._cloud_bucket, source_path)
2015-09-08 20:55:47 +00:00
destination._initialize_cloud_conn()
dest_path = destination._init_path(path)
source_key.copy(destination._cloud_bucket, dest_path)
return
# Fallback to a slower, default copy.
logger.debug('Copying file from %s to %s via a streamed copy', self._cloud_bucket,
destination)
with self.stream_read_file(path) as fp:
destination.stream_write(path, fp)
def _rel_upload_path(self, uuid):
return 'uploads/{0}'.format(uuid)
def initiate_chunked_upload(self):
self._initialize_cloud_conn()
random_uuid = str(uuid4())
metadata = {
_CHUNKS_KEY: [],
}
return random_uuid, metadata
2015-11-30 20:45:45 +00:00
def stream_upload_chunk(self, uuid, offset, length, in_fp, storage_metadata, content_type=None):
self._initialize_cloud_conn()
# We are going to upload each chunk to a separate key
chunk_path = self._rel_upload_path(str(uuid4()))
bytes_written, write_error = self._stream_write_internal(chunk_path, in_fp,
cancel_on_error=False, size=length,
content_type=content_type)
new_metadata = copy.deepcopy(storage_metadata)
# We are only going to track keys to which data was confirmed written
if bytes_written > 0:
new_metadata[_CHUNKS_KEY].append(_PartUploadMetadata(chunk_path, offset, bytes_written))
return bytes_written, new_metadata, write_error
def _chunk_generator(self, chunk_list):
for chunk in chunk_list:
yield filelike.StreamSlice(self.stream_read_file(chunk.path), 0, chunk.length)
@staticmethod
def _chunk_list_from_metadata(storage_metadata):
return [_PartUploadMetadata(*chunk_args) for chunk_args in storage_metadata[_CHUNKS_KEY]]
2015-09-30 21:46:22 +00:00
def _client_side_chunk_join(self, final_path, chunk_list):
# If there's only one chunk, just "move" (copy and delete) the key and call it a day.
if len(chunk_list) == 1:
chunk_path = self._init_path(chunk_list[0].path)
abs_final_path = self._init_path(final_path)
2015-09-30 21:46:22 +00:00
# Let the copy raise an exception if it fails.
self._cloud_bucket.copy_key(abs_final_path, self._bucket_name, chunk_path)
2015-09-30 21:46:22 +00:00
# Attempt to clean up the old chunk.
try:
self._cloud_bucket.delete_key(chunk_path)
except IOError:
# We failed to delete a chunk. This sucks, but we shouldn't fail the push.
msg = 'Failed to clean up chunk %s for move of %s'
logger.exception(msg, chunk_path, abs_final_path)
2015-09-30 21:46:22 +00:00
else:
# Concatenate and write all the chunks as one key.
concatenated = filelike.FilelikeStreamConcat(self._chunk_generator(chunk_list))
self.stream_write(final_path, concatenated)
# Attempt to clean up all the chunks.
for chunk in chunk_list:
try:
self._cloud_bucket.delete_key(self._init_path(chunk.path))
2015-09-30 21:46:22 +00:00
except IOError:
# We failed to delete a chunk. This sucks, but we shouldn't fail the push.
msg = 'Failed to clean up chunk %s for reupload of %s'
logger.exception(msg, chunk.path, final_path)
def complete_chunked_upload(self, uuid, final_path, storage_metadata, force_client_side=False):
self._initialize_cloud_conn()
chunk_list = self._chunk_list_from_metadata(storage_metadata)
# Here is where things get interesting: we are going to try to assemble this server side
# In order to be a candidate all parts (after offsets have been computed) must be at least 5MB
server_side_assembly = False
if not force_client_side:
server_side_assembly = True
for chunk_offset, chunk in enumerate(chunk_list):
# If the chunk is both too small, and not the last chunk, we rule out server side assembly
if chunk.length < self.automatic_chunk_size and (chunk_offset + 1) < len(chunk_list):
server_side_assembly = False
break
if server_side_assembly:
logger.debug('Performing server side assembly of multi-part upload for: %s', final_path)
try:
# Awesome, we can do this completely server side, now we have to start a new multipart
# upload and use copy_part_from_key to set all of the chunks.
mpu = self.__initiate_multipart_upload(final_path, content_type=None, content_encoding=None)
for chunk_offset, chunk in enumerate(chunk_list):
abs_chunk_path = self._init_path(chunk.path)
part_num = chunk_offset + 1
chunk_end_offset_inclusive = chunk.length - 1
mpu.copy_part_from_key(self.get_cloud_bucket().name, abs_chunk_path, part_num,
start=0, end=chunk_end_offset_inclusive)
# Note: Sometimes Amazon S3 simply raises an internal error when trying to complete a
# multipart upload. The recommendation is to simply try calling complete_upload again.
for remaining_retries in range(3, -1, -1):
try:
mpu.complete_upload()
break
except S3ResponseError as s3re:
if remaining_retries and s3re.status == 200 and s3re.error_code == 'InternalError':
# Weird internal error case. Retry.
continue
# Otherwise, raise it.
logger.exception('Exception trying to complete multipart upload for: %s', final_path)
raise s3re
except IOError as ioe:
# Something bad happened, log it and then give up
msg = 'Exception when attempting server-side assembly for: %s'
logger.exception(msg, final_path)
mpu.cancel_upload()
raise ioe
else:
# We are going to turn all of the server side objects into a single file-like stream, and
# pass that to stream_write to chunk and upload the final object.
2015-09-30 21:46:22 +00:00
self._client_side_chunk_join(final_path, chunk_list)
def cancel_chunked_upload(self, uuid, storage_metadata):
self._initialize_cloud_conn()
# We have to go through and delete all of the uploaded chunks
for chunk in self._chunk_list_from_metadata(storage_metadata):
self.remove(chunk.path)
2014-08-12 06:06:44 +00:00
class S3Storage(_CloudStorage):
def __init__(self, metric_queue, storage_path, s3_bucket, s3_access_key=None, s3_secret_key=None,
2016-01-15 16:15:40 +00:00
host=None):
2014-08-12 06:06:44 +00:00
upload_params = {
'encrypt_key': True,
}
connect_kwargs = {}
if host:
if host.startswith('http:') or host.startswith('https:'):
raise ValueError('host name must not start with http:// or https://')
connect_kwargs['host'] = host
2016-01-15 16:15:40 +00:00
super(S3Storage, self).__init__(metric_queue, boto.s3.connection.S3Connection, boto.s3.key.Key,
connect_kwargs, upload_params, storage_path, s3_bucket,
access_key=s3_access_key or None,
secret_key=s3_secret_key or None)
2014-08-12 06:06:44 +00:00
def setup(self):
self.get_cloud_bucket().set_cors_xml("""<?xml version="1.0" encoding="UTF-8"?>
<CORSConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
<CORSRule>
<AllowedOrigin>*</AllowedOrigin>
<AllowedMethod>GET</AllowedMethod>
<MaxAgeSeconds>3000</MaxAgeSeconds>
<AllowedHeader>Authorization</AllowedHeader>
</CORSRule>
<CORSRule>
<AllowedOrigin>*</AllowedOrigin>
<AllowedMethod>PUT</AllowedMethod>
<MaxAgeSeconds>3000</MaxAgeSeconds>
<AllowedHeader>Content-Type</AllowedHeader>
<AllowedHeader>x-amz-acl</AllowedHeader>
<AllowedHeader>origin</AllowedHeader>
</CORSRule>
</CORSConfiguration>""")
2014-08-12 06:06:44 +00:00
class GoogleCloudStorage(_CloudStorage):
2016-01-15 16:15:40 +00:00
def __init__(self, metric_queue, storage_path, access_key, secret_key, bucket_name):
upload_params = {}
connect_kwargs = {}
2016-01-15 16:15:40 +00:00
super(GoogleCloudStorage, self).__init__(metric_queue, boto.gs.connection.GSConnection,
boto.gs.key.Key, connect_kwargs, upload_params,
storage_path, bucket_name, access_key, secret_key)
2014-08-12 06:06:44 +00:00
def setup(self):
self.get_cloud_bucket().set_cors_xml("""<?xml version="1.0" encoding="UTF-8"?>
<CorsConfig>
<Cors>
<Origins>
<Origin>*</Origin>
</Origins>
<Methods>
<Method>GET</Method>
<Method>PUT</Method>
</Methods>
<ResponseHeaders>
<ResponseHeader>Content-Type</ResponseHeader>
</ResponseHeaders>
<MaxAgeSec>3000</MaxAgeSec>
</Cors>
</CorsConfig>""")
def _stream_write_internal(self, path, fp, content_type=None, content_encoding=None,
cancel_on_error=True, size=filelike.READ_UNTIL_END):
2014-08-12 06:06:44 +00:00
# Minimum size of upload part size on S3 is 5MB
self._initialize_cloud_conn()
path = self._init_path(path)
key = self._key_class(self._cloud_bucket, path)
if content_type is not None:
key.set_metadata('Content-Type', content_type)
if content_encoding is not None:
key.set_metadata('Content-Encoding', content_encoding)
if size != filelike.READ_UNTIL_END:
fp = filelike.StreamSlice(fp, 0, size)
# TODO figure out how to handle cancel_on_error=False
2015-12-10 04:16:33 +00:00
try:
key.set_contents_from_stream(fp)
except IOError as ex:
return 0, ex
2015-12-10 04:16:33 +00:00
return key.size, None
def complete_chunked_upload(self, uuid, final_path, storage_metadata):
self._initialize_cloud_conn()
# Boto does not support GCS's multipart upload API because it differs from S3, so
# we are forced to join it all locally and then reupload.
# See https://github.com/boto/boto/issues/3355
chunk_list = self._chunk_list_from_metadata(storage_metadata)
self._client_side_chunk_join(final_path, chunk_list)
class RadosGWStorage(_CloudStorage):
2016-01-15 16:15:40 +00:00
def __init__(self, metric_queue, hostname, is_secure, storage_path, access_key, secret_key,
bucket_name):
upload_params = {}
connect_kwargs = {
'host': hostname,
'is_secure': is_secure,
'calling_format': boto.s3.connection.OrdinaryCallingFormat(),
}
2016-01-15 16:15:40 +00:00
super(RadosGWStorage, self).__init__(metric_queue, boto.s3.connection.S3Connection,
boto.s3.key.Key, connect_kwargs, upload_params,
storage_path, bucket_name, access_key, secret_key)
# TODO remove when radosgw supports cors: http://tracker.ceph.com/issues/8718#change-38624
def get_direct_download_url(self, path, expires_in=60, requires_cors=False):
if requires_cors:
return None
return super(RadosGWStorage, self).get_direct_download_url(path, expires_in, requires_cors)
# TODO remove when radosgw supports cors: http://tracker.ceph.com/issues/8718#change-38624
def get_direct_upload_url(self, path, mime_type, requires_cors=True):
if requires_cors:
return None
return super(RadosGWStorage, self).get_direct_upload_url(path, mime_type, requires_cors)
def complete_chunked_upload(self, uuid, final_path, storage_metadata):
self._initialize_cloud_conn()
2015-09-30 21:46:22 +00:00
# RadosGW does not support multipart copying from keys, so we are forced to join
# it all locally and then reupload.
# See https://github.com/ceph/ceph/pull/5139
chunk_list = self._chunk_list_from_metadata(storage_metadata)
2015-09-30 21:46:22 +00:00
self._client_side_chunk_join(final_path, chunk_list)