Make sure we don't generate chunk sizes larger than 5 GB.

Amazon S3 does not allow for chunk sizes larger than 5 GB; we currently don't handle that case at all, which is why large uploads are failing. This change ensures that if a storage engine specifies a *maximum* chunk size, we write multiple chunks no larger than that size.
This commit is contained in:
Joseph Schorr 2016-09-27 12:23:32 +02:00
parent a74e94fb67
commit bfe2646a50
2 changed files with 112 additions and 8 deletions

View file

@ -3,6 +3,8 @@ import os
import logging
import copy
from itertools import chain
from boto.exception import S3ResponseError
import boto.s3.connection
import boto.s3.multipart
@ -51,7 +53,8 @@ class _CloudStorage(BaseStorageV2):
storage_path, bucket_name, access_key=None, secret_key=None):
super(_CloudStorage, self).__init__()
self.automatic_chunk_size = 5 * 1024 * 1024
self.minimum_chunk_size = 5 * 1024 * 1024
self.maximum_chunk_size = None
self._initialized = False
self._bucket_name = bucket_name
@ -184,7 +187,7 @@ class _CloudStorage(BaseStorageV2):
num_part = 1
total_bytes_written = 0
while size == filelike.READ_UNTIL_END or total_bytes_written < size:
bytes_to_copy = self.automatic_chunk_size
bytes_to_copy = self.minimum_chunk_size
if size != filelike.READ_UNTIL_END:
# We never want to ask for more bytes than our caller has indicated to copy
bytes_to_copy = min(bytes_to_copy, size - total_bytes_written)
@ -364,6 +367,22 @@ class _CloudStorage(BaseStorageV2):
logger.exception('Exception trying to perform action %s', action)
raise s3re
@staticmethod
def _rechunk(chunk, max_chunk_size):
""" Rechunks the chunk list to meet maximum chunk size restrictions for the storage engine. """
if max_chunk_size is None or chunk.length <= max_chunk_size:
yield chunk
else:
newchunk_length = chunk.length / 2
first_subchunk = _PartUploadMetadata(chunk.path, chunk.offset, newchunk_length)
second_subchunk = _PartUploadMetadata(chunk.path,
chunk.offset + newchunk_length,
chunk.length - newchunk_length)
for subchunk in chain(_CloudStorage._rechunk(first_subchunk, max_chunk_size),
_CloudStorage._rechunk(second_subchunk, max_chunk_size)):
yield subchunk
def complete_chunked_upload(self, uuid, final_path, storage_metadata, force_client_side=False):
self._initialize_cloud_conn()
chunk_list = self._chunk_list_from_metadata(storage_metadata)
@ -375,7 +394,7 @@ class _CloudStorage(BaseStorageV2):
server_side_assembly = True
for chunk_offset, chunk in enumerate(chunk_list):
# If the chunk is both too small, and not the last chunk, we rule out server side assembly
if chunk.length < self.automatic_chunk_size and (chunk_offset + 1) < len(chunk_list):
if chunk.length < self.minimum_chunk_size and (chunk_offset + 1) < len(chunk_list):
server_side_assembly = False
break
@ -385,14 +404,14 @@ class _CloudStorage(BaseStorageV2):
# Awesome, we can do this completely server side, now we have to start a new multipart
# upload and use copy_part_from_key to set all of the chunks.
mpu = self.__initiate_multipart_upload(final_path, content_type=None, content_encoding=None)
updated_chunks = chain.from_iterable([_CloudStorage._rechunk(c, self.maximum_chunk_size)
for c in chunk_list])
for chunk_offset, chunk in enumerate(chunk_list):
for index, chunk in enumerate(updated_chunks):
abs_chunk_path = self._init_path(chunk.path)
part_num = chunk_offset + 1
chunk_end_offset_inclusive = chunk.length - 1
self._perform_action_with_retry(mpu.copy_part_from_key, self.get_cloud_bucket().name,
abs_chunk_path, part_num, start=0,
end=chunk_end_offset_inclusive)
abs_chunk_path, index + 1, start=chunk.offset,
end=chunk.length + chunk.offset - 1)
self._perform_action_with_retry(mpu.complete_upload)
except IOError as ioe:
@ -433,6 +452,8 @@ class S3Storage(_CloudStorage):
access_key=s3_access_key or None,
secret_key=s3_secret_key or None)
self.maximum_chunk_size = 5 * 1024 * 1024 * 1024 # 5GB.
def setup(self):
self.get_cloud_bucket().set_cors_xml("""<?xml version="1.0" encoding="UTF-8"?>
<CORSConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">