Make sure we don't generate chunk sizes larger than 5 GB.

Amazon S3 does not allow for chunk sizes larger than 5 GB; we currently don't handle that case at all, which is why large uploads are failing. This change ensures that if a storage engine specifies a *maximum* chunk size, we write multiple chunks no larger than that size.
2016-09-27 12:23:32 +02:00 · 2016-09-27 12:23:32 +02:00 · bfe2646a50
commit bfe2646a50
parent a74e94fb67
2 changed files with 112 additions and 8 deletions
--- a/storage/cloud.py
+++ b/storage/cloud.py
@ -3,6 +3,8 @@ import os
 import logging
 import copy

+from itertools import chain
+
 from boto.exception import S3ResponseError
 import boto.s3.connection
 import boto.s3.multipart
@ -51,7 +53,8 @@ class _CloudStorage(BaseStorageV2):
               storage_path, bucket_name, access_key=None, secret_key=None):
    super(_CloudStorage, self).__init__()

-    self.automatic_chunk_size = 5 * 1024 * 1024
+    self.minimum_chunk_size = 5 * 1024 * 1024
+    self.maximum_chunk_size = None

    self._initialized = False
    self._bucket_name = bucket_name
@ -184,7 +187,7 @@ class _CloudStorage(BaseStorageV2):
    num_part = 1
    total_bytes_written = 0
    while size == filelike.READ_UNTIL_END or total_bytes_written < size:
-      bytes_to_copy = self.automatic_chunk_size
+      bytes_to_copy = self.minimum_chunk_size
      if size != filelike.READ_UNTIL_END:
        # We never want to ask for more bytes than our caller has indicated to copy
        bytes_to_copy = min(bytes_to_copy, size - total_bytes_written)
@ -364,6 +367,22 @@ class _CloudStorage(BaseStorageV2):
        logger.exception('Exception trying to perform action %s', action)
        raise s3re

+  @staticmethod
+  def _rechunk(chunk, max_chunk_size):
+    """ Rechunks the chunk list to meet maximum chunk size restrictions for the storage engine. """
+    if max_chunk_size is None or chunk.length <= max_chunk_size:
+      yield chunk
+    else:
+      newchunk_length = chunk.length / 2
+      first_subchunk = _PartUploadMetadata(chunk.path, chunk.offset, newchunk_length)
+      second_subchunk = _PartUploadMetadata(chunk.path,
+                                            chunk.offset + newchunk_length,
+                                            chunk.length - newchunk_length)
+      for subchunk in chain(_CloudStorage._rechunk(first_subchunk, max_chunk_size),
+                            _CloudStorage._rechunk(second_subchunk, max_chunk_size)):
+        yield subchunk
+
+
  def complete_chunked_upload(self, uuid, final_path, storage_metadata, force_client_side=False):
    self._initialize_cloud_conn()
    chunk_list = self._chunk_list_from_metadata(storage_metadata)
@ -375,7 +394,7 @@ class _CloudStorage(BaseStorageV2):
      server_side_assembly = True
      for chunk_offset, chunk in enumerate(chunk_list):
        # If the chunk is both too small, and not the last chunk, we rule out server side assembly
-        if chunk.length < self.automatic_chunk_size and (chunk_offset + 1) < len(chunk_list):
+        if chunk.length < self.minimum_chunk_size and (chunk_offset + 1) < len(chunk_list):
          server_side_assembly = False
          break

@ -385,14 +404,14 @@ class _CloudStorage(BaseStorageV2):
        # Awesome, we can do this completely server side, now we have to start a new multipart
        # upload and use copy_part_from_key to set all of the chunks.
        mpu = self.__initiate_multipart_upload(final_path, content_type=None, content_encoding=None)
+        updated_chunks = chain.from_iterable([_CloudStorage._rechunk(c, self.maximum_chunk_size)
+                                              for c in chunk_list])

-        for chunk_offset, chunk in enumerate(chunk_list):
+        for index, chunk in enumerate(updated_chunks):
          abs_chunk_path = self._init_path(chunk.path)
-          part_num = chunk_offset + 1
-          chunk_end_offset_inclusive = chunk.length - 1
          self._perform_action_with_retry(mpu.copy_part_from_key, self.get_cloud_bucket().name,
-                                          abs_chunk_path, part_num, start=0,
-                                          end=chunk_end_offset_inclusive)
+                                          abs_chunk_path, index + 1, start=chunk.offset,
+                                          end=chunk.length + chunk.offset - 1)

        self._perform_action_with_retry(mpu.complete_upload)
      except IOError as ioe:
@ -433,6 +452,8 @@ class S3Storage(_CloudStorage):
                                    access_key=s3_access_key or None,
                                    secret_key=s3_secret_key or None)

+    self.maximum_chunk_size = 5 * 1024 * 1024 * 1024 # 5GB.
+
  def setup(self):
    self.get_cloud_bucket().set_cors_xml("""<?xml version="1.0" encoding="UTF-8"?>
      <CORSConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">