Have the layer PUT method calculate the uncompressed size in realtime, as trusting the JSON is fraught with complications

2014-09-29 17:00:47 -04:00 · 2014-09-29 17:00:47 -04:00 · 474add0fb1
commit 474add0fb1
parent 7fd3c7d31b
3 changed files with 48 additions and 5 deletions
--- a/data/model/legacy.py
+++ b/data/model/legacy.py
@ -1251,7 +1251,7 @@ def set_image_size(docker_image_id, namespace_name, repository_name,


 def set_image_metadata(docker_image_id, namespace_name, repository_name, created_date_str, comment,
-                       command, uncompressed_size, parent=None):
+                       command, parent=None):
  with config.app_config['DB_TRANSACTION_FACTORY'](db):
    query = (Image
             .select(Image, ImageStorage)
@ -1272,7 +1272,6 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created
    fetched.storage.created = dateutil.parser.parse(created_date_str).replace(tzinfo=None)
    fetched.storage.comment = comment
    fetched.storage.command = command
-    fetched.storage.uncompressed_size = uncompressed_size

    if parent:
      fetched.ancestors = '%s%s/' % (parent.ancestors, parent.id)
--- a/endpoints/registry.py
+++ b/endpoints/registry.py
@ -14,6 +14,7 @@ from util.http import abort, exact_abort
 from auth.permissions import (ReadRepositoryPermission,
                              ModifyRepositoryPermission)
 from data import model
+from util import gzipstream


 registry = Blueprint('registry', __name__)
@ -193,14 +194,26 @@ def put_image_layer(namespace, repository, image_id):
    # encoding (Gunicorn)
    input_stream = request.environ['wsgi.input']

-  # compute checksums
-  csums = []
+  # Create a socket reader to read the input stream containing the layer data.
  sr = SocketReader(input_stream)
+
+  # Add a handler that store the data in storage.
  tmp, store_hndlr = store.temp_store_handler()
  sr.add_handler(store_hndlr)
+
+  # Add a handler to compute the uncompressed size of the layer.
+  uncompressed_size_info, size_hndlr = gzipstream.calculate_size_handler()
+  sr.add_handler(size_hndlr)
+
+  # Add a handler which computes the checksum.
  h, sum_hndlr = checksums.simple_checksum_handler(json_data)
  sr.add_handler(sum_hndlr)
+
+  # Stream write the data to storage.
  store.stream_write(repo_image.storage.locations, layer_path, sr)
+
+  # Append the computed checksum.
+  csums = []
  csums.append('sha256:{0}'.format(h.hexdigest()))

  try:
@ -216,6 +229,12 @@ def put_image_layer(namespace, repository, image_id):
    logger.debug('put_image_layer: Error when computing tarsum '
                 '{0}'.format(e))

+  # Write the uncompressed image size, if any.
+  if uncompressed_size_info['size'] > 0:
+    profile.debug('Storing uncompressed layer size: %s' % uncompressed_size_info['size'])
+    repo_image.storage.uncompressed_size = uncompressed_size_info['size']
+    repo_image.storage.save()
+
  if repo_image.storage.checksum is None:
    # We don't have a checksum stored yet, that's fine skipping the check.
    # Not removing the mark though, image is not downloadable yet.
@ -460,7 +479,7 @@ def put_image_json(namespace, repository, image_id):
  profile.debug('Setting image metadata')
  model.set_image_metadata(image_id, namespace, repository,
                           data.get('created'), data.get('comment'), command,
-                           data.get('Size'), parent_image)
+                           parent_image)

  profile.debug('Putting json path')
  store.put_content(repo_image.storage.locations, json_path, request.data)
--- a/util/gzipstream.py
+++ b/util/gzipstream.py
@ -0,0 +1,25 @@
+"""
+Defines utility methods for working with gzip streams.
+"""
+
+import zlib
+
+# Window size for decompressing GZIP streams.
+# This results in ZLIB automatically detecting the GZIP headers.
+# http://stackoverflow.com/questions/3122145/zlib-error-error-3-while-decompressing-incorrect-header-check/22310760#22310760
+ZLIB_GZIP_WINDOW = zlib.MAX_WBITS | 32
+
+def calculate_size_handler():
+  """ Returns an object and a SocketReader handler. The handler will gunzip the data it receives,
+      adding the size found to the object.
+  """
+  uncompressed_size_info = {
+    'size': 0
+  }
+ 
+  decompressor = zlib.decompressobj(ZLIB_GZIP_WINDOW)
+
+  def fn(buf):
+    uncompressed_size_info['size'] += len(decompressor.decompress(buf))
+
+  return uncompressed_size_info, fn