Ensure we encode the config in manifest schema 2 via the canonical JSON format

2018-12-18 14:52:19 -05:00 · 2018-12-18 14:52:19 -05:00 · feee49be9e
commit feee49be9e
parent 48e584905a
4 changed files with 48 additions and 5 deletions
--- a/image/docker/schema1.py
+++ b/image/docker/schema1.py
@ -23,7 +23,7 @@ from image.docker import ManifestException
 from image.docker.types import ManifestImageLayer
 from image.docker.interfaces import ManifestInterface
 from image.docker.v1 import DockerV1Metadata
-from image.docker.schemautil import ensure_utf8
+from image.docker.schemautil import ensure_utf8, to_canonical_json

 logger = logging.getLogger(__name__)

@ -375,7 +375,7 @@ class DockerSchema1Manifest(ManifestInterface):

      v1_metadata = json.loads(metadata_string)
      command_list = v1_metadata.get('container_config', {}).get('Cmd', None)
-      command = json.dumps(command_list) if command_list else None
+      command = to_canonical_json(command_list) if command_list else None

      if not 'id' in v1_metadata:
        raise MalformedSchema1Manifest('id field missing from v1Compatibility JSON')
@ -597,4 +597,4 @@ def _updated_v1_metadata(v1_metadata_json, updated_id_map):
    if existing_image in updated_id_map:
      parsed['container_config']['image'] = updated_id_map[existing_image]

-  return json.dumps(parsed)
+  return to_canonical_json(parsed)
--- a/image/docker/schema2/manifest.py
+++ b/image/docker/schema2/manifest.py
@ -446,4 +446,4 @@ class DockerSchema2ManifestBuilder(object):
        _build_layer(layer) for layer in self.filesystem_layers
      ],
    }
-    return DockerSchema2Manifest(json.dumps(manifest_dict, indent=3, ensure_ascii=ensure_ascii))
+    return DockerSchema2Manifest(json.dumps(manifest_dict, ensure_ascii=ensure_ascii, indent=3))
--- a/image/docker/schemautil.py
+++ b/image/docker/schemautil.py
@ -25,8 +25,28 @@ class ContentRetrieverForTesting(ContentRetriever):


 def ensure_utf8(unicode_or_str):
-  """ Ensures the given string is utf-8 encoded and not unicode. """
+  """ Ensures the given string is a utf-8 encoded str and not a unicode type. """
  if isinstance(unicode_or_str, unicode):
    return unicode_or_str.encode('utf-8')

  return unicode_or_str
+
+
+class _CustomEncoder(json.JSONEncoder):
+  def encode(self, o):
+    encoded = super(_CustomEncoder, self).encode(o)
+    if isinstance(o, basestring):
+      encoded = encoded.replace('<', '\\u003c')
+      encoded = encoded.replace('>', '\\u003e')
+      encoded = encoded.replace('&', '\\u0026')
+    return encoded
+
+
+def to_canonical_json(value, ensure_ascii=True, indent=None):
+  """ Returns the canonical JSON string form of the given value,
+      as per the guidelines in https://github.com/docker/distribution/blob/master/docs/spec/json.md.
+
+      `indent` is allowed only for the purposes of indenting for debugging.
+  """
+  return json.dumps(value, ensure_ascii=ensure_ascii, sort_keys=True, separators=(',', ':'),
+                    cls=_CustomEncoder, indent=indent)
--- a/image/docker/test/test_schemautil.py
+++ b/image/docker/test/test_schemautil.py
@ -0,0 +1,23 @@
+import pytest
+
+from image.docker.schemautil import to_canonical_json
+
+@pytest.mark.parametrize('input, expected_output', [
+  pytest.param({}, '{}', id='empty object'),
+  pytest.param({'b': 2, 'a': 1}, '{"a":1,"b":2}', id='object with sorted keys'),
+  pytest.param('hello world', '"hello world"', id='basic string'),
+  pytest.param('hey & hi', '"hey \\u0026 hi"', id='string with &'),
+  pytest.param('<hey>', '"\\u003chey\\u003e"', id='string with brackets'),
+  pytest.param({
+    "zxcv": [{}, True, 1000000000, 'tyui'],
+    "asdf": 1,
+    "qwer": [],
+  }, '{"asdf":1,"qwer":[],"zxcv":[{},true,1000000000,"tyui"]}', id='example canonical'),
+])
+def test_to_canonical_json(input, expected_output):
+  result = to_canonical_json(input)
+  assert result == expected_output
+
+  # Ensure the result is utf-8.
+  assert isinstance(result, str)
+  result.decode('utf-8')