diff --git a/endpoints/v2/manifest.py b/endpoints/v2/manifest.py index 9f6eb38e0..6588d32eb 100644 --- a/endpoints/v2/manifest.py +++ b/endpoints/v2/manifest.py @@ -124,7 +124,7 @@ class SignedManifest(object): @property def layers(self): """ Returns a generator of objects that have the blobSum and v1Compatibility keys in them, - starting from the leaf image and working toward the base node. + starting from the base image and working toward the leaf node. """ for blob_sum_obj, history_obj in reversed(zip(self._parsed[_FS_LAYERS_KEY], self._parsed[_HISTORY_KEY])): @@ -152,15 +152,9 @@ class SignedManifest(object): @property def payload(self): protected = str(self._signatures[0][_PROTECTED_KEY]) - parsed_protected = json.loads(jwt.utils.base64url_decode(protected)) - logger.debug('parsed_protected: %s', parsed_protected) - signed_content_head = self._bytes[:parsed_protected[_FORMAT_LENGTH_KEY]] - logger.debug('signed content head: %s', signed_content_head) - signed_content_tail = jwt.utils.base64url_decode(str(parsed_protected[_FORMAT_TAIL_KEY])) - logger.debug('signed content tail: %s', signed_content_tail) return signed_content_head + signed_content_tail @@ -189,6 +183,7 @@ class SignedManifestBuilder(object): self._history.append({ _V1_COMPAT_KEY: v1_json_metadata, }) + return self def build(self, json_web_key): @@ -354,7 +349,7 @@ def _updated_v1_metadata(v1_metadata_json, updated_id_map): return json.dumps(parsed) -def _write_manifest(namespace_name, repo_name, manifest): +def _write_manifest_itself(namespace_name, repo_name, manifest): # Ensure that the manifest is for this repository. If the manifest's namespace is empty, then # it is for the library namespace and we need an extra check. if (manifest.namespace == '' and features.LIBRARY_SUPPORT and @@ -396,11 +391,20 @@ def _write_manifest(namespace_name, repo_name, manifest): has_rewritten_ids = False updated_id_map = {} + # Synthesized image id hash. Can be used to pull a "content addressable" image id out of thin air. + digest_history = hashlib.sha256() + for mdata in layers: digest_str = str(mdata.digest) v1_mdata = mdata.v1_metadata working_docker_id = v1_mdata.docker_id + # Update our digest_history hash for the new layer data. + digest_history.update(digest_str) + digest_history.update("@") + digest_history.update(mdata.v1_metadata_str.encode('utf-8')) + digest_history.update("|") + # Ensure that all blobs exist. blob_storage = storage_map.get(digest_str) if blob_storage is None: @@ -413,10 +417,9 @@ def _write_manifest(namespace_name, repo_name, manifest): images_map[v1_mdata.docker_id].storage.content_checksum != digest_str) or has_rewritten_ids): - v1_metadata_str = mdata.v1_metadata_str.encode('utf-8') - working_docker_id = hashlib.sha256(v1_metadata_str + '@' + digest_str).hexdigest() - logger.debug('Rewriting docker_id %s/%s %s -> %s', namespace_name, repo_name, - v1_mdata.docker_id, working_docker_id) + working_docker_id = digest_history.hexdigest() + logger.warning('Rewriting docker_id %s/%s %s -> %s', namespace_name, repo_name, + v1_mdata.docker_id, working_docker_id) has_rewritten_ids = True # Store the new docker id in the map @@ -431,9 +434,6 @@ def _write_manifest(namespace_name, repo_name, manifest): raise ManifestInvalid(detail={'message': msg}) # Synthesize and store the v1 metadata in the db. - digest_str = str(mdata.digest) - blob_storage = storage_map[digest_str] - v1_metadata_json = mdata.v1_metadata_str if has_rewritten_ids: v1_metadata_json = _updated_v1_metadata(mdata.v1_metadata_str, updated_id_map) @@ -453,6 +453,11 @@ def _write_manifest(namespace_name, repo_name, manifest): leaf_layer_id = images_map[layers[-1].v1_metadata.docker_id].docker_image_id model.tag.store_tag_manifest(namespace_name, repo_name, tag_name, leaf_layer_id, manifest_digest, manifest.bytes) + return (repo, tag_name, manifest_digest) + + +def _write_manifest(namespace_name, repo_name, manifest): + (repo, tag_name, manifest_digest) = _write_manifest_itself(namespace_name, repo_name, manifest) # Spawn the repo_push event. event_data = { diff --git a/test/test_manifests.py b/test/test_manifests.py new file mode 100644 index 000000000..d8a4574f9 --- /dev/null +++ b/test/test_manifests.py @@ -0,0 +1,145 @@ +import unittest +import time +import hashlib + +from app import app, storage, docker_v2_signing_key +from initdb import setup_database_for_testing, finished_database_for_testing +from data import model, database +from endpoints.v2.manifest import _write_manifest_itself, SignedManifestBuilder + + +ADMIN_ACCESS_USER = 'devtable' +REPO = 'simple' +FIRST_TAG = 'first' +SECOND_TAG = 'second' +THIRD_TAG = 'third' + + +class TestManifests(unittest.TestCase): + @staticmethod + def _set_tag_expiration_policy(namespace, expiration_s): + namespace_user = model.user.get_user(namespace) + model.user.change_user_tag_expiration(namespace_user, expiration_s) + + def setUp(self): + setup_database_for_testing(self) + + self._set_tag_expiration_policy(ADMIN_ACCESS_USER, 0) + + self.app = app.test_client() + self.ctx = app.test_request_context() + self.ctx.__enter__() + + def tearDown(self): + finished_database_for_testing(self) + self.ctx.__exit__(True, None, None) + + def _perform_cleanup(self): + database.RepositoryTag.delete().where(database.RepositoryTag.hidden == True).execute() + model.repository.garbage_collect_repository(ADMIN_ACCESS_USER, REPO) + + def test_missing_link(self): + """ Tests for a corner case that could result in missing a link to a blob referenced by a + manifest. The test exercises the case as follows: + + 1) Push a manifest of a single layer with a Docker ID `FIRST_ID`, pointing + to blob `FIRST_BLOB`. The database should contain the tag referencing the layer, with + no changed ID and the blob not being GCed. + + 2) Push a manifest of two layers: + + Layer 1: `FIRST_ID` with blob `SECOND_BLOB`: Will result in a new synthesized ID + Layer 2: `SECOND_ID` with blob `THIRD_BLOB`: Will result in `SECOND_ID` pointing to the + `THIRD_BLOB`, with a parent pointing to the new synthesized ID's layer. + + 3) Push a manifest of two layers: + + Layer 1: `THIRD_ID` with blob `FOURTH_BLOB`: Will result in a new `THIRD_ID` layer + Layer 2: `FIRST_ID` with blob `THIRD_BLOB`: Since `FIRST_ID` already points to `SECOND_BLOB`, + this will synthesize a new ID. With the current bug, the synthesized ID will match + that of `SECOND_ID`, leaving `THIRD_ID` unlinked and therefore, after a GC, missing + `FOURTH_BLOB`. + """ + location_name = storage.preferred_locations[0] + location = database.ImageStorageLocation.get(name=location_name) + + # Create first blob. + first_blob_sha = 'sha256:' + hashlib.sha256("FIRST").hexdigest() + model.blob.store_blob_record_and_temp_link(ADMIN_ACCESS_USER, REPO, first_blob_sha, location, 0, 0, 0) + + # Push the first manifest. + first_manifest = (SignedManifestBuilder(ADMIN_ACCESS_USER, REPO, FIRST_TAG) + .add_layer(first_blob_sha, '{"id": "first"}') + .build(docker_v2_signing_key)) + + _write_manifest_itself(ADMIN_ACCESS_USER, REPO, first_manifest) + + # Delete all temp tags and perform GC. + self._perform_cleanup() + + # Ensure that the first blob still exists, along with the first tag. + self.assertIsNotNone(model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, REPO, first_blob_sha)) + self.assertIsNotNone(model.tag.load_tag_manifest(ADMIN_ACCESS_USER, REPO, FIRST_TAG)) + self.assertEquals("first", model.tag.get_tag_image(ADMIN_ACCESS_USER, REPO, FIRST_TAG).docker_image_id) + + # Create the second and third blobs. + second_blob_sha = 'sha256:' + hashlib.sha256("SECOND").hexdigest() + third_blob_sha = 'sha256:' + hashlib.sha256("THIRD").hexdigest() + + model.blob.store_blob_record_and_temp_link(ADMIN_ACCESS_USER, REPO, second_blob_sha, location, 0, 0, 0) + model.blob.store_blob_record_and_temp_link(ADMIN_ACCESS_USER, REPO, third_blob_sha, location, 0, 0, 0) + + # Push the second manifest. + second_manifest = (SignedManifestBuilder(ADMIN_ACCESS_USER, REPO, SECOND_TAG) + .add_layer(third_blob_sha, '{"id": "second", "parent": "first"}') + .add_layer(second_blob_sha, '{"id": "first"}') + .build(docker_v2_signing_key)) + + _write_manifest_itself(ADMIN_ACCESS_USER, REPO, second_manifest) + + # Delete all temp tags and perform GC. + self._perform_cleanup() + + # Ensure that the first and second blobs still exists, along with the second tag. + self.assertIsNotNone(model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, REPO, first_blob_sha)) + self.assertIsNotNone(model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, REPO, second_blob_sha)) + self.assertIsNotNone(model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, REPO, third_blob_sha)) + + self.assertIsNotNone(model.tag.load_tag_manifest(ADMIN_ACCESS_USER, REPO, FIRST_TAG)) + self.assertIsNotNone(model.tag.load_tag_manifest(ADMIN_ACCESS_USER, REPO, SECOND_TAG)) + + self.assertEquals("first", model.tag.get_tag_image(ADMIN_ACCESS_USER, REPO, FIRST_TAG).docker_image_id) + + # Ensure the IDs have changed. + self.assertNotEquals("first", model.tag.get_tag_image(ADMIN_ACCESS_USER, REPO, SECOND_TAG).parent.docker_image_id) + self.assertNotEquals("second", model.tag.get_tag_image(ADMIN_ACCESS_USER, REPO, SECOND_TAG).docker_image_id) + + # Create the fourth blob. + fourth_blob_sha = 'sha256:' + hashlib.sha256("FOURTH").hexdigest() + model.blob.store_blob_record_and_temp_link(ADMIN_ACCESS_USER, REPO, fourth_blob_sha, location, 0, 0, 0) + + # Push the third manifest. + third_manifest = (SignedManifestBuilder(ADMIN_ACCESS_USER, REPO, THIRD_TAG) + .add_layer(third_blob_sha, '{"id": "second", "parent": "first"}') + .add_layer(fourth_blob_sha, '{"id": "first"}') # Note the change in BLOB from the second manifest. + .build(docker_v2_signing_key)) + + _write_manifest_itself(ADMIN_ACCESS_USER, REPO, third_manifest) + + # Delete all temp tags and perform GC. + self._perform_cleanup() + + # Ensure all blobs are present. + self.assertIsNotNone(model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, REPO, first_blob_sha)) + self.assertIsNotNone(model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, REPO, second_blob_sha)) + self.assertIsNotNone(model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, REPO, third_blob_sha)) + self.assertIsNotNone(model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, REPO, fourth_blob_sha)) + + # Ensure new synthesized IDs were created. + self.assertNotEquals( + model.tag.get_tag_image(ADMIN_ACCESS_USER, REPO, SECOND_TAG).docker_image_id, + model.tag.get_tag_image(ADMIN_ACCESS_USER, REPO, THIRD_TAG).docker_image_id) + + +if __name__ == '__main__': + unittest.main()