Move v1 checksums to image and track v2 separately

2015-11-04 16:18:53 -05:00 · 2015-11-04 16:18:53 -05:00 · 3d0bcbaaeb
commit 3d0bcbaaeb
parent 2b3633b107
8 changed files with 65 additions and 17 deletions
--- a/data/database.py
+++ b/data/database.py
@ -484,11 +484,12 @@ class EmailConfirmation(BaseModel):

 class ImageStorage(BaseModel):
  uuid = CharField(default=uuid_generator, index=True, unique=True)
-  checksum = CharField(null=True)
+  checksum = CharField(null=True)  # TODO remove when all checksums have been moved back to Image
  image_size = BigIntegerField(null=True)
  uncompressed_size = BigIntegerField(null=True)
  uploading = BooleanField(default=True, null=True)
  cas_path = BooleanField(default=True)
+  content_checksum = CharField(null=True, index=True)


 class ImageStorageTransformation(BaseModel):
@ -570,6 +571,7 @@ class Image(BaseModel):
  command = TextField(null=True)
  aggregate_size = BigIntegerField(null=True)
  v1_json_metadata = TextField(null=True)
+  v1_checksum = CharField(null=True)

  class Meta:
    database = db
--- a/data/migrations/versions/2827d36939e4_separate_v1_and_v2_checksums.py
+++ b/data/migrations/versions/2827d36939e4_separate_v1_and_v2_checksums.py
@ -0,0 +1,30 @@
+"""Separate v1 and v2 checksums.
+
+Revision ID: 2827d36939e4
+Revises: 73669db7e12
+Create Date: 2015-11-04 16:29:48.905775
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '2827d36939e4'
+down_revision = '73669db7e12'
+
+from alembic import op
+import sqlalchemy as sa
+
+
+def upgrade(tables):
+    ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('image', sa.Column('v1_checksum', sa.String(length=255), nullable=True))
+    op.add_column('imagestorage', sa.Column('content_checksum', sa.String(length=255), nullable=True))
+    op.create_index('imagestorage_content_checksum', 'imagestorage', ['content_checksum'], unique=False)
+    ### end Alembic commands ###
+
+
+def downgrade(tables):
+    ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index('imagestorage_content_checksum', table_name='imagestorage')
+    op.drop_column('imagestorage', 'content_checksum')
+    op.drop_column('image', 'v1_checksum')
+    ### end Alembic commands ###
--- a/data/migrations/versions/73669db7e12_remove_legacy_github_column.py
+++ b/data/migrations/versions/73669db7e12_remove_legacy_github_column.py
@ -12,10 +12,6 @@ down_revision = '35f538da62'

 from alembic import op
 import sqlalchemy as sa
-<<<<<<< HEAD
-=======
-from sqlalchemy.dialects import mysql
->>>>>>> Remove the used_legacy_github column

 def upgrade(tables):
    ### commands auto generated by Alembic - please adjust! ###
--- a/data/model/blob.py
+++ b/data/model/blob.py
@ -17,7 +17,7 @@ def get_repo_blob_by_digest(namespace, repo_name, blob_digest):
                    .join(Repository)
                    .join(Namespace)
                    .where(Repository.name == repo_name, Namespace.username == namespace,
-                           ImageStorage.checksum == blob_digest))
+                           ImageStorage.content_checksum == blob_digest))
  if not placements:
    raise BlobDoesNotExist('Blob does not exist with digest: {0}'.format(blob_digest))

@ -35,11 +35,11 @@ def store_blob_record_and_temp_link(namespace, repo_name, blob_digest, location_
    repo = _basequery.get_existing_repository(namespace, repo_name)

    try:
-      storage = ImageStorage.get(checksum=blob_digest)
+      storage = ImageStorage.get(content_checksum=blob_digest)
      location = ImageStorageLocation.get(name=location_name)
      ImageStoragePlacement.get(storage=storage, location=location)
    except ImageStorage.DoesNotExist:
-      storage = ImageStorage.create(checksum=blob_digest)
+      storage = ImageStorage.create(content_checksum=blob_digest)
    except ImageStoragePlacement.DoesNotExist:
      ImageStoragePlacement.create(storage=storage, location=location)

--- a/data/model/image.py
+++ b/data/model/image.py
@ -284,10 +284,7 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created
    except Image.DoesNotExist:
      raise DataModelException('No image with specified id and repository')

-    # We cleanup any old checksum in case it's a retry after a fail
-    fetched.storage.checksum = None
    fetched.created = datetime.now()
-
    if created_date_str is not None:
      try:
        fetched.created = dateutil.parser.parse(created_date_str).replace(tzinfo=None)
@ -295,6 +292,11 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created
        # parse raises different exceptions, so we cannot use a specific kind of handler here.
        pass

+    # We cleanup any old checksum in case it's a retry after a fail
+    fetched.v1_checksum = None
+    fetched.storage.checksum = None # TODO remove when storage checksums are no longer read
+    fetched.storage.content_checksum = None
+
    fetched.comment = comment
    fetched.command = command
    fetched.v1_json_metadata = v1_json_metadata
--- a/digest/checksums.py
+++ b/digest/checksums.py
@ -75,6 +75,14 @@ def simple_checksum_handler(json_data):
    return h, fn


+def content_checksum_handler():
+    h = hashlib.sha256()
+
+    def fn(buf):
+        h.update(buf)
+    return h, fn
+
+
 def compute_simple(fp, json_data):
    data = json_data + '\n'
    return 'sha256:{0}'.format(sha256_file(fp, data))
--- a/endpoints/v1/registry.py
+++ b/endpoints/v1/registry.py
@ -249,6 +249,10 @@ def put_image_layer(namespace, repository, image_id):
  h, sum_hndlr = checksums.simple_checksum_handler(json_data)
  sr.add_handler(sum_hndlr)

+  # Add a handler which computes the content checksum only
+  ch, content_sum_hndlr = checksums.content_checksum_handler()
+  sr.add_handler(content_sum_hndlr)
+
  # Stream write the data to storage.
  with database.CloseForLongOperation(app.config):
    try:
@ -278,6 +282,7 @@ def put_image_layer(namespace, repository, image_id):
    # We don't have a checksum stored yet, that's fine skipping the check.
    # Not removing the mark though, image is not downloadable yet.
    session['checksum'] = csums
+    session['content_checksum'] = 'sha256:{0}'.format(ch.hexdigest())
    return make_response('true', 200)

  checksum = repo_image.storage.checksum
@ -339,8 +344,9 @@ def put_image_checksum(namespace, repository, image_id):
    abort(409, 'Cannot set checksum for image %(image_id)s',
          issue='image-write-error', image_id=image_id)

-  logger.debug('Storing image checksum')
-  err = store_checksum(repo_image.storage, checksum)
+  logger.debug('Storing image and content checksums')
+  content_checksum = session.get('content_checksum', None)
+  err = store_checksum(repo_image, checksum, content_checksum)
  if err:
    abort(400, err)

@ -429,14 +435,18 @@ def generate_ancestry(image_id, uuid, locations, parent_id=None, parent_uuid=Non
  store.put_content(locations, store.image_ancestry_path(uuid), json.dumps(data))


-def store_checksum(image_storage, checksum):
+def store_checksum(image_with_storage, checksum, content_checksum):
  checksum_parts = checksum.split(':')
  if len(checksum_parts) != 2:
    return 'Invalid checksum format'

  # We store the checksum
-  image_storage.checksum = checksum
-  image_storage.save()
+  image_with_storage.storage.checksum = checksum  # TODO remove when v1 checksums are on image only
+  image_with_storage.storage.content_checksum = content_checksum
+  image_with_storage.storage.save()
+
+  image_with_storage.v1_checksum = checksum
+  image_with_storage.save()


@v1_bp.route('/images/<image_id>/json', methods=['PUT'])
--- a/initdb.py
+++ b/initdb.py
@ -82,7 +82,7 @@ def __create_subtree(repo, structure, creator_username, parent, tag_map):
    new_image_locations = new_image.storage.locations
    new_image.storage.uuid = __gen_image_uuid(repo, image_num)
    new_image.storage.uploading = False
-    new_image.storage.checksum = checksum
+    new_image.storage.content_checksum = checksum
    new_image.storage.save()

    # Write some data for the storage.