From 8f80d7064bb757aebe840d1f8d5e54c58d477c70 Mon Sep 17 00:00:00 2001
From: Jake Moshenko <jake.moshenko@coreos.com>
Date: Tue, 5 Jan 2016 12:14:52 -0500
Subject: [PATCH] Hash v1 uploads for torrent chunks

---
 data/model/_basequery.py | 32 +++++++++++++++++++--
 data/model/image.py      | 61 ++++------------------------------------
 data/model/storage.py    | 44 ++++++++++++++++++++++++++---
 endpoints/v1/registry.py | 12 ++++++--
 initdb.py                |  8 +++---
 util/registry/torrent.py | 10 +++++--
 6 files changed, 98 insertions(+), 69 deletions(-)

diff --git a/data/model/_basequery.py b/data/model/_basequery.py
index a04df60c3..0cb0f3fee 100644
--- a/data/model/_basequery.py
+++ b/data/model/_basequery.py
@@ -1,8 +1,9 @@
-from peewee import JOIN_LEFT_OUTER, Clause, SQL
+from peewee import Clause, SQL, fn
 from cachetools import lru_cache
 
+from data.model import DataModelException
 from data.database import (Repository, User, Team, TeamMember, RepositoryPermission, TeamRole,
-                           Namespace, Visibility, db_for_update)
+                           Namespace, Visibility, ImageStorage, Image, db_for_update)
 
 
 def prefix_search(field, prefix_query):
@@ -97,3 +98,30 @@ def get_user_organizations(username):
           .join(TeamMember)
           .join(UserAlias, on=(UserAlias.id == TeamMember.user))
           .where(User.organization == True, UserAlias.username == username))
+
+
+def calculate_image_aggregate_size(ancestors_str, image_size, parent_image):
+  ancestors = ancestors_str.split('/')[1:-1]
+  if not ancestors:
+    return image_size
+
+  if parent_image is None:
+    raise DataModelException('Could not load parent image')
+
+  ancestor_size = parent_image.aggregate_size
+  if ancestor_size is not None:
+    return ancestor_size + image_size
+
+  # Fallback to a slower path if the parent doesn't have an aggregate size saved.
+  # TODO: remove this code if/when we do a full backfill.
+  ancestor_size = (ImageStorage
+                   .select(fn.Sum(ImageStorage.image_size))
+                   .join(Image)
+                   .where(Image.id << ancestors)
+                   .scalar())
+  if ancestor_size is None:
+    return None
+
+  return ancestor_size + image_size
+
+
diff --git a/data/model/image.py b/data/model/image.py
index e8227dfaa..bbe9201ad 100644
--- a/data/model/image.py
+++ b/data/model/image.py
@@ -2,7 +2,7 @@ import logging
 import dateutil.parser
 import random
 
-from peewee import JOIN_LEFT_OUTER, fn, SQL
+from peewee import JOIN_LEFT_OUTER, SQL
 from datetime import datetime
 
 from data.model import (DataModelException, db_transaction, _basequery, storage,
@@ -296,6 +296,8 @@ def find_create_or_link_image(docker_image_id, repo_obj, username, translations,
 
 def set_image_metadata(docker_image_id, namespace_name, repository_name, created_date_str, comment,
                        command, v1_json_metadata, parent=None):
+  """ Sets metadata that is specific to how a binary piece of storage fits into the layer tree.
+  """
   with db_transaction():
     query = (Image
              .select(Image, ImageStorage)
@@ -322,6 +324,7 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created
     # We cleanup any old checksum in case it's a retry after a fail
     fetched.v1_checksum = None
     fetched.storage.content_checksum = None
+    fetched.storage.save()
 
     fetched.comment = comment
     fetched.command = command
@@ -335,59 +338,6 @@ def set_image_metadata(docker_image_id, namespace_name, repository_name, created
     return fetched
 
 
-def set_image_size(docker_image_id, namespace_name, repository_name, image_size, uncompressed_size):
-  if image_size is None:
-    raise DataModelException('Empty image size field')
-
-  try:
-    image = (Image
-             .select(Image, ImageStorage)
-             .join(Repository)
-             .join(Namespace, on=(Repository.namespace_user == Namespace.id))
-             .switch(Image)
-             .join(ImageStorage, JOIN_LEFT_OUTER)
-             .where(Repository.name == repository_name, Namespace.username == namespace_name,
-                    Image.docker_image_id == docker_image_id)
-             .get())
-  except Image.DoesNotExist:
-    raise DataModelException('No image with specified id and repository')
-
-  image.storage.image_size = image_size
-  image.storage.uncompressed_size = uncompressed_size
-  image.storage.save()
-
-  image.aggregate_size = calculate_image_aggregate_size(image.ancestors, image.storage,
-                                                        image.parent)
-  image.save()
-
-  return image
-
-
-def calculate_image_aggregate_size(ancestors_str, image_storage, parent_image):
-  ancestors = ancestors_str.split('/')[1:-1]
-  if not ancestors:
-    return image_storage.image_size
-
-  if parent_image is None:
-    raise DataModelException('Could not load parent image')
-
-  ancestor_size = parent_image.aggregate_size
-  if ancestor_size is not None:
-    return ancestor_size + image_storage.image_size
-
-  # Fallback to a slower path if the parent doesn't have an aggregate size saved.
-  # TODO: remove this code if/when we do a full backfill.
-  ancestor_size = (ImageStorage
-                   .select(fn.Sum(ImageStorage.image_size))
-                   .join(Image)
-                   .where(Image.id << ancestors)
-                   .scalar())
-  if ancestor_size is None:
-    return None
-
-  return ancestor_size + image_storage.image_size
-
-
 def get_image(repo, docker_image_id):
   try:
     return Image.get(Image.docker_image_id == docker_image_id, Image.repository == repo)
@@ -452,7 +402,8 @@ def synthesize_v1_image(repo, image_storage, docker_image_id, created_date_str,
       pass
 
   # Get the aggregate size for the image.
-  aggregate_size = calculate_image_aggregate_size(ancestors, image_storage, parent_image)
+  aggregate_size = _basequery.calculate_image_aggregate_size(ancestors, image_storage.image_size,
+                                                             parent_image)
 
   return Image.create(docker_image_id=docker_image_id, ancestors=ancestors, comment=comment,
                       command=command, v1_json_metadata=v1_json_metadata, created=created,
diff --git a/data/model/storage.py b/data/model/storage.py
index d0122d9a8..b66b20c49 100644
--- a/data/model/storage.py
+++ b/data/model/storage.py
@@ -2,10 +2,12 @@ import logging
 
 from peewee import JOIN_LEFT_OUTER, fn, SQL
 
-from data.model import config, db_transaction, InvalidImageException, TorrentInfoDoesNotExist
-from data.database import (ImageStorage, Image, DerivedStorageForImage, ImageStoragePlacement,
-                           ImageStorageLocation, ImageStorageTransformation, ImageStorageSignature,
-                           ImageStorageSignatureKind, Repository, Namespace, TorrentInfo)
+from data.model import (config, db_transaction, InvalidImageException, TorrentInfoDoesNotExist,
+                        DataModelException, _basequery)
+from data.database import (ImageStorage, Image, ImageStoragePlacement, ImageStorageLocation,
+                           ImageStorageTransformation, ImageStorageSignature,
+                           ImageStorageSignatureKind, Repository, Namespace, TorrentInfo,
+                           db_for_update)
 
 
 logger = logging.getLogger(__name__)
@@ -203,6 +205,40 @@ def _reduce_as_tree(queries_to_reduce):
   return to_reduce_left.union_all(to_reduce_right)
 
 
+def set_image_storage_metadata(docker_image_id, namespace_name, repository_name, image_size,
+                               uncompressed_size):
+  """ Sets metadata that is specific to the binary storage of the data, irrespective of how it
+      is used in the layer tree.
+  """
+  if image_size is None:
+    raise DataModelException('Empty image size field')
+
+  query = (Image
+           .select(Image, ImageStorage)
+           .join(Repository)
+           .join(Namespace, on=(Repository.namespace_user == Namespace.id))
+           .switch(Image)
+           .join(ImageStorage, JOIN_LEFT_OUTER)
+           .where(Repository.name == repository_name, Namespace.username == namespace_name,
+                  Image.docker_image_id == docker_image_id))
+
+  try:
+    image = db_for_update(query).get()
+  except ImageStorage.DoesNotExist:
+    raise InvalidImageException('No image with specified id and repository')
+
+  # We MUST do this here, it can't be done in the corresponding image call because the storage
+  # has not yet been pushed
+  image.aggregate_size = _basequery.calculate_image_aggregate_size(image.ancestors, image_size,
+                                                                   image.parent)
+  image.save()
+
+  image.storage.image_size = image_size
+  image.storage.uncompressed_size = uncompressed_size
+  image.storage.save()
+  return image.storage
+
+
 def get_storage_locations(uuid):
   query = (ImageStoragePlacement
            .select()
diff --git a/endpoints/v1/registry.py b/endpoints/v1/registry.py
index e7557468c..d4d33c630 100644
--- a/endpoints/v1/registry.py
+++ b/endpoints/v1/registry.py
@@ -18,6 +18,7 @@ from auth.permissions import (ReadRepositoryPermission,
                               ModifyRepositoryPermission)
 from data import model, database
 from util.registry import gzipstream
+from util.registry.torrent import PieceHasher
 from endpoints.v1 import v1_bp
 from endpoints.decorators import anon_protect
 
@@ -214,6 +215,10 @@ def put_image_layer(namespace, repository, image_id):
   size_info, size_hndlr = gzipstream.calculate_size_handler()
   sr.add_handler(size_hndlr)
 
+  # Add a handler to hash the chunks of the upload for torrenting
+  piece_hasher = PieceHasher(app.config['TORRENT_PIECE_SIZE'])
+  sr.add_handler(piece_hasher.update)
+
   # Add a handler which computes the checksum.
   h, sum_hndlr = checksums.simple_checksum_handler(json_data)
   sr.add_handler(sum_hndlr)
@@ -231,8 +236,11 @@ def put_image_layer(namespace, repository, image_id):
       abort(520, 'Image %(image_id)s could not be written. Please try again.', image_id=image_id)
 
   # Save the size of the image.
-  model.image.set_image_size(image_id, namespace, repository, size_info.compressed_size,
-                             size_info.uncompressed_size)
+  updated_storage = model.storage.set_image_storage_metadata(image_id, namespace, repository,
+                                                             size_info.compressed_size,
+                                                             size_info.uncompressed_size)
+  pieces_bytes = piece_hasher.piece_hashes + piece_hasher.hash_fragment.digest()
+  model.storage.save_torrent_info(updated_storage, app.config['TORRENT_PIECE_SIZE'], pieces_bytes)
 
   # Append the computed checksum.
   csums = []
diff --git a/initdb.py b/initdb.py
index b37ec9dcd..73f5902dc 100644
--- a/initdb.py
+++ b/initdb.py
@@ -81,10 +81,8 @@ def __create_subtree(repo, structure, creator_username, parent, tag_map):
     checksum = __gen_checksum(docker_image_id)
 
     new_image = model.image.find_create_or_link_image(docker_image_id, repo, None, {}, 'local_us')
-    new_image_locations = new_image.storage.locations
     new_image.storage.uuid = __gen_image_uuid(repo, image_num)
     new_image.storage.uploading = False
-    new_image.storage.content_checksum = checksum
     new_image.storage.save()
 
     # Write some data for the storage.
@@ -113,10 +111,12 @@ def __create_subtree(repo, structure, creator_username, parent, tag_map):
     new_image = model.image.set_image_metadata(docker_image_id, repo.namespace_user.username,
                                                repo.name, str(creation_time), 'no comment', command,
                                                json.dumps(v1_metadata), parent)
+    new_image.storage.content_checksum = checksum
+    new_image.storage.save()
 
     compressed_size = random.randrange(1, 1024 * 1024 * 1024)
-    model.image.set_image_size(docker_image_id, repo.namespace_user.username, repo.name,
-                               compressed_size, int(compressed_size * 1.4))
+    model.storage.set_image_storage_metadata(docker_image_id, repo.namespace_user.username,
+                                             repo.name, compressed_size, int(compressed_size * 1.4))
 
     parent = new_image
 
diff --git a/util/registry/torrent.py b/util/registry/torrent.py
index ea92410dd..0cc9cb349 100644
--- a/util/registry/torrent.py
+++ b/util/registry/torrent.py
@@ -35,7 +35,8 @@ def make_torrent(name, webseed, length, piece_length, pieces):
 
 
 class PieceHasher(object):
-  def __init__(self, piece_size, starting_offset, starting_piece_hash_bytes, hash_fragment_to_resume):
+  def __init__(self, piece_size, starting_offset=0, starting_piece_hash_bytes='',
+               hash_fragment_to_resume=None):
     if not isinstance(starting_offset, (int, long)):
       raise TypeError('starting_offset must be an integer')
     elif not isinstance(piece_size, (int, long)):
@@ -43,9 +44,14 @@ class PieceHasher(object):
 
     self._current_offset = starting_offset
     self._piece_size = piece_size
-    self._hash_fragment = hash_fragment_to_resume
     self._piece_hashes = bytearray(starting_piece_hash_bytes)
 
+    if hash_fragment_to_resume is None:
+      self._hash_fragment = resumablehashlib.sha1()
+    else:
+      self._hash_fragment = hash_fragment_to_resume
+
+
   def update(self, buf):
     buf_offset = 0
     while buf_offset < len(buf):