quay/endpoints/v2/blob.py

import logging
import re
import time

from flask import url_for, request, redirect, Response, abort as flask_abort

import bitmath
import resumablehashlib

from app import storage, app, get_app_url, metric_queue, model_cache
from auth.registry_jwt_auth import process_registry_jwt_auth
from auth.permissions import ReadRepositoryPermission
from data import database
from data.cache import cache_key
from digest import digest_tools
from endpoints.decorators import anon_protect, parse_repository_name
from endpoints.v2 import v2_bp, require_repo_read, require_repo_write, get_input_stream
from endpoints.v2.errors import (
  BlobUnknown, BlobUploadInvalid, BlobUploadUnknown, Unsupported, NameUnknown, LayerTooLarge,
  InvalidRequest)
from endpoints.v2.models_interface import Blob
from endpoints.v2.models_pre_oci import data_model as model
from util.cache import cache_control
from util.names import parse_namespace_repository
from util.registry.filelike import wrap_with_handler, StreamSlice
from util.registry.gzipstream import calculate_size_handler
from util.registry.torrent import PieceHasher

logger = logging.getLogger(__name__)

BASE_BLOB_ROUTE = '/<repopath:repository>/blobs/<regex("{0}"):digest>'
BLOB_DIGEST_ROUTE = BASE_BLOB_ROUTE.format(digest_tools.DIGEST_PATTERN)
RANGE_HEADER_REGEX = re.compile(r'^bytes=([0-9]+)-([0-9]+)$')
BLOB_CONTENT_TYPE = 'application/octet-stream'


class _InvalidRangeHeader(Exception):
  pass


def _get_repository_blob(namespace_name, repo_name, digest):
  """ Returns the blob with the given digest under the repository with the given
      name. If one does not exist (or it is still uploading), returns None.
      Automatically handles caching.
  """
  def load_blob():
    blob = model.get_blob_by_digest(namespace_name, repo_name, digest)
    if blob is None:
      return None

    return blob._asdict()

  blob_cache_key = cache_key.for_repository_blob(namespace_name, repo_name, digest)
  blob_dict = model_cache.retrieve(blob_cache_key, load_blob)
  return Blob(**blob_dict) if blob_dict is not None else None


@v2_bp.route(BLOB_DIGEST_ROUTE, methods=['HEAD'])
@parse_repository_name()
@process_registry_jwt_auth(scopes=['pull'])
@require_repo_read
@anon_protect
@cache_control(max_age=31436000)
def check_blob_exists(namespace_name, repo_name, digest):
  # Find the blob.
  blob = _get_repository_blob(namespace_name, repo_name, digest)
  if blob is None:
    raise BlobUnknown()

  # Build the response headers.
  headers = {
    'Docker-Content-Digest': digest,
    'Content-Length': blob.size,
    'Content-Type': BLOB_CONTENT_TYPE,
  }

  # If our storage supports range requests, let the client know.
  if storage.get_supports_resumable_downloads(blob.locations):
    headers['Accept-Ranges'] = 'bytes'

  # Write the response to the client.
  return Response(headers=headers)


@v2_bp.route(BLOB_DIGEST_ROUTE, methods=['GET'])
@parse_repository_name()
@process_registry_jwt_auth(scopes=['pull'])
@require_repo_read
@anon_protect
@cache_control(max_age=31536000)
def download_blob(namespace_name, repo_name, digest):
  # Find the blob.
  blob = _get_repository_blob(namespace_name, repo_name, digest)
  if blob is None:
    raise BlobUnknown()

  # Build the response headers.
  headers = {'Docker-Content-Digest': digest}

  # If our storage supports range requests, let the client know.
  if storage.get_supports_resumable_downloads(blob.locations):
    headers['Accept-Ranges'] = 'bytes'

  # Find the storage path for the blob.
  path = model.get_blob_path(blob)

  # Short-circuit by redirecting if the storage supports it.
  logger.debug('Looking up the direct download URL for path: %s', path)
  direct_download_url = storage.get_direct_download_url(blob.locations, path, request.remote_addr)
  if direct_download_url:
    logger.debug('Returning direct download URL')
    resp = redirect(direct_download_url)
    resp.headers.extend(headers)
    return resp

  # Close the database connection before we stream the download.
  logger.debug('Closing database connection before streaming layer data')
  with database.CloseForLongOperation(app.config):
    # Stream the response to the client.
    return Response(
      storage.stream_read(blob.locations, path),
      headers=headers.update({
        'Content-Length': blob.size,
        'Content-Type': BLOB_CONTENT_TYPE,}),)


def _try_to_mount_blob(namespace_name, repo_name, mount_blob_digest):
  """ Attempts to mount a blob requested by the user from another repository. """
  logger.debug('Got mount request for blob `%s` into `%s/%s`', mount_blob_digest, namespace_name,
               repo_name)
  from_repo = request.args.get('from', None)
  if from_repo is None:
    raise InvalidRequest

  # Ensure the user has access to the repository.
  logger.debug('Got mount request for blob `%s` under repository `%s` into `%s/%s`',
               mount_blob_digest, from_repo, namespace_name, repo_name)
  from_namespace, from_repo_name = parse_namespace_repository(from_repo,
                                                              app.config['LIBRARY_NAMESPACE'],
                                                              include_tag=False)

  # First check permission. This does not hit the DB so we do it first.
  read_permission = ReadRepositoryPermission(from_namespace, from_repo_name).can()
  if not read_permission:
    # If no direct permission, check if the repostory is public.
    if not model.is_repository_public(from_namespace, from_repo_name):
      logger.debug('No permission to mount blob `%s` under repository `%s` into `%s/%s`',
                   mount_blob_digest, from_repo, namespace_name, repo_name)
      return None

  # Lookup if the mount blob's digest exists in the repository.
  mount_blob = model.get_blob_by_digest(from_namespace, from_repo_name, mount_blob_digest)
  if mount_blob is None:
    logger.debug('Blob `%s` under repository `%s` not found', mount_blob_digest, from_repo)
    return None

  logger.debug('Mounting blob `%s` under repository `%s` into `%s/%s`', mount_blob_digest,
               from_repo, namespace_name, repo_name)

  # Mount the blob into the current repository and return that we've completed the operation.
  expiration_sec = app.config['PUSH_TEMP_TAG_EXPIRATION_SEC']
  if not model.mount_blob_and_temp_tag(namespace_name, repo_name, mount_blob, expiration_sec):
    logger.debug('Could not mount blob `%s` under repository `%s` not found', mount_blob_digest,
                 from_repo)
    return

  # Return the response for the blob indicating that it was mounted, and including its content
  # digest.
  logger.debug('Mounted blob `%s` under repository `%s` into `%s/%s`', mount_blob_digest,
               from_repo, namespace_name, repo_name)
  return Response(
    status=201,
    headers={
      'Docker-Content-Digest': mount_blob_digest,
      'Location':
        get_app_url() + url_for('v2.download_blob', repository='%s/%s' %
                                (namespace_name, repo_name), digest=mount_blob_digest),},)


@v2_bp.route('/<repopath:repository>/blobs/uploads/', methods=['POST'])
@parse_repository_name()
@process_registry_jwt_auth(scopes=['pull', 'push'])
@require_repo_write
@anon_protect
def start_blob_upload(namespace_name, repo_name):
  # Begin the blob upload process in the database and storage.
  location_name = storage.preferred_locations[0]
  new_upload_uuid, upload_metadata = storage.initiate_chunked_upload(location_name)
  repository_exists = model.create_blob_upload(namespace_name, repo_name, new_upload_uuid,
                                               location_name, upload_metadata)
  if not repository_exists:
    raise NameUnknown()

  # Check for mounting of a blob from another repository.
  mount_blob_digest = request.args.get('mount', None)
  if mount_blob_digest is not None:
    response = _try_to_mount_blob(namespace_name, repo_name, mount_blob_digest)
    if response is not None:
      return response

  # Check for a normal blob upload.
  digest = request.args.get('digest', None)
  if digest is None:
    # Short-circuit because the user will send the blob data in another request.
    return Response(
      status=202,
      headers={
        'Docker-Upload-UUID':
          new_upload_uuid,
        'Range':
          _render_range(0),
        'Location':
          get_app_url() + url_for('v2.upload_chunk', repository='%s/%s' %
                                  (namespace_name, repo_name), upload_uuid=new_upload_uuid)},)

  # The user plans to send us the entire body right now.
  # Find the upload.
  blob_upload = model.blob_upload_by_uuid(namespace_name, repo_name, new_upload_uuid)
  if blob_upload is None:
    raise BlobUploadUnknown()

  # Upload the chunk to storage while calculating some metadata and updating
  # the upload state.
  updated_blob_upload = _upload_chunk(blob_upload, request.headers.get('range'))
  if updated_blob_upload is None:
    _abort_range_not_satisfiable(blob_upload.byte_count, new_upload_uuid)

  # Save the upload state to the database.
  model.update_blob_upload(updated_blob_upload)

  # Finalize the upload process in the database and storage.
  _finish_upload(namespace_name, repo_name, updated_blob_upload, digest)

  # Write the response to the client.
  return Response(
    status=201,
    headers={
      'Docker-Content-Digest':
        digest,
      'Location':
        get_app_url() + url_for('v2.download_blob', repository='%s/%s' %
                                (namespace_name, repo_name), digest=digest),},)


@v2_bp.route('/<repopath:repository>/blobs/uploads/<upload_uuid>', methods=['GET'])
@parse_repository_name()
@process_registry_jwt_auth(scopes=['pull'])
@require_repo_write
@anon_protect
def fetch_existing_upload(namespace_name, repo_name, upload_uuid):
  blob_upload = model.blob_upload_by_uuid(namespace_name, repo_name, upload_uuid)
  if blob_upload is None:
    raise BlobUploadUnknown()

  return Response(
    status=204,
    headers={
      'Docker-Upload-UUID': upload_uuid,
      'Range': _render_range(blob_upload.byte_count + 1),  # byte ranges are exclusive
    },)


@v2_bp.route('/<repopath:repository>/blobs/uploads/<upload_uuid>', methods=['PATCH'])
@parse_repository_name()
@process_registry_jwt_auth(scopes=['pull', 'push'])
@require_repo_write
@anon_protect
def upload_chunk(namespace_name, repo_name, upload_uuid):
  # Find the upload.
  blob_upload = model.blob_upload_by_uuid(namespace_name, repo_name, upload_uuid)
  if blob_upload is None:
    raise BlobUploadUnknown()

  # Upload the chunk to storage while calculating some metadata and updating
  # the upload state.
  updated_blob_upload = _upload_chunk(blob_upload, request.headers.get('range'))
  if updated_blob_upload is None:
    _abort_range_not_satisfiable(blob_upload.byte_count, upload_uuid)

  # Save the upload state to the database.
  model.update_blob_upload(updated_blob_upload)

  # Write the response to the client.
  return Response(
    status=204,
    headers={
      'Location': _current_request_url(),
      'Range': _render_range(updated_blob_upload.byte_count, with_bytes_prefix=False),
      'Docker-Upload-UUID': upload_uuid,},)


@v2_bp.route('/<repopath:repository>/blobs/uploads/<upload_uuid>', methods=['PUT'])
@parse_repository_name()
@process_registry_jwt_auth(scopes=['pull', 'push'])
@require_repo_write
@anon_protect
def monolithic_upload_or_last_chunk(namespace_name, repo_name, upload_uuid):
  # Ensure the digest is present before proceeding.
  digest = request.args.get('digest', None)
  if digest is None:
    raise BlobUploadInvalid(detail={'reason': 'Missing digest arg on monolithic upload'})

  # Find the upload.
  blob_upload = model.blob_upload_by_uuid(namespace_name, repo_name, upload_uuid)
  if blob_upload is None:
    raise BlobUploadUnknown()

  # Upload the chunk to storage while calculating some metadata and updating
  # the upload state.
  updated_blob_upload = _upload_chunk(blob_upload, request.headers.get('range'))
  if updated_blob_upload is None:
    _abort_range_not_satisfiable(blob_upload.byte_count, upload_uuid)

  # Finalize the upload process in the database and storage.
  _finish_upload(namespace_name, repo_name, updated_blob_upload, digest)

  # Write the response to the client.
  return Response(status=201, headers={
    'Docker-Content-Digest':
      digest,
    'Location':
      get_app_url() + url_for('v2.download_blob', repository='%s/%s' %
                              (namespace_name, repo_name), digest=digest),})


@v2_bp.route('/<repopath:repository>/blobs/uploads/<upload_uuid>', methods=['DELETE'])
@parse_repository_name()
@process_registry_jwt_auth(scopes=['pull', 'push'])
@require_repo_write
@anon_protect
def cancel_upload(namespace_name, repo_name, upload_uuid):
  blob_upload = model.blob_upload_by_uuid(namespace_name, repo_name, upload_uuid)
  if blob_upload is None:
    raise BlobUploadUnknown()

  # We delete the record for the upload first, since if the partial upload in
  # storage fails to delete, it doesn't break anything.
  model.delete_blob_upload(namespace_name, repo_name, upload_uuid)
  storage.cancel_chunked_upload({blob_upload.location_name}, blob_upload.uuid,
                                blob_upload.storage_metadata)

  return Response(status=204)


@v2_bp.route('/<repopath:repository>/blobs/<digest>', methods=['DELETE'])
@parse_repository_name()
@process_registry_jwt_auth(scopes=['pull', 'push'])
@require_repo_write
@anon_protect
def delete_digest(namespace_name, repo_name, upload_uuid):
  # We do not support deleting arbitrary digests, as they break repo images.
  raise Unsupported()


def _render_range(num_uploaded_bytes, with_bytes_prefix=True):
  """
  Returns a string formatted to be used in the Range header.
  """
  return '{0}0-{1}'.format('bytes=' if with_bytes_prefix else '', num_uploaded_bytes - 1)


def _current_request_url():
  return '{0}{1}{2}'.format(get_app_url(), request.script_root, request.path)


def _abort_range_not_satisfiable(valid_end, upload_uuid):
  """
  Writes a failure response for scenarios where the registry cannot function
  with the provided range.

  TODO(jzelinskie): Unify this with the V2RegistryException class.
  """
  flask_abort(
    Response(status=416, headers={
      'Location': _current_request_url(),
      'Range': '0-{0}'.format(valid_end),
      'Docker-Upload-UUID': upload_uuid}))


def _parse_range_header(range_header_text):
  """
  Parses the range header.

  Returns a tuple of the start offset and the length.
  If the parse fails, raises _InvalidRangeHeader.
  """
  found = RANGE_HEADER_REGEX.match(range_header_text)
  if found is None:
    raise _InvalidRangeHeader()

  start = int(found.group(1))
  length = int(found.group(2)) - start

  if length <= 0:
    raise _InvalidRangeHeader()

  return (start, length)


def _start_offset_and_length(range_header):
  """
  Returns a tuple of the start offset and the length.
  If the range header doesn't exist, defaults to (0, -1).
  If parsing fails, returns (None, None).
  """
  start_offset, length = 0, -1
  if range_header is not None:
    try:
      start_offset, length = _parse_range_header(range_header)
    except _InvalidRangeHeader:
      return None, None

  return start_offset, length


def _upload_chunk(blob_upload, range_header):
  """
  Calculates metadata while uploading a chunk to storage.

  Returns a BlobUpload object or None if there was a failure.
  """
  max_layer_size = bitmath.parse_string_unsafe(app.config['MAXIMUM_LAYER_SIZE'])

  # Get the offset and length of the current chunk.
  start_offset, length = _start_offset_and_length(range_header)
  if blob_upload is None or None in {start_offset, length}:
    logger.error('Invalid arguments provided to _upload_chunk')
    return None

  if start_offset > 0 and start_offset > blob_upload.byte_count:
    logger.error('start_offset provided to _upload_chunk greater than blob.upload.byte_count')
    return None

  # Check if we should raise 413 before accepting the data.
  uploaded = bitmath.Byte(length + start_offset)
  if length > -1 and uploaded > max_layer_size:
    raise LayerTooLarge(uploaded=uploaded.bytes, max_allowed=max_layer_size.bytes)

  location_set = {blob_upload.location_name}

  upload_error = None
  with database.CloseForLongOperation(app.config):
    input_fp = get_input_stream(request)

    if start_offset > 0 and start_offset < blob_upload.byte_count:
      # Skip the bytes which were received on a previous push, which are already stored and
      # included in the sha calculation
      overlap_size = blob_upload.byte_count - start_offset
      input_fp = StreamSlice(input_fp, overlap_size)

      # Update our upload bounds to reflect the skipped portion of the overlap
      start_offset = blob_upload.byte_count
      length = max(length - overlap_size, 0)

    # We use this to escape early in case we have already processed all of the bytes the user
    # wants to upload
    if length == 0:
      return blob_upload

    input_fp = wrap_with_handler(input_fp, blob_upload.sha_state.update)

    # Add a hasher for calculating SHA1s for torrents if this is the first chunk and/or we have
    # already calculated hash data for the previous chunk(s).
    piece_hasher = None
    if blob_upload.chunk_count == 0 or blob_upload.piece_sha_state:
      initial_sha1_value = blob_upload.piece_sha_state or resumablehashlib.sha1()
      initial_sha1_pieces_value = blob_upload.piece_hashes or ''

      piece_hasher = PieceHasher(app.config['BITTORRENT_PIECE_SIZE'], start_offset,
                                 initial_sha1_pieces_value, initial_sha1_value)

      input_fp = wrap_with_handler(input_fp, piece_hasher.update)

    # If this is the first chunk and we're starting at the 0 offset, add a handler to gunzip the
    # stream so we can determine the uncompressed size. We'll throw out this data if another chunk
    # comes in, but in the common case the docker client only sends one chunk.
    size_info = None
    if start_offset == 0 and blob_upload.chunk_count == 0:
      size_info, fn = calculate_size_handler()
      input_fp = wrap_with_handler(input_fp, fn)

    start_time = time.time()
    length_written, new_metadata, upload_error = storage.stream_upload_chunk(
      location_set,
      blob_upload.uuid,
      start_offset,
      length,
      input_fp,
      blob_upload.storage_metadata,
      content_type=BLOB_CONTENT_TYPE,)

    if upload_error is not None:
      logger.error('storage.stream_upload_chunk returned error %s', upload_error)
      return None

    # Update the chunk upload time metric.
    metric_queue.chunk_upload_time.Observe(time.time() - start_time, labelvalues=[
      length_written, list(location_set)[0]])

  # If we determined an uncompressed size and this is the first chunk, add it to the blob.
  # Otherwise, we clear the size from the blob as it was uploaded in multiple chunks.
  if size_info is not None and blob_upload.chunk_count == 0 and size_info.is_valid:
    blob_upload.uncompressed_byte_count = size_info.uncompressed_size
  elif length_written > 0:
    # Otherwise, if we wrote some bytes and the above conditions were not met, then we don't
    # know the uncompressed size.
    blob_upload.uncompressed_byte_count = None

  if piece_hasher is not None:
    blob_upload.piece_hashes = piece_hasher.piece_hashes
    blob_upload.piece_sha_state = piece_hasher.hash_fragment

  blob_upload.storage_metadata = new_metadata
  blob_upload.byte_count += length_written
  blob_upload.chunk_count += 1

  # Ensure we have not gone beyond the max layer size.
  upload_size = bitmath.Byte(blob_upload.byte_count)
  if upload_size > max_layer_size:
    raise LayerTooLarge(uploaded=upload_size.bytes, max_allowed=max_layer_size.bytes)

  return blob_upload


def _validate_digest(blob_upload, expected_digest):
  """
  Verifies that the digest's SHA matches that of the uploaded data.
  """
  computed_digest = digest_tools.sha256_digest_from_hashlib(blob_upload.sha_state)
  if not digest_tools.digests_equal(computed_digest, expected_digest):
    logger.error('Digest mismatch for upload %s: Expected digest %s, found digest %s',
                 blob_upload.uuid, expected_digest, computed_digest)
    raise BlobUploadInvalid(detail={'reason': 'Digest mismatch on uploaded blob'})


def _finalize_blob_storage(blob_upload, expected_digest):
  """
  When an upload is successful, this ends the uploading process from the
  storage's perspective.

  Returns True if the blob already existed.
  """
  final_blob_location = digest_tools.content_path(expected_digest)

  # Move the storage into place, or if this was a re-upload, cancel it
  with database.CloseForLongOperation(app.config):
    already_existed = storage.exists({blob_upload.location_name}, final_blob_location)
    if already_existed:
      # It already existed, clean up our upload which served as proof that the
      # uploader had the blob.
      storage.cancel_chunked_upload({blob_upload.location_name}, blob_upload.uuid,
                                    blob_upload.storage_metadata)

    else:
      # We were the first ones to upload this image (at least to this location)
      # Let's copy it into place
      storage.complete_chunked_upload({blob_upload.location_name}, blob_upload.uuid,
                                      final_blob_location, blob_upload.storage_metadata)
  return already_existed


def _finalize_blob_database(namespace_name, repo_name, blob_upload, digest, already_existed):
  """
  When an upload is successful, this ends the uploading process from the
  database's perspective.
  """
  # Create the blob and temporarily tag it.
  blob_storage = model.create_blob_and_temp_tag(
    namespace_name,
    repo_name,
    digest,
    blob_upload,
    app.config['PUSH_TEMP_TAG_EXPIRATION_SEC'],)

  # If it doesn't already exist, create the BitTorrent pieces for the blob.
  if blob_upload.piece_sha_state is not None and not already_existed:
    piece_bytes = blob_upload.piece_hashes + blob_upload.piece_sha_state.digest()
    model.save_bittorrent_pieces(blob_storage, app.config['BITTORRENT_PIECE_SIZE'], piece_bytes)

  # Delete the blob upload.
  model.delete_blob_upload(namespace_name, repo_name, blob_upload.uuid)


def _finish_upload(namespace_name, repo_name, blob_upload, digest):
  """
  When an upload is successful, this ends the uploading process.
  """
  _validate_digest(blob_upload, digest)
  _finalize_blob_database(
    namespace_name,
    repo_name,
    blob_upload,
    digest,
    _finalize_blob_storage(blob_upload, digest),)