quay/util/streamlayerformat.py

import marisa_trie
import os
import tarfile
import StringIO
import traceback

AUFS_METADATA = u'.wh..wh.'

AUFS_WHITEOUT = u'.wh.'
AUFS_WHITEOUT_PREFIX_LENGTH = len(AUFS_WHITEOUT)

class StreamLayerMerger(object):
  """ Class which creates a generator of the combined TAR data for a set of Docker layers. """
  def __init__(self, layer_iterator):
    self.trie = marisa_trie.Trie()
    self.layer_iterator = layer_iterator
    self.encountered = []

  def get_generator(self):
    for current_layer in self.layer_iterator():
      # Read the current layer as TAR. If it is empty, we just continue
      # to the next layer.
      try:
        tar_file = tarfile.open(mode='r|*', fileobj=current_layer)
      except tarfile.ReadError as re:
        continue

      # For each of the tar entries, yield them IF and ONLY IF we have not
      # encountered the path before.

      # 9MB (+ padding below) so that it matches the 10MB expected by Gzip.
      chunk_size = 1024 * 1024 * 9

      for tar_info in tar_file:
        result = self.process_tar_info(tar_info)
        if not result:
          continue

        (tarinfo, filebuf) = result

        yield tarinfo.tobuf()

        if filebuf:
          length = 0
          file_stream = tar_file.extractfile(tarinfo)
          while True:
            current_block = file_stream.read(chunk_size)
            if not len(current_block):
              break

            yield current_block
            length += len(current_block)

          file_stream.close()

          # Files must be padding to 512 byte multiples.
          if length % 512 != 0:
            yield '\0' * (512 - (length % 512))

      # Close the layer stream now that we're done with it.
      tar_file.close()

      # Update the trie with the new encountered entries.
      self.trie = marisa_trie.Trie(self.encountered)
      
    # Last two records are empty in TAR spec.
    yield '\0' * 512
    yield '\0' * 512


  def process_tar_info(self, tar_info):
    absolute = os.path.relpath(tar_info.name.decode('utf-8'), './')
    dirname = os.path.dirname(absolute)
    filename = os.path.basename(absolute)

    # Skip directories and metadata
    if (filename.startswith(AUFS_METADATA) or
        absolute.startswith(AUFS_METADATA)):
      # Skip
      return None

    elif filename.startswith(AUFS_WHITEOUT):
      removed_filename = filename[AUFS_WHITEOUT_PREFIX_LENGTH:]
      removed_prefix = os.path.join('/', dirname, removed_filename)
      self.encountered.append(removed_prefix)
      return None

    # Check if this file has already been encountered somewhere. If so,
    # skip it.
    if unicode(absolute) in self.trie:
      return None

    self.encountered.append(absolute)

    if tar_info.isdir() or tar_info.issym() or tar_info.islnk():
      return (tar_info, False)
      
    elif tar_info.isfile():
      return (tar_info, True)
Work in progress. This is currently broken! 2014-09-16 00:18:57 -04:00			`import marisa_trie`
			`import os`
			`import tarfile`
			`import StringIO`
			`import traceback`

			`AUFS_METADATA = u'.wh..wh.'`

			`AUFS_WHITEOUT = u'.wh.'`
			`AUFS_WHITEOUT_PREFIX_LENGTH = len(AUFS_WHITEOUT)`

			`class StreamLayerMerger(object):`
			`""" Class which creates a generator of the combined TAR data for a set of Docker layers. """`
			`def __init__(self, layer_iterator):`
			`self.trie = marisa_trie.Trie()`
			`self.layer_iterator = layer_iterator`
			`self.encountered = []`

			`def get_generator(self):`
			`for current_layer in self.layer_iterator():`
			`# Read the current layer as TAR. If it is empty, we just continue`
			`# to the next layer.`
			`try:`
			`tar_file = tarfile.open(mode='r\|*', fileobj=current_layer)`
			`except tarfile.ReadError as re:`
			`continue`

			`# For each of the tar entries, yield them IF and ONLY IF we have not`
			`# encountered the path before.`

			`# 9MB (+ padding below) so that it matches the 10MB expected by Gzip.`
			`chunk_size = 1024 * 1024 * 9`

			`for tar_info in tar_file:`
			`result = self.process_tar_info(tar_info)`
			`if not result:`
			`continue`

			`(tarinfo, filebuf) = result`

			`yield tarinfo.tobuf()`

			`if filebuf:`
			`length = 0`
			`file_stream = tar_file.extractfile(tarinfo)`
			`while True:`
			`current_block = file_stream.read(chunk_size)`
			`if not len(current_block):`
			`break`

			`yield current_block`
			`length += len(current_block)`

			`file_stream.close()`

			`# Files must be padding to 512 byte multiples.`
			`if length % 512 != 0:`
			`yield '\0' * (512 - (length % 512))`

			`# Close the layer stream now that we're done with it.`
			`tar_file.close()`

			`# Update the trie with the new encountered entries.`
			`self.trie = marisa_trie.Trie(self.encountered)`

			`# Last two records are empty in TAR spec.`
			`yield '\0' * 512`
			`yield '\0' * 512`


			`def process_tar_info(self, tar_info):`
			`absolute = os.path.relpath(tar_info.name.decode('utf-8'), './')`
			`dirname = os.path.dirname(absolute)`
			`filename = os.path.basename(absolute)`

			`# Skip directories and metadata`
			`if (filename.startswith(AUFS_METADATA) or`
			`absolute.startswith(AUFS_METADATA)):`
			`# Skip`
			`return None`

			`elif filename.startswith(AUFS_WHITEOUT):`
			`removed_filename = filename[AUFS_WHITEOUT_PREFIX_LENGTH:]`
			`removed_prefix = os.path.join('/', dirname, removed_filename)`
			`self.encountered.append(removed_prefix)`
			`return None`

			`# Check if this file has already been encountered somewhere. If so,`
			`# skip it.`
			`if unicode(absolute) in self.trie:`
			`return None`

			`self.encountered.append(absolute)`

			`if tar_info.isdir() or tar_info.issym() or tar_info.islnk():`
			`return (tar_info, False)`

			`elif tar_info.isfile():`
			`return (tar_info, True)`