import marisa_trie import os import tarfile import StringIO import traceback AUFS_METADATA = u'.wh..wh.' AUFS_WHITEOUT = u'.wh.' AUFS_WHITEOUT_PREFIX_LENGTH = len(AUFS_WHITEOUT) class StreamLayerMerger(object): """ Class which creates a generator of the combined TAR data for a set of Docker layers. """ def __init__(self, layer_iterator): self.trie = marisa_trie.Trie() self.layer_iterator = layer_iterator self.encountered = [] def get_generator(self): for current_layer in self.layer_iterator(): # Read the current layer as TAR. If it is empty, we just continue # to the next layer. try: tar_file = tarfile.open(mode='r|*', fileobj=current_layer) except tarfile.ReadError as re: continue # For each of the tar entries, yield them IF and ONLY IF we have not # encountered the path before. # 9MB (+ padding below) so that it matches the 10MB expected by Gzip. chunk_size = 1024 * 1024 * 9 for tar_info in tar_file: result = self.process_tar_info(tar_info) if not result: continue (tarinfo, filebuf) = result yield tarinfo.tobuf() if filebuf: length = 0 file_stream = tar_file.extractfile(tarinfo) while True: current_block = file_stream.read(chunk_size) if not len(current_block): break yield current_block length += len(current_block) file_stream.close() # Files must be padding to 512 byte multiples. if length % 512 != 0: yield '\0' * (512 - (length % 512)) # Close the layer stream now that we're done with it. tar_file.close() # Update the trie with the new encountered entries. self.trie = marisa_trie.Trie(self.encountered) # Last two records are empty in TAR spec. yield '\0' * 512 yield '\0' * 512 def process_tar_info(self, tar_info): absolute = os.path.relpath(tar_info.name.decode('utf-8'), './') dirname = os.path.dirname(absolute) filename = os.path.basename(absolute) # Skip directories and metadata if (filename.startswith(AUFS_METADATA) or absolute.startswith(AUFS_METADATA)): # Skip return None elif filename.startswith(AUFS_WHITEOUT): removed_filename = filename[AUFS_WHITEOUT_PREFIX_LENGTH:] removed_prefix = os.path.join('/', dirname, removed_filename) self.encountered.append(removed_prefix) return None # Check if this file has already been encountered somewhere. If so, # skip it. if unicode(absolute) in self.trie: return None self.encountered.append(absolute) if tar_info.isdir() or tar_info.issym() or tar_info.islnk(): return (tar_info, False) elif tar_info.isfile(): return (tar_info, True)