188 lines
6.5 KiB
Python
188 lines
6.5 KiB
Python
|
import os
|
||
|
import tarfile
|
||
|
import copy
|
||
|
|
||
|
from abc import ABCMeta, abstractmethod
|
||
|
from collections import defaultdict
|
||
|
from six import add_metaclass
|
||
|
|
||
|
from util.abchelpers import nooper
|
||
|
|
||
|
class TarLayerReadException(Exception):
|
||
|
""" Exception raised when reading a layer has failed. """
|
||
|
pass
|
||
|
|
||
|
|
||
|
# 9MB (+ padding below) so that it matches the 10MB expected by Gzip.
|
||
|
CHUNK_SIZE = 1024 * 1024 * 9
|
||
|
|
||
|
@add_metaclass(ABCMeta)
|
||
|
class TarLayerFormatterReporter(object):
|
||
|
@abstractmethod
|
||
|
def report_pass(self, stream_count):
|
||
|
""" Reports a formatting pass. """
|
||
|
pass
|
||
|
|
||
|
|
||
|
@nooper
|
||
|
class NoopReporter(TarLayerFormatterReporter):
|
||
|
pass
|
||
|
|
||
|
|
||
|
@add_metaclass(ABCMeta)
|
||
|
class TarLayerFormat(object):
|
||
|
""" Class which creates a generator of the combined TAR data. """
|
||
|
def __init__(self, tar_stream_getter_iterator, path_prefix=None, reporter=None):
|
||
|
self.tar_stream_getter_iterator = tar_stream_getter_iterator
|
||
|
self.path_prefix = path_prefix or ''
|
||
|
self.reporter = reporter or NoopReporter()
|
||
|
|
||
|
def get_generator(self):
|
||
|
for stream_getter in self.tar_stream_getter_iterator():
|
||
|
current_tar_stream = stream_getter()
|
||
|
|
||
|
# Read the current TAR. If it is empty, we just continue
|
||
|
# to the next one.
|
||
|
tar_file = TarLayerFormat._tar_file_from_stream(current_tar_stream)
|
||
|
if not tar_file:
|
||
|
continue
|
||
|
|
||
|
# For each of the tar entries, yield them IF and ONLY IF we have not
|
||
|
# encountered the path before.
|
||
|
dangling_hard_links = defaultdict(list)
|
||
|
try:
|
||
|
for tar_info in tar_file:
|
||
|
if not self.should_append_file(tar_info.name):
|
||
|
continue
|
||
|
|
||
|
# Note: We use a copy here because we need to make sure we copy over all the internal
|
||
|
# data of the tar header. We cannot use frombuf(tobuf()), however, because it doesn't
|
||
|
# properly handle large filenames.
|
||
|
clone = copy.deepcopy(tar_info)
|
||
|
clone.name = os.path.join(self.path_prefix, clone.name)
|
||
|
|
||
|
# If the entry is a *hard* link, then prefix it as well. Soft links are relative.
|
||
|
if clone.linkname and clone.type == tarfile.LNKTYPE:
|
||
|
# If the entry is a dangling hard link, we skip here. Dangling hard links will be handled
|
||
|
# in a second pass.
|
||
|
if self.is_skipped_file(tar_info.linkname):
|
||
|
dangling_hard_links[tar_info.linkname].append(tar_info)
|
||
|
continue
|
||
|
|
||
|
clone.linkname = os.path.join(self.path_prefix, clone.linkname)
|
||
|
|
||
|
# Yield the tar header.
|
||
|
yield clone.tobuf()
|
||
|
|
||
|
# Try to extract any file contents for the tar. If found, we yield them as well.
|
||
|
if tar_info.isreg():
|
||
|
for block in TarLayerFormat._emit_file(tar_file, tar_info):
|
||
|
yield block
|
||
|
except UnicodeDecodeError as ude:
|
||
|
raise TarLayerReadException('Decode error: %s' % ude)
|
||
|
|
||
|
# Close the layer stream now that we're done with it.
|
||
|
tar_file.close()
|
||
|
|
||
|
# If there are any dangling hard links, open a new stream and retarget the dangling hard
|
||
|
# links to a new copy of the contents, which will be placed under the *first* dangling hard
|
||
|
# link's name.
|
||
|
if len(dangling_hard_links) > 0:
|
||
|
tar_file = TarLayerFormat._tar_file_from_stream(stream_getter())
|
||
|
if not tar_file:
|
||
|
raise TarLayerReadException('Could not re-read tar layer')
|
||
|
|
||
|
for tar_info in tar_file:
|
||
|
# If we encounter a file that holds the data for a dangling link,
|
||
|
# emit it under the name of the first dangling hard link. All other
|
||
|
# dangling hard links will be retargeted to this first name.
|
||
|
if tar_info.name in dangling_hard_links:
|
||
|
first_dangling = dangling_hard_links[tar_info.name][0]
|
||
|
|
||
|
# Copy the first dangling hard link, change it to a normal file,
|
||
|
# and emit the deleted file's contents for it.
|
||
|
clone = copy.deepcopy(first_dangling)
|
||
|
clone.name = os.path.join(self.path_prefix, first_dangling.name)
|
||
|
clone.type = tar_info.type
|
||
|
clone.size = tar_info.size
|
||
|
clone.pax_headers = tar_info.pax_headers
|
||
|
yield clone.tobuf()
|
||
|
|
||
|
for block in TarLayerFormat._emit_file(tar_file, tar_info):
|
||
|
yield block
|
||
|
|
||
|
elif (tar_info.type == tarfile.LNKTYPE and
|
||
|
tar_info.linkname in dangling_hard_links and
|
||
|
not self.is_skipped_file(tar_info.name)):
|
||
|
# Retarget if necessary. All dangling hard links (but the first) will
|
||
|
# need to be retargeted.
|
||
|
first_dangling = dangling_hard_links[tar_info.linkname][0]
|
||
|
if tar_info.name == first_dangling.name:
|
||
|
# Skip; the first dangling is handled above.
|
||
|
continue
|
||
|
|
||
|
# Retarget the hard link to the first dangling hard link.
|
||
|
clone = copy.deepcopy(tar_info)
|
||
|
clone.name = os.path.join(self.path_prefix, clone.name)
|
||
|
clone.linkname = os.path.join(self.path_prefix, first_dangling.name)
|
||
|
yield clone.tobuf()
|
||
|
|
||
|
# Close the layer stream now that we're done with it.
|
||
|
tar_file.close()
|
||
|
|
||
|
# Conduct any post-tar work.
|
||
|
self.after_tar_layer()
|
||
|
self.reporter.report_pass(2 if len(dangling_hard_links) > 0 else 1)
|
||
|
|
||
|
# Last two records are empty in TAR spec.
|
||
|
yield '\0' * 512
|
||
|
yield '\0' * 512
|
||
|
|
||
|
@abstractmethod
|
||
|
def is_skipped_file(self, filename):
|
||
|
""" Returns true if the file with the given name will be skipped during append.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
@abstractmethod
|
||
|
def should_append_file(self, filename):
|
||
|
""" Returns true if the file with the given name should be appended when producing
|
||
|
the new TAR.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
@abstractmethod
|
||
|
def after_tar_layer(self):
|
||
|
""" Invoked after a TAR layer is added, to do any post-add work. """
|
||
|
pass
|
||
|
|
||
|
@staticmethod
|
||
|
def _tar_file_from_stream(stream):
|
||
|
tar_file = None
|
||
|
try:
|
||
|
tar_file = tarfile.open(mode='r|*', fileobj=stream)
|
||
|
except tarfile.ReadError as re:
|
||
|
if str(re) != 'empty file':
|
||
|
raise TarLayerReadException('Could not read layer')
|
||
|
|
||
|
return tar_file
|
||
|
|
||
|
@staticmethod
|
||
|
def _emit_file(tar_file, tar_info):
|
||
|
file_stream = tar_file.extractfile(tar_info)
|
||
|
if file_stream is not None:
|
||
|
length = 0
|
||
|
while True:
|
||
|
current_block = file_stream.read(CHUNK_SIZE)
|
||
|
if not len(current_block):
|
||
|
break
|
||
|
|
||
|
yield current_block
|
||
|
length += len(current_block)
|
||
|
|
||
|
file_stream.close()
|
||
|
|
||
|
# Files must be padding to 512 byte multiples.
|
||
|
if length % 512 != 0:
|
||
|
yield '\0' * (512 - (length % 512))
|
||
|
|