This repository has been archived on 2020-03-24. You can view files and clone it, but cannot push or open issues or pull requests.
quay/workers/manifestbackfillworker.py
Joseph Schorr fa58f3b1d2 Fix handling of manifests with unicode in the backfill
Also adds a bunch of tests around manifests to ensure we get the same information in and out
2018-08-15 11:41:15 -04:00

178 lines
5.9 KiB
Python

import logging
import logging.config
from peewee import JOIN, fn, IntegrityError
from app import app
from data.database import (UseThenDisconnect, TagManifest, TagManifestToManifest, Image,
db_transaction)
from data.model.image import get_parent_images
from data.model.tag import populate_manifest
from data.model.blob import get_repo_blob_by_digest, BlobDoesNotExist
from image.docker.schema1 import (DockerSchema1Manifest, ManifestException, ManifestInterface,
DOCKER_SCHEMA1_SIGNED_MANIFEST_CONTENT_TYPE)
from workers.worker import Worker
from util.log import logfile_path
from util.migrate.allocator import yield_random_entries
logger = logging.getLogger(__name__)
WORKER_TIMEOUT = 600
class BrokenManifest(ManifestInterface):
""" Implementation of the ManifestInterface for "broken" manifests. This allows us to add the
new manifest row while not adding any additional rows for it.
"""
def __init__(self, digest, payload):
self._digest = digest
self._payload = payload
@property
def digest(self):
return self._digest
@property
def media_type(self):
return DOCKER_SCHEMA1_SIGNED_MANIFEST_CONTENT_TYPE
@property
def manifest_dict(self):
return {}
@property
def bytes(self):
return self._payload
@property
def layers(self):
return []
@property
def leaf_layer_v1_image_id(self):
return None
@property
def blob_digests(self):
return []
class ManifestBackfillWorker(Worker):
def __init__(self):
super(ManifestBackfillWorker, self).__init__()
self.add_operation(self._backfill_manifests, WORKER_TIMEOUT)
def _candidates_to_backfill(self):
def missing_tmt_query():
return (TagManifest
.select()
.join(TagManifestToManifest, JOIN.LEFT_OUTER)
.where(TagManifestToManifest.id >> None))
min_id = (TagManifest
.select(fn.Min(TagManifest.id))
.join(TagManifestToManifest, JOIN.LEFT_OUTER)
.where(TagManifestToManifest.id >> None)
.scalar())
max_id = TagManifest.select(fn.Max(TagManifest.id)).scalar()
iterator = yield_random_entries(
missing_tmt_query,
TagManifest.id,
100,
max_id,
min_id,
)
return iterator
def _backfill_manifests(self):
""" Performs garbage collection on repositories. """
with UseThenDisconnect(app.config):
iterator = self._candidates_to_backfill()
if iterator is None:
logger.debug('Found no additional images to scan')
return None
for candidate, abt, _ in iterator:
if not backfill_manifest(candidate):
logger.info('Another worker pre-empted us for manifest: %s', candidate.id)
abt.set()
def lookup_map_row(tag_manifest):
try:
TagManifestToManifest.get(tag_manifest=tag_manifest)
return True
except TagManifestToManifest.DoesNotExist:
return False
def backfill_manifest(tag_manifest):
logger.info('Backfilling manifest %s', tag_manifest.id)
# Ensure that a mapping row doesn't already exist. If it does, we've been preempted.
if lookup_map_row(tag_manifest):
return False
# Parse the manifest. If we cannot parse, then we treat the manifest as broken and just emit it
# without additional rows or data, as it will eventually not be useful.
is_broken = False
try:
manifest = DockerSchema1Manifest.for_latin1_bytes(tag_manifest.json_data, validate=False)
except ManifestException:
logger.exception('Exception when trying to parse manifest %s', tag_manifest.id)
manifest = BrokenManifest(tag_manifest.digest, tag_manifest.json_data)
is_broken = True
# Lookup the storages for the digests.
root_image = tag_manifest.tag.image
repository = tag_manifest.tag.repository
image_storage_id_map = {root_image.storage.content_checksum: root_image.storage.id}
parent_images = get_parent_images(repository.namespace_user.username, repository.name, root_image)
for parent_image in parent_images:
image_storage_id_map[parent_image.storage.content_checksum] = parent_image.storage.id
# Ensure that all the expected blobs have been found. If not, we lookup the blob under the repo
# and add its storage ID. If the blob is not found, we mark the manifest as broken.
storage_ids = set()
for blob_digest in manifest.blob_digests:
if blob_digest in image_storage_id_map:
storage_ids.add(image_storage_id_map[blob_digest])
else:
logger.debug('Blob `%s` not found in images for manifest `%s`; checking repo',
blob_digest, tag_manifest.id)
try:
blob_storage = get_repo_blob_by_digest(repository.namespace_user.username, repository.name,
blob_digest)
storage_ids.add(blob_storage.id)
except BlobDoesNotExist:
logger.debug('Blob `%s` not found in repo for manifest `%s`',
blob_digest, tag_manifest.id)
is_broken = True
with db_transaction():
# Re-retrieve the tag manifest to ensure it still exists and we're pointing at the correct tag.
try:
tag_manifest = TagManifest.get(id=tag_manifest.id)
except TagManifest.DoesNotExist:
return True
# Create the new-style rows for the manifest.
manifest_row = populate_manifest(tag_manifest.tag.repository, manifest,
tag_manifest.tag.image, storage_ids)
# Create the mapping row. If we find another was created for this tag manifest in the
# meantime, then we've been preempted.
try:
TagManifestToManifest.create(tag_manifest=tag_manifest, manifest=manifest_row,
broken=is_broken)
return True
except IntegrityError:
return False
if __name__ == "__main__":
logging.config.fileConfig(logfile_path(debug=False), disable_existing_loggers=False)
worker = ManifestBackfillWorker()
worker.start()