Add a batch get_matching_tags_for_images method

This will be used in the security notification worker to retrieving the tags needed in a set of batch calls, rather than multiple calls per image
2017-05-02 15:38:25 -04:00 · 2017-05-02 15:38:25 -04:00 · 74dd0ef8e8
commit 74dd0ef8e8
parent e583be3914
4 changed files with 162 additions and 36 deletions
--- a/data/model/tag.py
+++ b/data/model/tag.py
@ -53,6 +53,73 @@ def _tag_alive(query, now_ts=None):
                     (RepositoryTag.lifetime_end_ts > now_ts))


+_MAX_SUB_QUERIES = 100
+
+def get_matching_tags_for_images(image_pairs, filter_query=None, selections=None):
+  """ Returns all tags that contain the images with the given docker_image_id and storage_uuid,
+      as specified as an iterable of pairs. """
+  if not image_pairs:
+    return []
+
+  image_pairs = set(image_pairs)
+
+  # Find all possible matching image+storages.
+  ids = [image_pair[0] for image_pair in image_pairs]
+  uuids = [image_pair[1] for image_pair in image_pairs]
+  images_query = (Image
+                  .select(Image.id, Image.docker_image_id, Image.ancestors, ImageStorage.uuid)
+                  .join(ImageStorage)
+                  .where(Image.docker_image_id << ids, ImageStorage.uuid << uuids))
+
+  # Filter down to those images actually in the pairs set and build the set of queries to run.
+  individual_image_queries = []
+
+  for img in images_query:
+    # Make sure the actual image was requested.
+    pair = (img.docker_image_id, img.storage.uuid)
+    if pair not in image_pairs:
+      continue
+
+    # Remove the pair so we don't try it again.
+    image_pairs.remove(pair)
+
+    ancestors_str = '%s%s/%%' % (img.ancestors, img.id)
+    query = (Image
+             .select(Image.id)
+             .where((Image.id == img.id) | (Image.ancestors ** ancestors_str)))
+
+    individual_image_queries.append(query)
+
+  if not individual_image_queries:
+    return []
+
+  # Shard based on the max subquery count. This is used to prevent going over the DB's max query
+  # size, as well as to prevent the DB from locking up on a massive query.
+  sharded_queries = []
+  while individual_image_queries:
+    shard = individual_image_queries[0:_MAX_SUB_QUERIES]
+    sharded_queries.append(_basequery.reduce_as_tree(shard))
+    individual_image_queries = individual_image_queries[_MAX_SUB_QUERIES:]
+
+  # Collect IDs of the tags found for each query.
+  tags = {}
+  for query in sharded_queries:
+    tag_query = (_tag_alive(RepositoryTag
+                            .select(*(selections or []))
+                            .distinct()
+                            .join(Image)
+                            .where(RepositoryTag.hidden == False)
+                            .where(Image.id << query)))
+
+    if filter_query is not None:
+      tag_query = filter_query(tag_query)
+
+    for tag in tag_query:
+      tags[tag.id] = tag
+
+  return tags.values()
+
+
 def get_matching_tags(docker_image_id, storage_uuid, *args):
  """ Returns a query pointing to all tags that contain the image with the
      given docker_image_id and storage_uuid. """