Fix GC handling around CAS paths
Adds code to ensure we never GC CAS paths that are shared amongst multiple ImageStorage rows, as well as an associated pair of tests to catch the positive and negative cases.
This commit is contained in:
		
							parent
							
								
									16ccc946f3
								
							
						
					
					
						commit
						69e550d125
					
				
					 2 changed files with 129 additions and 5 deletions
				
			
		|  | @ -79,10 +79,33 @@ def garbage_collect_storage(storage_id_whitelist): | |||
|   if len(storage_id_whitelist) == 0: | ||||
|     return [] | ||||
| 
 | ||||
|   def placements_query_to_paths_set(placements_query): | ||||
|   def placements_to_filtered_paths_set(placements_list): | ||||
|     """ Returns the list of paths to remove from storage, filtered from the given placements | ||||
|         query by removing any CAS paths that are still referenced by storage(s) in the database. | ||||
|     """ | ||||
|     if not placements_list: | ||||
|       return set() | ||||
| 
 | ||||
|     # Find the content checksums not referenced by other storages. Any that are, we cannot | ||||
|     # remove. | ||||
|     content_checksums = set([placement.storage.content_checksum for placement in placements_list | ||||
|                              if placement.storage.cas_path]) | ||||
| 
 | ||||
|     unreferenced_checksums = set() | ||||
|     if content_checksums: | ||||
|       query = (ImageStorage | ||||
|                .select(ImageStorage.content_checksum) | ||||
|                .where(ImageStorage.content_checksum << list(content_checksums))) | ||||
|       referenced_checksums = set([image_storage.content_checksum for image_storage in query]) | ||||
|       unreferenced_checksums = content_checksums - referenced_checksums | ||||
| 
 | ||||
|     # Return all placements for all image storages found not at a CAS path or with a content | ||||
|     # checksum that is referenced. | ||||
|     return {(get_image_location_for_id(placement.location_id).name, | ||||
|              get_layer_path(placement.storage)) | ||||
|             for placement in placements_query} | ||||
|              for placement in placements_list | ||||
|              if not placement.storage.cas_path or | ||||
|                 placement.storage.content_checksum in unreferenced_checksums} | ||||
| 
 | ||||
|   # Note: Both of these deletes must occur in the same transaction (unfortunately) because a | ||||
|   # storage without any placement is invalid, and a placement cannot exist without a storage. | ||||
|  | @ -96,12 +119,10 @@ def garbage_collect_storage(storage_id_whitelist): | |||
|       return [] | ||||
| 
 | ||||
|     placements_to_remove = list(ImageStoragePlacement | ||||
|                                 .select() | ||||
|                                 .select(ImageStoragePlacement, ImageStorage) | ||||
|                                 .join(ImageStorage) | ||||
|                                 .where(ImageStorage.id << orphaned_storage_ids)) | ||||
| 
 | ||||
|     paths_to_remove = placements_query_to_paths_set(placements_to_remove) | ||||
| 
 | ||||
|     # Remove the placements for orphaned storages | ||||
|     if len(placements_to_remove) > 0: | ||||
|       placement_ids_to_remove = [placement.id for placement in placements_to_remove] | ||||
|  | @ -130,6 +151,11 @@ def garbage_collect_storage(storage_id_whitelist): | |||
|                         .execute()) | ||||
|     logger.debug('Removed %s image storage records', storages_removed) | ||||
| 
 | ||||
|     # Determine the paths to remove. We cannot simply remove all paths matching storages, as CAS | ||||
|     # can share the same path. We further filter these paths by checking for any storages still in | ||||
|     # the database with the same content checksum. | ||||
|     paths_to_remove = placements_to_filtered_paths_set(placements_to_remove) | ||||
| 
 | ||||
|   # We are going to make the conscious decision to not delete image storage blobs inside | ||||
|   # transactions. | ||||
|   # This may end up producing garbage in s3, trading off for higher availability in the database. | ||||
|  |  | |||
		Reference in a new issue