Harden the storage replication worker to failures by explicitly catching certain errors, having better backoff and explicitly ensuring a layer has been replicated to a region before adding the placement into the database
Also adds tests for the various failure cases
This commit is contained in:
		
							parent
							
								
									088a301754
								
							
						
					
					
						commit
						6a9634dffb
					
				
					 3 changed files with 181 additions and 55 deletions
				
			
		|  | @ -3,10 +3,10 @@ import time | |||
| 
 | ||||
| import features | ||||
| 
 | ||||
| from app import app, storage, image_replication_queue | ||||
| from app import app, storage as app_storage, image_replication_queue | ||||
| from data.database import CloseForLongOperation | ||||
| from data import model | ||||
| from workers.queueworker import QueueWorker, WorkerUnhealthyException | ||||
| from workers.queueworker import QueueWorker, WorkerUnhealthyException, JobException | ||||
| from util.log import logfile_path | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
|  | @ -28,17 +28,28 @@ class StorageReplicationWorker(QueueWorker): | |||
|                        namespace_id, storage_uuid) | ||||
|       return | ||||
| 
 | ||||
|     succeeded = self.replicate_storage(namespace, storage_uuid) | ||||
|     logger.debug('Replication finished of image storage %s under namespace %s: %s', | ||||
|                  storage_uuid, namespace_id, succeeded) | ||||
|     if not succeeded: | ||||
|       raise WorkerUnhealthyException() | ||||
|     self.replicate_storage(namespace, storage_uuid, app_storage) | ||||
| 
 | ||||
|   def replicate_storage(self, namespace, storage_uuid): | ||||
|   def _backoff_check_exists(self, location, path, storage, backoff_check=True): | ||||
|     for retry in range(0, 4): | ||||
|       if storage.exists([location], path): | ||||
|         return True | ||||
| 
 | ||||
|       if not backoff_check: | ||||
|         return False | ||||
| 
 | ||||
|       seconds = pow(2, retry) * 2 | ||||
|       logger.debug('Cannot find path `%s` in location %s (try #%s). Sleeping for %s seconds', | ||||
|                    path, location, retry, seconds) | ||||
|       time.sleep(seconds) | ||||
| 
 | ||||
|     return False | ||||
| 
 | ||||
|   def replicate_storage(self, namespace, storage_uuid, storage, backoff_check=True): | ||||
|     # Lookup the namespace and its associated regions. | ||||
|     if not namespace: | ||||
|       logger.debug('Unknown namespace when trying to replicate storage %s', storage_uuid) | ||||
|       return True | ||||
|       return | ||||
| 
 | ||||
|     locations = model.user.get_region_locations(namespace) | ||||
| 
 | ||||
|  | @ -47,7 +58,7 @@ class StorageReplicationWorker(QueueWorker): | |||
|       partial_storage = model.storage.get_storage_by_uuid(storage_uuid) | ||||
|     except model.InvalidImageException: | ||||
|       logger.debug('Unknown storage: %s', storage_uuid) | ||||
|       return True | ||||
|       return | ||||
| 
 | ||||
|     # Check to see if the image is at all the required locations. | ||||
|     locations_required = locations | set(storage.default_locations) | ||||
|  | @ -59,26 +70,17 @@ class StorageReplicationWorker(QueueWorker): | |||
|     if not locations_missing: | ||||
|       logger.debug('No missing locations for storage %s under namespace %s. Required: %s', | ||||
|                    storage_uuid, namespace.username, locations_required) | ||||
|       return True | ||||
|       return | ||||
| 
 | ||||
|     # For any missing storage locations, initiate a copy. | ||||
|     existing_location = list(partial_storage.locations)[0] | ||||
|     path_to_copy = model.storage.get_layer_path(partial_storage) | ||||
| 
 | ||||
|     # Lookup the existing location. If not found, progressively sleep a few times to handle the case | ||||
|     # of not fully consistent storage. | ||||
|     for retry in range(0, 3): | ||||
|       if storage.exists([existing_location], path_to_copy): | ||||
|         break | ||||
| 
 | ||||
|       logger.debug('Cannot find image storage %s in existing location %s (try #%s)', | ||||
|                    storage_uuid, existing_location, retry) | ||||
|       time.sleep(pow(2, retry) * 5) | ||||
| 
 | ||||
|     if not storage.exists([existing_location], path_to_copy): | ||||
|     # Lookup and ensure the existing location exists. | ||||
|     if not self._backoff_check_exists(existing_location, path_to_copy, storage, backoff_check): | ||||
|       logger.warning('Cannot find image storage %s in existing location %s; stopping replication', | ||||
|                      storage_uuid, existing_location) | ||||
|       return False | ||||
|       raise JobException() | ||||
| 
 | ||||
|     # For each missing location, copy over the storage. | ||||
|     for location in locations_missing: | ||||
|  | @ -91,21 +93,32 @@ class StorageReplicationWorker(QueueWorker): | |||
|         with CloseForLongOperation(app.config): | ||||
|           storage.copy_between(path_to_copy, existing_location, location) | ||||
|           copied = True | ||||
|       except: | ||||
|         logger.exception('Exception when copying path %s of image storage %s to location %s', | ||||
|       except IOError: | ||||
|         logger.exception('Failed to copy path `%s` of image storage %s to location %s', | ||||
|                          path_to_copy, partial_storage.uuid, location) | ||||
|         return False | ||||
|         raise JobException() | ||||
|       except: | ||||
|         logger.exception('Unknown exception when copying path %s of image storage %s to loc %s', | ||||
|                          path_to_copy, partial_storage.uuid, location) | ||||
|         raise WorkerUnhealthyException() | ||||
| 
 | ||||
|       # Create the storage location record for the storage now that the copies have | ||||
|       # completed. | ||||
|       if copied: | ||||
|         # Verify the data was copied to the target storage, to ensure that there are no cases | ||||
|         # where we write the placement without knowing the data is present. | ||||
|         if not self._backoff_check_exists(location, path_to_copy, storage, backoff_check): | ||||
|           logger.warning('Failed to find path `%s` in location `%s` after copy', path_to_copy, | ||||
|                          location) | ||||
|           raise JobException() | ||||
| 
 | ||||
|         # Create the storage location record for the storage now that the copy has | ||||
|         # completed. | ||||
|         model.storage.add_storage_placement(partial_storage, location) | ||||
| 
 | ||||
|         logger.debug('Finished copy of image storage %s to location %s from %s', | ||||
|                      partial_storage.uuid, location, existing_location) | ||||
| 
 | ||||
|     logger.debug('Completed replication of image storage %s to locations %s from %s', | ||||
|                  partial_storage.uuid, locations_missing, existing_location) | ||||
|     return True | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|  |  | |||
		Reference in a new issue