Route shared storage RPC to evac dest at startup
If a compute is started up while an evacuation of an instance from this host is still in progress then the destroy_evacuated_instances call will try to check if the instance is on shared storage to decide if the local disk needs to deleted from the source node or not. However this call uses the instance.host to target the RPC call. If the evacuation is still ongoing then the instance.host might still be set to the source node. This means the source node during init_host tries to call RPC on itself. This will always time out as the RPC server is only started after init_host. Also it is wrong as the shared storage check RPC should be called on another host. Moreover when this wrongly routed RPC times out the source compute logs the exception, ignores it, and the assume the disk is on shared storage so won't clean it up. This means that a later evacuation of this VM targeting this node will fails as the instance directory is already present on the node. The fix is simple, the destroy_evacuated_instances call should always send the shared storage check RPC call to the destination node of the evacuation based on the migration record. It will be correct even if the evacuation is still in progress or even if it is already finished. Closes-Bug: #2085975 Change-Id: If5ad213649d68da995dad146f0a0c3cacc369309
This commit is contained in:
parent
1d6c80bfe6
commit
2c76fd3baf
@ -826,8 +826,9 @@ class ComputeManager(manager.Manager):
|
||||
context, instance)
|
||||
bdi = self._get_instance_block_device_info(context,
|
||||
instance)
|
||||
evac = evacuations[instance.uuid]
|
||||
destroy_disks = not (self._is_instance_storage_shared(
|
||||
context, instance))
|
||||
context, instance, host=evac.dest_compute))
|
||||
except exception.InstanceNotFound:
|
||||
network_info = network_model.NetworkInfo()
|
||||
bdi = {}
|
||||
|
@ -14,7 +14,6 @@ import fixtures
|
||||
import threading
|
||||
|
||||
from oslo_log import log as logging
|
||||
from oslo_messaging import MessagingTimeout
|
||||
|
||||
from nova.tests.functional.libvirt import base
|
||||
|
||||
@ -128,12 +127,5 @@ class TestComputeStartupInProgressEvacuation(
|
||||
# finishes
|
||||
contd.set()
|
||||
|
||||
# This is bug 2085975 as the shared storage check fails with exception
|
||||
# and therefore nova defaults to not cleaning up the local disk of the
|
||||
# instance.
|
||||
self.assertEqual(MessagingTimeout, type(rpc_exception))
|
||||
# when host is not passed rpc defaults to instance.host
|
||||
self.assertIsNone(rpc_target_host)
|
||||
# when fixed it should not fail
|
||||
# self.assertIsNone(rpc_exception)
|
||||
# self.assertEqual('compute2', rpc_target_host)
|
||||
self.assertIsNone(rpc_exception)
|
||||
self.assertEqual('compute2', rpc_target_host)
|
||||
|
@ -7667,6 +7667,7 @@ class ComputeTestCase(BaseTestCase,
|
||||
|
||||
migration = objects.Migration(instance_uuid=evacuated_instance.uuid)
|
||||
migration.source_node = NODENAME
|
||||
migration.dest_compute = "dest-host"
|
||||
mock_get_filter.return_value = [migration]
|
||||
instances.append(evacuated_instance)
|
||||
mock_get_inst.return_value = instances
|
||||
@ -7694,7 +7695,8 @@ class ComputeTestCase(BaseTestCase,
|
||||
mock_get_inst.assert_called_once_with(fake_context)
|
||||
mock_get_nw.assert_called_once_with(fake_context, evacuated_instance)
|
||||
mock_get_blk.assert_called_once_with(fake_context, evacuated_instance)
|
||||
mock_is_inst.assert_called_once_with(fake_context, evacuated_instance)
|
||||
mock_is_inst.assert_called_once_with(
|
||||
fake_context, evacuated_instance, host='dest-host')
|
||||
mock_destroy.assert_called_once_with(fake_context, evacuated_instance,
|
||||
'fake_network_info',
|
||||
'fake_bdi', False)
|
||||
@ -7739,6 +7741,7 @@ class ComputeTestCase(BaseTestCase,
|
||||
|
||||
migration = objects.Migration(instance_uuid=evacuated_instance.uuid)
|
||||
migration.source_node = NODENAME
|
||||
migration.dest_compute = 'dest-host'
|
||||
mock_get_filter.return_value = [migration]
|
||||
instances.append(evacuated_instance)
|
||||
mock_get_drv.return_value = instances
|
||||
@ -7765,7 +7768,7 @@ class ComputeTestCase(BaseTestCase,
|
||||
mock_check.assert_called_once_with(fake_context,
|
||||
{'filename': 'tmpfilename'},
|
||||
instance=evacuated_instance,
|
||||
host=None)
|
||||
host='dest-host')
|
||||
mock_check_clean.assert_called_once_with(fake_context,
|
||||
{'filename': 'tmpfilename'})
|
||||
mock_destroy.assert_called_once_with(fake_context, evacuated_instance,
|
||||
@ -7811,6 +7814,7 @@ class ComputeTestCase(BaseTestCase,
|
||||
|
||||
migration = objects.Migration(instance_uuid=evacuated_instance.uuid)
|
||||
migration.source_node = NODENAME
|
||||
migration.dest_compute = 'dest-host'
|
||||
mock_get_filter.return_value = [migration]
|
||||
instances.append(evacuated_instance)
|
||||
mock_get_inst.return_value = instances
|
||||
|
@ -5086,6 +5086,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
# Consider the migration successful
|
||||
migration.status = 'done'
|
||||
migration.source_node = 'fake-node'
|
||||
migration.dest_compute = 'dest-host'
|
||||
|
||||
node_cache = {
|
||||
uuids.our_node_uuid:
|
||||
@ -5151,11 +5152,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
# compute was down
|
||||
migration_1.status = 'done'
|
||||
migration_1.source_node = 'deleted-node'
|
||||
migration_1.dest_compute = 'dest-host'
|
||||
|
||||
migration_2 = objects.Migration(instance_uuid=instance_2.uuid)
|
||||
# Consider the migration successful
|
||||
migration_2.status = 'done'
|
||||
migration_2.source_node = 'fake-node'
|
||||
migration_2.dest_compute = 'dest-host'
|
||||
|
||||
node_cache = {
|
||||
uuids.our_node_uuid:
|
||||
|
Loading…
x
Reference in New Issue
Block a user