From 2c76fd3bafc90b23ed9d9e6a7f84919082dc0076 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Wed, 30 Oct 2024 13:24:41 +0100
Subject: [PATCH] Route shared storage RPC to evac dest at startup

If a compute is started up while an evacuation of an instance from this
host is still in progress then the destroy_evacuated_instances call will
try to check if the instance is on shared storage to decide if the local
disk needs to deleted from the source node or not. However this call
uses the instance.host to target the RPC call. If the evacuation is
still ongoing then the instance.host might still be set to the source
node. This means the source node during init_host tries to call  RPC
on itself. This will always time out as the RPC server is only started
after init_host. Also it is wrong as the shared storage check RPC
should be called on another host. Moreover when this wrongly routed RPC
times out the source compute logs the exception, ignores it, and the
assume the disk is on shared storage so won't clean it up. This means
that a later evacuation of this VM targeting this node will fails as the
instance directory is already present on the node.

The fix is simple, the destroy_evacuated_instances call should always
send the shared storage check RPC call to the destination node of the
evacuation based on the migration record. It will be correct even if the
evacuation is still in progress or even if it is already finished.

Closes-Bug: #2085975
Change-Id: If5ad213649d68da995dad146f0a0c3cacc369309
---
 nova/compute/manager.py                              |  3 ++-
 .../tests/functional/regressions/test_bug_2085975.py | 12 ++----------
 nova/tests/unit/compute/test_compute.py              |  8 ++++++--
 nova/tests/unit/compute/test_compute_mgr.py          |  3 +++
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index d5434a15cd0b..64fc196bf03c 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -826,8 +826,9 @@ class ComputeManager(manager.Manager):
                     context, instance)
                 bdi = self._get_instance_block_device_info(context,
                                                            instance)
+                evac = evacuations[instance.uuid]
                 destroy_disks = not (self._is_instance_storage_shared(
-                    context, instance))
+                    context, instance, host=evac.dest_compute))
             except exception.InstanceNotFound:
                 network_info = network_model.NetworkInfo()
                 bdi = {}
diff --git a/nova/tests/functional/regressions/test_bug_2085975.py b/nova/tests/functional/regressions/test_bug_2085975.py
index 89fa009a0acc..a3e2424978a7 100644
--- a/nova/tests/functional/regressions/test_bug_2085975.py
+++ b/nova/tests/functional/regressions/test_bug_2085975.py
@@ -14,7 +14,6 @@ import fixtures
 import threading
 
 from oslo_log import log as logging
-from oslo_messaging import MessagingTimeout
 
 from nova.tests.functional.libvirt import base
 
@@ -128,12 +127,5 @@ class TestComputeStartupInProgressEvacuation(
         # finishes
         contd.set()
 
-        # This is bug 2085975 as the shared storage check fails with exception
-        # and therefore nova defaults to not cleaning up the local disk of the
-        # instance.
-        self.assertEqual(MessagingTimeout, type(rpc_exception))
-        # when host is not passed rpc defaults to instance.host
-        self.assertIsNone(rpc_target_host)
-        # when fixed it should not fail
-        # self.assertIsNone(rpc_exception)
-        # self.assertEqual('compute2', rpc_target_host)
+        self.assertIsNone(rpc_exception)
+        self.assertEqual('compute2', rpc_target_host)
diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py
index cecefc2c9c3d..1fc2c6d26672 100644
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@@ -7667,6 +7667,7 @@ class ComputeTestCase(BaseTestCase,
 
         migration = objects.Migration(instance_uuid=evacuated_instance.uuid)
         migration.source_node = NODENAME
+        migration.dest_compute = "dest-host"
         mock_get_filter.return_value = [migration]
         instances.append(evacuated_instance)
         mock_get_inst.return_value = instances
@@ -7694,7 +7695,8 @@ class ComputeTestCase(BaseTestCase,
         mock_get_inst.assert_called_once_with(fake_context)
         mock_get_nw.assert_called_once_with(fake_context, evacuated_instance)
         mock_get_blk.assert_called_once_with(fake_context, evacuated_instance)
-        mock_is_inst.assert_called_once_with(fake_context, evacuated_instance)
+        mock_is_inst.assert_called_once_with(
+            fake_context, evacuated_instance, host='dest-host')
         mock_destroy.assert_called_once_with(fake_context, evacuated_instance,
                                              'fake_network_info',
                                              'fake_bdi', False)
@@ -7739,6 +7741,7 @@ class ComputeTestCase(BaseTestCase,
 
         migration = objects.Migration(instance_uuid=evacuated_instance.uuid)
         migration.source_node = NODENAME
+        migration.dest_compute = 'dest-host'
         mock_get_filter.return_value = [migration]
         instances.append(evacuated_instance)
         mock_get_drv.return_value = instances
@@ -7765,7 +7768,7 @@ class ComputeTestCase(BaseTestCase,
         mock_check.assert_called_once_with(fake_context,
                                            {'filename': 'tmpfilename'},
                                            instance=evacuated_instance,
-                                           host=None)
+                                           host='dest-host')
         mock_check_clean.assert_called_once_with(fake_context,
                                                  {'filename': 'tmpfilename'})
         mock_destroy.assert_called_once_with(fake_context, evacuated_instance,
@@ -7811,6 +7814,7 @@ class ComputeTestCase(BaseTestCase,
 
         migration = objects.Migration(instance_uuid=evacuated_instance.uuid)
         migration.source_node = NODENAME
+        migration.dest_compute = 'dest-host'
         mock_get_filter.return_value = [migration]
         instances.append(evacuated_instance)
         mock_get_inst.return_value = instances
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 4b1b22f78009..5d9f10e26e53 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -5086,6 +5086,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         # Consider the migration successful
         migration.status = 'done'
         migration.source_node = 'fake-node'
+        migration.dest_compute = 'dest-host'
 
         node_cache = {
             uuids.our_node_uuid:
@@ -5151,11 +5152,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         # compute was down
         migration_1.status = 'done'
         migration_1.source_node = 'deleted-node'
+        migration_1.dest_compute = 'dest-host'
 
         migration_2 = objects.Migration(instance_uuid=instance_2.uuid)
         # Consider the migration successful
         migration_2.status = 'done'
         migration_2.source_node = 'fake-node'
+        migration_2.dest_compute = 'dest-host'
 
         node_cache = {
             uuids.our_node_uuid: