From 63ffba7496182f6f6f49a380f3c639fc3ded9772 Mon Sep 17 00:00:00 2001 From: "Erlon R. Cruz" Date: Tue, 7 Dec 2021 17:39:58 -0300 Subject: [PATCH] Fix pre_live_migration rollback During the pre live migration process, Nova performs most of the tasks related to the creation and operation of the VM in the destination host. That is done without interrupting any of the hardware in the source host. If the pre_live_migration fails, those same operations should be rolled back. Currently nova is sharing the _rollback_live_migration for both live and pre_live migration rollbacks, and that is causing the source host to try to re-attach network interfaces on the source host where they weren't actually de-attached. This patch fixes that by adding a conditional to allow nova to do different paths for migration and pre_live_migration rollbacks. Closes-bug: #1944619 Change-Id: I784190ac356695dd508e0ad8ec31d8eaa3ebee56 --- nova/compute/manager.py | 16 ++++++++++++---- .../functional/regressions/test_bug_1944619.py | 8 +------- nova/tests/unit/compute/test_compute_mgr.py | 6 ++++-- .../bug-1944619-fix-live-migration-rollback.yaml | 10 ++++++++++ 4 files changed, 27 insertions(+), 13 deletions(-) create mode 100644 releasenotes/notes/bug-1944619-fix-live-migration-rollback.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 44352909a2a8..4df1c4112c35 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -8413,7 +8413,8 @@ class ComputeManager(manager.Manager): migrate_data.migration = migration self._rollback_live_migration(context, instance, dest, migrate_data=migrate_data, - source_bdms=source_bdms) + source_bdms=source_bdms, + pre_live_migration=True) def _do_pre_live_migration_from_source(self, context, dest, instance, block_migration, migration, @@ -9167,7 +9168,8 @@ class ComputeManager(manager.Manager): def _rollback_live_migration(self, context, instance, dest, migrate_data=None, migration_status='failed', - source_bdms=None): + source_bdms=None, + pre_live_migration=False): """Recovers Instance/volume state from migrating -> running. :param context: security context @@ -9217,8 +9219,14 @@ class ComputeManager(manager.Manager): # for nova-network) # NOTE(mriedem): This is a no-op for neutron. self.network_api.setup_networks_on_host(context, instance, self.host) - self.driver.rollback_live_migration_at_source(context, instance, - migrate_data) + + # NOTE(erlon): We should make sure that rollback_live_migration_at_src + # is not called in the pre_live_migration rollback as that will trigger + # the src host to re-attach interfaces which were not detached + # previously. + if not pre_live_migration: + self.driver.rollback_live_migration_at_source(context, instance, + migrate_data) # NOTE(lyarwood): Fetch the current list of BDMs, disconnect any # connected volumes from the dest and delete any volume attachments diff --git a/nova/tests/functional/regressions/test_bug_1944619.py b/nova/tests/functional/regressions/test_bug_1944619.py index 3274ff5a158b..82b7475dca8f 100644 --- a/nova/tests/functional/regressions/test_bug_1944619.py +++ b/nova/tests/functional/regressions/test_bug_1944619.py @@ -72,11 +72,5 @@ class TestRollbackWithHWOffloadedOVS( self._live_migrate(self.server, migration_expected_state='failed', server_expected_state='MIGRATING') - # FIXME(erlon): In the current behavior, - # rollback_live_migration_at_source is called if an error happens - # during the pre_live_migration phase on the destination and therefore - # triggers the observed bug. rollback_live_migration_at_source should - # *not* be called for when errors happen during pre_live_migration - # phase. - mlpr.assert_called_once() + mlpr.assert_not_called() mlpp.assert_called_once() diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index cd1a9369c4a6..760ea79e8774 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -9539,7 +9539,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.assertEqual('error', self.migration.status) mock_rollback_live_mig.assert_called_once_with( self.context, self.instance, 'dest-host', - migrate_data=migrate_data, source_bdms=source_bdms) + migrate_data=migrate_data, source_bdms=source_bdms, + pre_live_migration=True) @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration') @@ -9574,7 +9575,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.assertEqual('error', self.migration.status) mock_rollback_live_mig.assert_called_once_with( self.context, self.instance, 'dest-host', - migrate_data=migrate_data, source_bdms=source_bdms) + migrate_data=migrate_data, source_bdms=source_bdms, + pre_live_migration=True) @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration') diff --git a/releasenotes/notes/bug-1944619-fix-live-migration-rollback.yaml b/releasenotes/notes/bug-1944619-fix-live-migration-rollback.yaml new file mode 100644 index 000000000000..b6c68ed49f2f --- /dev/null +++ b/releasenotes/notes/bug-1944619-fix-live-migration-rollback.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + Instances with hardware offloaded ovs ports no longer lose connectivity + after failed live migrations. The driver.rollback_live_migration_at_source + function is no longer called during during pre_live_migration rollback + which previously resulted in connectivity loss following a failed live + migration. See `Bug 1944619`_ for more details. + + .. _Bug 1944619: https://bugs.launchpad.net/nova/+bug/1944619