diff --git a/lower-constraints.txt b/lower-constraints.txt index ca8ca7ce08d3..913076f87f65 100644 --- a/lower-constraints.txt +++ b/lower-constraints.txt @@ -66,7 +66,7 @@ os-brick==2.6.2 os-client-config==1.29.0 os-resource-classes==0.4.0 os-service-types==1.7.0 -os-traits==2.0.0 +os-traits==2.1.0 os-vif==1.14.0 os-win==3.0.0 os-xenapi==0.3.3 diff --git a/nova/compute/api.py b/nova/compute/api.py index d682933a9d93..d1ee5d18cc2a 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -25,6 +25,7 @@ import re import string from castellan import key_manager +import os_traits from oslo_log import log as logging from oslo_messaging import exceptions as oslo_exceptions from oslo_serialization import base64 as base64utils @@ -103,6 +104,7 @@ AGGREGATE_ACTION_ADD = 'Add' MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED = 38 MIN_COMPUTE_CROSS_CELL_RESIZE = 47 +MIN_COMPUTE_SAME_HOST_COLD_MIGRATE = 48 # FIXME(danms): Keep a global cache of the cells we find the # first time we look. This needs to be refreshed on a timer or @@ -3881,8 +3883,7 @@ class API(base.Base): validate_pci=True) filter_properties = {'ignore_hosts': []} - - if not CONF.allow_resize_to_same_host: + if not self._allow_resize_to_same_host(same_instance_type, instance): filter_properties['ignore_hosts'].append(instance.host) request_spec = objects.RequestSpec.get_by_instance_uuid( @@ -3931,6 +3932,57 @@ class API(base.Base): request_spec=request_spec, do_cast=True) + def _allow_resize_to_same_host(self, cold_migrate, instance): + """Contains logic for excluding the instance.host on resize/migrate. + + If performing a cold migration and the compute node resource provider + reports the COMPUTE_SAME_HOST_COLD_MIGRATE trait then same-host cold + migration is allowed otherwise it is not and the current instance.host + should be excluded as a scheduling candidate. + + :param cold_migrate: true if performing a cold migration, false + for resize + :param instance: Instance object being resized or cold migrated + :returns: True if same-host resize/cold migrate is allowed, False + otherwise + """ + if cold_migrate: + # Check to see if the compute node resource provider on which the + # instance is running has the COMPUTE_SAME_HOST_COLD_MIGRATE + # trait. + # Note that we check this here in the API since we cannot + # pre-filter allocation candidates in the scheduler using this + # trait as it would not work. For example, libvirt nodes will not + # report the trait but using it as a forbidden trait filter when + # getting allocation candidates would still return libvirt nodes + # which means we could attempt to cold migrate to the same libvirt + # node, which would fail. + ctxt = instance._context + cn = objects.ComputeNode.get_by_host_and_nodename( + ctxt, instance.host, instance.node) + traits = self.placementclient.get_provider_traits( + ctxt, cn.uuid).traits + # If the provider has the trait it is (1) new enough to report that + # trait and (2) supports cold migration on the same host. + if os_traits.COMPUTE_SAME_HOST_COLD_MIGRATE in traits: + allow_same_host = True + else: + # TODO(mriedem): Remove this compatibility code after one + # release. If the compute is old we will not know if it + # supports same-host cold migration so we fallback to config. + service = objects.Service.get_by_compute_host(ctxt, cn.host) + if service.version >= MIN_COMPUTE_SAME_HOST_COLD_MIGRATE: + # The compute is new enough to report the trait but does + # not so same-host cold migration is not allowed. + allow_same_host = False + else: + # The compute is not new enough to report the trait so we + # fallback to config. + allow_same_host = CONF.allow_resize_to_same_host + else: + allow_same_host = CONF.allow_resize_to_same_host + return allow_same_host + @check_instance_lock @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED, vm_states.PAUSED, vm_states.SUSPENDED]) diff --git a/nova/objects/service.py b/nova/objects/service.py index 413827f1b383..0ec10acae0ab 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 47 +SERVICE_VERSION = 48 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -176,6 +176,8 @@ SERVICE_VERSION_HISTORY = ( # Version 47: Compute RPC v5.10: # finish_revert_snapshot_based_resize_at_source {'compute_rpc': '5.10'}, + # Version 48: Drivers report COMPUTE_SAME_HOST_COLD_MIGRATE trait. + {'compute_rpc': '5.10'}, ) diff --git a/nova/tests/functional/test_cold_migrate.py b/nova/tests/functional/test_cold_migrate.py new file mode 100644 index 000000000000..5b6aa5bacc4c --- /dev/null +++ b/nova/tests/functional/test_cold_migrate.py @@ -0,0 +1,122 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock + +from nova.compute import api as compute_api +from nova import context as nova_context +from nova import objects +from nova import test +from nova.tests.functional import integrated_helpers +from nova.tests.unit import fake_notifier + + +class ColdMigrationDisallowSameHost( + integrated_helpers.ProviderUsageBaseTestCase): + """Tests cold migrate where the source host does not have the + COMPUTE_SAME_HOST_COLD_MIGRATE trait. + """ + compute_driver = 'fake.MediumFakeDriver' + + def setUp(self): + super(ColdMigrationDisallowSameHost, self).setUp() + # Start one compute service which will use the fake virt driver + # which disallows cold migration to the same host. + self._start_compute('host1') + + def _wait_for_migrate_no_valid_host(self, error='NoValidHost'): + event = fake_notifier.wait_for_versioned_notifications( + 'compute_task.migrate_server.error')[0] + self.assertEqual(error, + event['payload']['nova_object.data']['reason'][ + 'nova_object.data']['exception']) + + def test_cold_migrate_same_host_not_supported(self): + """Simple test to show that you cannot cold-migrate to the same host + when the resource provider does not expose the + COMPUTE_SAME_HOST_COLD_MIGRATE trait. + """ + server = self._create_server(networks='none') + # The fake driver does not report COMPUTE_SAME_HOST_COLD_MIGRATE + # so cold migration should fail since we only have one host. + self.api.post_server_action(server['id'], {'migrate': None}) + self._wait_for_migrate_no_valid_host() + + def test_cold_migrate_same_host_old_compute_disallow(self): + """Upgrade compat test where the resource provider does not report + the COMPUTE_SAME_HOST_COLD_MIGRATE trait but the compute service is + old so the API falls back to the allow_resize_to_same_host config which + defaults to False. + """ + server = self._create_server(networks='none') + # Stub the compute service version check to make the compute service + # appear old. + fake_service = objects.Service() + fake_service.version = ( + compute_api.MIN_COMPUTE_SAME_HOST_COLD_MIGRATE - 1) + with mock.patch('nova.objects.Service.get_by_compute_host', + return_value=fake_service) as mock_get_service: + self.api.post_server_action(server['id'], {'migrate': None}) + mock_get_service.assert_called_once_with( + test.MatchType(nova_context.RequestContext), 'host1') + # Since allow_resize_to_same_host defaults to False scheduling failed + # since there are no other hosts. + self._wait_for_migrate_no_valid_host() + + def test_cold_migrate_same_host_old_compute_allow(self): + """Upgrade compat test where the resource provider does not report + the COMPUTE_SAME_HOST_COLD_MIGRATE trait but the compute service is + old so the API falls back to the allow_resize_to_same_host config which + in this test is set to True. + """ + self.flags(allow_resize_to_same_host=True) + server = self._create_server(networks='none') + # Stub the compute service version check to make the compute service + # appear old. + fake_service = objects.Service() + fake_service.version = ( + compute_api.MIN_COMPUTE_SAME_HOST_COLD_MIGRATE - 1) + with mock.patch('nova.objects.Service.get_by_compute_host', + return_value=fake_service) as mock_get_service: + self.api.post_server_action(server['id'], {'migrate': None}) + mock_get_service.assert_called_once_with( + test.MatchType(nova_context.RequestContext), 'host1') + # In this case the compute is old so the API falls back to checking + # allow_resize_to_same_host which says same-host cold migrate is + # allowed so the scheduler sends the request to the only compute + # available but the virt driver says same-host cold migrate is not + # supported and raises UnableToMigrateToSelf. A reschedule is sent + # to conductor which results in MaxRetriesExceeded since there are no + # alternative hosts. + self._wait_for_migrate_no_valid_host(error='MaxRetriesExceeded') + + +class ColdMigrationAllowSameHost( + integrated_helpers.ProviderUsageBaseTestCase): + """Tests cold migrate where the source host has the + COMPUTE_SAME_HOST_COLD_MIGRATE trait. + """ + compute_driver = 'fake.SameHostColdMigrateDriver' + + def setUp(self): + super(ColdMigrationAllowSameHost, self).setUp() + self._start_compute('host1') + + def test_cold_migrate_same_host_supported(self): + """Simple test to show that you can cold-migrate to the same host + when the resource provider exposes the COMPUTE_SAME_HOST_COLD_MIGRATE + trait. + """ + server = self._create_server(networks='none') + self.api.post_server_action(server['id'], {'migrate': None}) + server = self._wait_for_state_change(server, 'VERIFY_RESIZE') + self.assertEqual('host1', server['OS-EXT-SRV-ATTR:host']) diff --git a/nova/tests/unit/compute/test_compute_api.py b/nova/tests/unit/compute/test_compute_api.py index b6d3066abaae..aa5b4a9e7b97 100644 --- a/nova/tests/unit/compute/test_compute_api.py +++ b/nova/tests/unit/compute/test_compute_api.py @@ -1741,6 +1741,7 @@ class _ComputeAPIUnitTestMixIn(object): @mock.patch('nova.compute.api.API.get_instance_host_status', new=mock.Mock(return_value=fields_obj.HostStatus.UP)) + @mock.patch('nova.compute.api.API._allow_resize_to_same_host') @mock.patch('nova.compute.utils.is_volume_backed_instance', return_value=False) @mock.patch('nova.compute.api.API._validate_flavor_image_nostatus') @@ -1757,6 +1758,7 @@ class _ComputeAPIUnitTestMixIn(object): mock_get_by_instance_uuid, mock_get_flavor, mock_upsize, mock_inst_save, mock_count, mock_limit, mock_record, mock_migration, mock_validate, mock_is_vol_backed, + mock_allow_resize_to_same_host, flavor_id_passed=True, same_host=False, allow_same_host=False, project_id=None, @@ -1768,6 +1770,7 @@ class _ComputeAPIUnitTestMixIn(object): allow_cross_cell_resize=False): self.flags(allow_resize_to_same_host=allow_same_host) + mock_allow_resize_to_same_host.return_value = allow_same_host params = {} if project_id is not None: @@ -1871,6 +1874,7 @@ class _ComputeAPIUnitTestMixIn(object): self.assertEqual( allow_cross_cell_resize, fake_spec.requested_destination.allow_cross_cell_move) + mock_allow_resize_to_same_host.assert_called_once() if host_name: mock_get_all_by_host.assert_called_once_with( diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 755b4e0f756b..14a8991ec9ee 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -7705,6 +7705,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, super(ComputeManagerMigrationTestCase, self).setUp() fake_notifier.stub_notifier(self) self.addCleanup(fake_notifier.reset) + self.flags(compute_driver='fake.SameHostColdMigrateDriver') self.compute = manager.ComputeManager() self.context = context.RequestContext(fakes.FAKE_USER_ID, fakes.FAKE_PROJECT_ID) @@ -9658,7 +9659,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, @mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage') @mock.patch.object(objects.Migration, 'get_by_id') - @mock.patch.object(nova.virt.fake.SmallFakeDriver, 'live_migration_abort') + @mock.patch.object(nova.virt.fake.FakeDriver, 'live_migration_abort') @mock.patch('nova.compute.utils.notify_about_instance_action') def test_live_migration_abort(self, mock_notify_action, mock_driver, mock_get_migration, mock_notify): @@ -9711,7 +9712,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') @mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage') @mock.patch.object(objects.Migration, 'get_by_id') - @mock.patch.object(nova.virt.fake.SmallFakeDriver, 'live_migration_abort') + @mock.patch.object(nova.virt.fake.FakeDriver, 'live_migration_abort') @mock.patch('nova.compute.utils.notify_about_instance_action') def test_live_migration_abort_not_supported(self, mock_notify_action, mock_driver, diff --git a/nova/virt/driver.py b/nova/virt/driver.py index e6cf491ee7f9..f8f57ecab88b 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -124,6 +124,9 @@ CAPABILITY_TRAITS_MAP = { "supports_image_type_vmdk": os_traits.COMPUTE_IMAGE_TYPE_VMDK, # Added in os-traits 2.0.0 "supports_image_type_ploop": os_traits.COMPUTE_IMAGE_TYPE_PLOOP, + + # Added in os-traits 2.1.0. + "supports_migrate_to_same_host": os_traits.COMPUTE_SAME_HOST_COLD_MIGRATE, } diff --git a/nova/virt/fake.py b/nova/virt/fake.py index dbc58b0c1b95..3f6047307c55 100644 --- a/nova/virt/fake.py +++ b/nova/virt/fake.py @@ -104,7 +104,7 @@ class FakeDriver(driver.ComputeDriver): capabilities = { "has_imagecache": True, "supports_evacuate": True, - "supports_migrate_to_same_host": True, + "supports_migrate_to_same_host": False, "supports_attach_interface": True, "supports_device_tagging": True, "supports_tagged_attach_interface": True, @@ -702,6 +702,12 @@ class MediumFakeDriver(FakeDriver): local_gb = 1028 +class SameHostColdMigrateDriver(MediumFakeDriver): + """MediumFakeDriver variant that supports same-host cold migrate.""" + capabilities = dict(FakeDriver.capabilities, + supports_migrate_to_same_host=True) + + class PowerUpdateFakeDriver(SmallFakeDriver): # A specific fake driver for the power-update external event testing. diff --git a/releasenotes/notes/bug-1748697-COMPUTE_SAME_HOST_COLD_MIGRATE-19ed64bf48bb1fc7.yaml b/releasenotes/notes/bug-1748697-COMPUTE_SAME_HOST_COLD_MIGRATE-19ed64bf48bb1fc7.yaml new file mode 100644 index 000000000000..b2ea5aba5945 --- /dev/null +++ b/releasenotes/notes/bug-1748697-COMPUTE_SAME_HOST_COLD_MIGRATE-19ed64bf48bb1fc7.yaml @@ -0,0 +1,18 @@ +--- +fixes: + - | + This release contains a fix for `bug 1748697`_ which distinguishes between + resize and cold migrate such that the ``allow_resize_to_same_host`` config + option is no longer used to determine if a server can be cold migrated to + the same compute service host. Now when requesting a cold migration the + API will check if the source compute node resource provider has the + ``COMPUTE_SAME_HOST_COLD_MIGRATE`` trait and if so the scheduler can select + the source host. Note that the only in-tree compute driver that supports + cold migration to the same host is ``VMwareVCDriver``. + + To support rolling upgrades with N-1 computes if a node does not report the + trait and is old the API will fallback to the ``allow_resize_to_same_host`` + config option value. That compatibility will be removed in a subsequent + release. + + .. _bug 1748697: https://launchpad.net/bugs/1748697 diff --git a/requirements.txt b/requirements.txt index 5fd2e8b4e6b9..a0855a79cd52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -55,7 +55,7 @@ psutil>=3.2.2 # BSD oslo.versionedobjects>=1.35.0 # Apache-2.0 os-brick>=2.6.2 # Apache-2.0 os-resource-classes>=0.4.0 # Apache-2.0 -os-traits>=2.0.0 # Apache-2.0 +os-traits>=2.1.0 # Apache-2.0 os-vif>=1.14.0 # Apache-2.0 os-win>=3.0.0 # Apache-2.0 castellan>=0.16.0 # Apache-2.0