Merge "Fail to live migration if instance has a NUMA topology"

2018-12-18 02:56:40 +00:00 · 2018-12-18 02:56:40 +00:00 · 5d748001c2
commit 5d748001c2
parent 2e9356374e ae2e5650d1
12 changed files with 179 additions and 13 deletions
--- a/doc/source/admin/adv-config.rst
+++ b/doc/source/admin/adv-config.rst
@ -20,6 +20,8 @@ vital. In these cases, instances can be expected to deliver near-native
 performance. The Compute service provides features to improve individual
 instance for these kind of workloads.
 .. include:: /common/numa-live-migration-warning.txt
 .. toctree::
   :maxdepth: 2
--- a/doc/source/admin/configuring-migrations.rst
+++ b/doc/source/admin/configuring-migrations.rst
@ -19,7 +19,8 @@ This document covers live migrations using the
 .. note::
   Not all Compute service hypervisor drivers support live-migration, or
-   support all live-migration features.
+   support all live-migration features. Similarly not all compute service
   features are supported.
   Consult :doc:`/user/support-matrix` to determine which hypervisors
   support live-migration.
--- a/doc/source/admin/cpu-topologies.rst
+++ b/doc/source/admin/cpu-topologies.rst
@ -7,6 +7,8 @@ control over how instances run on hypervisor CPUs and the topology of virtual
 CPUs available to instances. These features help minimize latency and maximize
 performance.
 .. include:: /common/numa-live-migration-warning.txt
 SMP, NUMA, and SMT
 ~~~~~~~~~~~~~~~~~~
--- a/doc/source/common/numa-live-migration-warning.txt
+++ b/doc/source/common/numa-live-migration-warning.txt
@ -0,0 +1,10 @@
 .. important::
   Unless :oslo.config:option:`specifically enabled
   <workarounds.enable_numa_live_migration>`, live migration is not currently
   possible for instances with a NUMA topology when using the libvirt driver.
   A NUMA topology may be specified explicitly or can be added implicitly due
   to the use of CPU pinning or huge pages. Refer to `bug #1289064`__ for more
   information.
   __ https://bugs.launchpad.net/nova/+bug/1289064
--- a/doc/source/user/feature-classification.rst
+++ b/doc/source/user/feature-classification.rst
@ -65,6 +65,8 @@ create a particular service. It is common for this workloads needing
 bare metal like performance, i.e. low latency and close to line speed
 performance.
 .. include:: /common/numa-live-migration-warning.txt
 .. feature_matrix:: feature-matrix-nfv.ini
 .. _matrix-hpc:
--- a/nova/api/openstack/compute/migrate_server.py
+++ b/nova/api/openstack/compute/migrate_server.py
@ -101,7 +101,11 @@ class MigrateServerController(wsgi.Controller):
            disk_over_commit = strutils.bool_from_string(disk_over_commit,
                                                         strict=True)
-        instance = common.get_instance(self.compute_api, context, id)
+        # NOTE(stephenfin): we need 'numa_topology' because of the
        # 'LiveMigrationTask._check_instance_has_no_numa' check in the
        # conductor
        instance = common.get_instance(self.compute_api, context, id,
                                       expected_attrs=['numa_topology'])
        try:
            self.compute_api.live_migrate(context, instance, block_migration,
                                          disk_over_commit, host, force,
--- a/nova/conductor/tasks/live_migrate.py
+++ b/nova/conductor/tasks/live_migrate.py
@ -23,6 +23,7 @@ from nova import exception
 from nova.i18n import _
 from nova import network
 from nova import objects
 from nova.objects import fields as obj_fields
 from nova.scheduler import utils as scheduler_utils
 from nova import utils
@ -65,6 +66,7 @@ class LiveMigrationTask(base.TaskBase):
    def _execute(self):
        self._check_instance_is_active()
        self._check_instance_has_no_numa()
        self._check_host_is_up(self.source)
        self._source_cn, self._held_allocations = (
@ -151,6 +153,30 @@ class LiveMigrationTask(base.TaskBase):
                    state=self.instance.power_state,
                    method='live migrate')
    def _check_instance_has_no_numa(self):
        """Prevent live migrations of instances with NUMA topologies."""
        if not self.instance.numa_topology:
            return
        # Only KVM (libvirt) supports NUMA topologies with CPU pinning;
        # HyperV's vNUMA feature doesn't allow specific pinning
        hypervisor_type = objects.ComputeNode.get_by_host_and_nodename(
            self.context, self.source, self.instance.node).hypervisor_type
        if hypervisor_type != obj_fields.HVType.KVM:
            return
        msg = ('Instance has an associated NUMA topology. '
               'Instance NUMA topologies, including related attributes '
               'such as CPU pinning, huge page and emulator thread '
               'pinning information, are not currently recalculated on '
               'live migration. See bug #1289064 for more information.'
               )
        if CONF.workarounds.enable_numa_live_migration:
            LOG.warning(msg, instance=self.instance)
        else:
            raise exception.MigrationPreCheckError(reason=msg)
    def _check_host_is_up(self, host):
        service = objects.Service.get_by_compute_host(self.context, host)
--- a/nova/conf/workarounds.py
+++ b/nova/conf/workarounds.py
@ -184,6 +184,34 @@ deploying the new code during an upgrade.
 Related options:
 * ``[consoleauth]/token_ttl``
 """),
    cfg.BoolOpt(
        'enable_numa_live_migration',
        default=False,
        help="""
 Enable live migration of instances with NUMA topologies.
 Live migration of instances with NUMA topologies is disabled by default
 when using the libvirt driver. This includes live migration of instances with
 CPU pinning or hugepages. CPU pinning and huge page information for such
 instances is not currently re-calculated, as noted in `bug #1289064`_.  This
 means that if instances were already present on the destination host, the
 migrated instance could be placed on the same dedicated cores as these
 instances or use hugepages allocated for another instance. Alternately, if the
 host platforms were not homogeneous, the instance could be assigned to
 non-existent cores or be inadvertently split across host NUMA nodes.
 Despite these known issues, there may be cases where live migration is
 necessary. By enabling this option, operators that are aware of the issues and
 are willing to manually work around them can enable live migration support for
 these instances.
 Related options:
 * ``compute_driver``: Only the libvirt driver is affected.
 .. _bug #1289064: https://bugs.launchpad.net/nova/+bug/1289064
 """),
 ]
--- a/nova/tests/unit/api/openstack/compute/admin_only_action_common.py
+++ b/nova/tests/unit/api/openstack/compute/admin_only_action_common.py
@ -48,6 +48,10 @@ class CommonMixin(object):
        # Reset the mock.
        self.mock_get.reset_mock()
        expected_attrs = None
        if action == '_migrate_live':
            expected_attrs = ['numa_topology']
        uuid = uuidutils.generate_uuid()
        self.mock_get.side_effect = exception.InstanceNotFound(
            instance_id=uuid)
@ -57,7 +61,7 @@ class CommonMixin(object):
                          controller_function,
                          self.req, uuid, body=body_map)
        self.mock_get.assert_called_once_with(self.context, uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=expected_attrs,
                                              cell_down_support=False)
    def _test_action(self, action, body=None, method=None,
@ -65,6 +69,10 @@ class CommonMixin(object):
        # Reset the mock.
        self.mock_get.reset_mock()
        expected_attrs = None
        if action == '_migrate_live':
            expected_attrs = ['numa_topology']
        if method is None:
            method = action.replace('_', '')
        compute_api_args_map = compute_api_args_map or {}
@ -85,13 +93,17 @@ class CommonMixin(object):
            mock_method.assert_called_once_with(self.context, instance, *args,
                                                **kwargs)
        self.mock_get.assert_called_once_with(self.context, instance.uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=expected_attrs,
                                              cell_down_support=False)
    def _test_not_implemented_state(self, action, method=None):
        # Reset the mock.
        self.mock_get.reset_mock()
        expected_attrs = None
        if action == '_migrate_live':
            expected_attrs = ['numa_topology']
        if method is None:
            method = action.replace('_', '')
@ -110,7 +122,7 @@ class CommonMixin(object):
            mock_method.assert_called_once_with(self.context, instance,
                                                *args, **kwargs)
        self.mock_get.assert_called_once_with(self.context, instance.uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=expected_attrs,
                                              cell_down_support=False)
    def _test_invalid_state(self, action, method=None, body_map=None,
@ -119,6 +131,10 @@ class CommonMixin(object):
        # Reset the mock.
        self.mock_get.reset_mock()
        expected_attrs = None
        if action == '_migrate_live':
            expected_attrs = ['numa_topology']
        if method is None:
            method = action.replace('_', '')
        if body_map is None:
@ -146,7 +162,7 @@ class CommonMixin(object):
            mock_method.assert_called_once_with(self.context, instance,
                                                *args, **kwargs)
        self.mock_get.assert_called_once_with(self.context, instance.uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=expected_attrs,
                                              cell_down_support=False)
    def _test_locked_instance(self, action, method=None, body=None,
@ -154,6 +170,10 @@ class CommonMixin(object):
        # Reset the mock.
        self.mock_get.reset_mock()
        expected_attrs = None
        if action == '_migrate_live':
            expected_attrs = ['numa_topology']
        if method is None:
            method = action.replace('_', '')
@ -173,7 +193,7 @@ class CommonMixin(object):
            mock_method.assert_called_once_with(self.context, instance,
                                                *args, **kwargs)
        self.mock_get.assert_called_once_with(self.context, instance.uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=expected_attrs,
                                              cell_down_support=False)
    def _test_instance_not_found_in_compute_api(self, action,
@ -181,6 +201,10 @@ class CommonMixin(object):
        # Reset the mock.
        self.mock_get.reset_mock()
        expected_attrs = None
        if action == '_migrate_live':
            expected_attrs = ['numa_topology']
        if method is None:
            method = action.replace('_', '')
        compute_api_args_map = compute_api_args_map or {}
@ -200,7 +224,7 @@ class CommonMixin(object):
            mock_method.assert_called_once_with(self.context, instance,
                                                *args, **kwargs)
        self.mock_get.assert_called_once_with(self.context, instance.uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=expected_attrs,
                                              cell_down_support=False)
--- a/nova/tests/unit/api/openstack/compute/test_migrate_server.py
+++ b/nova/tests/unit/api/openstack/compute/test_migrate_server.py
@ -150,7 +150,7 @@ class MigrateServerTestsV21(admin_only_action_common.CommonTests):
                'hostname', self.force, self.async_)
        self.mock_get.assert_called_once_with(self.context, instance.uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=['numa_topology'],
                                              cell_down_support=False)
    def test_migrate_live_enabled(self):
@ -228,7 +228,7 @@ class MigrateServerTestsV21(admin_only_action_common.CommonTests):
                self.context, instance, False, self.disk_over_commit,
                'hostname', self.force, self.async_)
        self.mock_get.assert_called_once_with(self.context, instance.uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=['numa_topology'],
                                              cell_down_support=False)
    def test_migrate_live_compute_service_unavailable(self):
@ -446,7 +446,7 @@ class MigrateServerTestsV234(MigrateServerTestsV230):
                self.context, instance, None, self.disk_over_commit,
                'hostname', self.force, self.async_)
        self.mock_get.assert_called_once_with(self.context, instance.uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=['numa_topology'],
                                              cell_down_support=False)
    def test_migrate_live_unexpected_error(self):
@ -465,7 +465,7 @@ class MigrateServerTestsV234(MigrateServerTestsV230):
                self.context, instance, None, self.disk_over_commit,
                'hostname', self.force, self.async_)
        self.mock_get.assert_called_once_with(self.context, instance.uuid,
-                                              expected_attrs=None,
+                                              expected_attrs=['numa_topology'],
                                              cell_down_support=False)
--- a/nova/tests/unit/conductor/tasks/test_live_migrate.py
+++ b/nova/tests/unit/conductor/tasks/test_live_migrate.py
@ -19,6 +19,7 @@ from nova.compute import power_state
 from nova.compute import rpcapi as compute_rpcapi
 from nova.compute import vm_states
 from nova.conductor.tasks import live_migrate
 from nova import context as nova_context
 from nova import exception
 from nova.network import model as network_model
 from nova import objects
@ -39,7 +40,7 @@ fake_selection2 = objects.Selection(service_host="host2", nodename="node2",
 class LiveMigrationTaskTestCase(test.NoDBTestCase):
    def setUp(self):
        super(LiveMigrationTaskTestCase, self).setUp()
-        self.context = "context"
+        self.context = nova_context.get_admin_context()
        self.instance_host = "host"
        self.instance_uuid = uuids.instance
        self.instance_image = "image_ref"
@ -53,12 +54,14 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        self.instance = objects.Instance._from_db_object(
                self.context, objects.Instance(), db_instance)
        self.instance.system_metadata = {'image_hw_disk_bus': 'scsi'}
        self.instance.numa_topology = None
        self.destination = "destination"
        self.block_migration = "bm"
        self.disk_over_commit = "doc"
        self.migration = objects.Migration()
        self.fake_spec = objects.RequestSpec()
        self._generate_task()
        _p = mock.patch('nova.compute.utils.heal_reqspec_is_bfv')
        self.heal_reqspec_is_bfv_mock = _p.start()
        self.addCleanup(_p.stop)
@ -170,6 +173,45 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        self.assertRaises(exception.InstanceInvalidState,
                          self.task._check_instance_is_active)
    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
    def test_check_instance_has_no_numa_passes_no_numa(self, mock_get):
        self.flags(enable_numa_live_migration=False, group='workarounds')
        self.task.instance.numa_topology = None
        mock_get.return_value = objects.ComputeNode(
            uuid=uuids.cn1, hypervisor_type='kvm')
        self.task._check_instance_has_no_numa()
    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
    def test_check_instance_has_no_numa_passes_non_kvm(self, mock_get):
        self.flags(enable_numa_live_migration=False, group='workarounds')
        self.task.instance.numa_topology = objects.InstanceNUMATopology(
            cells=[objects.InstanceNUMACell(id=0, cpuset=set([0]),
                                            memory=1024)])
        mock_get.return_value = objects.ComputeNode(
            uuid=uuids.cn1, hypervisor_type='xen')
        self.task._check_instance_has_no_numa()
    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
    def test_check_instance_has_no_numa_passes_workaround(self, mock_get):
        self.flags(enable_numa_live_migration=True, group='workarounds')
        self.task.instance.numa_topology = objects.InstanceNUMATopology(
            cells=[objects.InstanceNUMACell(id=0, cpuset=set([0]),
                                            memory=1024)])
        mock_get.return_value = objects.ComputeNode(
            uuid=uuids.cn1, hypervisor_type='kvm')
        self.task._check_instance_has_no_numa()
    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
    def test_check_instance_has_no_numa_fails(self, mock_get):
        self.flags(enable_numa_live_migration=False, group='workarounds')
        mock_get.return_value = objects.ComputeNode(
            uuid=uuids.cn1, hypervisor_type='kvm')
        self.task.instance.numa_topology = objects.InstanceNUMATopology(
            cells=[objects.InstanceNUMACell(id=0, cpuset=set([0]),
                                            memory=1024)])
        self.assertRaises(exception.MigrationPreCheckError,
                          self.task._check_instance_has_no_numa)
    @mock.patch.object(objects.Service, 'get_by_compute_host')
    @mock.patch.object(servicegroup.API, 'service_is_up')
    def test_check_instance_host_is_up(self, mock_is_up, mock_get):
--- a/releasenotes/notes/disable-live-migration-with-numa-bc710a1bcde25957.yaml
+++ b/releasenotes/notes/disable-live-migration-with-numa-bc710a1bcde25957.yaml
@ -0,0 +1,25 @@
 ---
 upgrade:
  - |
    Live migration of instances with NUMA topologies is now disabled by default
    when using the libvirt driver. This includes live migration of instances
    with CPU pinning or hugepages. CPU pinning and huge page information for
    such instances is not currently re-calculated, as noted in `bug #1289064`_.
    This means that if instances were already present on the destination host,
    the migrated instance could be placed on the same dedicated cores as these
    instances or use hugepages allocated for another instance. Alternately, if
    the host platforms were not homogeneous, the instance could be assigned to
    non-existent cores or be inadvertently split across host NUMA nodes.
    The `long term solution`_ to these issues is to recalculate the XML on the
    destination node. When this work is completed, the restriction on live
    migration with NUMA topologies will be lifted.
    For operators that are aware of the issues and are able to manually work
    around them, the ``[workarounds] enable_numa_live_migration`` option can
    be used to allow the broken behavior.
    For more information, refer to `bug #1289064`_.
    .. _bug #1289064: https://bugs.launchpad.net/nova/+bug/1289064
    .. _long term solution: https://blueprints.launchpad.net/nova/+spec/numa-aware-live-migration