Merge "Fail to live migration if instance has a NUMA topology"
This commit is contained in:
commit
5d748001c2
@ -20,6 +20,8 @@ vital. In these cases, instances can be expected to deliver near-native
|
||||
performance. The Compute service provides features to improve individual
|
||||
instance for these kind of workloads.
|
||||
|
||||
.. include:: /common/numa-live-migration-warning.txt
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
|
@ -19,7 +19,8 @@ This document covers live migrations using the
|
||||
.. note::
|
||||
|
||||
Not all Compute service hypervisor drivers support live-migration, or
|
||||
support all live-migration features.
|
||||
support all live-migration features. Similarly not all compute service
|
||||
features are supported.
|
||||
|
||||
Consult :doc:`/user/support-matrix` to determine which hypervisors
|
||||
support live-migration.
|
||||
|
@ -7,6 +7,8 @@ control over how instances run on hypervisor CPUs and the topology of virtual
|
||||
CPUs available to instances. These features help minimize latency and maximize
|
||||
performance.
|
||||
|
||||
.. include:: /common/numa-live-migration-warning.txt
|
||||
|
||||
SMP, NUMA, and SMT
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
10
doc/source/common/numa-live-migration-warning.txt
Normal file
10
doc/source/common/numa-live-migration-warning.txt
Normal file
@ -0,0 +1,10 @@
|
||||
.. important::
|
||||
|
||||
Unless :oslo.config:option:`specifically enabled
|
||||
<workarounds.enable_numa_live_migration>`, live migration is not currently
|
||||
possible for instances with a NUMA topology when using the libvirt driver.
|
||||
A NUMA topology may be specified explicitly or can be added implicitly due
|
||||
to the use of CPU pinning or huge pages. Refer to `bug #1289064`__ for more
|
||||
information.
|
||||
|
||||
__ https://bugs.launchpad.net/nova/+bug/1289064
|
@ -65,6 +65,8 @@ create a particular service. It is common for this workloads needing
|
||||
bare metal like performance, i.e. low latency and close to line speed
|
||||
performance.
|
||||
|
||||
.. include:: /common/numa-live-migration-warning.txt
|
||||
|
||||
.. feature_matrix:: feature-matrix-nfv.ini
|
||||
|
||||
.. _matrix-hpc:
|
||||
|
@ -101,7 +101,11 @@ class MigrateServerController(wsgi.Controller):
|
||||
disk_over_commit = strutils.bool_from_string(disk_over_commit,
|
||||
strict=True)
|
||||
|
||||
instance = common.get_instance(self.compute_api, context, id)
|
||||
# NOTE(stephenfin): we need 'numa_topology' because of the
|
||||
# 'LiveMigrationTask._check_instance_has_no_numa' check in the
|
||||
# conductor
|
||||
instance = common.get_instance(self.compute_api, context, id,
|
||||
expected_attrs=['numa_topology'])
|
||||
try:
|
||||
self.compute_api.live_migrate(context, instance, block_migration,
|
||||
disk_over_commit, host, force,
|
||||
|
@ -23,6 +23,7 @@ from nova import exception
|
||||
from nova.i18n import _
|
||||
from nova import network
|
||||
from nova import objects
|
||||
from nova.objects import fields as obj_fields
|
||||
from nova.scheduler import utils as scheduler_utils
|
||||
from nova import utils
|
||||
|
||||
@ -65,6 +66,7 @@ class LiveMigrationTask(base.TaskBase):
|
||||
|
||||
def _execute(self):
|
||||
self._check_instance_is_active()
|
||||
self._check_instance_has_no_numa()
|
||||
self._check_host_is_up(self.source)
|
||||
|
||||
self._source_cn, self._held_allocations = (
|
||||
@ -151,6 +153,30 @@ class LiveMigrationTask(base.TaskBase):
|
||||
state=self.instance.power_state,
|
||||
method='live migrate')
|
||||
|
||||
def _check_instance_has_no_numa(self):
|
||||
"""Prevent live migrations of instances with NUMA topologies."""
|
||||
if not self.instance.numa_topology:
|
||||
return
|
||||
|
||||
# Only KVM (libvirt) supports NUMA topologies with CPU pinning;
|
||||
# HyperV's vNUMA feature doesn't allow specific pinning
|
||||
hypervisor_type = objects.ComputeNode.get_by_host_and_nodename(
|
||||
self.context, self.source, self.instance.node).hypervisor_type
|
||||
if hypervisor_type != obj_fields.HVType.KVM:
|
||||
return
|
||||
|
||||
msg = ('Instance has an associated NUMA topology. '
|
||||
'Instance NUMA topologies, including related attributes '
|
||||
'such as CPU pinning, huge page and emulator thread '
|
||||
'pinning information, are not currently recalculated on '
|
||||
'live migration. See bug #1289064 for more information.'
|
||||
)
|
||||
|
||||
if CONF.workarounds.enable_numa_live_migration:
|
||||
LOG.warning(msg, instance=self.instance)
|
||||
else:
|
||||
raise exception.MigrationPreCheckError(reason=msg)
|
||||
|
||||
def _check_host_is_up(self, host):
|
||||
service = objects.Service.get_by_compute_host(self.context, host)
|
||||
|
||||
|
@ -184,6 +184,34 @@ deploying the new code during an upgrade.
|
||||
Related options:
|
||||
|
||||
* ``[consoleauth]/token_ttl``
|
||||
"""),
|
||||
|
||||
cfg.BoolOpt(
|
||||
'enable_numa_live_migration',
|
||||
default=False,
|
||||
help="""
|
||||
Enable live migration of instances with NUMA topologies.
|
||||
|
||||
Live migration of instances with NUMA topologies is disabled by default
|
||||
when using the libvirt driver. This includes live migration of instances with
|
||||
CPU pinning or hugepages. CPU pinning and huge page information for such
|
||||
instances is not currently re-calculated, as noted in `bug #1289064`_. This
|
||||
means that if instances were already present on the destination host, the
|
||||
migrated instance could be placed on the same dedicated cores as these
|
||||
instances or use hugepages allocated for another instance. Alternately, if the
|
||||
host platforms were not homogeneous, the instance could be assigned to
|
||||
non-existent cores or be inadvertently split across host NUMA nodes.
|
||||
|
||||
Despite these known issues, there may be cases where live migration is
|
||||
necessary. By enabling this option, operators that are aware of the issues and
|
||||
are willing to manually work around them can enable live migration support for
|
||||
these instances.
|
||||
|
||||
Related options:
|
||||
|
||||
* ``compute_driver``: Only the libvirt driver is affected.
|
||||
|
||||
.. _bug #1289064: https://bugs.launchpad.net/nova/+bug/1289064
|
||||
"""),
|
||||
]
|
||||
|
||||
|
@ -48,6 +48,10 @@ class CommonMixin(object):
|
||||
# Reset the mock.
|
||||
self.mock_get.reset_mock()
|
||||
|
||||
expected_attrs = None
|
||||
if action == '_migrate_live':
|
||||
expected_attrs = ['numa_topology']
|
||||
|
||||
uuid = uuidutils.generate_uuid()
|
||||
self.mock_get.side_effect = exception.InstanceNotFound(
|
||||
instance_id=uuid)
|
||||
@ -57,7 +61,7 @@ class CommonMixin(object):
|
||||
controller_function,
|
||||
self.req, uuid, body=body_map)
|
||||
self.mock_get.assert_called_once_with(self.context, uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=expected_attrs,
|
||||
cell_down_support=False)
|
||||
|
||||
def _test_action(self, action, body=None, method=None,
|
||||
@ -65,6 +69,10 @@ class CommonMixin(object):
|
||||
# Reset the mock.
|
||||
self.mock_get.reset_mock()
|
||||
|
||||
expected_attrs = None
|
||||
if action == '_migrate_live':
|
||||
expected_attrs = ['numa_topology']
|
||||
|
||||
if method is None:
|
||||
method = action.replace('_', '')
|
||||
compute_api_args_map = compute_api_args_map or {}
|
||||
@ -85,13 +93,17 @@ class CommonMixin(object):
|
||||
mock_method.assert_called_once_with(self.context, instance, *args,
|
||||
**kwargs)
|
||||
self.mock_get.assert_called_once_with(self.context, instance.uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=expected_attrs,
|
||||
cell_down_support=False)
|
||||
|
||||
def _test_not_implemented_state(self, action, method=None):
|
||||
# Reset the mock.
|
||||
self.mock_get.reset_mock()
|
||||
|
||||
expected_attrs = None
|
||||
if action == '_migrate_live':
|
||||
expected_attrs = ['numa_topology']
|
||||
|
||||
if method is None:
|
||||
method = action.replace('_', '')
|
||||
|
||||
@ -110,7 +122,7 @@ class CommonMixin(object):
|
||||
mock_method.assert_called_once_with(self.context, instance,
|
||||
*args, **kwargs)
|
||||
self.mock_get.assert_called_once_with(self.context, instance.uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=expected_attrs,
|
||||
cell_down_support=False)
|
||||
|
||||
def _test_invalid_state(self, action, method=None, body_map=None,
|
||||
@ -119,6 +131,10 @@ class CommonMixin(object):
|
||||
# Reset the mock.
|
||||
self.mock_get.reset_mock()
|
||||
|
||||
expected_attrs = None
|
||||
if action == '_migrate_live':
|
||||
expected_attrs = ['numa_topology']
|
||||
|
||||
if method is None:
|
||||
method = action.replace('_', '')
|
||||
if body_map is None:
|
||||
@ -146,7 +162,7 @@ class CommonMixin(object):
|
||||
mock_method.assert_called_once_with(self.context, instance,
|
||||
*args, **kwargs)
|
||||
self.mock_get.assert_called_once_with(self.context, instance.uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=expected_attrs,
|
||||
cell_down_support=False)
|
||||
|
||||
def _test_locked_instance(self, action, method=None, body=None,
|
||||
@ -154,6 +170,10 @@ class CommonMixin(object):
|
||||
# Reset the mock.
|
||||
self.mock_get.reset_mock()
|
||||
|
||||
expected_attrs = None
|
||||
if action == '_migrate_live':
|
||||
expected_attrs = ['numa_topology']
|
||||
|
||||
if method is None:
|
||||
method = action.replace('_', '')
|
||||
|
||||
@ -173,7 +193,7 @@ class CommonMixin(object):
|
||||
mock_method.assert_called_once_with(self.context, instance,
|
||||
*args, **kwargs)
|
||||
self.mock_get.assert_called_once_with(self.context, instance.uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=expected_attrs,
|
||||
cell_down_support=False)
|
||||
|
||||
def _test_instance_not_found_in_compute_api(self, action,
|
||||
@ -181,6 +201,10 @@ class CommonMixin(object):
|
||||
# Reset the mock.
|
||||
self.mock_get.reset_mock()
|
||||
|
||||
expected_attrs = None
|
||||
if action == '_migrate_live':
|
||||
expected_attrs = ['numa_topology']
|
||||
|
||||
if method is None:
|
||||
method = action.replace('_', '')
|
||||
compute_api_args_map = compute_api_args_map or {}
|
||||
@ -200,7 +224,7 @@ class CommonMixin(object):
|
||||
mock_method.assert_called_once_with(self.context, instance,
|
||||
*args, **kwargs)
|
||||
self.mock_get.assert_called_once_with(self.context, instance.uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=expected_attrs,
|
||||
cell_down_support=False)
|
||||
|
||||
|
||||
|
@ -150,7 +150,7 @@ class MigrateServerTestsV21(admin_only_action_common.CommonTests):
|
||||
'hostname', self.force, self.async_)
|
||||
|
||||
self.mock_get.assert_called_once_with(self.context, instance.uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=['numa_topology'],
|
||||
cell_down_support=False)
|
||||
|
||||
def test_migrate_live_enabled(self):
|
||||
@ -228,7 +228,7 @@ class MigrateServerTestsV21(admin_only_action_common.CommonTests):
|
||||
self.context, instance, False, self.disk_over_commit,
|
||||
'hostname', self.force, self.async_)
|
||||
self.mock_get.assert_called_once_with(self.context, instance.uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=['numa_topology'],
|
||||
cell_down_support=False)
|
||||
|
||||
def test_migrate_live_compute_service_unavailable(self):
|
||||
@ -446,7 +446,7 @@ class MigrateServerTestsV234(MigrateServerTestsV230):
|
||||
self.context, instance, None, self.disk_over_commit,
|
||||
'hostname', self.force, self.async_)
|
||||
self.mock_get.assert_called_once_with(self.context, instance.uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=['numa_topology'],
|
||||
cell_down_support=False)
|
||||
|
||||
def test_migrate_live_unexpected_error(self):
|
||||
@ -465,7 +465,7 @@ class MigrateServerTestsV234(MigrateServerTestsV230):
|
||||
self.context, instance, None, self.disk_over_commit,
|
||||
'hostname', self.force, self.async_)
|
||||
self.mock_get.assert_called_once_with(self.context, instance.uuid,
|
||||
expected_attrs=None,
|
||||
expected_attrs=['numa_topology'],
|
||||
cell_down_support=False)
|
||||
|
||||
|
||||
|
@ -19,6 +19,7 @@ from nova.compute import power_state
|
||||
from nova.compute import rpcapi as compute_rpcapi
|
||||
from nova.compute import vm_states
|
||||
from nova.conductor.tasks import live_migrate
|
||||
from nova import context as nova_context
|
||||
from nova import exception
|
||||
from nova.network import model as network_model
|
||||
from nova import objects
|
||||
@ -39,7 +40,7 @@ fake_selection2 = objects.Selection(service_host="host2", nodename="node2",
|
||||
class LiveMigrationTaskTestCase(test.NoDBTestCase):
|
||||
def setUp(self):
|
||||
super(LiveMigrationTaskTestCase, self).setUp()
|
||||
self.context = "context"
|
||||
self.context = nova_context.get_admin_context()
|
||||
self.instance_host = "host"
|
||||
self.instance_uuid = uuids.instance
|
||||
self.instance_image = "image_ref"
|
||||
@ -53,12 +54,14 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
|
||||
self.instance = objects.Instance._from_db_object(
|
||||
self.context, objects.Instance(), db_instance)
|
||||
self.instance.system_metadata = {'image_hw_disk_bus': 'scsi'}
|
||||
self.instance.numa_topology = None
|
||||
self.destination = "destination"
|
||||
self.block_migration = "bm"
|
||||
self.disk_over_commit = "doc"
|
||||
self.migration = objects.Migration()
|
||||
self.fake_spec = objects.RequestSpec()
|
||||
self._generate_task()
|
||||
|
||||
_p = mock.patch('nova.compute.utils.heal_reqspec_is_bfv')
|
||||
self.heal_reqspec_is_bfv_mock = _p.start()
|
||||
self.addCleanup(_p.stop)
|
||||
@ -170,6 +173,45 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
|
||||
self.assertRaises(exception.InstanceInvalidState,
|
||||
self.task._check_instance_is_active)
|
||||
|
||||
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
|
||||
def test_check_instance_has_no_numa_passes_no_numa(self, mock_get):
|
||||
self.flags(enable_numa_live_migration=False, group='workarounds')
|
||||
self.task.instance.numa_topology = None
|
||||
mock_get.return_value = objects.ComputeNode(
|
||||
uuid=uuids.cn1, hypervisor_type='kvm')
|
||||
self.task._check_instance_has_no_numa()
|
||||
|
||||
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
|
||||
def test_check_instance_has_no_numa_passes_non_kvm(self, mock_get):
|
||||
self.flags(enable_numa_live_migration=False, group='workarounds')
|
||||
self.task.instance.numa_topology = objects.InstanceNUMATopology(
|
||||
cells=[objects.InstanceNUMACell(id=0, cpuset=set([0]),
|
||||
memory=1024)])
|
||||
mock_get.return_value = objects.ComputeNode(
|
||||
uuid=uuids.cn1, hypervisor_type='xen')
|
||||
self.task._check_instance_has_no_numa()
|
||||
|
||||
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
|
||||
def test_check_instance_has_no_numa_passes_workaround(self, mock_get):
|
||||
self.flags(enable_numa_live_migration=True, group='workarounds')
|
||||
self.task.instance.numa_topology = objects.InstanceNUMATopology(
|
||||
cells=[objects.InstanceNUMACell(id=0, cpuset=set([0]),
|
||||
memory=1024)])
|
||||
mock_get.return_value = objects.ComputeNode(
|
||||
uuid=uuids.cn1, hypervisor_type='kvm')
|
||||
self.task._check_instance_has_no_numa()
|
||||
|
||||
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
|
||||
def test_check_instance_has_no_numa_fails(self, mock_get):
|
||||
self.flags(enable_numa_live_migration=False, group='workarounds')
|
||||
mock_get.return_value = objects.ComputeNode(
|
||||
uuid=uuids.cn1, hypervisor_type='kvm')
|
||||
self.task.instance.numa_topology = objects.InstanceNUMATopology(
|
||||
cells=[objects.InstanceNUMACell(id=0, cpuset=set([0]),
|
||||
memory=1024)])
|
||||
self.assertRaises(exception.MigrationPreCheckError,
|
||||
self.task._check_instance_has_no_numa)
|
||||
|
||||
@mock.patch.object(objects.Service, 'get_by_compute_host')
|
||||
@mock.patch.object(servicegroup.API, 'service_is_up')
|
||||
def test_check_instance_host_is_up(self, mock_is_up, mock_get):
|
||||
|
@ -0,0 +1,25 @@
|
||||
---
|
||||
upgrade:
|
||||
- |
|
||||
Live migration of instances with NUMA topologies is now disabled by default
|
||||
when using the libvirt driver. This includes live migration of instances
|
||||
with CPU pinning or hugepages. CPU pinning and huge page information for
|
||||
such instances is not currently re-calculated, as noted in `bug #1289064`_.
|
||||
This means that if instances were already present on the destination host,
|
||||
the migrated instance could be placed on the same dedicated cores as these
|
||||
instances or use hugepages allocated for another instance. Alternately, if
|
||||
the host platforms were not homogeneous, the instance could be assigned to
|
||||
non-existent cores or be inadvertently split across host NUMA nodes.
|
||||
|
||||
The `long term solution`_ to these issues is to recalculate the XML on the
|
||||
destination node. When this work is completed, the restriction on live
|
||||
migration with NUMA topologies will be lifted.
|
||||
|
||||
For operators that are aware of the issues and are able to manually work
|
||||
around them, the ``[workarounds] enable_numa_live_migration`` option can
|
||||
be used to allow the broken behavior.
|
||||
|
||||
For more information, refer to `bug #1289064`_.
|
||||
|
||||
.. _bug #1289064: https://bugs.launchpad.net/nova/+bug/1289064
|
||||
.. _long term solution: https://blueprints.launchpad.net/nova/+spec/numa-aware-live-migration
|
Loading…
x
Reference in New Issue
Block a user