Make compute auto-disable itself if builds are failing
This implements an auto-disable feature in nova-compute, where we automatically set our service record to disabled if we consecutively fail to build a certain number of instances. While this is a very useful thing to do in general, disabling a failing compute becomes more important in the future where scheduler retries due to unknown failures may become either impossible or scoped to a single cell. Since a compute that is consistently failing will look very attractive to the scheduler, it may become a build magnet, that in the absence of retries, would effectively kill all builds in a cloud until fixed. Change-Id: I02b7cd87d399d487dd1d650540f503a70bc27749
This commit is contained in:
parent
5532d6f404
commit
f93f675a85
@ -528,6 +528,7 @@ class ComputeManager(manager.Manager):
|
||||
CONF.max_concurrent_live_migrations)
|
||||
else:
|
||||
self._live_migration_semaphore = compute_utils.UnlimitedSemaphore()
|
||||
self._failed_builds = 0
|
||||
|
||||
super(ComputeManager, self).__init__(service_name="compute",
|
||||
*args, **kwargs)
|
||||
@ -1688,6 +1689,31 @@ class ComputeManager(manager.Manager):
|
||||
|
||||
return block_device_info
|
||||
|
||||
def _build_failed(self):
|
||||
self._failed_builds += 1
|
||||
limit = CONF.compute.consecutive_build_service_disable_threshold
|
||||
if limit and self._failed_builds >= limit:
|
||||
# NOTE(danms): If we're doing a bunch of parallel builds,
|
||||
# it is possible (although not likely) that we have already
|
||||
# failed N-1 builds before this and we race with a successful
|
||||
# build and disable ourselves here when we might've otherwise
|
||||
# not.
|
||||
LOG.error('Disabling service due to %(fails)i '
|
||||
'consecutive build failures',
|
||||
{'fails': self._failed_builds})
|
||||
ctx = nova.context.get_admin_context()
|
||||
service = objects.Service.get_by_compute_host(ctx, CONF.host)
|
||||
service.disabled = True
|
||||
service.disabled_reason = (
|
||||
'Auto-disabled due to %i build failures' % self._failed_builds)
|
||||
service.save()
|
||||
# NOTE(danms): Reset our counter now so that when the admin
|
||||
# re-enables us we can start fresh
|
||||
self._failed_builds = 0
|
||||
elif self._failed_builds > 1:
|
||||
LOG.warning('%(fails)i consecutive build failures',
|
||||
{'fails': self._failed_builds})
|
||||
|
||||
@wrap_exception()
|
||||
@reverts_task_state
|
||||
@wrap_instance_fault
|
||||
@ -1704,7 +1730,18 @@ class ComputeManager(manager.Manager):
|
||||
# for a while and we want to make sure that nothing else tries
|
||||
# to do anything with this instance while we wait.
|
||||
with self._build_semaphore:
|
||||
self._do_build_and_run_instance(*args, **kwargs)
|
||||
try:
|
||||
result = self._do_build_and_run_instance(*args, **kwargs)
|
||||
except Exception:
|
||||
result = build_results.FAILED
|
||||
raise
|
||||
finally:
|
||||
fails = (build_results.FAILED,
|
||||
build_results.RESCHEDULED)
|
||||
if result in fails:
|
||||
self._build_failed()
|
||||
else:
|
||||
self._failed_builds = 0
|
||||
|
||||
# NOTE(danms): We spawn here to return the RPC worker thread back to
|
||||
# the pool. Since what follows could take a really long time, we don't
|
||||
|
@ -23,6 +23,12 @@ from oslo_config import types
|
||||
|
||||
from nova.conf import paths
|
||||
|
||||
compute_group = cfg.OptGroup(
|
||||
'compute',
|
||||
title='Compute Manager Options',
|
||||
help="""
|
||||
A collection of options specific to the nova-compute service.
|
||||
""")
|
||||
compute_opts = [
|
||||
cfg.StrOpt('compute_driver',
|
||||
help="""
|
||||
@ -637,6 +643,27 @@ Possible values:
|
||||
""")
|
||||
]
|
||||
|
||||
compute_group_opts = [
|
||||
cfg.IntOpt('consecutive_build_service_disable_threshold',
|
||||
default=10,
|
||||
help="""
|
||||
Number of consecutive failed builds that result in disabling a compute service.
|
||||
|
||||
This option will cause nova-compute to set itself to a disabled state
|
||||
if a certain number of consecutive build failures occur. This will
|
||||
prevent the scheduler from continuing to send builds to a compute node that is
|
||||
consistently failing. Note that all failures qualify and count towards this
|
||||
score, including reschedules that may have been due to racy scheduler behavior.
|
||||
Since the failures must be consecutive, it is unlikely that occasional expected
|
||||
reschedules will actually disable a compute node.
|
||||
|
||||
Possible values:
|
||||
|
||||
* Any positive integer representing a build failure count.
|
||||
* Zero to never auto-disable.
|
||||
"""),
|
||||
]
|
||||
|
||||
interval_opts = [
|
||||
cfg.IntOpt('image_cache_manager_interval',
|
||||
default=2400,
|
||||
@ -1139,7 +1166,10 @@ ALL_OPTS = (compute_opts +
|
||||
|
||||
def register_opts(conf):
|
||||
conf.register_opts(ALL_OPTS)
|
||||
conf.register_group(compute_group)
|
||||
conf.register_opts(compute_group_opts, group=compute_group)
|
||||
|
||||
|
||||
def list_opts():
|
||||
return {'DEFAULT': ALL_OPTS}
|
||||
return {'DEFAULT': ALL_OPTS,
|
||||
'compute': compute_opts}
|
||||
|
@ -4040,6 +4040,95 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
set_error=True, cleanup_volumes=True,
|
||||
nil_out_host_and_node=True)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
def test_build_failures_disable_service(self, mock_service, mock_dbari):
|
||||
mock_dbari.return_value = build_results.FAILED
|
||||
instance = objects.Instance(uuid=uuids.instance)
|
||||
for i in range(0, 10):
|
||||
self.compute.build_and_run_instance(None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertTrue(service.disabled)
|
||||
self.assertEqual('Auto-disabled due to 10 build failures',
|
||||
service.disabled_reason)
|
||||
service.save.assert_called_once_with()
|
||||
self.assertEqual(0, self.compute._failed_builds)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
def test_build_failures_not_disable_service(self, mock_service,
|
||||
mock_dbari):
|
||||
self.flags(consecutive_build_service_disable_threshold=0,
|
||||
group='compute')
|
||||
mock_dbari.return_value = build_results.FAILED
|
||||
instance = objects.Instance(uuid=uuids.instance)
|
||||
for i in range(0, 10):
|
||||
self.compute.build_and_run_instance(None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertFalse(service.save.called)
|
||||
self.assertEqual(10, self.compute._failed_builds)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
def test_transient_build_failures_no_disable_service(self, mock_service,
|
||||
mock_dbari):
|
||||
results = [build_results.FAILED,
|
||||
build_results.ACTIVE,
|
||||
build_results.RESCHEDULED]
|
||||
|
||||
def _fake_build(*a, **k):
|
||||
if results:
|
||||
return results.pop(0)
|
||||
else:
|
||||
return build_results.ACTIVE
|
||||
|
||||
mock_dbari.side_effect = _fake_build
|
||||
instance = objects.Instance(uuid=uuids.instance)
|
||||
for i in range(0, 10):
|
||||
self.compute.build_and_run_instance(None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertFalse(service.save.called)
|
||||
self.assertEqual(0, self.compute._failed_builds)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
def test_build_reschedules_disable_service(self, mock_service, mock_dbari):
|
||||
mock_dbari.return_value = build_results.RESCHEDULED
|
||||
instance = objects.Instance(uuid=uuids.instance)
|
||||
for i in range(0, 10):
|
||||
self.compute.build_and_run_instance(None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertTrue(service.disabled)
|
||||
self.assertEqual('Auto-disabled due to 10 build failures',
|
||||
service.disabled_reason)
|
||||
service.save.assert_called_once_with()
|
||||
self.assertEqual(0, self.compute._failed_builds)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
@mock.patch('nova.exception_wrapper._emit_exception_notification')
|
||||
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
|
||||
def test_build_exceptions_disable_service(self, mock_if, mock_notify,
|
||||
mock_service, mock_dbari):
|
||||
mock_dbari.side_effect = test.TestingException()
|
||||
instance = objects.Instance(uuid=uuids.instance,
|
||||
task_state=None)
|
||||
for i in range(0, 10):
|
||||
self.assertRaises(test.TestingException,
|
||||
self.compute.build_and_run_instance,
|
||||
None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertTrue(service.disabled)
|
||||
self.assertEqual('Auto-disabled due to 10 build failures',
|
||||
service.disabled_reason)
|
||||
service.save.assert_called_once_with()
|
||||
self.assertEqual(0, self.compute._failed_builds)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'spawn')
|
||||
|
@ -0,0 +1,17 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
The `nova-compute` worker can automatically disable itself in the
|
||||
service database if consecutive build failures exceed a set threshold. The
|
||||
``[compute]/consecutive_build_service_disable_threshold`` configuration option
|
||||
allows setting the threshold for this behavior, or disabling it entirely if
|
||||
desired.
|
||||
The intent is that an admin will examine the issue before manually
|
||||
re-enabling the service, which will avoid that compute node becoming a
|
||||
black hole build magnet.
|
||||
upgrade:
|
||||
- |
|
||||
The new configuration option
|
||||
``[compute]/consecutive_build_service_disable_threshold``
|
||||
defaults to a nonzero value, which means multiple failed builds will
|
||||
result in a compute node auto-disabling itself.
|
Loading…
x
Reference in New Issue
Block a user