Make compute auto-disable itself if builds are failing

This implements an auto-disable feature in nova-compute, where we automatically set our service record to disabled if we consecutively fail to build a certain number of instances. While this is a very useful thing to do in general, disabling a failing compute becomes more important in the future where scheduler retries due to unknown failures may become either impossible or scoped to a single cell. Since a compute that is consistently failing will look very attractive to the scheduler, it may become a build magnet, that in the absence of retries, would effectively kill all builds in a cloud until fixed. Change-Id: I02b7cd87d399d487dd1d650540f503a70bc27749
2017-05-09 09:11:27 -07:00 · 2017-05-09 09:11:27 -07:00 · f93f675a85
commit f93f675a85
parent 5532d6f404
4 changed files with 175 additions and 2 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -528,6 +528,7 @@ class ComputeManager(manager.Manager):
                CONF.max_concurrent_live_migrations)
        else:
            self._live_migration_semaphore = compute_utils.UnlimitedSemaphore()
+        self._failed_builds = 0

        super(ComputeManager, self).__init__(service_name="compute",
                                             *args, **kwargs)
@ -1688,6 +1689,31 @@ class ComputeManager(manager.Manager):

        return block_device_info

+    def _build_failed(self):
+        self._failed_builds += 1
+        limit = CONF.compute.consecutive_build_service_disable_threshold
+        if limit and self._failed_builds >= limit:
+            # NOTE(danms): If we're doing a bunch of parallel builds,
+            # it is possible (although not likely) that we have already
+            # failed N-1 builds before this and we race with a successful
+            # build and disable ourselves here when we might've otherwise
+            # not.
+            LOG.error('Disabling service due to %(fails)i '
+                      'consecutive build failures',
+                      {'fails': self._failed_builds})
+            ctx = nova.context.get_admin_context()
+            service = objects.Service.get_by_compute_host(ctx, CONF.host)
+            service.disabled = True
+            service.disabled_reason = (
+                'Auto-disabled due to %i build failures' % self._failed_builds)
+            service.save()
+            # NOTE(danms): Reset our counter now so that when the admin
+            # re-enables us we can start fresh
+            self._failed_builds = 0
+        elif self._failed_builds > 1:
+            LOG.warning('%(fails)i consecutive build failures',
+                        {'fails': self._failed_builds})
+
    @wrap_exception()
    @reverts_task_state
    @wrap_instance_fault
@ -1704,7 +1730,18 @@ class ComputeManager(manager.Manager):
            # for a while and we want to make sure that nothing else tries
            # to do anything with this instance while we wait.
            with self._build_semaphore:
-                self._do_build_and_run_instance(*args, **kwargs)
+                try:
+                    result = self._do_build_and_run_instance(*args, **kwargs)
+                except Exception:
+                    result = build_results.FAILED
+                    raise
+                finally:
+                    fails = (build_results.FAILED,
+                             build_results.RESCHEDULED)
+                    if result in fails:
+                        self._build_failed()
+                    else:
+                        self._failed_builds = 0

        # NOTE(danms): We spawn here to return the RPC worker thread back to
        # the pool. Since what follows could take a really long time, we don't
--- a/nova/conf/compute.py
+++ b/nova/conf/compute.py
@ -23,6 +23,12 @@ from oslo_config import types

 from nova.conf import paths

+compute_group = cfg.OptGroup(
+    'compute',
+    title='Compute Manager Options',
+    help="""
+A collection of options specific to the nova-compute service.
+""")
 compute_opts = [
    cfg.StrOpt('compute_driver',
        help="""
@ -637,6 +643,27 @@ Possible values:
 """)
 ]

+compute_group_opts = [
+    cfg.IntOpt('consecutive_build_service_disable_threshold',
+        default=10,
+        help="""
+Number of consecutive failed builds that result in disabling a compute service.
+
+This option will cause nova-compute to set itself to a disabled state
+if a certain number of consecutive build failures occur. This will
+prevent the scheduler from continuing to send builds to a compute node that is
+consistently failing. Note that all failures qualify and count towards this
+score, including reschedules that may have been due to racy scheduler behavior.
+Since the failures must be consecutive, it is unlikely that occasional expected
+reschedules will actually disable a compute node.
+
+Possible values:
+
+* Any positive integer representing a build failure count.
+* Zero to never auto-disable.
+"""),
+]
+
 interval_opts = [
    cfg.IntOpt('image_cache_manager_interval',
        default=2400,
@ -1139,7 +1166,10 @@ ALL_OPTS = (compute_opts +

 def register_opts(conf):
    conf.register_opts(ALL_OPTS)
+    conf.register_group(compute_group)
+    conf.register_opts(compute_group_opts, group=compute_group)


 def list_opts():
-    return {'DEFAULT': ALL_OPTS}
+    return {'DEFAULT': ALL_OPTS,
+            'compute': compute_opts}
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -4040,6 +4040,95 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
                set_error=True, cleanup_volumes=True,
                nil_out_host_and_node=True)

+    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
+    @mock.patch('nova.objects.Service.get_by_compute_host')
+    def test_build_failures_disable_service(self, mock_service, mock_dbari):
+        mock_dbari.return_value = build_results.FAILED
+        instance = objects.Instance(uuid=uuids.instance)
+        for i in range(0, 10):
+            self.compute.build_and_run_instance(None, instance, None,
+                                                None, None)
+        service = mock_service.return_value
+        self.assertTrue(service.disabled)
+        self.assertEqual('Auto-disabled due to 10 build failures',
+                         service.disabled_reason)
+        service.save.assert_called_once_with()
+        self.assertEqual(0, self.compute._failed_builds)
+
+    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
+    @mock.patch('nova.objects.Service.get_by_compute_host')
+    def test_build_failures_not_disable_service(self, mock_service,
+                                                mock_dbari):
+        self.flags(consecutive_build_service_disable_threshold=0,
+                   group='compute')
+        mock_dbari.return_value = build_results.FAILED
+        instance = objects.Instance(uuid=uuids.instance)
+        for i in range(0, 10):
+            self.compute.build_and_run_instance(None, instance, None,
+                                                None, None)
+        service = mock_service.return_value
+        self.assertFalse(service.save.called)
+        self.assertEqual(10, self.compute._failed_builds)
+
+    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
+    @mock.patch('nova.objects.Service.get_by_compute_host')
+    def test_transient_build_failures_no_disable_service(self, mock_service,
+                                                         mock_dbari):
+        results = [build_results.FAILED,
+                   build_results.ACTIVE,
+                   build_results.RESCHEDULED]
+
+        def _fake_build(*a, **k):
+            if results:
+                return results.pop(0)
+            else:
+                return build_results.ACTIVE
+
+        mock_dbari.side_effect = _fake_build
+        instance = objects.Instance(uuid=uuids.instance)
+        for i in range(0, 10):
+            self.compute.build_and_run_instance(None, instance, None,
+                                                None, None)
+        service = mock_service.return_value
+        self.assertFalse(service.save.called)
+        self.assertEqual(0, self.compute._failed_builds)
+
+    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
+    @mock.patch('nova.objects.Service.get_by_compute_host')
+    def test_build_reschedules_disable_service(self, mock_service, mock_dbari):
+        mock_dbari.return_value = build_results.RESCHEDULED
+        instance = objects.Instance(uuid=uuids.instance)
+        for i in range(0, 10):
+            self.compute.build_and_run_instance(None, instance, None,
+                                                None, None)
+        service = mock_service.return_value
+        self.assertTrue(service.disabled)
+        self.assertEqual('Auto-disabled due to 10 build failures',
+                         service.disabled_reason)
+        service.save.assert_called_once_with()
+        self.assertEqual(0, self.compute._failed_builds)
+
+    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
+    @mock.patch('nova.objects.Service.get_by_compute_host')
+    @mock.patch('nova.exception_wrapper._emit_exception_notification')
+    @mock.patch('nova.compute.utils.add_instance_fault_from_exc')
+    def test_build_exceptions_disable_service(self, mock_if, mock_notify,
+                                              mock_service, mock_dbari):
+        mock_dbari.side_effect = test.TestingException()
+        instance = objects.Instance(uuid=uuids.instance,
+                                    task_state=None)
+        for i in range(0, 10):
+            self.assertRaises(test.TestingException,
+                              self.compute.build_and_run_instance,
+                              None, instance, None,
+                              None, None)
+        service = mock_service.return_value
+        self.assertTrue(service.disabled)
+        self.assertEqual('Auto-disabled due to 10 build failures',
+                         service.disabled_reason)
+        service.save.assert_called_once_with()
+        self.assertEqual(0, self.compute._failed_builds)
+
    @mock.patch.object(manager.ComputeManager, '_shutdown_instance')
    @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
    @mock.patch.object(fake_driver.FakeDriver, 'spawn')
--- a/releasenotes/notes/compute-node-auto-disable-303eb9b0fdb4f3f1.yaml
+++ b/releasenotes/notes/compute-node-auto-disable-303eb9b0fdb4f3f1.yaml
@ -0,0 +1,17 @@
+---
+features:
+  - |
+    The `nova-compute` worker can automatically disable itself in the
+    service database if consecutive build failures exceed a set threshold. The
+    ``[compute]/consecutive_build_service_disable_threshold`` configuration option
+    allows setting the threshold for this behavior, or disabling it entirely if
+    desired.
+    The intent is that an admin will examine the issue before manually
+    re-enabling the service, which will avoid that compute node becoming a
+    black hole build magnet.
+upgrade:
+  - |
+    The new configuration option
+    ``[compute]/consecutive_build_service_disable_threshold``
+    defaults to a nonzero value, which means multiple failed builds will
+    result in a compute node auto-disabling itself.