From 91734bad9139555294fe088d2c2d77a9712652ab Mon Sep 17 00:00:00 2001 From: Vishvananda Ishaya Date: Mon, 17 Sep 2012 16:02:06 -0700 Subject: [PATCH] Fixes error handling during schedule_run_instance If there are not enough hosts available during a multi-instance launch, every failing instance should be updated to error state, instead of just the first instance. Currently only the first instance is set to Error and the rest stay in building. This patch makes a number of fixes to error handling during scheduling. * Moves instance faults into compute utils so they can be created from the scheduler. * Moves error handling into the driver so that each instance can be updated separately. * Sets an instance fault for failed scheduling * Sets task state back to none if there is a scheduling failure * Modifies chance scheduler to stop returning a list of instances as it is not used. * Modifies tests to check for these states. In addition to the included tests, the code was manually verified on a devstack install Fixes bug 1051066 Fixes bug 1019017 Change-Id: I49267ce4a21e2f7cc7a996fb2ed5d625f6794730 --- nova/compute/manager.py | 27 ++---------- nova/compute/utils.py | 22 ++++++++++ nova/scheduler/chance.py | 33 ++++++++------- nova/scheduler/driver.py | 35 +++++++++++++++- nova/scheduler/filter_scheduler.py | 33 ++++++++------- nova/tests/compute/test_compute.py | 11 ++--- nova/tests/scheduler/test_chance_scheduler.py | 25 ++++++----- nova/tests/scheduler/test_filter_scheduler.py | 42 +++++++++++++------ 8 files changed, 146 insertions(+), 82 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 2d5baafcf57d..a7e04b19c678 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -198,7 +198,7 @@ def wrap_instance_fault(function): raise except Exception, e: with excutils.save_and_reraise_exception(): - self._add_instance_fault_from_exc(context, + compute_utils.add_instance_fault_from_exc(context, kwargs['instance']['uuid'], e, sys.exc_info()) return decorated_function @@ -1099,8 +1099,8 @@ class ComputeManager(manager.SchedulerDependentManager): except Exception, exc: LOG.error(_('Cannot reboot instance: %(exc)s'), locals(), context=context, instance=instance) - self._add_instance_fault_from_exc(context, instance['uuid'], exc, - sys.exc_info()) + compute_utils.add_instance_fault_from_exc(context, + instance['uuid'], exc, sys.exc_info()) # Fall through and reset task_state to None current_power_state = self._get_power_state(context, instance) @@ -2710,27 +2710,6 @@ class ComputeManager(manager.SchedulerDependentManager): """ self.resource_tracker.update_available_resource(context) - def _add_instance_fault_from_exc(self, context, instance_uuid, fault, - exc_info=None): - """Adds the specified fault to the database.""" - - code = 500 - if hasattr(fault, "kwargs"): - code = fault.kwargs.get('code', 500) - - details = unicode(fault) - if exc_info and code == 500: - tb = exc_info[2] - details += '\n' + ''.join(traceback.format_tb(tb)) - - values = { - 'instance_uuid': instance_uuid, - 'code': code, - 'message': fault.__class__.__name__, - 'details': unicode(details), - } - self.db.instance_fault_create(context, values) - @manager.periodic_task( ticks_between_runs=FLAGS.running_deleted_instance_poll_interval) def _cleanup_running_deleted_instances(self, context): diff --git a/nova/compute/utils.py b/nova/compute/utils.py index 1eb595e80602..1f46d8019669 100644 --- a/nova/compute/utils.py +++ b/nova/compute/utils.py @@ -18,6 +18,7 @@ import re import string +import traceback from nova import block_device from nova import db @@ -33,6 +34,27 @@ FLAGS = flags.FLAGS LOG = log.getLogger(__name__) +def add_instance_fault_from_exc(context, instance_uuid, fault, exc_info=None): + """Adds the specified fault to the database.""" + + code = 500 + if hasattr(fault, "kwargs"): + code = fault.kwargs.get('code', 500) + + details = unicode(fault) + if exc_info and code == 500: + tb = exc_info[2] + details += '\n' + ''.join(traceback.format_tb(tb)) + + values = { + 'instance_uuid': instance_uuid, + 'code': code, + 'message': fault.__class__.__name__, + 'details': unicode(details), + } + db.instance_fault_create(context, values) + + def get_device_name_for_instance(context, instance, device): """Validates (or generates) a device name for instance. diff --git a/nova/scheduler/chance.py b/nova/scheduler/chance.py index 00042f43ad47..e44e9d7d2c1e 100644 --- a/nova/scheduler/chance.py +++ b/nova/scheduler/chance.py @@ -61,23 +61,28 @@ class ChanceScheduler(driver.Scheduler): requested_networks, is_first_time, filter_properties): """Create and run an instance or instances""" - instances = [] instance_uuids = request_spec.get('instance_uuids') for num, instance_uuid in enumerate(instance_uuids): - host = self._schedule(context, 'compute', request_spec, - filter_properties) request_spec['instance_properties']['launch_index'] = num - updated_instance = driver.instance_update_db(context, - instance_uuid, host) - self.compute_rpcapi.run_instance(context, - instance=updated_instance, host=host, - requested_networks=requested_networks, - injected_files=injected_files, - admin_password=admin_password, is_first_time=is_first_time, - request_spec=request_spec, - filter_properties=filter_properties) - instances.append(driver.encode_instance(updated_instance)) - return instances + try: + host = self._schedule(context, 'compute', request_spec, + filter_properties) + updated_instance = driver.instance_update_db(context, + instance_uuid, host) + self.compute_rpcapi.run_instance(context, + instance=updated_instance, host=host, + requested_networks=requested_networks, + injected_files=injected_files, + admin_password=admin_password, + is_first_time=is_first_time, + request_spec=request_spec, + filter_properties=filter_properties) + except Exception as ex: + # NOTE(vish): we don't reraise the exception here to make sure + # that all instances in the request get set to + # error properly + driver.handle_schedule_error(context, ex, instance_uuid, + request_spec) def schedule_prep_resize(self, context, image, request_spec, filter_properties, instance, instance_type, diff --git a/nova/scheduler/driver.py b/nova/scheduler/driver.py index 705cb8636214..ee7db02b94f3 100644 --- a/nova/scheduler/driver.py +++ b/nova/scheduler/driver.py @@ -21,10 +21,13 @@ Scheduler base class that all Schedulers should inherit from """ +import sys + from nova.compute import api as compute_api from nova.compute import power_state from nova.compute import rpcapi as compute_rpcapi -from nova.compute import task_states +from nova.compute import utils as compute_utils +from nova.compute import vm_states from nova import db from nova import exception from nova import flags @@ -32,9 +35,9 @@ from nova import notifications from nova.openstack.common import cfg from nova.openstack.common import importutils from nova.openstack.common import log as logging +from nova.openstack.common.notifier import api as notifier from nova.openstack.common import rpc from nova.openstack.common import timeutils -from nova import quota from nova import utils @@ -56,6 +59,34 @@ flags.DECLARE('instances_path', 'nova.compute.manager') flags.DECLARE('libvirt_type', 'nova.virt.libvirt.driver') +def handle_schedule_error(context, ex, instance_uuid, request_spec): + if not isinstance(ex, exception.NoValidHost): + LOG.exception(_("Exception during scheduler.run_instance")) + compute_utils.add_instance_fault_from_exc(context, + instance_uuid, ex, sys.exc_info()) + state = vm_states.ERROR.upper() + LOG.warning(_('Setting instance to %(state)s state.'), + locals(), instance_uuid=instance_uuid) + + # update instance state and notify on the transition + (old_ref, new_ref) = db.instance_update_and_get_original(context, + instance_uuid, {'vm_state': vm_states.ERROR, + 'task_state': None}) + notifications.send_update(context, old_ref, new_ref, + service="scheduler") + + properties = request_spec.get('instance_properties', {}) + payload = dict(request_spec=request_spec, + instance_properties=properties, + instance_id=instance_uuid, + state=vm_states.ERROR, + method='run_instance', + reason=ex) + + notifier.notify(context, notifier.publisher_id("scheduler"), + 'scheduler.run_instance', notifier.ERROR, payload) + + def cast_to_volume_host(context, host, method, **kwargs): """Cast request to a volume host queue""" diff --git a/nova/scheduler/filter_scheduler.py b/nova/scheduler/filter_scheduler.py index da4c4c0ddb51..a3e3e1354a9b 100644 --- a/nova/scheduler/filter_scheduler.py +++ b/nova/scheduler/filter_scheduler.py @@ -72,27 +72,32 @@ class FilterScheduler(driver.Scheduler): weighted_hosts = self._schedule(context, "compute", request_spec, filter_properties, instance_uuids) - if not weighted_hosts: - raise exception.NoValidHost(reason="") - # NOTE(comstud): Make sure we do not pass this through. It # contains an instance of RpcContext that cannot be serialized. filter_properties.pop('context', None) for num, instance_uuid in enumerate(instance_uuids): - if not weighted_hosts: - break - weighted_host = weighted_hosts.pop(0) - request_spec['instance_properties']['launch_index'] = num - self._provision_resource(elevated, weighted_host, - request_spec, - filter_properties, - requested_networks, - injected_files, admin_password, - is_first_time, - instance_uuid=instance_uuid) + try: + try: + weighted_host = weighted_hosts.pop(0) + except IndexError: + raise exception.NoValidHost(reason="") + + self._provision_resource(elevated, weighted_host, + request_spec, + filter_properties, + requested_networks, + injected_files, admin_password, + is_first_time, + instance_uuid=instance_uuid) + except Exception as ex: + # NOTE(vish): we don't reraise the exception here to make sure + # that all instances in the request get set to + # error properly + driver.handle_schedule_error(context, ex, instance_uuid, + request_spec) # scrub retry host list in case we're scheduling multiple # instances: retry = filter_properties.get('retry', {}) diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py index 45fffd9c41f0..de07d4d68995 100644 --- a/nova/tests/compute/test_compute.py +++ b/nova/tests/compute/test_compute.py @@ -35,6 +35,7 @@ from nova.compute import manager as compute_manager from nova.compute import power_state from nova.compute import rpcapi as compute_rpcapi from nova.compute import task_states +from nova.compute import utils as compute_utils from nova.compute import vm_states from nova import context from nova import db @@ -212,7 +213,7 @@ class ComputeTestCase(BaseTestCase): def did_it_add_fault(*args): called['fault_added'] = True - self.stubs.Set(self.compute, '_add_instance_fault_from_exc', + self.stubs.Set(compute_utils, 'add_instance_fault_from_exc', did_it_add_fault) @nova.compute.manager.wrap_instance_fault @@ -232,7 +233,7 @@ class ComputeTestCase(BaseTestCase): def did_it_add_fault(*args): called['fault_added'] = True - self.stubs.Set(self.compute, '_add_instance_fault_from_exc', + self.stubs.Set(compute_utils, 'add_instance_fault_from_exc', did_it_add_fault) @nova.compute.manager.wrap_instance_fault @@ -2131,7 +2132,7 @@ class ComputeTestCase(BaseTestCase): self.stubs.Set(nova.db, 'instance_fault_create', fake_db_fault_create) ctxt = context.get_admin_context() - self.compute._add_instance_fault_from_exc(ctxt, instance_uuid, + compute_utils.add_instance_fault_from_exc(ctxt, instance_uuid, NotImplementedError('test'), exc_info) @@ -2159,7 +2160,7 @@ class ComputeTestCase(BaseTestCase): self.stubs.Set(nova.db, 'instance_fault_create', fake_db_fault_create) ctxt = context.get_admin_context() - self.compute._add_instance_fault_from_exc(ctxt, instance_uuid, + compute_utils.add_instance_fault_from_exc(ctxt, instance_uuid, user_exc, exc_info) def test_add_instance_fault_no_exc_info(self): @@ -2177,7 +2178,7 @@ class ComputeTestCase(BaseTestCase): self.stubs.Set(nova.db, 'instance_fault_create', fake_db_fault_create) ctxt = context.get_admin_context() - self.compute._add_instance_fault_from_exc(ctxt, instance_uuid, + compute_utils.add_instance_fault_from_exc(ctxt, instance_uuid, NotImplementedError('test')) def test_cleanup_running_deleted_instances(self): diff --git a/nova/tests/scheduler/test_chance_scheduler.py b/nova/tests/scheduler/test_chance_scheduler.py index 7560f72f4c26..4fb9ab617cea 100644 --- a/nova/tests/scheduler/test_chance_scheduler.py +++ b/nova/tests/scheduler/test_chance_scheduler.py @@ -23,7 +23,10 @@ import random import mox from nova.compute import rpcapi as compute_rpcapi +from nova.compute import utils as compute_utils +from nova.compute import vm_states from nova import context +from nova import db from nova import exception from nova.scheduler import chance from nova.scheduler import driver @@ -79,7 +82,6 @@ class ChanceSchedulerTestCase(test_scheduler.SchedulerTestCase): self.mox.StubOutWithMock(ctxt, 'elevated') self.mox.StubOutWithMock(self.driver, 'hosts_up') self.mox.StubOutWithMock(random, 'random') - self.mox.StubOutWithMock(driver, 'encode_instance') self.mox.StubOutWithMock(driver, 'instance_update_db') self.mox.StubOutWithMock(compute_rpcapi.ComputeAPI, 'run_instance') @@ -94,7 +96,6 @@ class ChanceSchedulerTestCase(test_scheduler.SchedulerTestCase): instance=instance1, requested_networks=None, injected_files=None, admin_password=None, is_first_time=None, request_spec=request_spec, filter_properties={}) - driver.encode_instance(instance1).AndReturn(instance1_encoded) # instance 2 ctxt.elevated().AndReturn(ctxt_elevated) @@ -107,33 +108,37 @@ class ChanceSchedulerTestCase(test_scheduler.SchedulerTestCase): instance=instance2, requested_networks=None, injected_files=None, admin_password=None, is_first_time=None, request_spec=request_spec, filter_properties={}) - driver.encode_instance(instance2).AndReturn(instance2_encoded) self.mox.ReplayAll() - result = self.driver.schedule_run_instance(ctxt, request_spec, + self.driver.schedule_run_instance(ctxt, request_spec, None, None, None, None, {}) - expected = [instance1_encoded, instance2_encoded] - self.assertEqual(result, expected) def test_basic_schedule_run_instance_no_hosts(self): ctxt = context.RequestContext('fake', 'fake', False) ctxt_elevated = 'fake-context-elevated' fake_args = (1, 2, 3) + uuid = 'fake-uuid1' instance_opts = {'fake_opt1': 'meow', 'launch_index': -1} - request_spec = {'instance_uuids': ['fake-uuid1'], + request_spec = {'instance_uuids': [uuid], 'instance_properties': instance_opts} self.mox.StubOutWithMock(ctxt, 'elevated') self.mox.StubOutWithMock(self.driver, 'hosts_up') + self.mox.StubOutWithMock(compute_utils, 'add_instance_fault_from_exc') + self.mox.StubOutWithMock(db, 'instance_update_and_get_original') # instance 1 ctxt.elevated().AndReturn(ctxt_elevated) self.driver.hosts_up(ctxt_elevated, 'compute').AndReturn([]) + compute_utils.add_instance_fault_from_exc(ctxt, + uuid, mox.IsA(exception.NoValidHost), mox.IgnoreArg()) + db.instance_update_and_get_original(ctxt, uuid, + {'vm_state': vm_states.ERROR, + 'task_state': None}).AndReturn(({}, {})) self.mox.ReplayAll() - self.assertRaises(exception.NoValidHost, - self.driver.schedule_run_instance, ctxt, request_spec, - None, None, None, None, {}) + self.driver.schedule_run_instance( + ctxt, request_spec, None, None, None, None, {}) def test_schedule_prep_resize_doesnt_update_host(self): fake_context = context.RequestContext('user', 'project', diff --git a/nova/tests/scheduler/test_filter_scheduler.py b/nova/tests/scheduler/test_filter_scheduler.py index 909bad5fc2b7..130251a0f34a 100644 --- a/nova/tests/scheduler/test_filter_scheduler.py +++ b/nova/tests/scheduler/test_filter_scheduler.py @@ -18,7 +18,10 @@ Tests For Filter Scheduler. import mox +from nova.compute import utils as compute_utils +from nova.compute import vm_states from nova import context +from nova import db from nova import exception from nova.scheduler import driver from nova.scheduler import filter_scheduler @@ -38,26 +41,31 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase): driver_cls = filter_scheduler.FilterScheduler def test_run_instance_no_hosts(self): - """ - Ensure empty hosts & child_zones result in NoValidHosts exception. - """ + def _fake_empty_call_zone_method(*args, **kwargs): return [] sched = fakes.FakeFilterScheduler() + uuid = 'fake-uuid1' fake_context = context.RequestContext('user', 'project') request_spec = {'instance_type': {'memory_mb': 1, 'root_gb': 1, 'ephemeral_gb': 0}, 'instance_properties': {'project_id': 1}, - 'instance_uuids': ['fake-uuid1']} - self.assertRaises(exception.NoValidHost, sched.schedule_run_instance, - fake_context, request_spec, None, None, None, - None, {}) + 'instance_uuids': [uuid]} + + self.mox.StubOutWithMock(compute_utils, 'add_instance_fault_from_exc') + self.mox.StubOutWithMock(db, 'instance_update_and_get_original') + compute_utils.add_instance_fault_from_exc(fake_context, + uuid, mox.IsA(exception.NoValidHost), mox.IgnoreArg()) + db.instance_update_and_get_original(fake_context, uuid, + {'vm_state': vm_states.ERROR, + 'task_state': None}).AndReturn(({}, {})) + self.mox.ReplayAll() + sched.schedule_run_instance( + fake_context, request_spec, None, None, None, None, {}) def test_run_instance_non_admin(self): - """Test creating an instance locally using run_instance, passing - a non-admin context. DB actions should work.""" self.was_admin = False def fake_get(context, *args, **kwargs): @@ -71,12 +79,20 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase): fake_context = context.RequestContext('user', 'project') + uuid = 'fake-uuid1' request_spec = {'instance_type': {'memory_mb': 1, 'local_gb': 1}, 'instance_properties': {'project_id': 1}, - 'instance_uuids': ['fake-uuid1']} - self.assertRaises(exception.NoValidHost, sched.schedule_run_instance, - fake_context, request_spec, None, None, None, - None, {}) + 'instance_uuids': [uuid]} + self.mox.StubOutWithMock(compute_utils, 'add_instance_fault_from_exc') + self.mox.StubOutWithMock(db, 'instance_update_and_get_original') + compute_utils.add_instance_fault_from_exc(fake_context, + uuid, mox.IsA(exception.NoValidHost), mox.IgnoreArg()) + db.instance_update_and_get_original(fake_context, uuid, + {'vm_state': vm_states.ERROR, + 'task_state': None}).AndReturn(({}, {})) + self.mox.ReplayAll() + sched.schedule_run_instance( + fake_context, request_spec, None, None, None, None, {}) self.assertTrue(self.was_admin) def test_schedule_bad_topic(self):