Fix poll_rescued_instances periodic task

The poll_rescued_instances periodic task now checks the amount of time that an instance has been in the RESCUED stated before timing out the rescue. It also now performs the unrescue through the compute api in order to make sure the database is left in a consistent state. The poll_rescued_instances method is no longer necessary in the virt driver interface and has been removed. And also removed from the different virt drivers, since it was just doing a 'pass' in each of them. bug 1088625 bug 1088627 Change-Id: I75f7dc188cc49e5f6e5c8a3cb256d1c42ff7d882
2012-12-11 13:48:11 -05:00 · 2012-12-11 13:48:11 -05:00 · c40fc8a4db
commit c40fc8a4db
parent dc48ce7fb1
9 changed files with 51 additions and 64 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -1592,6 +1592,7 @@ class ComputeManager(manager.SchedulerDependentManager):
                              vm_state=vm_states.RESCUED,
                              task_state=None,
                              power_state=current_power_state,
+                              launched_at=timeutils.utcnow(),
                              expected_task_state=task_states.RESCUING)

    @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@ -2818,7 +2819,22 @@ class ComputeManager(manager.SchedulerDependentManager):
    @manager.periodic_task
    def _poll_rescued_instances(self, context):
        if CONF.rescue_timeout > 0:
-            self.driver.poll_rescued_instances(CONF.rescue_timeout)
+            instances = self.conductor_api.instance_get_all_by_host(context,
+                                                                    self.host)
+
+            rescued_instances = []
+            for instance in instances:
+                if instance['vm_state'] == vm_states.RESCUED:
+                    rescued_instances.append(instance)
+
+            to_unrescue = []
+            for instance in rescued_instances:
+                if timeutils.is_older_than(instance['launched_at'],
+                                           CONF.rescue_timeout):
+                    to_unrescue.append(instance)
+
+            for instance in to_unrescue:
+                self.compute_api.unrescue(context, instance)

    @manager.periodic_task
    def _poll_unconfirmed_resizes(self, context):
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@ -2788,6 +2788,40 @@ class ComputeTestCase(BaseTestCase):
        self.assertEqual(call_info['get_by_uuid'], 3)
        self.assertEqual(call_info['get_nw_info'], 4)

+    def test_poll_rescued_instances(self):
+        timed_out_time = timeutils.utcnow() - datetime.timedelta(minutes=5)
+        not_timed_out_time = timeutils.utcnow()
+
+        instances = [{'uuid': 'fake_uuid1', 'vm_state': vm_states.RESCUED,
+                      'launched_at': timed_out_time},
+                     {'uuid': 'fake_uuid2', 'vm_state': vm_states.ACTIVE,
+                      'launched_at': timed_out_time},
+                     {'uuid': 'fake_uuid3', 'vm_state': vm_states.ACTIVE,
+                      'launched_at': not_timed_out_time},
+                     {'uuid': 'fake_uuid4', 'vm_state': vm_states.RESCUED,
+                      'launched_at': timed_out_time},
+                     {'uuid': 'fake_uuid5', 'vm_state': vm_states.RESCUED,
+                      'launched_at': not_timed_out_time}]
+        unrescued_instances = {'fake_uuid1': False, 'fake_uuid4': False}
+
+        def fake_instance_get_all_by_host(context, host):
+            return instances
+
+        def fake_unrescue(self, context, instance):
+            unrescued_instances[instance['uuid']] = True
+
+        self.stubs.Set(self.compute.conductor_api, 'instance_get_all_by_host',
+                       fake_instance_get_all_by_host)
+        self.stubs.Set(compute_api.API, 'unrescue', fake_unrescue)
+
+        self.flags(rescue_timeout=60)
+        ctxt = context.get_admin_context()
+
+        self.compute._poll_rescued_instances(ctxt)
+
+        for instance in unrescued_instances.values():
+            self.assertTrue(instance)
+
    def test_poll_unconfirmed_resizes(self):
        instances = [{'uuid': 'fake_uuid1', 'vm_state': vm_states.RESIZED,
                      'task_state': None},
--- a/nova/tests/test_virt_drivers.py
+++ b/nova/tests/test_virt_drivers.py
@ -282,10 +282,6 @@ class _VirtDriverTestCase(_FakeDriverBackendTestCase):
        instances = [self._get_running_instance()]
        self.connection.poll_rebooting_instances(10, instances)

-    @catch_notimplementederror
-    def test_poll_rescued_instances(self):
-        self.connection.poll_rescued_instances(10)
-
    @catch_notimplementederror
    def test_migrate_disk_and_power_off(self):
        instance_ref, network_info = self._get_running_instance()
--- a/nova/virt/driver.py
+++ b/nova/virt/driver.py
@ -635,11 +635,6 @@ class ComputeDriver(object):
        # TODO(Vek): Need to pass context in for access to auth_token
        raise NotImplementedError()

-    def poll_rescued_instances(self, timeout):
-        """Poll for rescued instances"""
-        # TODO(Vek): Need to pass context in for access to auth_token
-        raise NotImplementedError()
-
    def host_power_action(self, host, action):
        """Reboots, shuts down or powers up the host."""
        raise NotImplementedError()
--- a/nova/virt/fake.py
+++ b/nova/virt/fake.py
@ -154,9 +154,6 @@ class FakeDriver(driver.ComputeDriver):
    def poll_rebooting_instances(self, timeout, instances):
        pass

-    def poll_rescued_instances(self, timeout):
-        pass
-
    def migrate_disk_and_power_off(self, context, instance, dest,
                                   instance_type, network_info,
                                   block_device_info=None):
--- a/nova/virt/hyperv/driver.py
+++ b/nova/virt/hyperv/driver.py
@ -119,9 +119,6 @@ class HyperVDriver(driver.ComputeDriver):
    def get_volume_connector(self, instance):
        return self._volumeops.get_volume_connector(instance)

-    def poll_rescued_instances(self, timeout):
-        pass
-
    def get_available_resource(self, nodename):
        return self._hostops.get_available_resource()

--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@ -1083,10 +1083,6 @@ class LibvirtDriver(driver.ComputeDriver):
    def poll_rebooting_instances(self, timeout, instances):
        pass

-    @exception.wrap_exception()
-    def poll_rescued_instances(self, timeout):
-        pass
-
    def _enable_hairpin(self, xml):
        interfaces = self.get_interfaces(xml)
        for interface in interfaces:
--- a/nova/virt/xenapi/driver.py
+++ b/nova/virt/xenapi/driver.py
@ -290,10 +290,6 @@ class XenAPIDriver(driver.ComputeDriver):
        """Poll for rebooting instances"""
        self._vmops.poll_rebooting_instances(timeout, instances)

-    def poll_rescued_instances(self, timeout):
-        """Poll for rescued instances"""
-        self._vmops.poll_rescued_instances(timeout)
-
    def reset_network(self, instance):
        """reset networking for specified instance"""
        self._vmops.reset_network(instance)
--- a/nova/virt/xenapi/vmops.py
+++ b/nova/virt/xenapi/vmops.py
@ -147,7 +147,6 @@ class VMOps(object):
        self.compute_api = compute.API()
        self._session = session
        self._virtapi = virtapi
-        self.poll_rescue_last_ran = None
        self.firewall_driver = firewall.load_driver(
            DEFAULT_FIREWALL_DRIVER,
            self._virtapi,
@ -1217,45 +1216,6 @@ class VMOps(object):
            LOG.info(_("Automatically hard rebooting"), instance=instance)
            self.compute_api.reboot(ctxt, instance, "HARD")

-    def poll_rescued_instances(self, timeout):
-        """Look for expirable rescued instances.
-
-            - forcibly exit rescue mode for any instances that have been
-              in rescue mode for >= the provided timeout
-
-        """
-        last_ran = self.poll_rescue_last_ran
-        if not last_ran:
-            # We need a base time to start tracking.
-            self.poll_rescue_last_ran = timeutils.utcnow()
-            return
-
-        if not timeutils.is_older_than(last_ran, timeout):
-            # Do not run. Let's bail.
-            return
-
-        # Update the time tracker and proceed.
-        self.poll_rescue_last_ran = timeutils.utcnow()
-
-        rescue_vms = []
-        for instance in self.list_instances():
-            if instance.endswith("-rescue"):
-                rescue_vms.append(dict(name=instance,
-                                       vm_ref=vm_utils.lookup(self._session,
-                                                              instance)))
-
-        for vm in rescue_vms:
-            rescue_vm_ref = vm["vm_ref"]
-
-            original_name = vm["name"].split("-rescue", 1)[0]
-            original_vm_ref = vm_utils.lookup(self._session, original_name)
-
-            self._destroy_rescue_instance(rescue_vm_ref, original_vm_ref)
-
-            self._release_bootlock(original_vm_ref)
-            self._session.call_xenapi("VM.start", original_vm_ref, False,
-                                      False)
-
    def get_info(self, instance, vm_ref=None):
        """Return data about VM instance."""
        vm_ref = vm_ref or self._get_vm_opaque_ref(instance)