From f8b98390dc99f6cb0101c88223eb840e0d1c7124 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Thu, 15 Aug 2024 13:06:39 +0200 Subject: [PATCH] Fix PCI passthrough cleanup on reschedule The resource tracker Claim object works on a copy of the instance object got from the compute manager. But the PCI claim logic does not use the copy but use the original instance object. However the abort claim logic including the abort PCI claim logic worked on the copy only. Therefore the claimed PCI devices are visible to the compute manager in the instance.pci_decives list even after the claim is aborted. There was another bug in the PCIDevice object where the instance object wasn't passed to the free() function and therefore the instance.pci_devices list wasn't updated when the device was freed. Closes-Bug: #1860555 Change-Id: Iff343d4d78996cd17a6a584fefa7071c81311673 --- doc/notification_samples/instance-create-error.json | 4 +++- nova/compute/claims.py | 3 ++- nova/pci/manager.py | 4 ++-- .../functional/libvirt/test_pci_sriov_servers.py | 7 ++----- nova/tests/unit/compute/test_shelve.py | 12 ++++-------- nova/tests/unit/pci/test_manager.py | 8 ++++++-- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/doc/notification_samples/instance-create-error.json b/doc/notification_samples/instance-create-error.json index 095cfbf4f489..cd824eda2989 100644 --- a/doc/notification_samples/instance-create-error.json +++ b/doc/notification_samples/instance-create-error.json @@ -18,7 +18,9 @@ "ip_addresses": [], "launched_at": null, "power_state": "pending", - "state": "building" + "state": "building", + "host": null, + "node": null } }, "priority":"ERROR", diff --git a/nova/compute/claims.py b/nova/compute/claims.py index 490b418081e5..fa93d9ab1852 100644 --- a/nova/compute/claims.py +++ b/nova/compute/claims.py @@ -64,6 +64,7 @@ class Claim(NopClaim): super().__init__(migration=migration) # Stash a copy of the instance at the current point of time self.instance = instance.obj_clone() + self.instance_ref = instance self.nodename = nodename self.tracker = tracker self._pci_requests = pci_requests @@ -82,7 +83,7 @@ class Claim(NopClaim): been aborted. """ LOG.debug("Aborting claim: %s", self, instance=self.instance) - self.tracker.abort_instance_claim(self.context, self.instance, + self.tracker.abort_instance_claim(self.context, self.instance_ref, self.nodename) def _claim_test(self, compute_node, limits=None): diff --git a/nova/pci/manager.py b/nova/pci/manager.py index 304b4fa22058..6168bb7c79ef 100644 --- a/nova/pci/manager.py +++ b/nova/pci/manager.py @@ -407,7 +407,7 @@ class PciDevTracker(object): for dev in self.pci_devs: if (dev.status == fields.PciDeviceStatus.ALLOCATED and dev.instance_uuid == instance['uuid']): - self._free_device(dev) + self._free_device(dev, instance) def free_instance_claims( self, context: ctx.RequestContext, instance: 'objects.Instance', @@ -423,7 +423,7 @@ class PciDevTracker(object): for dev in self.pci_devs: if (dev.status == fields.PciDeviceStatus.CLAIMED and dev.instance_uuid == instance['uuid']): - self._free_device(dev) + self._free_device(dev, instance) def free_instance( self, context: ctx.RequestContext, instance: 'objects.Instance', diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py index d7ceaaf04ca6..2108151e711a 100644 --- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py +++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py @@ -3042,11 +3042,8 @@ class PCIResourceRequestReschedulingTest(_PCIServersTestBase): def validate_group_policy(manager, instance, *args, **kwargs): nonlocal validate_group_policy_called - if validate_group_policy_called: - # FIXME(johngarbutt): This is bug 1860555, it should be 1. - self.assertEqual(2, len(instance.pci_devices)) - else: - self.assertEqual(1, len(instance.pci_devices)) + self.assertEqual(1, len(instance.pci_devices)) + if not validate_group_policy_called: validate_group_policy_called = True raise exception.RescheduledException( instance_uuid='fake-uuid', diff --git a/nova/tests/unit/compute/test_shelve.py b/nova/tests/unit/compute/test_shelve.py index ebd721689fe0..42fef1f7c5eb 100644 --- a/nova/tests/unit/compute/test_shelve.py +++ b/nova/tests/unit/compute/test_shelve.py @@ -596,14 +596,6 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): # This is before we've failed. self.assertEqual(task_states.SPAWNING, instance.task_state) tracking['last_state'] = instance.task_state - elif tracking['last_state'] == task_states.SPAWNING: - # This is after we've failed. - self.assertIsNone(instance.host) - self.assertIsNone(instance.node) - self.assertIsNone(instance.task_state) - tracking['last_state'] = instance.task_state - else: - self.fail('Unexpected save!') with mock.patch.object(instance, 'save') as mock_save: mock_save.side_effect = check_save @@ -614,6 +606,10 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): request_spec=objects.RequestSpec(), accel_uuids=[]) + self.assertIsNone(instance.host) + self.assertIsNone(instance.node) + self.assertIsNone(instance.task_state) + mock_notify_instance_action.assert_called_once_with( self.context, instance, 'fake-mini', action='unshelve', phase='start', bdms=mock_bdms) diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py index bcd4cecb8573..2cc9de253eea 100644 --- a/nova/tests/unit/pci/test_manager.py +++ b/nova/tests/unit/pci/test_manager.py @@ -528,7 +528,9 @@ class PciDevTrackerTestCase(test.NoDBTestCase): # Ensure that the claim actually fixes the inconsistency so when the # parent if freed the children become available too. self.tracker.free_instance( - mock.sentinel.context, {'uuid': uuidsentinel.instance1}) + mock.sentinel.context, + {'uuid': uuidsentinel.instance1, + 'pci_devices': [objects.PciDevice(id=pf['id'])]}) pf_dev = self._get_device_by_address(pf['address']) self.assertEqual('available', pf_dev.status) @@ -586,7 +588,9 @@ class PciDevTrackerTestCase(test.NoDBTestCase): # Ensure that the claim actually fixes the inconsistency so when the # parent if freed the children become available too. self.tracker.free_instance( - mock.sentinel.context, {'uuid': uuidsentinel.instance1}) + mock.sentinel.context, + {'uuid': uuidsentinel.instance1, + 'pci_devices': [objects.PciDevice(id=pf['id'])]}) pf_dev = self._get_device_by_address(pf['address']) self.assertEqual('available', pf_dev.status)