Merge "Allow claiming PCI PF if child VF is unavailable"

This commit is contained in:
Zuul 2022-05-05 13:34:46 +00:00 committed by Gerrit Code Review
commit 7520711a0e
3 changed files with 162 additions and 26 deletions

View File

@ -346,10 +346,40 @@ class PciDevice(base.NovaPersistentObject, base.NovaObject):
# Update PF status to CLAIMED if all of it dependants are free
# and set their status to UNCLAIMABLE
vfs_list = self.child_devices
if not all([vf.is_available() for vf in vfs_list]):
raise exception.PciDeviceVFInvalidStatus(
compute_node_id=self.compute_node_id,
address=self.address)
non_free_dependants = [
vf for vf in vfs_list if not vf.is_available()]
if non_free_dependants:
# NOTE(gibi): There should not be any dependent devices that
# are UNCLAIMABLE or UNAVAILABLE as the parent is AVAILABLE,
# but we got reports in bug 1969496 that this inconsistency
# can happen. So check if the only non-free devices are in
# state UNCLAIMABLE or UNAVAILABLE then we log a warning but
# allow to claim the parent.
actual_statuses = {
child.status for child in non_free_dependants}
allowed_non_free_statues = {
fields.PciDeviceStatus.UNCLAIMABLE,
fields.PciDeviceStatus.UNAVAILABLE,
}
if actual_statuses - allowed_non_free_statues == set():
LOG.warning(
"Some child device of parent %s is in an inconsistent "
"state. If you can reproduce this warning then please "
"report a bug at "
"https://bugs.launchpad.net/nova/+filebug with "
"reproduction steps. Inconsistent children with "
"state: %s",
self.address,
",".join(
"%s - %s" % (child.address, child.status)
for child in non_free_dependants
),
)
else:
raise exception.PciDeviceVFInvalidStatus(
compute_node_id=self.compute_node_id,
address=self.address)
self._bulk_update_status(vfs_list,
fields.PciDeviceStatus.UNCLAIMABLE)

View File

@ -279,8 +279,12 @@ class PciDeviceStats(object):
if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF:
vfs_list = pci_dev.child_devices
if vfs_list:
free_devs = self.get_free_devs()
for vf in vfs_list:
self.remove_device(vf)
# NOTE(gibi): do not try to remove a device that are
# already removed
if vf in free_devs:
self.remove_device(vf)
elif pci_dev.dev_type in (
fields.PciDeviceType.SRIOV_VF,
fields.PciDeviceType.VDPA,

View File

@ -507,32 +507,134 @@ class PciDevTrackerTestCase(test.NoDBTestCase):
)
# now try to claim and allocate the PF. It should work as it is
# available
# This is bug 1969496 as the claim fails with exception
ex = self.assertRaises(
exception.PciDevicePoolEmpty,
self.tracker.claim_instance,
mock.sentinel.context,
pci_requests_obj,
None
)
self.assertIn(
'Attempt to consume PCI device 1:0000:00:02.1 from empty pool',
str(ex)
)
self.tracker.claim_instance(
mock.sentinel.context, pci_requests_obj, None)
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
pf_dev = self._get_device_by_address(pf['address'])
self.assertEqual('available', pf_dev.status)
self.assertEqual('allocated', pf_dev.status)
vf_dev = self._get_device_by_address(vf['address'])
self.assertEqual('unavailable', vf_dev.status)
# This should work when the bug is fixed
# self.tracker.claim_instance(
# mock.sentinel.context, pci_requests_obj, None)
# self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
self.assertIn(
'Some child device of parent 0000:00:01.1 is in an inconsistent '
'state. If you can reproduce this warning then please report a '
'bug at https://bugs.launchpad.net/nova/+filebug with '
'reproduction steps. Inconsistent children with state: '
'0000:00:02.1 - unavailable',
self.stdlog.logger.output
)
# pf_dev = self._get_device_by_address(pf['address'])
# self.assertEqual('allocated', pf_dev.status)
# vf_dev = self._get_device_by_address(vf['address'])
# self.assertEqual('unavailable', vf_dev.status)
# Ensure that the claim actually fixes the inconsistency so when the
# parent if freed the children become available too.
self.tracker.free_instance(
mock.sentinel.context, {'uuid': uuidsentinel.instance1})
pf_dev = self._get_device_by_address(pf['address'])
self.assertEqual('available', pf_dev.status)
vf_dev = self._get_device_by_address(vf['address'])
self.assertEqual('available', vf_dev.status)
def test_claim_available_pf_while_children_vfs_are_in_mixed_state(self):
# We start with a PF parent and two VF children. The PF is available
# and one of the VF is unavailable while the other is available.
pf = copy.deepcopy(fake_db_dev_3)
vf1 = copy.deepcopy(fake_db_dev_4)
vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE
vf2 = copy.deepcopy(fake_db_dev_5)
vf2['status'] = fields.PciDeviceStatus.AVAILABLE
self._create_tracker([pf, vf1, vf2])
pf_dev = self._get_device_by_address(pf['address'])
self.assertEqual('available', pf_dev.status)
vf1_dev = self._get_device_by_address(vf1['address'])
self.assertEqual('unavailable', vf1_dev.status)
vf2_dev = self._get_device_by_address(vf2['address'])
self.assertEqual('available', vf2_dev.status)
pci_requests_obj = self._create_pci_requests_object(
[
{
'count': 1,
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
}
],
instance_uuid=uuidsentinel.instance1,
)
# now try to claim and allocate the PF. It should work as it is
# available
self.tracker.claim_instance(
mock.sentinel.context, pci_requests_obj, None)
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
pf_dev = self._get_device_by_address(pf['address'])
self.assertEqual('allocated', pf_dev.status)
vf1_dev = self._get_device_by_address(vf1['address'])
self.assertEqual('unavailable', vf1_dev.status)
vf2_dev = self._get_device_by_address(vf2['address'])
self.assertEqual('unavailable', vf2_dev.status)
self.assertIn(
'Some child device of parent 0000:00:01.1 is in an inconsistent '
'state. If you can reproduce this warning then please report a '
'bug at https://bugs.launchpad.net/nova/+filebug with '
'reproduction steps. Inconsistent children with state: '
'0000:00:02.1 - unavailable',
self.stdlog.logger.output
)
# Ensure that the claim actually fixes the inconsistency so when the
# parent if freed the children become available too.
self.tracker.free_instance(
mock.sentinel.context, {'uuid': uuidsentinel.instance1})
pf_dev = self._get_device_by_address(pf['address'])
self.assertEqual('available', pf_dev.status)
vf1_dev = self._get_device_by_address(vf1['address'])
self.assertEqual('available', vf1_dev.status)
vf2_dev = self._get_device_by_address(vf2['address'])
self.assertEqual('available', vf2_dev.status)
def test_claim_available_pf_while_a_child_is_used(self):
pf = copy.deepcopy(fake_db_dev_3)
vf1 = copy.deepcopy(fake_db_dev_4)
vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE
vf2 = copy.deepcopy(fake_db_dev_5)
vf2['status'] = fields.PciDeviceStatus.CLAIMED
self._create_tracker([pf, vf1, vf2])
pf_dev = self._get_device_by_address(pf['address'])
self.assertEqual('available', pf_dev.status)
vf1_dev = self._get_device_by_address(vf1['address'])
self.assertEqual('unavailable', vf1_dev.status)
vf2_dev = self._get_device_by_address(vf2['address'])
self.assertEqual('claimed', vf2_dev.status)
pci_requests_obj = self._create_pci_requests_object(
[
{
'count': 1,
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
}
],
instance_uuid=uuidsentinel.instance1,
)
# now try to claim and allocate the PF. The claim should fail as on of
# the child is used.
self.assertRaises(
exception.PciDeviceVFInvalidStatus,
self.tracker.claim_instance,
mock.sentinel.context,
pci_requests_obj,
None,
)
pf_dev = self._get_device_by_address(pf['address'])
self.assertEqual('available', pf_dev.status)
vf1_dev = self._get_device_by_address(vf1['address'])
self.assertEqual('unavailable', vf1_dev.status)
vf2_dev = self._get_device_by_address(vf2['address'])
self.assertEqual('claimed', vf2_dev.status)
def test_update_pci_for_instance_active(self):
pci_requests_obj = self._create_pci_requests_object(fake_pci_requests)