From 03915cd59d5ead425ac11f63bfe84c8c2b0dc293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Ribaud?= Date: Thu, 2 Jan 2025 19:43:58 +0100 Subject: [PATCH] Update driver to deal with managed flag The target goal of these series of patch is to enable VFIO devices with kernel variant drivers. Implements: blueprint enable-vfio-devices-with-kernel-variant-drivers Change-Id: I7949ba6da8b6257865d8e9e48bf3feabc10bdf17 --- doc/source/admin/pci-passthrough.rst | 72 ++++++++++- nova/conf/pci.py | 24 ++++ .../libvirt/test_pci_sriov_servers.py | 122 ++++++++++++++++++ nova/tests/unit/virt/libvirt/test_driver.py | 113 +++++++++++----- nova/virt/libvirt/driver.py | 11 +- ...rnel-variant-drivers-fb675539545d2db2.yaml | 8 ++ 6 files changed, 311 insertions(+), 39 deletions(-) create mode 100644 releasenotes/notes/enable-vfio-devices-with-kernel-variant-drivers-fb675539545d2db2.yaml diff --git a/doc/source/admin/pci-passthrough.rst b/doc/source/admin/pci-passthrough.rst index 41f44d5a689b..0f82a227a8a7 100644 --- a/doc/source/admin/pci-passthrough.rst +++ b/doc/source/admin/pci-passthrough.rst @@ -69,6 +69,11 @@ capabilities. Nova provides Placement based scheduling support for servers with flavor based PCI requests. This support is disable by default. +.. versionchanged:: 31.0.0 (2025.1 Epoxy): + Add managed tag to define if the PCI device is managed by libvirt. + This is required to support SR-IOV devices using the new kernel variant + driver interface. + Enabling PCI passthrough ------------------------ @@ -222,6 +227,31 @@ have special meaning: place. It is recommended to test specific devices, drivers and firmware versions before assuming this feature can be used. +``managed`` + Users must specify whether the PCI device is managed by libvirt to allow + detachment from the host and assignment to the guest, or vice versa. + The managed mode of a device depends on the specific device and the support + provided by its driver. + + - ``managed='yes'`` means that nova will let libvirt to detach the device + from the host before attaching it to the guest and re-attach it to the host + after the guest is deleted. + + - ``managed='no'`` means that Nova will not request libvirt to attach + or detach the device from the host. Instead, Nova assumes that + the operator has pre-configured the host so that the devices are + already bound to vfio-pci or an appropriate variant driver. This + setup allows the devices to be directly usable by QEMU without + requiring any additional operations to enable passthrough. + + .. note:: + If not set, the default value is managed='yes' to preserve the existing + behavior, primarily for upgrade purposes. + + .. warning:: + Incorrect configuration of this parameter may result in compute + node crashes. + Configure ``nova-scheduler`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -352,9 +382,11 @@ PCI tracking in Placement The feature described below are optional and disabled by default in nova 26.0.0. (Zed). The legacy PCI tracker code path is still supported and enabled. The Placement PCI tracking can be enabled via the - :oslo.config:option:`pci.report_in_placement` configuration. But please note - that once it is enabled on a given compute host it cannot be disabled there - any more. + :oslo.config:option:`pci.report_in_placement` configuration. + +.. warning:: + Please note that once it is enabled on a given compute host + **it cannot be disabled there any more**. Since nova 26.0.0 (Zed) PCI passthrough device inventories are tracked in Placement. If a PCI device exists on the hypervisor and @@ -460,6 +492,40 @@ by nova to ``CUSTOM_PCI__``. For deeper technical details please read the `nova specification. `_ +Support for multiple types of VFs +--------------------------------- + +SR-IOV devices, such as GPUs, can be configured to provide VFs with various +characteristics under the same vendor ID and product ID. + +To enable Nova to model this, if you configure the VFs with different +resource allocations, you will need to use separate resource_classes for each. + +This can be achieved by following the steps below: + +- Enable PCI in Placement: This is necessary to track PCI devices with + custom resource classes in the placement service. + +- Define Device Specifications: Use a custom resource class to represent + a specific VF type and ensure that the VFs existing on the hypervisor are + matched via the VF's PCI address. + +- Specify Type-Specific Flavors: Define flavors with an alias that matches + the resource class to ensure proper allocation. + +Examples: + +.. note:: + The following example demonstrates device specifications and alias + configurations, utilizing resource classes as part of the "PCI in + placement" feature. + +.. code-block:: shell + + [pci] + device_spec = { "vendor_id": "10de", "product_id": "25b6", "address": "0000:25:00.4", "resource_class": "CUSTOM_A16_16A", "managed": "no" } + device_spec = { "vendor_id": "10de", "product_id": "25b6", "address": "0000:25:00.5", "resource_class": "CUSTOM_A16_8A", "managed": "no" } + alias = { "device_type": "type-VF", resource_class: "CUSTOM_A16_16A", "name": "A16_16A" } Virtual IOMMU support --------------------- diff --git a/nova/conf/pci.py b/nova/conf/pci.py index badb04abd34c..9a6dc2e07e16 100644 --- a/nova/conf/pci.py +++ b/nova/conf/pci.py @@ -174,6 +174,21 @@ Possible values: VPD capability with a card serial number (either on a VF itself on its corresponding PF), otherwise they will be ignored and not available for allocation. + - ``managed`` - Specify if the PCI device is managed by libvirt. + May have boolean-like string values case-insensitive values: + "yes" or "no". By default, "yes" is assumed for all devices. + + - ``managed='yes'`` means that nova will use libvirt to detach the + device from the host before attaching it to the guest and re-attach + it to the host after the guest is deleted. + + - ``managed='no'`` means that nova will not request libvirt to + detach / attach the device from / to the host. In this case nova + assumes that the operator configured the host in a way that these + VFs are not attached to the host. + + Warning: Incorrect configuration of this parameter may result in compute + node crashes. - ``resource_class`` - optional Placement resource class name to be used to track the matching PCI devices in Placement when [pci]report_in_placement is True. @@ -234,6 +249,15 @@ Possible values: "address": "0000:82:00.0", "resource_class": "PGPU", "traits": "HW_GPU_API_VULKAN,my-awesome-gpu"} + device_spec = {"vendor_id":"10de", + "product_id":"25b6", + "address": "0000:25:00.4", + "managed": "no"} + device_spec = {"vendor_id":"10de", + "product_id":"25b6", + "address": "0000:25:00.4", + "resource_class": "CUSTOM_A16_16A", + "managed": "no"} The following are invalid, as they specify mutually exclusive options:: diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py index 2108151e711a..37b0c645012c 100644 --- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py +++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py @@ -402,6 +402,68 @@ class SRIOVServersTest(_PCIServersWithMigrationTestBase): for device in vfs_to_delete: del pci_info.devices[device] + def _verify_guest_xml(self, xml, expected_managed): + """Helper method to check the generated XML for PCI device settings.""" + tree = etree.fromstring(xml) + elem = tree.find("./devices/hostdev") + + # Check managed attribute + actual_managed = elem.get("managed") + self.assertEqual(expected_managed, actual_managed) + + # Compare PCI address + addr_elem = tree.find("./devices/hostdev/source/address") + expected_addr = ("0x81", "0x00", "0x1") + actual_addr = ( + addr_elem.get("bus"), + addr_elem.get("slot"), + addr_elem.get("function"), + ) + self.assertEqual(expected_addr, actual_addr) + + def _run_create_server_test( + self, + pci_info, + expected_managed, + device_spec=None, + ): + """Runs a create server test with a specified PCI setup and checks + Guest.create call. + """ + if device_spec: + self.flags( + device_spec=[jsonutils.dumps(x) for x in device_spec], + group="pci", + ) + + with mock.patch.object( + nova.virt.libvirt.guest.Guest, + "create", + wraps=nova.virt.libvirt.guest.Guest.create, + ) as mock_create: + compute = self.start_compute( + pci_info=pci_info, + ) + self.host = self.computes[compute].driver._host + + # Create a server + extra_spec = {"pci_passthrough:alias": "%s:1" % + self.VFS_ALIAS_NAME} + flavor_id = self._create_flavor(extra_spec=extra_spec) + self._create_server(flavor_id=flavor_id, networks="none") + + # Ensure the method was called + mock_create.assert_called_once() + + # Verify the XML generated by the create method + xml = mock_create.call_args[0][ + 0 + ] # Extract the XML from the call args + self._verify_guest_xml(xml, expected_managed) + + # Ensure the filter was called + self.assertTrue(self.mock_filter.called) + def test_create_server_with_VF(self): """Create a server with an SR-IOV VF-type PCI device.""" @@ -416,6 +478,66 @@ class SRIOVServersTest(_PCIServersWithMigrationTestBase): # ensure the filter was called self.assertTrue(self.mock_filter.called) + def test_create_server_with_VF_and_managed_set_to_yes(self): + device_spec = [ + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.PF_PROD_ID, + "physical_network": "physnet4", + }, + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.VF_PROD_ID, + "physical_network": "physnet4", + "managed": "yes", + }, + ] + pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=1) + self._run_create_server_test( + pci_info, + expected_managed="yes", + device_spec=device_spec, + ) + + def test_create_server_with_VF_and_managed_set_to_no(self): + device_spec = [ + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.PF_PROD_ID, + "physical_network": "physnet4", + }, + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.VF_PROD_ID, + "physical_network": "physnet4", + "managed": "no", + }, + ] + pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=1) + self._run_create_server_test( + pci_info, + expected_managed="no", + device_spec=device_spec, + ) + + def test_create_server_with_VF_and_managed_not_set(self): + device_spec = [ + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.PF_PROD_ID, + "physical_network": "physnet4", + }, + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.VF_PROD_ID, + "physical_network": "physnet4", + }, + ] + pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=1) + self._run_create_server_test( + pci_info, expected_managed="yes", device_spec=device_spec + ) + def test_create_server_with_PF(self): """Create a server with an SR-IOV PF-type PCI device.""" diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 436c3a04c172..c51d292b644c 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -7942,26 +7942,28 @@ class LibvirtConnTestCase(test.NoDBTestCase, compute_ref = objects.ComputeNode(**compute_info) return (service_ref, compute_ref) - def test_get_guest_config_with_pci_passthrough_kvm(self): - self.flags(virt_type='kvm', group='libvirt') - service_ref, compute_ref = self._create_fake_service_compute() - + def _setup_instance_and_pci_device( + self, compute_ref, pci_address, managed=None + ): instance = objects.Instance(**self.test_instance) image_meta = objects.ImageMeta.from_dict(self.test_image_meta) pci_device_info = dict(test_pci_device.fake_db_dev) - pci_device_info.update(compute_node_id=1, - label='fake', - status=fields.PciDeviceStatus.ALLOCATED, - address='0000:00:00.1', - compute_id=compute_ref.id, - instance_uuid=instance.uuid, - request_id=uuids.pci_req1, - extra_info={}) + pci_device_info.update( + compute_node_id=1, + label="fake", + status=fields.PciDeviceStatus.ALLOCATED, + address=pci_address, + compute_id=compute_ref.id, + instance_uuid=instance.uuid, + request_id=uuids.pci_req1, + extra_info={"managed": managed} if managed is not None else {}, + ) + pci_device = objects.PciDevice(**pci_device_info) - pci_list = objects.PciDeviceList() - pci_list.objects.append(pci_device) + pci_list = objects.PciDeviceList(objects=[pci_device]) instance.pci_devices = pci_list + instance.pci_requests = objects.InstancePCIRequests( requests=[ objects.InstancePCIRequest( @@ -7970,27 +7972,74 @@ class LibvirtConnTestCase(test.NoDBTestCase, ] ) + return instance, image_meta + + def _assert_pci_device_config( + self, cfg, expected_managed, expected_function + ): + had_pci = [ + dev + for dev in cfg.devices + if isinstance(dev, vconfig.LibvirtConfigGuestHostdevPCI) + ] + self.assertEqual(len(had_pci), 1) + + pci_dev = had_pci[0] + self.assertEqual(pci_dev.type, "pci") + if expected_managed is not None: + self.assertEqual(pci_dev.managed, expected_managed) + self.assertEqual(pci_dev.mode, "subsystem") + self.assertEqual(pci_dev.domain, "0000") + self.assertEqual(pci_dev.bus, "00") + self.assertEqual(pci_dev.slot, "00") + self.assertEqual(pci_dev.function, expected_function) + + def _test_get_guest_config_with_pci( + self, pci_address, managed, expected_managed, expected_function + ): + service_ref, compute_ref = self._create_fake_service_compute() + instance, image_meta = self._setup_instance_and_pci_device( + compute_ref, pci_address, managed + ) + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) - disk_info = blockinfo.get_disk_info(CONF.libvirt.virt_type, - instance, - image_meta) - cfg = drvr._get_guest_config(instance, [], - image_meta, disk_info) + disk_info = blockinfo.get_disk_info( + CONF.libvirt.virt_type, instance, image_meta + ) + cfg = drvr._get_guest_config(instance, [], image_meta, disk_info) - had_pci = 0 - # care only about the PCI devices - for dev in cfg.devices: - if type(dev) is vconfig.LibvirtConfigGuestHostdevPCI: - had_pci += 1 - self.assertEqual(dev.type, 'pci') - self.assertEqual(dev.managed, 'yes') - self.assertEqual(dev.mode, 'subsystem') + self._assert_pci_device_config( + cfg, expected_managed, expected_function) - self.assertEqual(dev.domain, "0000") - self.assertEqual(dev.bus, "00") - self.assertEqual(dev.slot, "00") - self.assertEqual(dev.function, "1") - self.assertEqual(had_pci, 1) + def test_get_guest_config_with_pci_passthrough_kvm(self): + self._test_get_guest_config_with_pci("0000:00:00.1", None, "yes", "1") + + def test_get_guest_config_with_pci_passthrough_kvm_managed_yes(self): + self._test_get_guest_config_with_pci( + "0000:00:00.2", "true", "yes", "2") + + def test_get_guest_config_with_pci_passthrough_kvm_managed_no(self): + self._test_get_guest_config_with_pci( + "0000:00:00.3", "false", "no", "3") + + @mock.patch('nova.virt.libvirt.driver.LOG', autospec=True) + def test_log_in_set_managed_node(self, mock_log): + self.flags(virt_type='parallels', group='libvirt') + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) + # Just a fake class to check result + + class PciDevice(): + managed = None + + pcidev = PciDevice() + drvr._set_managed_mode(pcidev, "yes") + + mock_log.debug.assert_called_once_with( + "Managed mode set to '%s' but it is overwritten by parallels " + "hypervisor settings.", + "yes", + ) + self.assertEqual(pcidev.managed, "no") def test_get_guest_config_os_command_line_through_image_meta(self): self.flags(virt_type="kvm", diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 850d0d6f2fba..cfb22b5cbd1f 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -6172,19 +6172,22 @@ class LibvirtDriver(driver.ComputeDriver): return sysinfo - def _set_managed_mode(self, pcidev): + def _set_managed_mode(self, pcidev, managed): # only kvm support managed mode if CONF.libvirt.virt_type in ('parallels',): pcidev.managed = 'no' + LOG.debug("Managed mode set to '%s' but it is overwritten by " + "parallels hypervisor settings.", managed) if CONF.libvirt.virt_type in ('kvm', 'qemu'): - pcidev.managed = 'yes' + pcidev.managed = "yes" if managed == "true" else "no" def _get_guest_pci_device(self, pci_device): dbsf = pci_utils.parse_address(pci_device.address) dev = vconfig.LibvirtConfigGuestHostdevPCI() dev.domain, dev.bus, dev.slot, dev.function = dbsf - self._set_managed_mode(dev) + managed = pci_device.extra_info.get('managed', 'true') + self._set_managed_mode(dev, managed) return dev @@ -7769,7 +7772,7 @@ class LibvirtDriver(driver.ComputeDriver): dev.domain, dev.bus, dev.slot, dev.function = ( pci_addr['domain'], pci_addr['bus'], pci_addr['device'], pci_addr['function']) - self._set_managed_mode(dev) + self._set_managed_mode(dev, "true") guest.add_device(dev) diff --git a/releasenotes/notes/enable-vfio-devices-with-kernel-variant-drivers-fb675539545d2db2.yaml b/releasenotes/notes/enable-vfio-devices-with-kernel-variant-drivers-fb675539545d2db2.yaml new file mode 100644 index 000000000000..ec84ca8dd2d8 --- /dev/null +++ b/releasenotes/notes/enable-vfio-devices-with-kernel-variant-drivers-fb675539545d2db2.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + This release adds support for SR-IOV devices + using the new kernel VFIO SR-IOV variant driver interface. + See the `OpenStack pci-passthrough documentation`__ for more details. + + .. __: https://docs.openstack.org/nova/latest/admin/pci-passthrough.html