pci: implement the 'socket' NUMA affinity policy

This patch enables the 'socket' PCI NUMA affinity policy. The PCI manager gets a new method to implement it, and the libvirt driver starts reporting the necessary trait, enabling it to receive instances with the 'socket' policy. Implements: blueprint pci-socket-affinity Change-Id: Ia875c9c3542ef4138d0d7a2c26c0cf49dcca0761
2021-01-27 17:03:09 -05:00 · 2021-01-27 17:03:09 -05:00 · b2471dd578
commit b2471dd578
parent 890b6d54a6
7 changed files with 177 additions and 4 deletions
--- a/doc/source/user/flavors.rst
+++ b/doc/source/user/flavors.rst
@ -495,7 +495,7 @@ PCI NUMA Affinity Policy
  PCI passthrough devices and neutron SR-IOV interfaces via the
  ``hw:pci_numa_affinity_policy`` flavor extra spec or
  ``hw_pci_numa_affinity_policy``  image property. The allowed values are
-  ``required``,``preferred`` or ``legacy`` (default).
+  ``required``, ``socket``, ``preferred`` or ``legacy`` (default).

  **required**
      This value will mean that nova will boot instances with PCI devices
@ -504,6 +504,25 @@ PCI NUMA Affinity Policy
      devices could not be determined, those PCI devices wouldn't be consumable
      by the instance. This provides maximum performance.

+  **socket**
+      This means that the PCI device must be affined to the same host socket as
+      at least one of the guest NUMA nodes. For example, consider a system with
+      two sockets, each with two NUMA nodes, numbered node 0 and node 1 on
+      socket 0, and node 2 and node 3 on socket 1. There is a PCI device
+      affined to node 0. An PCI instance with two guest NUMA nodes and the
+      ``socket`` policy can be affined to either:
+
+      * node 0 and node 1
+      * node 0 and node 2
+      * node 0 and node 3
+      * node 1 and node 2
+      * node 1 and node 3
+
+      The instance cannot be affined to node 2 and node 3, as neither of those
+      are on the same socket as the PCI device. If the other nodes are consumed
+      by other instances and only nodes 2 and 3 are available, the instance
+      will not boot.
+
  **preferred**
      This value will mean that ``nova-scheduler`` will choose a compute host
      with minimal consideration for the NUMA affinity of PCI devices.
--- a/nova/pci/stats.py
+++ b/nova/pci/stats.py
@ -298,6 +298,15 @@ class PciDeviceStats(object):
                pool['count'] for pool in filtered_pools) >= requested_count:
            return filtered_pools

+        # the SOCKET policy is a bit of a special case. It's less strict than
+        # REQUIRED (so REQUIRED will automatically fulfil SOCKET, at least
+        # with our assumption of never having multiple sockets per NUMA node),
+        # but not always more strict than LEGACY: a PCI device with no NUMA
+        # affinity will fulfil LEGACY but not SOCKET. If we have SOCKET,
+        # process it here and don't continue.
+        if requested_policy == fields.PCINUMAAffinityPolicy.SOCKET:
+            return self._filter_pools_for_socket_affinity(pools, numa_cells)
+
        # some systems don't report NUMA node info for PCI devices, in which
        # case None is reported in 'pci_device.numa_node'. The LEGACY policy
        # allows us to use these devices so we include None in the list of
@ -323,6 +332,39 @@ class PciDeviceStats(object):
        return sorted(
            pools, key=lambda pool: pool.get('numa_node') not in numa_cell_ids)

+    def _filter_pools_for_socket_affinity(self, pools, numa_cells):
+        host_cells = self.numa_topology.cells
+        # bail early if we don't have socket information for all host_cells.
+        # This could happen if we're running on an weird older system with
+        # multiple sockets per NUMA node, which is a configuration that we
+        # explicitly chose not to support.
+        if any(cell.socket is None for cell in host_cells):
+            LOG.debug('No socket information in host NUMA cell(s).')
+            return []
+
+        # get a set of host sockets that the guest cells are in. Since guest
+        # cell IDs map to host cell IDs, we can just lookup the latter's
+        # socket.
+        socket_ids = set()
+        for guest_cell in numa_cells:
+            for host_cell in host_cells:
+                if guest_cell.id == host_cell.id:
+                    socket_ids.add(host_cell.socket)
+
+        # now get a set of host NUMA nodes that are in the above sockets
+        allowed_numa_nodes = set()
+        for host_cell in host_cells:
+            if host_cell.socket in socket_ids:
+                allowed_numa_nodes.add(host_cell.id)
+
+        # filter out pools that are not in one of the correct host NUMA nodes.
+        return [
+            pool for pool in pools if any(
+                utils.pci_device_prop_match(pool, [{'numa_node': numa_node}])
+                for numa_node in allowed_numa_nodes
+            )
+        ]
+
    def _filter_pools_for_unrequested_pfs(self, pools, request):
        """Filter out pools with PFs, unless these are required.

@ -383,8 +425,8 @@ class PciDeviceStats(object):
            return None

        # Next, let's exclude all devices that aren't on the correct NUMA node
-        # *assuming* we have devices and care about that, as determined by
-        # policy
+        # or socket, *assuming* we have devices and care about that, as
+        # determined by policy
        before_count = after_count
        pools = self._filter_pools_for_numa_cells(pools, request, numa_cells)
        after_count = sum([pool['count'] for pool in pools])
--- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py
+++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
@ -23,6 +23,7 @@ import mock
 from oslo_config import cfg
 from oslo_log import log as logging
 from oslo_serialization import jsonutils
+from oslo_utils import units

 import nova
 from nova import context
@ -1027,3 +1028,72 @@ class PCIServersWithSRIOVAffinityPoliciesTest(_PCIServersTestBase):
                   group='pci')

        self._test_policy(pci_numa_node, status, 'required')
+
+    def test_socket_policy_pass(self):
+        # With 1 socket containing 2 NUMA nodes, make the first node's CPU
+        # available for pinning, but affine the PCI device to the second node.
+        # This should pass.
+        host_info = fakelibvirt.HostInfo(
+            cpu_nodes=2, cpu_sockets=1, cpu_cores=2, cpu_threads=2,
+            kB_mem=(16 * units.Gi) // units.Ki)
+        self.flags(cpu_dedicated_set='0-3', group='compute')
+        pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=1)
+
+        self.start_compute(host_info=host_info, pci_info=pci_info)
+
+        extra_spec = {
+            'hw:cpu_policy': 'dedicated',
+            'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME,
+            'hw:pci_numa_affinity_policy': 'socket'
+        }
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        self._create_server(flavor_id=flavor_id)
+        self.assertTrue(self.mock_filter.called)
+
+    def test_socket_policy_fail(self):
+        # With 2 sockets containing 1 NUMA node each, make the first socket's
+        # CPUs available for pinning, but affine the PCI device to the second
+        # NUMA node in the second socket. This should fail.
+        host_info = fakelibvirt.HostInfo(
+            cpu_nodes=1, cpu_sockets=2, cpu_cores=2, cpu_threads=2,
+            kB_mem=(16 * units.Gi) // units.Ki)
+        self.flags(cpu_dedicated_set='0-3', group='compute')
+        pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=1)
+        self.start_compute(host_info=host_info, pci_info=pci_info)
+
+        extra_spec = {
+            'hw:cpu_policy': 'dedicated',
+            'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME,
+            'hw:pci_numa_affinity_policy': 'socket'
+        }
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        server = self._create_server(
+            flavor_id=flavor_id, expected_state='ERROR')
+        self.assertIn('fault', server)
+        self.assertIn('No valid host', server['fault']['message'])
+
+    def test_socket_policy_multi_numa_pass(self):
+        # 2 sockets, 2 NUMA nodes each, with the PCI device on NUMA 0 and
+        # socket 0. If we restrict cpu_dedicated_set to NUMA 1, 2 and 3, we
+        # should still be able to boot an instance with hw:numa_nodes=3 and the
+        # `socket` policy, because one of the instance's NUMA nodes will be on
+        # the same socket as the PCI device (even if there is no direct NUMA
+        # node affinity).
+        host_info = fakelibvirt.HostInfo(
+            cpu_nodes=2, cpu_sockets=2, cpu_cores=2, cpu_threads=1,
+            kB_mem=(16 * units.Gi) // units.Ki)
+        self.flags(cpu_dedicated_set='2-7', group='compute')
+        pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=0)
+
+        self.start_compute(host_info=host_info, pci_info=pci_info)
+
+        extra_spec = {
+            'hw:numa_nodes': '3',
+            'hw:cpu_policy': 'dedicated',
+            'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME,
+            'hw:pci_numa_affinity_policy': 'socket'
+        }
+        flavor_id = self._create_flavor(vcpu=6, memory_mb=3144,
+                                        extra_spec=extra_spec)
+        self._create_server(flavor_id=flavor_id)
+        self.assertTrue(self.mock_filter.called)
--- a/nova/tests/unit/pci/test_stats.py
+++ b/nova/tests/unit/pci/test_stats.py
@ -97,7 +97,16 @@ class PciDeviceStatsTestCase(test.NoDBTestCase):

    def setUp(self):
        super(PciDeviceStatsTestCase, self).setUp()
-        self.pci_stats = stats.PciDeviceStats(objects.NUMATopology())
+        self._setup_pci_stats()
+
+    def _setup_pci_stats(self, numa_topology=None):
+        """Exists for tests that need to setup pci_stats with a specific NUMA
+        topology, while still allowing tests that don't care to get the default
+        "empty" one.
+        """
+        if not numa_topology:
+            numa_topology = objects.NUMATopology()
+        self.pci_stats = stats.PciDeviceStats(numa_topology)
        # The following two calls need to be made before adding the devices.
        patcher = fakes.fake_pci_whitelist()
        self.addCleanup(patcher.stop)
@ -229,6 +238,25 @@ class PciDeviceStatsTestCase(test.NoDBTestCase):

        self.assertFalse(self.pci_stats.support_requests(pci_requests, cells))

+    def test_filter_pools_for_socket_affinity_no_socket(self):
+        self._setup_pci_stats(
+            objects.NUMATopology(
+                cells=[objects.NUMACell(socket=None)]))
+        self.assertEqual(
+            [],
+            self.pci_stats._filter_pools_for_socket_affinity(
+                self.pci_stats.pools, [objects.InstanceNUMACell()]))
+
+    def test_filter_pools_for_socket_affinity(self):
+        self._setup_pci_stats(
+            objects.NUMATopology(
+                cells=[objects.NUMACell(id=1, socket=1)]))
+        pools = self.pci_stats._filter_pools_for_socket_affinity(
+            self.pci_stats.pools, [objects.InstanceNUMACell(id=1)])
+        self.assertEqual(1, len(pools))
+        self.assertEqual('p2', pools[0]['product_id'])
+        self.assertEqual('v2', pools[0]['vendor_id'])
+
    def test_consume_requests(self):
        devs = self.pci_stats.consume_requests(pci_requests)
        self.assertEqual(2, len(devs))
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@ -1210,6 +1210,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
            'COMPUTE_NET_VIF_MODEL_VIRTIO': True,
            'COMPUTE_SECURITY_TPM_1_2': False,
            'COMPUTE_SECURITY_TPM_2_0': False,
+            'COMPUTE_SOCKET_PCI_NUMA_AFFINITY': True,
        }

        static_traits = drvr.static_traits
@ -1255,6 +1256,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
            'COMPUTE_NET_VIF_MODEL_VIRTIO': True,
            'COMPUTE_SECURITY_TPM_1_2': False,
            'COMPUTE_SECURITY_TPM_2_0': False,
+            'COMPUTE_SOCKET_PCI_NUMA_AFFINITY': True,
        }

        static_traits = drvr.static_traits
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@ -8105,6 +8105,7 @@ class LibvirtDriver(driver.ComputeDriver):
        traits.update(self._get_video_model_traits())
        traits.update(self._get_vif_model_traits())
        traits.update(self._get_tpm_traits())
+        traits.update({ot.COMPUTE_SOCKET_PCI_NUMA_AFFINITY: True})

        _, invalid_traits = ot.check_traits(traits)
        for invalid_trait in invalid_traits:
--- a/releasenotes/notes/socket-pci-numa-affinity-policy-70b95b57b9f8f0c4.yaml
+++ b/releasenotes/notes/socket-pci-numa-affinity-policy-70b95b57b9f8f0c4.yaml
@ -0,0 +1,11 @@
+---
+features:
+  - |
+    A new PCI NUMA affinity policy is available. The
+    ``hw:pci_numa_affinity_policy`` flavor extra spec and
+    ``hw_pci_numa_affinity_policy`` image metadata property now accept a
+    ``socket`` policy value. This value indicates that the PCI device must be
+    affined to the same host socket as at least one of the guest NUMA nodes.
+    For more information, see the `PCI Passthrough`__ guide.
+
+    .. __: https://docs.openstack.org/nova/latest/admin/pci-passthrough.html