pci: implement the 'socket' NUMA affinity policy
This patch enables the 'socket' PCI NUMA affinity policy. The PCI manager gets a new method to implement it, and the libvirt driver starts reporting the necessary trait, enabling it to receive instances with the 'socket' policy. Implements: blueprint pci-socket-affinity Change-Id: Ia875c9c3542ef4138d0d7a2c26c0cf49dcca0761
This commit is contained in:
parent
890b6d54a6
commit
b2471dd578
@ -495,7 +495,7 @@ PCI NUMA Affinity Policy
|
||||
PCI passthrough devices and neutron SR-IOV interfaces via the
|
||||
``hw:pci_numa_affinity_policy`` flavor extra spec or
|
||||
``hw_pci_numa_affinity_policy`` image property. The allowed values are
|
||||
``required``,``preferred`` or ``legacy`` (default).
|
||||
``required``, ``socket``, ``preferred`` or ``legacy`` (default).
|
||||
|
||||
**required**
|
||||
This value will mean that nova will boot instances with PCI devices
|
||||
@ -504,6 +504,25 @@ PCI NUMA Affinity Policy
|
||||
devices could not be determined, those PCI devices wouldn't be consumable
|
||||
by the instance. This provides maximum performance.
|
||||
|
||||
**socket**
|
||||
This means that the PCI device must be affined to the same host socket as
|
||||
at least one of the guest NUMA nodes. For example, consider a system with
|
||||
two sockets, each with two NUMA nodes, numbered node 0 and node 1 on
|
||||
socket 0, and node 2 and node 3 on socket 1. There is a PCI device
|
||||
affined to node 0. An PCI instance with two guest NUMA nodes and the
|
||||
``socket`` policy can be affined to either:
|
||||
|
||||
* node 0 and node 1
|
||||
* node 0 and node 2
|
||||
* node 0 and node 3
|
||||
* node 1 and node 2
|
||||
* node 1 and node 3
|
||||
|
||||
The instance cannot be affined to node 2 and node 3, as neither of those
|
||||
are on the same socket as the PCI device. If the other nodes are consumed
|
||||
by other instances and only nodes 2 and 3 are available, the instance
|
||||
will not boot.
|
||||
|
||||
**preferred**
|
||||
This value will mean that ``nova-scheduler`` will choose a compute host
|
||||
with minimal consideration for the NUMA affinity of PCI devices.
|
||||
|
@ -298,6 +298,15 @@ class PciDeviceStats(object):
|
||||
pool['count'] for pool in filtered_pools) >= requested_count:
|
||||
return filtered_pools
|
||||
|
||||
# the SOCKET policy is a bit of a special case. It's less strict than
|
||||
# REQUIRED (so REQUIRED will automatically fulfil SOCKET, at least
|
||||
# with our assumption of never having multiple sockets per NUMA node),
|
||||
# but not always more strict than LEGACY: a PCI device with no NUMA
|
||||
# affinity will fulfil LEGACY but not SOCKET. If we have SOCKET,
|
||||
# process it here and don't continue.
|
||||
if requested_policy == fields.PCINUMAAffinityPolicy.SOCKET:
|
||||
return self._filter_pools_for_socket_affinity(pools, numa_cells)
|
||||
|
||||
# some systems don't report NUMA node info for PCI devices, in which
|
||||
# case None is reported in 'pci_device.numa_node'. The LEGACY policy
|
||||
# allows us to use these devices so we include None in the list of
|
||||
@ -323,6 +332,39 @@ class PciDeviceStats(object):
|
||||
return sorted(
|
||||
pools, key=lambda pool: pool.get('numa_node') not in numa_cell_ids)
|
||||
|
||||
def _filter_pools_for_socket_affinity(self, pools, numa_cells):
|
||||
host_cells = self.numa_topology.cells
|
||||
# bail early if we don't have socket information for all host_cells.
|
||||
# This could happen if we're running on an weird older system with
|
||||
# multiple sockets per NUMA node, which is a configuration that we
|
||||
# explicitly chose not to support.
|
||||
if any(cell.socket is None for cell in host_cells):
|
||||
LOG.debug('No socket information in host NUMA cell(s).')
|
||||
return []
|
||||
|
||||
# get a set of host sockets that the guest cells are in. Since guest
|
||||
# cell IDs map to host cell IDs, we can just lookup the latter's
|
||||
# socket.
|
||||
socket_ids = set()
|
||||
for guest_cell in numa_cells:
|
||||
for host_cell in host_cells:
|
||||
if guest_cell.id == host_cell.id:
|
||||
socket_ids.add(host_cell.socket)
|
||||
|
||||
# now get a set of host NUMA nodes that are in the above sockets
|
||||
allowed_numa_nodes = set()
|
||||
for host_cell in host_cells:
|
||||
if host_cell.socket in socket_ids:
|
||||
allowed_numa_nodes.add(host_cell.id)
|
||||
|
||||
# filter out pools that are not in one of the correct host NUMA nodes.
|
||||
return [
|
||||
pool for pool in pools if any(
|
||||
utils.pci_device_prop_match(pool, [{'numa_node': numa_node}])
|
||||
for numa_node in allowed_numa_nodes
|
||||
)
|
||||
]
|
||||
|
||||
def _filter_pools_for_unrequested_pfs(self, pools, request):
|
||||
"""Filter out pools with PFs, unless these are required.
|
||||
|
||||
@ -383,8 +425,8 @@ class PciDeviceStats(object):
|
||||
return None
|
||||
|
||||
# Next, let's exclude all devices that aren't on the correct NUMA node
|
||||
# *assuming* we have devices and care about that, as determined by
|
||||
# policy
|
||||
# or socket, *assuming* we have devices and care about that, as
|
||||
# determined by policy
|
||||
before_count = after_count
|
||||
pools = self._filter_pools_for_numa_cells(pools, request, numa_cells)
|
||||
after_count = sum([pool['count'] for pool in pools])
|
||||
|
@ -23,6 +23,7 @@ import mock
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log as logging
|
||||
from oslo_serialization import jsonutils
|
||||
from oslo_utils import units
|
||||
|
||||
import nova
|
||||
from nova import context
|
||||
@ -1027,3 +1028,72 @@ class PCIServersWithSRIOVAffinityPoliciesTest(_PCIServersTestBase):
|
||||
group='pci')
|
||||
|
||||
self._test_policy(pci_numa_node, status, 'required')
|
||||
|
||||
def test_socket_policy_pass(self):
|
||||
# With 1 socket containing 2 NUMA nodes, make the first node's CPU
|
||||
# available for pinning, but affine the PCI device to the second node.
|
||||
# This should pass.
|
||||
host_info = fakelibvirt.HostInfo(
|
||||
cpu_nodes=2, cpu_sockets=1, cpu_cores=2, cpu_threads=2,
|
||||
kB_mem=(16 * units.Gi) // units.Ki)
|
||||
self.flags(cpu_dedicated_set='0-3', group='compute')
|
||||
pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=1)
|
||||
|
||||
self.start_compute(host_info=host_info, pci_info=pci_info)
|
||||
|
||||
extra_spec = {
|
||||
'hw:cpu_policy': 'dedicated',
|
||||
'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME,
|
||||
'hw:pci_numa_affinity_policy': 'socket'
|
||||
}
|
||||
flavor_id = self._create_flavor(extra_spec=extra_spec)
|
||||
self._create_server(flavor_id=flavor_id)
|
||||
self.assertTrue(self.mock_filter.called)
|
||||
|
||||
def test_socket_policy_fail(self):
|
||||
# With 2 sockets containing 1 NUMA node each, make the first socket's
|
||||
# CPUs available for pinning, but affine the PCI device to the second
|
||||
# NUMA node in the second socket. This should fail.
|
||||
host_info = fakelibvirt.HostInfo(
|
||||
cpu_nodes=1, cpu_sockets=2, cpu_cores=2, cpu_threads=2,
|
||||
kB_mem=(16 * units.Gi) // units.Ki)
|
||||
self.flags(cpu_dedicated_set='0-3', group='compute')
|
||||
pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=1)
|
||||
self.start_compute(host_info=host_info, pci_info=pci_info)
|
||||
|
||||
extra_spec = {
|
||||
'hw:cpu_policy': 'dedicated',
|
||||
'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME,
|
||||
'hw:pci_numa_affinity_policy': 'socket'
|
||||
}
|
||||
flavor_id = self._create_flavor(extra_spec=extra_spec)
|
||||
server = self._create_server(
|
||||
flavor_id=flavor_id, expected_state='ERROR')
|
||||
self.assertIn('fault', server)
|
||||
self.assertIn('No valid host', server['fault']['message'])
|
||||
|
||||
def test_socket_policy_multi_numa_pass(self):
|
||||
# 2 sockets, 2 NUMA nodes each, with the PCI device on NUMA 0 and
|
||||
# socket 0. If we restrict cpu_dedicated_set to NUMA 1, 2 and 3, we
|
||||
# should still be able to boot an instance with hw:numa_nodes=3 and the
|
||||
# `socket` policy, because one of the instance's NUMA nodes will be on
|
||||
# the same socket as the PCI device (even if there is no direct NUMA
|
||||
# node affinity).
|
||||
host_info = fakelibvirt.HostInfo(
|
||||
cpu_nodes=2, cpu_sockets=2, cpu_cores=2, cpu_threads=1,
|
||||
kB_mem=(16 * units.Gi) // units.Ki)
|
||||
self.flags(cpu_dedicated_set='2-7', group='compute')
|
||||
pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=0)
|
||||
|
||||
self.start_compute(host_info=host_info, pci_info=pci_info)
|
||||
|
||||
extra_spec = {
|
||||
'hw:numa_nodes': '3',
|
||||
'hw:cpu_policy': 'dedicated',
|
||||
'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME,
|
||||
'hw:pci_numa_affinity_policy': 'socket'
|
||||
}
|
||||
flavor_id = self._create_flavor(vcpu=6, memory_mb=3144,
|
||||
extra_spec=extra_spec)
|
||||
self._create_server(flavor_id=flavor_id)
|
||||
self.assertTrue(self.mock_filter.called)
|
||||
|
@ -97,7 +97,16 @@ class PciDeviceStatsTestCase(test.NoDBTestCase):
|
||||
|
||||
def setUp(self):
|
||||
super(PciDeviceStatsTestCase, self).setUp()
|
||||
self.pci_stats = stats.PciDeviceStats(objects.NUMATopology())
|
||||
self._setup_pci_stats()
|
||||
|
||||
def _setup_pci_stats(self, numa_topology=None):
|
||||
"""Exists for tests that need to setup pci_stats with a specific NUMA
|
||||
topology, while still allowing tests that don't care to get the default
|
||||
"empty" one.
|
||||
"""
|
||||
if not numa_topology:
|
||||
numa_topology = objects.NUMATopology()
|
||||
self.pci_stats = stats.PciDeviceStats(numa_topology)
|
||||
# The following two calls need to be made before adding the devices.
|
||||
patcher = fakes.fake_pci_whitelist()
|
||||
self.addCleanup(patcher.stop)
|
||||
@ -229,6 +238,25 @@ class PciDeviceStatsTestCase(test.NoDBTestCase):
|
||||
|
||||
self.assertFalse(self.pci_stats.support_requests(pci_requests, cells))
|
||||
|
||||
def test_filter_pools_for_socket_affinity_no_socket(self):
|
||||
self._setup_pci_stats(
|
||||
objects.NUMATopology(
|
||||
cells=[objects.NUMACell(socket=None)]))
|
||||
self.assertEqual(
|
||||
[],
|
||||
self.pci_stats._filter_pools_for_socket_affinity(
|
||||
self.pci_stats.pools, [objects.InstanceNUMACell()]))
|
||||
|
||||
def test_filter_pools_for_socket_affinity(self):
|
||||
self._setup_pci_stats(
|
||||
objects.NUMATopology(
|
||||
cells=[objects.NUMACell(id=1, socket=1)]))
|
||||
pools = self.pci_stats._filter_pools_for_socket_affinity(
|
||||
self.pci_stats.pools, [objects.InstanceNUMACell(id=1)])
|
||||
self.assertEqual(1, len(pools))
|
||||
self.assertEqual('p2', pools[0]['product_id'])
|
||||
self.assertEqual('v2', pools[0]['vendor_id'])
|
||||
|
||||
def test_consume_requests(self):
|
||||
devs = self.pci_stats.consume_requests(pci_requests)
|
||||
self.assertEqual(2, len(devs))
|
||||
|
@ -1210,6 +1210,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
'COMPUTE_NET_VIF_MODEL_VIRTIO': True,
|
||||
'COMPUTE_SECURITY_TPM_1_2': False,
|
||||
'COMPUTE_SECURITY_TPM_2_0': False,
|
||||
'COMPUTE_SOCKET_PCI_NUMA_AFFINITY': True,
|
||||
}
|
||||
|
||||
static_traits = drvr.static_traits
|
||||
@ -1255,6 +1256,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
'COMPUTE_NET_VIF_MODEL_VIRTIO': True,
|
||||
'COMPUTE_SECURITY_TPM_1_2': False,
|
||||
'COMPUTE_SECURITY_TPM_2_0': False,
|
||||
'COMPUTE_SOCKET_PCI_NUMA_AFFINITY': True,
|
||||
}
|
||||
|
||||
static_traits = drvr.static_traits
|
||||
|
@ -8105,6 +8105,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
traits.update(self._get_video_model_traits())
|
||||
traits.update(self._get_vif_model_traits())
|
||||
traits.update(self._get_tpm_traits())
|
||||
traits.update({ot.COMPUTE_SOCKET_PCI_NUMA_AFFINITY: True})
|
||||
|
||||
_, invalid_traits = ot.check_traits(traits)
|
||||
for invalid_trait in invalid_traits:
|
||||
|
@ -0,0 +1,11 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
A new PCI NUMA affinity policy is available. The
|
||||
``hw:pci_numa_affinity_policy`` flavor extra spec and
|
||||
``hw_pci_numa_affinity_policy`` image metadata property now accept a
|
||||
``socket`` policy value. This value indicates that the PCI device must be
|
||||
affined to the same host socket as at least one of the guest NUMA nodes.
|
||||
For more information, see the `PCI Passthrough`__ guide.
|
||||
|
||||
.. __: https://docs.openstack.org/nova/latest/admin/pci-passthrough.html
|
Loading…
x
Reference in New Issue
Block a user