diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index ea5c2982b569..7b94a44412ca 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -3038,6 +3038,9 @@ class NodeCacheTestCase(test.NoDBTestCase): mock_instances.return_value = instances mock_nodes.return_value = nodes mock_hosts.side_effect = hosts + parent_mock = mock.MagicMock() + parent_mock.attach_mock(mock_nodes, 'get_node_list') + parent_mock.attach_mock(mock_instances, 'get_uuids_by_host') if not can_send_146: mock_can_send.side_effect = ( exception.IronicAPIVersionNotAvailable(version='1.46')) @@ -3050,6 +3053,15 @@ class NodeCacheTestCase(test.NoDBTestCase): self.driver._refresh_cache() + # assert if get_node_list() is called before get_uuids_by_host() + parent_mock.assert_has_calls( + [ + mock.call.get_node_list(fields=ironic_driver._NODE_FIELDS, + **kwargs), + mock.call.get_uuids_by_host(mock.ANY, self.host) + ] + ) + mock_hash_ring.assert_called_once_with(mock.ANY) mock_instances.assert_called_once_with(mock.ANY, self.host) mock_nodes.assert_called_once_with(fields=ironic_driver._NODE_FIELDS, diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 6c03d03b7b7a..aabc05776ed4 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -733,10 +733,15 @@ class IronicDriver(virt_driver.ComputeDriver): def _refresh_cache(self): ctxt = nova_context.get_admin_context() self._refresh_hash_ring(ctxt) - instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host) node_cache = {} def _get_node_list(**kwargs): + # NOTE(TheJulia): This call can take a substantial amount + # of time as it may be attempting to retrieve thousands of + # baremetal nodes. Depending on the version of Ironic, + # this can be as long as 2-10 seconds per every thousand + # nodes, and this call may retrieve all nodes in a deployment, + # depending on if any filter paramters are applied. return self._get_node_list(fields=_NODE_FIELDS, **kwargs) # NOTE(jroll) if partition_key is set, we need to limit nodes that @@ -760,6 +765,15 @@ class IronicDriver(virt_driver.ComputeDriver): else: nodes = _get_node_list() + # NOTE(saga): As _get_node_list() will take a long + # time to return in large clusters we need to call it before + # get_uuids_by_host() method. Otherwise the instances list we get from + # get_uuids_by_host() method will become stale. + # A stale instances list can cause a node that is managed by this + # compute host to be excluded in error and cause the compute node + # to be orphaned and associated resource provider to be deleted. + instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host) + for node in nodes: # NOTE(jroll): we always manage the nodes for instances we manage if node.instance_uuid in instances: diff --git a/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml b/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml new file mode 100644 index 000000000000..a49d64c6ac58 --- /dev/null +++ b/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + Minimizes a race condition window when using the ``ironic`` virt driver + where the data generated for the Resource Tracker may attempt to compare + potentially stale instance information with the latest known baremetal + node information. While this doesn't completely prevent nor resolve the + underlying race condition identified in + `bug 1841481 `_, + this change allows Nova to have the latest state information, as opposed + to state information which may be out of date due to the time which it may + take to retrieve the status from Ironic. This issue was most observable + on baremetal clusters with several thousand physical nodes.