Avoid race condition by using persist_uuid

When creating a network and adding a port, the API requests will
often be handled by different workers. Since each worker maintains
its own internal copy of the OVN DBs, it is possible that the
request for creating the port arrives at a worker that has not yet
received notification that the Logical Switch for the network has
been created. When the code for adding the Logical Switch Port is
called, it tries to look up the LS, and fails.

Currently, a retry-and-wait solution is in place for the LS/LSP
issue in create_port(), but a new feature introduced in OVS 3.1
allows specifying the UUID of the LS, so that we can use a value
known to the other worker, and add the LSP to the LS by that UUID
even if the LS isn't in our local DB cache. This allows the
ovsdb-server to be the source of truth on the existance of the LS.

There are lots of other interactions that could result in this
kind of race condition, any create/update of objects that happen
rapidly on systems under high load for example. This method of
creating and referencing objects should be generalizable.

In this patch, the UUID for the LS is defined as the UUID for
the neutron network. The name of the LS continues to be
neutron-$UUID to match existing usage and to keep lookups by
name working.

Change-Id: Icc27c2b8825d7f96c9dac87dec8bbb55d493d942
This commit is contained in:
Terry Wilson 2024-06-11 11:37:11 -05:00
parent 5c22bcca01
commit 71ff8ee881
8 changed files with 147 additions and 27 deletions

View File

@ -69,6 +69,8 @@ HAChassisGroupInfo = collections.namedtuple(
'HAChassisGroupInfo', ['group_name', 'chassis_list', 'az_hints',
'ignore_chassis', 'external_ids'])
_OVS_PERSIST_UUID = _SENTINEL = object()
class OvsdbClientCommand:
_CONNECTION = 0
@ -1379,3 +1381,17 @@ def validate_port_forwarding_configuration():
if any(net_type in provider_network_types
for net_type in cfg.CONF.ml2.tenant_network_types):
raise ovn_exc.InvalidPortForwardingConfiguration()
def ovs_persist_uuid_supported(nb_idl):
# OVS 3.1+ contain the persist_uuid feature that allows choosing the UUID
# that will be stored in the DB. It was broken prior to 3.1.5/3.2.3/3.3.1
# so this will return True only for the fixed version. As actually testing
# the fix requires committing a transaction, an implementation detail is
# tested. This can be removed once a fixed version is required.
global _OVS_PERSIST_UUID
if _OVS_PERSIST_UUID is _SENTINEL:
_OVS_PERSIST_UUID = isinstance(
next(iter(nb_idl.tables["NB_Global"].rows.data.values())), list)
LOG.debug(f"OVS persist_uuid supported={_OVS_PERSIST_UUID}")
return _OVS_PERSIST_UUID

View File

@ -69,6 +69,7 @@ from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import impl_idl_ovn
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import maintenance
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_client
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovs_fixes
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import worker
from neutron import service
from neutron.services.logapi.drivers.ovn import driver as log_driver
@ -412,6 +413,9 @@ class OVNMechanismDriver(api.MechanismDriver):
self._post_fork_event.clear()
self._ovn_client_inst = None
# Patch python-ovs for fixes not yet released
ovs_fixes.apply_ovs_fixes()
if worker_class == wsgi.WorkerService:
self._setup_hash_ring()

View File

@ -13,10 +13,13 @@
# under the License.
import abc
import uuid
from oslo_utils import timeutils
from ovs.db import idl as ovs_idl
from ovsdbapp.backend.ovs_idl import command
from ovsdbapp.backend.ovs_idl import idlutils
from ovsdbapp.backend.ovs_idl import rowview
from ovsdbapp.schema.ovn_northbound import commands as ovn_nb_commands
from ovsdbapp import utils as ovsdbapp_utils
@ -110,19 +113,62 @@ class CheckLivenessCommand(command.BaseCommand):
self.result = self.api.nb_global.nb_cfg
class AddLSwitchPortCommand(command.BaseCommand):
def __init__(self, api, lport, lswitch, may_exist, **columns):
class AddNetworkCommand(command.AddCommand):
table_name = 'Logical_Switch'
def __init__(self, api, network_id, may_exist=False, **columns):
super().__init__(api)
self.lport = lport
self.lswitch = lswitch
self.network_uuid = uuid.UUID(str(network_id))
self.may_exist = may_exist
self.columns = columns
def run_idl(self, txn):
table = self.api.tables[self.table_name]
try:
ls = table.rows[self.network_uuid]
if self.may_exist:
self.result = rowview.RowView(ls)
return
msg = _("Switch %s already exists") % self.network_uuid
raise RuntimeError(msg)
except KeyError:
# Adding a new LS
if utils.ovs_persist_uuid_supported(txn.idl):
ls = txn.insert(table, new_uuid=self.network_uuid,
persist_uuid=True)
else:
ls = txn.insert(table)
self.set_columns(ls, **self.columns)
ls.name = utils.ovn_name(self.network_uuid)
self.result = ls.uuid
class AddLSwitchPortCommand(command.BaseCommand):
def __init__(self, api, lport, lswitch, may_exist, network_id=None,
**columns):
super().__init__(api)
self.lport = lport
self.lswitch = lswitch
self.may_exist = may_exist
self.network_uuid = uuid.UUID(str(network_id)) if network_id else None
self.columns = columns
def run_idl(self, txn):
try:
# We must look in the local cache first, because the LS may have
# been created as part of the current transaction. or in the case
# of adding an LSP to a LS that was created before persist_uuid
lswitch = idlutils.row_by_value(self.api.idl, 'Logical_Switch',
'name', self.lswitch)
except idlutils.RowNotFound:
if self.network_uuid and utils.ovs_persist_uuid_supported(txn.idl):
# Create a "fake" row with the right UUID so python-ovs creates
# a transaction referencing the Row, even though we might not
# have received the update for the row ourselves.
lswitch = ovs_idl.Row(self.api.idl,
self.api.tables['Logical_Switch'],
uuid=self.network_uuid, data={})
else:
msg = _("Logical Switch %s does not exist") % self.lswitch
raise RuntimeError(msg)
if self.may_exist:
@ -210,8 +256,8 @@ class SetLSwitchPortCommand(command.BaseCommand):
else:
new_port_dhcp_opts.add(dhcpv6_options.result)
port.dhcpv6_options = [dhcpv6_options.result]
for uuid in cur_port_dhcp_opts - new_port_dhcp_opts:
self.api._tables['DHCP_Options'].rows[uuid].delete()
for uuid_ in cur_port_dhcp_opts - new_port_dhcp_opts:
self.api._tables['DHCP_Options'].rows[uuid_].delete()
external_ids_update = self.external_ids_update or {}
external_ids = getattr(port, 'external_ids', {})
@ -293,8 +339,8 @@ class DelLSwitchPortCommand(command.BaseCommand):
# Delete DHCP_Options records no longer referred by this port.
cur_port_dhcp_opts = get_lsp_dhcp_options_uuids(
lport, self.lport)
for uuid in cur_port_dhcp_opts:
self.api._tables['DHCP_Options'].rows[uuid].delete()
for uuid_ in cur_port_dhcp_opts:
self.api._tables['DHCP_Options'].rows[uuid_].delete()
_delvalue_from_list(lswitch, 'ports', lport)
self.api._tables['Logical_Switch_Port'].rows[lport.uuid].delete()

View File

@ -275,10 +275,17 @@ class OvsdbNbOvnIdl(nb_impl_idl.OvnNbApiIdlImpl, Backend):
if revision_mismatch_raise:
raise e
def ls_add(self, switch=None, may_exist=False, network_id=None, **columns):
if network_id is None:
return super().ls_add(switch, may_exist, **columns)
return cmd.AddNetworkCommand(self, network_id, may_exist=may_exist,
**columns)
def create_lswitch_port(self, lport_name, lswitch_name, may_exist=True,
**columns):
network_id=None, **columns):
return cmd.AddLSwitchPortCommand(self, lport_name, lswitch_name,
may_exist, **columns)
may_exist, network_id=network_id,
**columns)
def set_lswitch_port(self, lport_name, external_ids_update=None,
if_exists=True, **columns):

View File

@ -587,7 +587,9 @@ class OVNClient:
# controller does not yet see that network in its local cache of the
# OVN northbound database. Check if the logical switch is present
# or not in the idl's local copy of the database before creating
# the lswitch port.
# the lswitch port. Once we require an ovs version with working
# persist_uuid support, this can be removed.
if not utils.ovs_persist_uuid_supported(self._nb_idl):
self._nb_idl.check_for_row_by_value_and_retry(
'Logical_Switch', 'name', lswitch_name)
@ -601,6 +603,7 @@ class OVNClient:
kwargs = {
'lport_name': port['id'],
'lswitch_name': lswitch_name,
'network_id': port['network_id'],
'addresses': port_info.addresses,
'external_ids': external_ids,
'parent_name': port_info.parent_name,
@ -2069,6 +2072,7 @@ class OVNClient:
cmd = self._nb_idl.create_lswitch_port(
lport_name=utils.ovn_provnet_port_name(segment['id']),
lswitch_name=utils.ovn_name(network_id),
network_id=network_id,
addresses=[ovn_const.UNKNOWN_ADDR],
external_ids={},
type=ovn_const.LSP_TYPE_LOCALNET,
@ -2131,14 +2135,13 @@ class OVNClient:
# UUID. This provides an easy way to refer to the logical switch
# without having to track what UUID OVN assigned to it.
lswitch_params = self._gen_network_parameters(network)
lswitch_name = utils.ovn_name(network['id'])
# NOTE(mjozefcz): Remove this workaround when bug
# 1869877 will be fixed.
segments = segments_db.get_network_segments(
context, network['id'])
with self._nb_idl.transaction(check_error=True) as txn:
txn.add(self._nb_idl.ls_add(lswitch_name, **lswitch_params,
may_exist=True))
txn.add(self._nb_idl.ls_add(network_id=network['id'],
**lswitch_params, may_exist=True))
for segment in segments:
if segment.get(segment_def.PHYSICAL_NETWORK):
self.create_provnet_port(network['id'], segment, txn=txn,

View File

@ -0,0 +1,37 @@
# Copyright (c) 2024
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from ovs.db import idl
import ovs.ovsuuid
# Temporarily fix ovs.db.idl.Transaction._substitute_uuids to support handling
# the persist_uuid feature
def _substitute_uuids(self, json):
if isinstance(json, (list, tuple)):
if (len(json) == 2 and
json[0] == 'uuid' and
ovs.ovsuuid.is_valid_string(json[1])):
uuid = ovs.ovsuuid.from_string(json[1])
row = self._txn_rows.get(uuid, None)
if row and row._data is None and not row._persist_uuid:
return ["named-uuid", idl._uuid_name_from_uuid(uuid)]
else:
return [self._substitute_uuids(elem) for elem in json]
return json
def apply_ovs_fixes():
idl.Transaction._substitute_uuids = _substitute_uuids

View File

@ -108,6 +108,8 @@ class MechDriverSetupBase(abc.ABC):
neutron_agent.AgentCache().get_agents.return_value = [agent1]
self.mock_vp_parents = mock.patch.object(
ovn_utils, 'get_virtual_port_parents', return_value=None).start()
mock.patch.object(ovn_utils, 'ovs_persist_uuid_supported',
return_value=True).start()
def _add_chassis_private(self, nb_cfg, name=None):
chassis_private = mock.Mock()
@ -881,15 +883,19 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
self.mech_driver.update_network_precommit,
fake_network_context)
def _verify_ls_add(self, net_id, may_exist=True, **kwargs):
ls_add = self.mech_driver._ovn_client._nb_idl.ls_add
ls_add.assert_called_once_with(
external_ids=mock.ANY,
may_exist=may_exist, network_id=net_id, **kwargs)
def _create_network_igmp_snoop(self, enabled):
cfg.CONF.set_override('igmp_snooping_enable', enabled, group='OVS')
nb_idl = self.mech_driver._ovn_client._nb_idl
net = self._make_network(self.fmt, name='net1',
admin_state_up=True)['network']
value = 'true' if enabled else 'false'
nb_idl.ls_add.assert_called_once_with(
ovn_utils.ovn_name(net['id']), external_ids=mock.ANY,
may_exist=True,
self._verify_ls_add(
net_id=net['id'],
other_config={ovn_const.MCAST_SNOOP: value,
ovn_const.MCAST_FLOOD_UNREGISTERED: 'false',
ovn_const.VLAN_PASSTHRU: 'false'})
@ -901,7 +907,6 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
self._create_network_igmp_snoop(enabled=False)
def _create_network_vlan_passthru(self, vlan_transparent, qinq):
nb_idl = self.mech_driver._ovn_client._nb_idl
net = self._make_network(
self.fmt, name='net1',
as_admin=True,
@ -917,9 +922,8 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
'provider:physical_network': 'physnet1'})['network']
value = 'true' if vlan_transparent or qinq else 'false'
expected_fdb_age_treshold = ovn_conf.get_fdb_age_threshold()
nb_idl.ls_add.assert_called_once_with(
ovn_utils.ovn_name(net['id']), external_ids=mock.ANY,
may_exist=True,
self._verify_ls_add(
net_id=net['id'],
other_config={
ovn_const.MCAST_SNOOP: 'false',
ovn_const.MCAST_FLOOD_UNREGISTERED: 'false',
@ -958,6 +962,7 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
self.context, net['id'])
nb_idl.create_lswitch_port.assert_called_once_with(
addresses=[ovn_const.UNKNOWN_ADDR],
network_id=net['id'],
external_ids={},
lport_name=ovn_utils.ovn_provnet_port_name(segments[0]['id']),
lswitch_name=ovn_utils.ovn_name(net['id']),
@ -3438,6 +3443,7 @@ class TestOVNMechanismDriverSegment(MechDriverSetupBase,
segmentation_id=200, network_type='vlan')['segment']
ovn_nb_api.create_lswitch_port.assert_called_once_with(
addresses=[ovn_const.UNKNOWN_ADDR],
network_id=net['id'],
external_ids={},
lport_name=ovn_utils.ovn_provnet_port_name(new_segment['id']),
lswitch_name=ovn_utils.ovn_name(net['id']),
@ -3456,6 +3462,7 @@ class TestOVNMechanismDriverSegment(MechDriverSetupBase,
segmentation_id=300, network_type='vlan')['segment']
ovn_nb_api.create_lswitch_port.assert_called_once_with(
addresses=[ovn_const.UNKNOWN_ADDR],
network_id=net['id'],
external_ids={},
lport_name=ovn_utils.ovn_provnet_port_name(new_segment['id']),
lswitch_name=ovn_utils.ovn_name(net['id']),

View File

@ -44,7 +44,7 @@ oslo.versionedobjects>=1.35.1 # Apache-2.0
osprofiler>=2.3.0 # Apache-2.0
os-ken>=3.0.0 # Apache-2.0
os-resource-classes>=1.1.0 # Apache-2.0
ovs>=2.12.0 # Apache-2.0
ovs>3.3.0 # Apache-2.0
ovsdbapp>=2.11.0 # Apache-2.0
psutil>=6.1.0 # BSD
pyroute2>=0.7.3;sys_platform!='win32' # Apache-2.0 (+ dual licensed GPL2)