Avoid race condition by using persist_uuid
When creating a network and adding a port, the API requests will often be handled by different workers. Since each worker maintains its own internal copy of the OVN DBs, it is possible that the request for creating the port arrives at a worker that has not yet received notification that the Logical Switch for the network has been created. When the code for adding the Logical Switch Port is called, it tries to look up the LS, and fails. Currently, a retry-and-wait solution is in place for the LS/LSP issue in create_port(), but a new feature introduced in OVS 3.1 allows specifying the UUID of the LS, so that we can use a value known to the other worker, and add the LSP to the LS by that UUID even if the LS isn't in our local DB cache. This allows the ovsdb-server to be the source of truth on the existance of the LS. There are lots of other interactions that could result in this kind of race condition, any create/update of objects that happen rapidly on systems under high load for example. This method of creating and referencing objects should be generalizable. In this patch, the UUID for the LS is defined as the UUID for the neutron network. The name of the LS continues to be neutron-$UUID to match existing usage and to keep lookups by name working. Change-Id: Icc27c2b8825d7f96c9dac87dec8bbb55d493d942
This commit is contained in:
parent
5c22bcca01
commit
71ff8ee881
@ -69,6 +69,8 @@ HAChassisGroupInfo = collections.namedtuple(
|
||||
'HAChassisGroupInfo', ['group_name', 'chassis_list', 'az_hints',
|
||||
'ignore_chassis', 'external_ids'])
|
||||
|
||||
_OVS_PERSIST_UUID = _SENTINEL = object()
|
||||
|
||||
|
||||
class OvsdbClientCommand:
|
||||
_CONNECTION = 0
|
||||
@ -1379,3 +1381,17 @@ def validate_port_forwarding_configuration():
|
||||
if any(net_type in provider_network_types
|
||||
for net_type in cfg.CONF.ml2.tenant_network_types):
|
||||
raise ovn_exc.InvalidPortForwardingConfiguration()
|
||||
|
||||
|
||||
def ovs_persist_uuid_supported(nb_idl):
|
||||
# OVS 3.1+ contain the persist_uuid feature that allows choosing the UUID
|
||||
# that will be stored in the DB. It was broken prior to 3.1.5/3.2.3/3.3.1
|
||||
# so this will return True only for the fixed version. As actually testing
|
||||
# the fix requires committing a transaction, an implementation detail is
|
||||
# tested. This can be removed once a fixed version is required.
|
||||
global _OVS_PERSIST_UUID
|
||||
if _OVS_PERSIST_UUID is _SENTINEL:
|
||||
_OVS_PERSIST_UUID = isinstance(
|
||||
next(iter(nb_idl.tables["NB_Global"].rows.data.values())), list)
|
||||
LOG.debug(f"OVS persist_uuid supported={_OVS_PERSIST_UUID}")
|
||||
return _OVS_PERSIST_UUID
|
||||
|
@ -69,6 +69,7 @@ from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import impl_idl_ovn
|
||||
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import maintenance
|
||||
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_client
|
||||
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync
|
||||
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovs_fixes
|
||||
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import worker
|
||||
from neutron import service
|
||||
from neutron.services.logapi.drivers.ovn import driver as log_driver
|
||||
@ -412,6 +413,9 @@ class OVNMechanismDriver(api.MechanismDriver):
|
||||
self._post_fork_event.clear()
|
||||
self._ovn_client_inst = None
|
||||
|
||||
# Patch python-ovs for fixes not yet released
|
||||
ovs_fixes.apply_ovs_fixes()
|
||||
|
||||
if worker_class == wsgi.WorkerService:
|
||||
self._setup_hash_ring()
|
||||
|
||||
|
@ -13,10 +13,13 @@
|
||||
# under the License.
|
||||
|
||||
import abc
|
||||
import uuid
|
||||
|
||||
from oslo_utils import timeutils
|
||||
from ovs.db import idl as ovs_idl
|
||||
from ovsdbapp.backend.ovs_idl import command
|
||||
from ovsdbapp.backend.ovs_idl import idlutils
|
||||
from ovsdbapp.backend.ovs_idl import rowview
|
||||
from ovsdbapp.schema.ovn_northbound import commands as ovn_nb_commands
|
||||
from ovsdbapp import utils as ovsdbapp_utils
|
||||
|
||||
@ -110,21 +113,64 @@ class CheckLivenessCommand(command.BaseCommand):
|
||||
self.result = self.api.nb_global.nb_cfg
|
||||
|
||||
|
||||
class AddLSwitchPortCommand(command.BaseCommand):
|
||||
def __init__(self, api, lport, lswitch, may_exist, **columns):
|
||||
class AddNetworkCommand(command.AddCommand):
|
||||
table_name = 'Logical_Switch'
|
||||
|
||||
def __init__(self, api, network_id, may_exist=False, **columns):
|
||||
super().__init__(api)
|
||||
self.lport = lport
|
||||
self.lswitch = lswitch
|
||||
self.network_uuid = uuid.UUID(str(network_id))
|
||||
self.may_exist = may_exist
|
||||
self.columns = columns
|
||||
|
||||
def run_idl(self, txn):
|
||||
table = self.api.tables[self.table_name]
|
||||
try:
|
||||
ls = table.rows[self.network_uuid]
|
||||
if self.may_exist:
|
||||
self.result = rowview.RowView(ls)
|
||||
return
|
||||
msg = _("Switch %s already exists") % self.network_uuid
|
||||
raise RuntimeError(msg)
|
||||
except KeyError:
|
||||
# Adding a new LS
|
||||
if utils.ovs_persist_uuid_supported(txn.idl):
|
||||
ls = txn.insert(table, new_uuid=self.network_uuid,
|
||||
persist_uuid=True)
|
||||
else:
|
||||
ls = txn.insert(table)
|
||||
self.set_columns(ls, **self.columns)
|
||||
ls.name = utils.ovn_name(self.network_uuid)
|
||||
self.result = ls.uuid
|
||||
|
||||
|
||||
class AddLSwitchPortCommand(command.BaseCommand):
|
||||
def __init__(self, api, lport, lswitch, may_exist, network_id=None,
|
||||
**columns):
|
||||
super().__init__(api)
|
||||
self.lport = lport
|
||||
self.lswitch = lswitch
|
||||
self.may_exist = may_exist
|
||||
self.network_uuid = uuid.UUID(str(network_id)) if network_id else None
|
||||
self.columns = columns
|
||||
|
||||
def run_idl(self, txn):
|
||||
try:
|
||||
# We must look in the local cache first, because the LS may have
|
||||
# been created as part of the current transaction. or in the case
|
||||
# of adding an LSP to a LS that was created before persist_uuid
|
||||
lswitch = idlutils.row_by_value(self.api.idl, 'Logical_Switch',
|
||||
'name', self.lswitch)
|
||||
except idlutils.RowNotFound:
|
||||
msg = _("Logical Switch %s does not exist") % self.lswitch
|
||||
raise RuntimeError(msg)
|
||||
if self.network_uuid and utils.ovs_persist_uuid_supported(txn.idl):
|
||||
# Create a "fake" row with the right UUID so python-ovs creates
|
||||
# a transaction referencing the Row, even though we might not
|
||||
# have received the update for the row ourselves.
|
||||
lswitch = ovs_idl.Row(self.api.idl,
|
||||
self.api.tables['Logical_Switch'],
|
||||
uuid=self.network_uuid, data={})
|
||||
else:
|
||||
msg = _("Logical Switch %s does not exist") % self.lswitch
|
||||
raise RuntimeError(msg)
|
||||
if self.may_exist:
|
||||
port = idlutils.row_by_value(self.api.idl,
|
||||
'Logical_Switch_Port', 'name',
|
||||
@ -210,8 +256,8 @@ class SetLSwitchPortCommand(command.BaseCommand):
|
||||
else:
|
||||
new_port_dhcp_opts.add(dhcpv6_options.result)
|
||||
port.dhcpv6_options = [dhcpv6_options.result]
|
||||
for uuid in cur_port_dhcp_opts - new_port_dhcp_opts:
|
||||
self.api._tables['DHCP_Options'].rows[uuid].delete()
|
||||
for uuid_ in cur_port_dhcp_opts - new_port_dhcp_opts:
|
||||
self.api._tables['DHCP_Options'].rows[uuid_].delete()
|
||||
|
||||
external_ids_update = self.external_ids_update or {}
|
||||
external_ids = getattr(port, 'external_ids', {})
|
||||
@ -293,8 +339,8 @@ class DelLSwitchPortCommand(command.BaseCommand):
|
||||
# Delete DHCP_Options records no longer referred by this port.
|
||||
cur_port_dhcp_opts = get_lsp_dhcp_options_uuids(
|
||||
lport, self.lport)
|
||||
for uuid in cur_port_dhcp_opts:
|
||||
self.api._tables['DHCP_Options'].rows[uuid].delete()
|
||||
for uuid_ in cur_port_dhcp_opts:
|
||||
self.api._tables['DHCP_Options'].rows[uuid_].delete()
|
||||
|
||||
_delvalue_from_list(lswitch, 'ports', lport)
|
||||
self.api._tables['Logical_Switch_Port'].rows[lport.uuid].delete()
|
||||
|
@ -275,10 +275,17 @@ class OvsdbNbOvnIdl(nb_impl_idl.OvnNbApiIdlImpl, Backend):
|
||||
if revision_mismatch_raise:
|
||||
raise e
|
||||
|
||||
def ls_add(self, switch=None, may_exist=False, network_id=None, **columns):
|
||||
if network_id is None:
|
||||
return super().ls_add(switch, may_exist, **columns)
|
||||
return cmd.AddNetworkCommand(self, network_id, may_exist=may_exist,
|
||||
**columns)
|
||||
|
||||
def create_lswitch_port(self, lport_name, lswitch_name, may_exist=True,
|
||||
**columns):
|
||||
network_id=None, **columns):
|
||||
return cmd.AddLSwitchPortCommand(self, lport_name, lswitch_name,
|
||||
may_exist, **columns)
|
||||
may_exist, network_id=network_id,
|
||||
**columns)
|
||||
|
||||
def set_lswitch_port(self, lport_name, external_ids_update=None,
|
||||
if_exists=True, **columns):
|
||||
|
@ -587,9 +587,11 @@ class OVNClient:
|
||||
# controller does not yet see that network in its local cache of the
|
||||
# OVN northbound database. Check if the logical switch is present
|
||||
# or not in the idl's local copy of the database before creating
|
||||
# the lswitch port.
|
||||
self._nb_idl.check_for_row_by_value_and_retry(
|
||||
'Logical_Switch', 'name', lswitch_name)
|
||||
# the lswitch port. Once we require an ovs version with working
|
||||
# persist_uuid support, this can be removed.
|
||||
if not utils.ovs_persist_uuid_supported(self._nb_idl):
|
||||
self._nb_idl.check_for_row_by_value_and_retry(
|
||||
'Logical_Switch', 'name', lswitch_name)
|
||||
|
||||
with self._nb_idl.transaction(check_error=True) as txn:
|
||||
dhcpv4_options, dhcpv6_options = self.update_port_dhcp_options(
|
||||
@ -601,6 +603,7 @@ class OVNClient:
|
||||
kwargs = {
|
||||
'lport_name': port['id'],
|
||||
'lswitch_name': lswitch_name,
|
||||
'network_id': port['network_id'],
|
||||
'addresses': port_info.addresses,
|
||||
'external_ids': external_ids,
|
||||
'parent_name': port_info.parent_name,
|
||||
@ -2069,6 +2072,7 @@ class OVNClient:
|
||||
cmd = self._nb_idl.create_lswitch_port(
|
||||
lport_name=utils.ovn_provnet_port_name(segment['id']),
|
||||
lswitch_name=utils.ovn_name(network_id),
|
||||
network_id=network_id,
|
||||
addresses=[ovn_const.UNKNOWN_ADDR],
|
||||
external_ids={},
|
||||
type=ovn_const.LSP_TYPE_LOCALNET,
|
||||
@ -2131,14 +2135,13 @@ class OVNClient:
|
||||
# UUID. This provides an easy way to refer to the logical switch
|
||||
# without having to track what UUID OVN assigned to it.
|
||||
lswitch_params = self._gen_network_parameters(network)
|
||||
lswitch_name = utils.ovn_name(network['id'])
|
||||
# NOTE(mjozefcz): Remove this workaround when bug
|
||||
# 1869877 will be fixed.
|
||||
segments = segments_db.get_network_segments(
|
||||
context, network['id'])
|
||||
with self._nb_idl.transaction(check_error=True) as txn:
|
||||
txn.add(self._nb_idl.ls_add(lswitch_name, **lswitch_params,
|
||||
may_exist=True))
|
||||
txn.add(self._nb_idl.ls_add(network_id=network['id'],
|
||||
**lswitch_params, may_exist=True))
|
||||
for segment in segments:
|
||||
if segment.get(segment_def.PHYSICAL_NETWORK):
|
||||
self.create_provnet_port(network['id'], segment, txn=txn,
|
||||
|
@ -0,0 +1,37 @@
|
||||
# Copyright (c) 2024
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from ovs.db import idl
|
||||
import ovs.ovsuuid
|
||||
|
||||
|
||||
# Temporarily fix ovs.db.idl.Transaction._substitute_uuids to support handling
|
||||
# the persist_uuid feature
|
||||
def _substitute_uuids(self, json):
|
||||
if isinstance(json, (list, tuple)):
|
||||
if (len(json) == 2 and
|
||||
json[0] == 'uuid' and
|
||||
ovs.ovsuuid.is_valid_string(json[1])):
|
||||
uuid = ovs.ovsuuid.from_string(json[1])
|
||||
row = self._txn_rows.get(uuid, None)
|
||||
if row and row._data is None and not row._persist_uuid:
|
||||
return ["named-uuid", idl._uuid_name_from_uuid(uuid)]
|
||||
else:
|
||||
return [self._substitute_uuids(elem) for elem in json]
|
||||
return json
|
||||
|
||||
|
||||
def apply_ovs_fixes():
|
||||
idl.Transaction._substitute_uuids = _substitute_uuids
|
@ -108,6 +108,8 @@ class MechDriverSetupBase(abc.ABC):
|
||||
neutron_agent.AgentCache().get_agents.return_value = [agent1]
|
||||
self.mock_vp_parents = mock.patch.object(
|
||||
ovn_utils, 'get_virtual_port_parents', return_value=None).start()
|
||||
mock.patch.object(ovn_utils, 'ovs_persist_uuid_supported',
|
||||
return_value=True).start()
|
||||
|
||||
def _add_chassis_private(self, nb_cfg, name=None):
|
||||
chassis_private = mock.Mock()
|
||||
@ -881,15 +883,19 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
|
||||
self.mech_driver.update_network_precommit,
|
||||
fake_network_context)
|
||||
|
||||
def _verify_ls_add(self, net_id, may_exist=True, **kwargs):
|
||||
ls_add = self.mech_driver._ovn_client._nb_idl.ls_add
|
||||
ls_add.assert_called_once_with(
|
||||
external_ids=mock.ANY,
|
||||
may_exist=may_exist, network_id=net_id, **kwargs)
|
||||
|
||||
def _create_network_igmp_snoop(self, enabled):
|
||||
cfg.CONF.set_override('igmp_snooping_enable', enabled, group='OVS')
|
||||
nb_idl = self.mech_driver._ovn_client._nb_idl
|
||||
net = self._make_network(self.fmt, name='net1',
|
||||
admin_state_up=True)['network']
|
||||
value = 'true' if enabled else 'false'
|
||||
nb_idl.ls_add.assert_called_once_with(
|
||||
ovn_utils.ovn_name(net['id']), external_ids=mock.ANY,
|
||||
may_exist=True,
|
||||
self._verify_ls_add(
|
||||
net_id=net['id'],
|
||||
other_config={ovn_const.MCAST_SNOOP: value,
|
||||
ovn_const.MCAST_FLOOD_UNREGISTERED: 'false',
|
||||
ovn_const.VLAN_PASSTHRU: 'false'})
|
||||
@ -901,7 +907,6 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
|
||||
self._create_network_igmp_snoop(enabled=False)
|
||||
|
||||
def _create_network_vlan_passthru(self, vlan_transparent, qinq):
|
||||
nb_idl = self.mech_driver._ovn_client._nb_idl
|
||||
net = self._make_network(
|
||||
self.fmt, name='net1',
|
||||
as_admin=True,
|
||||
@ -917,9 +922,8 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
|
||||
'provider:physical_network': 'physnet1'})['network']
|
||||
value = 'true' if vlan_transparent or qinq else 'false'
|
||||
expected_fdb_age_treshold = ovn_conf.get_fdb_age_threshold()
|
||||
nb_idl.ls_add.assert_called_once_with(
|
||||
ovn_utils.ovn_name(net['id']), external_ids=mock.ANY,
|
||||
may_exist=True,
|
||||
self._verify_ls_add(
|
||||
net_id=net['id'],
|
||||
other_config={
|
||||
ovn_const.MCAST_SNOOP: 'false',
|
||||
ovn_const.MCAST_FLOOD_UNREGISTERED: 'false',
|
||||
@ -958,6 +962,7 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
|
||||
self.context, net['id'])
|
||||
nb_idl.create_lswitch_port.assert_called_once_with(
|
||||
addresses=[ovn_const.UNKNOWN_ADDR],
|
||||
network_id=net['id'],
|
||||
external_ids={},
|
||||
lport_name=ovn_utils.ovn_provnet_port_name(segments[0]['id']),
|
||||
lswitch_name=ovn_utils.ovn_name(net['id']),
|
||||
@ -3438,6 +3443,7 @@ class TestOVNMechanismDriverSegment(MechDriverSetupBase,
|
||||
segmentation_id=200, network_type='vlan')['segment']
|
||||
ovn_nb_api.create_lswitch_port.assert_called_once_with(
|
||||
addresses=[ovn_const.UNKNOWN_ADDR],
|
||||
network_id=net['id'],
|
||||
external_ids={},
|
||||
lport_name=ovn_utils.ovn_provnet_port_name(new_segment['id']),
|
||||
lswitch_name=ovn_utils.ovn_name(net['id']),
|
||||
@ -3456,6 +3462,7 @@ class TestOVNMechanismDriverSegment(MechDriverSetupBase,
|
||||
segmentation_id=300, network_type='vlan')['segment']
|
||||
ovn_nb_api.create_lswitch_port.assert_called_once_with(
|
||||
addresses=[ovn_const.UNKNOWN_ADDR],
|
||||
network_id=net['id'],
|
||||
external_ids={},
|
||||
lport_name=ovn_utils.ovn_provnet_port_name(new_segment['id']),
|
||||
lswitch_name=ovn_utils.ovn_name(net['id']),
|
||||
|
@ -44,7 +44,7 @@ oslo.versionedobjects>=1.35.1 # Apache-2.0
|
||||
osprofiler>=2.3.0 # Apache-2.0
|
||||
os-ken>=3.0.0 # Apache-2.0
|
||||
os-resource-classes>=1.1.0 # Apache-2.0
|
||||
ovs>=2.12.0 # Apache-2.0
|
||||
ovs>3.3.0 # Apache-2.0
|
||||
ovsdbapp>=2.11.0 # Apache-2.0
|
||||
psutil>=6.1.0 # BSD
|
||||
pyroute2>=0.7.3;sys_platform!='win32' # Apache-2.0 (+ dual licensed GPL2)
|
||||
|
Loading…
x
Reference in New Issue
Block a user