
Previously, when the secondary site association sync status was set to 'out-of-sync', when the primary site becomes online again, it would update the primary site rehome-data with the secondary site data. This causes an issue because the systemcontroller_gateway_address present on the rehome-data is exclusive to each site and can't be shared across sites, causing an invalid route being created on the primary site. This commit prevents that by only syncing all the other rehome-data attributes, except the systemcontroller_gateway_address. This commit also prevents the association sync status from being set as 'out-of-sync' if the local association is already marked as 'in-sync' whenever the primary site detects that the secondary site is reachable. It also causes the peer monitor thread to immediately audit the peer site after the association is created, instead of waiting for the heartbeat interval. Additionally, this commit fixes an incorrect dictionary key usage from "sync_status" to "sync-status" that caused the peer site association sync status to be updated unnecessarily. It also fixes an issue introduced by [1] where the subcloud update would fail when there was an update in the systemcontroller_gateway_address attribute because the validation function was expecting some attributes that were not available as part of the request payload. This commit also improves some log messages and adds type annotations to some geo-redundancy related methods. Test Plan: 01. PASS - Run an end-to-end geo-redundancy test, migrating a subcloud from the primary site to the secondary and back. 02. PASS - Re-run the GR test, but cause a failure during the migration to the secondary site. Update the bootstrap values on the secondary site and verify that the sync-status is out-of-sync. Run the migration to secondary site again, then migrate back to primary site and verify that the rehome-data synchronization does not synchronize the systemcontroller_gateway_address attribute. 03. PASS: Do the following steps: - Create a system peer with an incorrect systemcontroller gateway address that's inside the management subnet, but outside the reserved IP range and then create an association. Verify that the secondary subcloud and a route was created using the incorrect IP. - Update the system peer with the correct systemcontroller gateway address on the primary site. Verify that the PGA sync status is set to 'out-of-sync' on both sites. - Sync the PGA and verify that the secondary subcloud systemcontroller gateway address was updated and that the old route was deleted and a new one using the new address was created. - Migrate the SPG to the non-primary site and verify that it completes successfully and that the subcloud becomes online and managed. 04. PASS - After creating a peer group and the association, verify that the peer monitor thread is started and that the first heartbeat check is executed without waiting for the heartbeat interval. [1]: https://review.opendev.org/c/starlingx/distcloud/+/922255 Closes-Bug: 2089715 Change-Id: I857f30e2d691dfb18196f123ba5a2a52fd8ddb64 Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com>
486 lines
21 KiB
Python
486 lines
21 KiB
Python
#
|
|
# Copyright (c) 2023-2024 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
from __future__ import annotations
|
|
import json
|
|
import threading
|
|
from typing import TYPE_CHECKING
|
|
|
|
from fm_api import constants as fm_const
|
|
from fm_api import fm_api
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon.drivers.openstack.dcmanager_v1 import DcmanagerClient
|
|
from dcmanager.common import consts
|
|
from dcmanager.common import context
|
|
from dcmanager.common.i18n import _
|
|
from dcmanager.common import manager
|
|
from dcmanager.common import utils
|
|
from dcmanager.db import api as db_api
|
|
from dcmanager.db.sqlalchemy import models
|
|
from dcmanager.manager.system_peer_manager import SystemPeerManager
|
|
|
|
# Use TYPE_CHECKING to avoid circular import
|
|
if TYPE_CHECKING:
|
|
from dcmanager.manager.subcloud_manager import SubcloudManager
|
|
|
|
CONF = cfg.CONF
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
class PeerGroupAuditManager(manager.Manager):
|
|
"""Manages audit related tasks."""
|
|
|
|
def __init__(
|
|
self, subcloud_manager: SubcloudManager, peer_group_id: int, *args, **kwargs
|
|
):
|
|
LOG.debug(_("PeerGroupAuditManager initialization..."))
|
|
super().__init__(service_name="peer_group_audit_manager", *args, **kwargs)
|
|
self.context = context.get_admin_context()
|
|
self.fm_api = fm_api.FaultAPIs()
|
|
self.subcloud_manager = subcloud_manager
|
|
self.peer_group_id = peer_group_id
|
|
self.require_audit_flag = True
|
|
self.thread = None
|
|
self.thread_lock = threading.Lock()
|
|
|
|
def _get_subclouds_by_peer_group_from_system_peer(
|
|
self, dc_client, system_peer, peer_group_name
|
|
):
|
|
try:
|
|
subclouds = dc_client.get_subcloud_list_by_peer_group(peer_group_name)
|
|
return subclouds
|
|
except Exception:
|
|
LOG.exception(
|
|
f"Failed to get subclouds of peer group {peer_group_name} "
|
|
f"from DC: {system_peer.peer_name}"
|
|
)
|
|
|
|
@staticmethod
|
|
def _get_association_sync_status_from_peer_site(
|
|
dc_client: DcmanagerClient, system_peer: models.SystemPeer, peer_group_id: int
|
|
) -> str:
|
|
try:
|
|
# Get peer site system peer
|
|
dc_peer_system_peer = dc_client.get_system_peer(
|
|
utils.get_local_system().uuid
|
|
)
|
|
association = dc_client.get_peer_group_association_with_peer_id_and_pg_id(
|
|
dc_peer_system_peer.get("id"), peer_group_id
|
|
)
|
|
return association.get("sync-status")
|
|
except Exception:
|
|
LOG.exception(
|
|
f"Failed to get subclouds of peer group {peer_group_id} "
|
|
f"from DC: {system_peer.peer_name}"
|
|
)
|
|
|
|
def _update_remote_peer_group_migration_status(
|
|
self, system_peer, peer_group_name, migration_status
|
|
):
|
|
dc_client = SystemPeerManager.get_peer_dc_client(system_peer)
|
|
peer_group_kwargs = {"migration_status": migration_status}
|
|
dc_client.update_subcloud_peer_group(peer_group_name, **peer_group_kwargs)
|
|
LOG.info(
|
|
f"Updated Subcloud Peer Group {peer_group_name} on peer site "
|
|
f"{system_peer.peer_name}, set migration_status to: {migration_status}"
|
|
)
|
|
|
|
def _get_local_subclouds_to_update_and_delete(
|
|
self,
|
|
local_peer_group: models.SubcloudPeerGroup,
|
|
remote_subclouds: list[dict],
|
|
remote_sync_status: str,
|
|
) -> tuple[list[models.Subcloud], list[models.Subcloud], bool]:
|
|
local_subclouds_to_update = list()
|
|
local_subclouds_to_delete = list()
|
|
any_rehome_failed = False
|
|
remote_subclouds_dict = {
|
|
remote_subcloud.get("region-name"): remote_subcloud
|
|
for remote_subcloud in remote_subclouds
|
|
}
|
|
local_subclouds = db_api.subcloud_get_for_peer_group(
|
|
self.context, local_peer_group.id
|
|
)
|
|
|
|
for local_subcloud in local_subclouds:
|
|
remote_subcloud = remote_subclouds_dict.get(local_subcloud.region_name)
|
|
if remote_subcloud:
|
|
# Check if the remote subcloud meets the conditions for update
|
|
# if it is 'managed' and the local subcloud is not
|
|
# in 'secondary' status
|
|
MANAGED = dccommon_consts.MANAGEMENT_MANAGED
|
|
if remote_subcloud.get(
|
|
"management-state"
|
|
) == MANAGED and not utils.subcloud_is_secondary_state(
|
|
local_subcloud.deploy_status
|
|
):
|
|
local_subclouds_to_update.append(local_subcloud)
|
|
# Sync rehome_data from remote to local subcloud if the remote
|
|
# PGA sync_status is out-of-sync once migration completes,
|
|
# indicating any bootstrap values/address updates to
|
|
# the subcloud on the remote site.
|
|
if remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC:
|
|
LOG.info(
|
|
"Peer association is out-of-sync, syncing rehome "
|
|
f"data of subcloud '{local_subcloud.name}' from "
|
|
"peer to current site"
|
|
)
|
|
self._sync_rehome_data(
|
|
local_subcloud, remote_subcloud.get("rehome_data")
|
|
)
|
|
elif remote_subcloud.get("deploy-status") in (
|
|
consts.DEPLOY_STATE_REHOME_FAILED,
|
|
consts.DEPLOY_STATE_REHOME_PREP_FAILED,
|
|
):
|
|
# Set local subcloud to rehome-failed if the remote is
|
|
# rehome-failed or rehome-prep-failed, otherwise, the
|
|
# deploy_status will remain rehome-pending, which will
|
|
# block the correction of the bootstrap values/address.
|
|
db_api.subcloud_update(
|
|
self.context,
|
|
local_subcloud.id,
|
|
deploy_status=consts.DEPLOY_STATE_REHOME_FAILED,
|
|
)
|
|
any_rehome_failed = True
|
|
else:
|
|
local_subclouds_to_delete.append(local_subcloud)
|
|
|
|
return local_subclouds_to_update, local_subclouds_to_delete, any_rehome_failed
|
|
|
|
def _set_local_subcloud_to_secondary(self, subcloud: models.Subcloud) -> None:
|
|
try:
|
|
LOG.info("Set local subcloud %s to secondary" % subcloud.name)
|
|
# There will be an exception when unmanage
|
|
# a subcloud in 'unamaged' state.
|
|
if subcloud.management_state != dccommon_consts.MANAGEMENT_UNMANAGED:
|
|
self.subcloud_manager.update_subcloud(
|
|
self.context,
|
|
subcloud.id,
|
|
management_state=dccommon_consts.MANAGEMENT_UNMANAGED,
|
|
)
|
|
self.subcloud_manager.update_subcloud(
|
|
self.context, subcloud.id, deploy_status=consts.DEPLOY_STATE_SECONDARY
|
|
)
|
|
except Exception as e:
|
|
LOG.exception(
|
|
"Failed to update local non-secondary and offline subcloud "
|
|
f"[{subcloud.name}], err: {e}"
|
|
)
|
|
raise e
|
|
|
|
def _sync_rehome_data(self, subcloud: models.Subcloud, rehome_data: str) -> None:
|
|
try:
|
|
remote_rehome_data = json.loads(rehome_data)
|
|
local_rehome_data = json.loads(subcloud.rehome_data)
|
|
|
|
# The systemcontroller_gateway_address can't be synced from the
|
|
# peer to the local site as it's specific to each site
|
|
remote_rehome_data["saved_payload"]["systemcontroller_gateway_address"] = (
|
|
local_rehome_data["saved_payload"]["systemcontroller_gateway_address"]
|
|
)
|
|
new_rehome_data = json.dumps(remote_rehome_data)
|
|
|
|
db_api.subcloud_update(
|
|
self.context, subcloud.id, rehome_data=new_rehome_data
|
|
)
|
|
except Exception as e:
|
|
LOG.error(
|
|
"Unable to sync rehome data of subcloud "
|
|
f"'{subcloud.name}' from peer to current site: {str(e)}"
|
|
)
|
|
raise
|
|
|
|
def audit(
|
|
self,
|
|
system_peer: models.SystemPeer,
|
|
remote_peer_group: dict,
|
|
local_peer_group: models.SubcloudPeerGroup,
|
|
) -> None:
|
|
if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING:
|
|
LOG.info("Local peer group in migrating state, quit audit")
|
|
return
|
|
|
|
LOG.info(
|
|
"Auditing remote subcloud peer group: [%s], migration_status: [%s], "
|
|
"group_priority: [%s], local subcloud peer group: [%s], "
|
|
"migration_status: [%s], group_priority: [%s]"
|
|
% (
|
|
remote_peer_group.get("peer_group_name"),
|
|
remote_peer_group.get("migration_status"),
|
|
remote_peer_group.get("group_priority"),
|
|
local_peer_group.peer_group_name,
|
|
local_peer_group.migration_status,
|
|
local_peer_group.group_priority,
|
|
)
|
|
)
|
|
|
|
# if remote subcloud peer group's migration_status is 'migrating',
|
|
# 'unmanaged' all local subclouds in local peer group and change its
|
|
# deploy status to consts.DEPLOY_STATE_REHOME_PENDING to stop cert-mon
|
|
# audits.
|
|
if remote_peer_group.get("migration_status") == consts.PEER_GROUP_MIGRATING:
|
|
# Unmanaged all local subclouds of peer group
|
|
LOG.info(
|
|
"Unmanaged all local subclouds of peer group "
|
|
f"{local_peer_group.peer_group_name} since remote is in migrating state"
|
|
)
|
|
subclouds = db_api.subcloud_get_for_peer_group(
|
|
self.context, local_peer_group.id
|
|
)
|
|
for subcloud in subclouds:
|
|
try:
|
|
# update_subcloud raises an exception when trying to umanage
|
|
# an already unmanaged subcloud, so the deploy status
|
|
# update must be done separately
|
|
if (
|
|
subcloud.management_state
|
|
!= dccommon_consts.MANAGEMENT_UNMANAGED
|
|
):
|
|
# Unmanage and update the deploy-status
|
|
LOG.info(
|
|
"Unmanaging and setting the local subcloud "
|
|
f"{subcloud.name} deploy status to "
|
|
f"{consts.DEPLOY_STATE_REHOME_PENDING}"
|
|
)
|
|
self.subcloud_manager.update_subcloud(
|
|
self.context,
|
|
subcloud.id,
|
|
management_state=dccommon_consts.MANAGEMENT_UNMANAGED,
|
|
deploy_status=consts.DEPLOY_STATE_REHOME_PENDING,
|
|
)
|
|
else:
|
|
# Already unmanaged, just update the deploy-status
|
|
LOG.info(
|
|
f"Setting the local subcloud {subcloud.name} "
|
|
f"deploy status to {consts.DEPLOY_STATE_REHOME_PENDING}"
|
|
)
|
|
self.subcloud_manager.update_subcloud(
|
|
self.context,
|
|
subcloud.id,
|
|
deploy_status=consts.DEPLOY_STATE_REHOME_PENDING,
|
|
)
|
|
except Exception as e:
|
|
LOG.exception(
|
|
f"Fail to unmanage local subcloud {subcloud.name}, err: {e}"
|
|
)
|
|
raise e
|
|
SystemPeerManager.update_sync_status(
|
|
self.context,
|
|
system_peer,
|
|
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC,
|
|
local_peer_group,
|
|
remote_peer_group,
|
|
)
|
|
self.require_audit_flag = False
|
|
|
|
# if remote subcloud peer group's migration_status is 'complete',
|
|
# get remote subclouds. For 'managed+online' subclouds,
|
|
# set 'unmanaged+secondary' to local on same subclouds
|
|
elif (
|
|
remote_peer_group.get("migration_status")
|
|
== consts.PEER_GROUP_MIGRATION_COMPLETE
|
|
):
|
|
dc_client = SystemPeerManager.get_peer_dc_client(system_peer)
|
|
remote_subclouds = self._get_subclouds_by_peer_group_from_system_peer(
|
|
dc_client, system_peer, remote_peer_group.get("peer_group_name")
|
|
)
|
|
remote_sync_status = self._get_association_sync_status_from_peer_site(
|
|
dc_client, system_peer, remote_peer_group.get("id")
|
|
)
|
|
|
|
local_subclouds_to_update, local_subclouds_to_delete, any_rehome_failed = (
|
|
self._get_local_subclouds_to_update_and_delete(
|
|
local_peer_group, remote_subclouds, remote_sync_status
|
|
)
|
|
)
|
|
|
|
for subcloud in local_subclouds_to_update:
|
|
self._set_local_subcloud_to_secondary(subcloud)
|
|
|
|
# Change the local subcloud not exist on peer site's SPG to
|
|
# secondary status then delete it
|
|
for subcloud in local_subclouds_to_delete:
|
|
self._set_local_subcloud_to_secondary(subcloud)
|
|
try:
|
|
self.subcloud_manager.delete_subcloud(self.context, subcloud.id)
|
|
LOG.info(f"Deleted local subcloud {subcloud.name}")
|
|
except Exception as e:
|
|
SystemPeerManager.update_sync_status(
|
|
self.context,
|
|
system_peer,
|
|
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC,
|
|
local_peer_group,
|
|
remote_peer_group,
|
|
)
|
|
LOG.exception(
|
|
f"Failed to delete local subcloud [{subcloud.name}] that does "
|
|
"not exist under the same subcloud_peer_group on peer site "
|
|
f"{system_peer.peer_name}, err: {e}"
|
|
)
|
|
raise e
|
|
|
|
if remote_peer_group.get("system_leader_id") == system_peer.peer_uuid:
|
|
self._clear_or_raise_alarm(
|
|
system_peer, local_peer_group, remote_peer_group
|
|
)
|
|
db_api.subcloud_peer_group_update(
|
|
self.context,
|
|
local_peer_group.id,
|
|
system_leader_id=system_peer.peer_uuid,
|
|
system_leader_name=system_peer.peer_name,
|
|
)
|
|
|
|
self._update_remote_peer_group_migration_status(
|
|
system_peer, remote_peer_group.get("peer_group_name"), None
|
|
)
|
|
|
|
if not (
|
|
remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC
|
|
and any_rehome_failed
|
|
):
|
|
SystemPeerManager.update_sync_status(
|
|
self.context,
|
|
system_peer,
|
|
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
|
|
local_peer_group,
|
|
remote_peer_group,
|
|
)
|
|
self.require_audit_flag = False
|
|
else:
|
|
# If remote peer group migration_status is 'None'
|
|
self.require_audit_flag = False
|
|
|
|
def _clear_or_raise_alarm(
|
|
self,
|
|
system_peer: models.SystemPeer,
|
|
local_peer_group: models.SubcloudPeerGroup,
|
|
remote_peer_group: dict,
|
|
) -> None:
|
|
# If local subcloud peer group's group_priority is
|
|
# lower than remote subcloud peer group's group_priority,
|
|
# an alarm will be raised.
|
|
# lower number means higher priority
|
|
entity_instance_id = "peer_group=%s,peer=%s" % (
|
|
local_peer_group.peer_group_name,
|
|
system_peer.peer_uuid,
|
|
)
|
|
if local_peer_group.group_priority < remote_peer_group.get("group_priority"):
|
|
LOG.warning(
|
|
f"Alarm: local subcloud peer group [{local_peer_group.peer_group_name}]"
|
|
f" is managed by remote system peer [{system_peer.peer_name}]"
|
|
)
|
|
try:
|
|
fault = fm_api.Fault(
|
|
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED,
|
|
alarm_state=fm_const.FM_ALARM_STATE_SET,
|
|
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD_PEER_GROUP,
|
|
entity_instance_id=entity_instance_id,
|
|
severity=fm_const.FM_ALARM_SEVERITY_MAJOR,
|
|
reason_text=(
|
|
"Subcloud peer group (peer_group_name=%s) is managed by "
|
|
"remote system peer (peer_uuid=%s) with a lower priority."
|
|
% (local_peer_group.peer_group_name, system_peer.peer_uuid)
|
|
),
|
|
alarm_type=fm_const.FM_ALARM_TYPE_0,
|
|
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_UNKNOWN,
|
|
proposed_repair_action=(
|
|
"Check the reported peer group state. Migrate it back to the "
|
|
"current system if the state is 'rehomed' and the current "
|
|
"system is stable. Otherwise, wait until these conditions "
|
|
"are met."
|
|
),
|
|
service_affecting=False,
|
|
)
|
|
self.fm_api.set_fault(fault)
|
|
except Exception as e:
|
|
LOG.exception(e)
|
|
else:
|
|
try:
|
|
fault = self.fm_api.get_fault(
|
|
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED,
|
|
entity_instance_id,
|
|
)
|
|
if fault:
|
|
LOG.info(f"Clear alarm: {entity_instance_id}")
|
|
self.fm_api.clear_fault(
|
|
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED,
|
|
entity_instance_id,
|
|
)
|
|
except Exception:
|
|
LOG.exception(
|
|
f"Problem clearing fault [{entity_instance_id}], alarm_id="
|
|
f"{fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED}"
|
|
)
|
|
|
|
def _do_audit(self, system_peer, remote_peer_group, local_peer_group):
|
|
with self.thread_lock:
|
|
try:
|
|
self.audit(system_peer, remote_peer_group, local_peer_group)
|
|
except Exception as e:
|
|
LOG.exception("audit error occurred: %s" % e)
|
|
|
|
def stop(self):
|
|
if self.thread:
|
|
self.thread.join()
|
|
LOG.info(f"Stopped peer group {self.peer_group_id} audit thread")
|
|
else:
|
|
LOG.info(f"No peer group {self.peer_group_id} audit thread to stop")
|
|
|
|
def start(self, system_peer, remote_peer_group, local_peer_group):
|
|
if self.thread_lock.locked():
|
|
LOG.warning(
|
|
f"Audit thread for {local_peer_group.peer_group_name} "
|
|
"has already started"
|
|
)
|
|
else:
|
|
self.thread = threading.Thread(
|
|
target=self._do_audit,
|
|
args=(system_peer, remote_peer_group, local_peer_group),
|
|
)
|
|
self.thread.start()
|
|
|
|
def audit_peer_group_from_system(
|
|
self, system_peer, remote_peer_group, local_peer_group
|
|
):
|
|
LOG.info(
|
|
f"Audit peer group [{local_peer_group.peer_group_name}] "
|
|
f"with remote system peer {system_peer.peer_name}"
|
|
)
|
|
self.start(system_peer, remote_peer_group, local_peer_group)
|
|
|
|
@staticmethod
|
|
def send_audit_peer_group(
|
|
system_peers: list[models.SystemPeer], peer_group: models.SubcloudPeerGroup
|
|
):
|
|
if not system_peers:
|
|
return
|
|
local_system = utils.get_local_system()
|
|
for system in system_peers:
|
|
try:
|
|
dc_client = SystemPeerManager.get_peer_dc_client(system)
|
|
payload = db_api.subcloud_peer_group_db_model_to_dict(peer_group)
|
|
if "created-at" in payload:
|
|
del payload["created-at"]
|
|
if "updated-at" in payload:
|
|
del payload["updated-at"]
|
|
payload["peer_uuid"] = local_system.uuid
|
|
LOG.info(
|
|
"Send audit payload [%s] of peer group %s"
|
|
% (payload, peer_group.peer_group_name)
|
|
)
|
|
response = dc_client.audit_subcloud_peer_group(
|
|
peer_group.peer_group_name, **payload
|
|
)
|
|
if response:
|
|
return response
|
|
except Exception:
|
|
LOG.exception(
|
|
"Failed to send audit request for peer group "
|
|
f"{peer_group.peer_group_name} to DC: {system.peer_name}"
|
|
)
|