distcloud/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py
Cristian Mondo a6a6b84258 Subcloud Name Reconfiguration
This change adds the capability to rename the subcloud after
bootstrap or during subcloud rehome operation.

Added a field in the database to separate the region name
from the subcloud name.
The region name determines the subcloud reference in the
Openstack core, through which it is possible to access
the endpoints of a given subcloud. Since the region name
cannot be changed, this commit adds the ability to maintain
a unique region name based on the UUID format, and allows
subcloud renaming when necessary without any endpoint
impact.
The region is randomly generated to configure the subcloud
when it is created and only applies to future subclouds.
For those systems that have existing subclouds, the region
will be the same as on day 0, that is, region will keep the
same name as the subcloud, but subclouds can be renamed.

This topic involves changes to dcmanager, dcmanager-client
and GUI. To ensure the region name reference needed by the
cert-monitor, a mechanism to determine if the request is
coming from the cert-monitor has been created.

Usage for subcloud rename:
dcmanager subcloud update <subcloud-name> --name <new-name>

Usage for subcloud rehoming:
dcmanager subcloud add --name <subcloud-name> --migrate ...

Note: Upgrade test from StarlingX 8 -> 9 for this commit
is deferred until upgrade functionality in master is
restored. Any issue found during upgrade test will be
addressed in a separate commit

Test Plan:
PASS: Run dcmanager subcloud passing subcommands:
      - add/delete/migrate/list/show/show --detail
      - errors/manage/unmanage/reinstall/reconfig
      - update/deploy
PASS: Run dcmanager subcloud add supplying --name
      parameter and validate the operation is not allowed
PASS: Run dcmanager supplying subcommands:
      - kube/patch/prestage strategies
PASS: Run dcmanager to apply patch and remove it
PASS: Run dcmanager subcloud-backup:
      - create/delete/restore/show/upload
PASS: Run subcloud-group:
      - add/delete/list/list-subclouds/show/update
PASS: Run dcmanager subcloud strategy for:
      - patch/kubernetes/firmware
PASS: Run dcmanager subcloud update command passing --name
      parameter supplying the following values:
      - current subcloud name (not changed)
      - different existing subcloud name
PASS: Run dcmanager to migrate a subcloud passing --name
      parameter supplying a new subcloud name
PASS: Run dcmanager to migrate a subcloud without --name
      parameter
PASS: Run dcmanager to migrate a subcloud passing --name
      parameter supplying a new subcloud name and
      different subcloud name in bootstrap file
PASS: Test dcmanager API response using cURL command line
      to validate new region name field
PASS: Run full DC sanity and regression

Story: 2010788
Task: 48217

Signed-off-by: Cristian Mondo <cristian.mondo@windriver.com>
Change-Id: Id04f42504b8e325d9ec3880c240fe4a06e3a20b7
2023-09-07 10:30:06 -03:00

552 lines
25 KiB
Python

# Copyright 2017 Ericsson AB.
# Copyright (c) 2017-2023 Wind River Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
from keystoneauth1 import exceptions as keystone_exceptions
from oslo_config import cfg
from oslo_log import log as logging
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
from dcmanager.audit import alarm_aggregation
from dcmanager.audit import firmware_audit
from dcmanager.audit import kube_rootca_update_audit
from dcmanager.audit import kubernetes_audit
from dcmanager.audit import patch_audit
from dcmanager.audit.subcloud_audit_manager import HELM_APP_OPENSTACK
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common.i18n import _
from dcmanager.common import manager
from dcmanager.common import prestage
from dcmanager.common import scheduler
from dcmanager.db import api as db_api
from dcmanager.rpc import client as dcmanager_rpc_client
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
# We will update the state of each subcloud in the dcorch about once per hour.
# Calculate how many iterations that will be.
SUBCLOUD_STATE_UPDATE_ITERATIONS = \
dccommon_consts.SECONDS_IN_HOUR // CONF.scheduler.subcloud_audit_interval
class SubcloudAuditWorkerManager(manager.Manager):
"""Manages tasks related to audits."""
def __init__(self, *args, **kwargs):
LOG.debug(_('SubcloudAuditWorkerManager initialization...'))
super(SubcloudAuditWorkerManager, self).__init__(
service_name="subcloud_audit_worker_manager")
self.context = context.get_admin_context()
self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient()
self.state_rpc_client = dcmanager_rpc_client.SubcloudStateClient()
# Keeps track of greenthreads we create to do work.
self.thread_group_manager = scheduler.ThreadGroupManager(
thread_pool_size=100)
# Track workers created for each subcloud.
self.subcloud_workers = dict()
self.alarm_aggr = alarm_aggregation.AlarmAggregation(self.context)
# todo(abailey): refactor the design pattern for adding new audits
self.patch_audit = patch_audit.PatchAudit(
self.context, self.state_rpc_client)
self.firmware_audit = firmware_audit.FirmwareAudit(
self.context, self.state_rpc_client)
self.kubernetes_audit = kubernetes_audit.KubernetesAudit(
self.context, self.state_rpc_client)
self.kube_rootca_update_audit = \
kube_rootca_update_audit.KubeRootcaUpdateAudit(
self.context,
self.state_rpc_client)
self.pid = os.getpid()
def audit_subclouds(self,
context,
subcloud_ids,
patch_audit_data,
firmware_audit_data,
kubernetes_audit_data,
do_openstack_audit,
kube_rootca_update_audit_data):
"""Run audits of the specified subcloud(s)"""
LOG.debug('PID: %s, subclouds to audit: %s, do_openstack_audit: %s' %
(self.pid, subcloud_ids, do_openstack_audit))
for subcloud_id in subcloud_ids:
# Retrieve the subcloud and subcloud audit info
try:
subcloud = db_api.subcloud_get(self.context, subcloud_id)
subcloud_audits = db_api.subcloud_audits_get_and_start_audit(
self.context, subcloud_id)
except exceptions.SubcloudNotFound:
# Possibility subcloud could have been deleted since the list of
# subclouds to audit was created.
LOG.info('Ignoring SubcloudNotFound when auditing subcloud %s' %
subcloud_id)
continue
LOG.debug("PID: %s, starting audit of subcloud: %s." %
(self.pid, subcloud.name))
# Include failure deploy status states in the auditable list
# so that the subcloud can be set as offline
if (subcloud.deploy_status not in
[consts.DEPLOY_STATE_DONE,
consts.DEPLOY_STATE_CONFIGURING,
consts.DEPLOY_STATE_CONFIG_FAILED,
consts.DEPLOY_STATE_CONFIG_ABORTED,
consts.DEPLOY_STATE_PRE_CONFIG_FAILED,
consts.DEPLOY_STATE_INSTALL_FAILED,
consts.DEPLOY_STATE_INSTALL_ABORTED,
consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
consts.DEPLOY_STATE_INSTALLING,
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
consts.DEPLOY_STATE_MIGRATED,
consts.DEPLOY_STATE_RESTORING,
consts.DEPLOY_STATE_RESTORE_PREP_FAILED,
consts.DEPLOY_STATE_RESTORE_FAILED]
and not prestage.is_deploy_status_prestage(
subcloud.deploy_status)) or (
subcloud.deploy_status == consts.DEPLOY_STATE_INSTALLING and
subcloud.availability_status == dccommon_consts.AVAILABILITY_OFFLINE):
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
(subcloud.name, subcloud.deploy_status))
# This DB API call will set the "audit_finished_at" timestamp
# so it won't get audited again for a while.
audits_done = []
db_api.subcloud_audits_end_audit(self.context,
subcloud_id, audits_done)
continue
# Check the per-subcloud audit flags
do_load_audit = subcloud_audits.load_audit_requested
# Currently we do the load audit as part of the patch audit,
# so if we want a load audit we need to do a patch audit.
do_patch_audit = (subcloud_audits.patch_audit_requested or
do_load_audit)
do_firmware_audit = subcloud_audits.firmware_audit_requested
do_kubernetes_audit = subcloud_audits.kubernetes_audit_requested
do_kube_rootca_update_audit = \
subcloud_audits.kube_rootca_update_audit_requested
update_subcloud_state = subcloud_audits.state_update_requested
# Create a new greenthread for each subcloud to allow the audits
# to be done in parallel. If there are not enough greenthreads
# in the pool, this will block until one becomes available.
self.subcloud_workers[subcloud.region_name] = \
self.thread_group_manager.start(self._do_audit_subcloud,
subcloud,
update_subcloud_state,
do_openstack_audit,
patch_audit_data,
firmware_audit_data,
kubernetes_audit_data,
kube_rootca_update_audit_data,
do_patch_audit,
do_load_audit,
do_firmware_audit,
do_kubernetes_audit,
do_kube_rootca_update_audit)
def update_subcloud_endpoints(self, context, subcloud_name, endpoints):
try:
LOG.info("Updating service endpoints for subcloud %s "
"in endpoint cache" % subcloud_name)
endpoint_cache = OpenStackDriver(
region_name=dccommon_consts.CLOUD_0).keystone_client.endpoint_cache
endpoint_cache.update_master_service_endpoint_region(
subcloud_name, endpoints)
except (keystone_exceptions.EndpointNotFound,
keystone_exceptions.ConnectFailure,
IndexError):
LOG.error("Failed to update the service endpoints "
"for subcloud %s." % subcloud_name)
def _update_subcloud_audit_fail_count(self, subcloud,
audit_fail_count):
"""Update the subcloud's audit_fail_count directly to db.
It's safe to update audit_fail_count because only the audit actually cares
about it, dcmanager itself doesn't do anything with the value. If
audit_fail_count is the only field to update, we want to update the db by
an audit worker directly to eliminate unnecessary notifications to dcmanager.
Note: this method should not be used for updating any other data.
param subcloud: the subcloud object to be updated.
param audit_fail_count: count of failed audit.
"""
try:
db_api.subcloud_update(self.context, subcloud.id,
audit_fail_count=audit_fail_count)
except exceptions.SubcloudNotFound:
# Possibly subcloud could have been deleted since we found it in db,
# ignore this benign error.
LOG.info('Ignoring SubcloudNotFound when attempting update'
'audit_fail_count for subcloud: %s' % subcloud.name)
def _update_subcloud_availability(self, subcloud_name,
subcloud_region,
availability_status=None,
update_state_only=False,
audit_fail_count=None):
try:
self.state_rpc_client.update_subcloud_availability(
self.context, subcloud_name, subcloud_region, availability_status,
update_state_only, audit_fail_count)
LOG.info('Notifying dcmanager-state, subcloud:%s, availability:%s' %
(subcloud_name,
availability_status))
except Exception:
LOG.exception('Problem informing dcmanager-state of subcloud '
'availability state change, subcloud: %s'
% subcloud_name)
@staticmethod
def _get_subcloud_availability_status(subcloud_name, sysinv_client):
"""For each subcloud, if at least one service is active in each
service of servicegroup-list then declare the subcloud online.
"""
avail_to_set = dccommon_consts.AVAILABILITY_OFFLINE
svc_groups = None
# get a list of service groups in the subcloud
try:
svc_groups = sysinv_client.get_service_groups()
except Exception as e:
LOG.warn('Cannot retrieve service groups for '
'subcloud: %s, %s' % (subcloud_name, e))
if svc_groups:
active_sgs = []
inactive_sgs = []
# Build 2 lists, 1 of active service groups,
# one with non-active.
for sg in svc_groups:
if sg.state != consts.SERVICE_GROUP_STATUS_ACTIVE:
inactive_sgs.append(sg.service_group_name)
else:
active_sgs.append(sg.service_group_name)
# Create a list of service groups that are only present
# in non-active list
inactive_only = [sg for sg in inactive_sgs if
sg not in active_sgs]
# An empty inactive only list and a non-empty active list
# means we're good to go.
if not inactive_only and active_sgs:
avail_to_set = \
dccommon_consts.AVAILABILITY_ONLINE
else:
LOG.info("Subcloud:%s has non-active "
"service groups: %s" %
(subcloud_name, inactive_only))
return avail_to_set
def _audit_subcloud_openstack_app(self, subcloud_name, sysinv_client,
openstack_installed):
openstack_installed_current = False
# get a list of installed apps in the subcloud
try:
apps = sysinv_client.get_applications()
except Exception:
LOG.exception('Cannot retrieve installed apps for subcloud:%s'
% subcloud_name)
return
for app in apps:
if app.name.endswith(HELM_APP_OPENSTACK) and app.active:
# audit find openstack app is installed and active in
# the subcloud
openstack_installed_current = True
break
endpoint_type_list = dccommon_consts.ENDPOINT_TYPES_LIST_OS
if openstack_installed_current and not openstack_installed:
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
self.context,
subcloud_name,
endpoint_type_list,
openstack_installed_current)
elif not openstack_installed_current and openstack_installed:
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
self.context,
subcloud_name,
endpoint_type_list,
openstack_installed_current)
def _do_audit_subcloud(self,
subcloud,
update_subcloud_state,
do_audit_openstack,
patch_audit_data,
firmware_audit_data,
kubernetes_audit_data,
kube_rootca_update_audit_data,
do_patch_audit,
do_load_audit,
do_firmware_audit,
do_kubernetes_audit,
do_kube_rootca_update_audit):
audits_done = list()
failures = list()
# Do the actual subcloud audit.
try:
audits_done, failures = self._audit_subcloud(
subcloud,
update_subcloud_state,
do_audit_openstack,
patch_audit_data,
firmware_audit_data,
kubernetes_audit_data,
kube_rootca_update_audit_data,
do_patch_audit,
do_load_audit,
do_firmware_audit,
do_kubernetes_audit,
do_kube_rootca_update_audit)
except Exception:
LOG.exception("Got exception auditing subcloud: %s" % subcloud.name)
if failures and len(failures) > 1:
# extra log for multiple failures:
LOG.error("Multiple failures auditing subcloud %s: "
"for endpoints: %s",
subcloud.name, ", ".join(sorted(failures)))
# Update the audit completion timestamp so it doesn't get
# audited again for a while.
db_api.subcloud_audits_end_audit(self.context,
subcloud.id, audits_done)
# Remove the worker for this subcloud
self.subcloud_workers.pop(subcloud.region_name, None)
LOG.debug("PID: %s, done auditing subcloud: %s." %
(self.pid, subcloud.name))
def _audit_subcloud(self,
subcloud,
update_subcloud_state,
do_audit_openstack,
patch_audit_data,
firmware_audit_data,
kubernetes_audit_data,
kube_rootca_update_audit_data,
do_patch_audit,
do_load_audit,
do_firmware_audit,
do_kubernetes_audit,
do_kube_rootca_update_audit):
"""Audit a single subcloud."""
avail_status_current = subcloud.availability_status
audit_fail_count = subcloud.audit_fail_count
subcloud_name = subcloud.name
subcloud_region = subcloud.region_name
audits_done = list()
failures = list()
# Set defaults to None and disabled so we will still set disabled
# status if we encounter an error.
sysinv_client = None
fm_client = None
avail_to_set = dccommon_consts.AVAILABILITY_OFFLINE
try:
os_client = OpenStackDriver(region_name=subcloud_region,
thread_name='subcloud-audit',
region_clients=['fm', 'sysinv'])
sysinv_client = os_client.sysinv_client
fm_client = os_client.fm_client
except keystone_exceptions.ConnectTimeout:
if avail_status_current == dccommon_consts.AVAILABILITY_OFFLINE:
LOG.debug("Identity or Platform endpoint for %s not "
"found, ignoring for offline "
"subcloud." % subcloud_name)
return audits_done, failures
else:
# The subcloud will be marked as offline below.
LOG.error("Identity or Platform endpoint for online "
"subcloud: %s not found." % subcloud_name)
except keystone_exceptions.NotFound:
if subcloud.first_identity_sync_complete \
and avail_status_current == dccommon_consts.AVAILABILITY_ONLINE:
# The first identity sync is already complete
# Therefore this is an error
LOG.error("Identity or Platform endpoint for online "
"subcloud: %s not found." % subcloud_name)
else:
LOG.debug("Identity or Platform endpoint for %s not "
"found, ignoring for offline "
"subcloud or identity sync not done." % subcloud_name)
return audits_done, failures
except (keystone_exceptions.EndpointNotFound,
keystone_exceptions.ConnectFailure,
IndexError):
if avail_status_current == dccommon_consts.AVAILABILITY_OFFLINE:
LOG.info("Identity or Platform endpoint for %s not "
"found, ignoring for offline "
"subcloud." % subcloud_name)
return audits_done, failures
else:
# The subcloud will be marked as offline below.
LOG.error("Identity or Platform endpoint for online "
"subcloud: %s not found." % subcloud_name)
except Exception:
LOG.exception("Failed to get OS Client for subcloud: %s"
% subcloud_name)
# Check availability of the subcloud
if sysinv_client:
# Avoid a network call to sysinv here if possible:
# If prestaging is active we can assume that the subcloud
# is online (otherwise prestaging will fail):
if subcloud.deploy_status in (consts.PRESTAGE_STATE_PACKAGES,
consts.PRESTAGE_STATE_IMAGES):
avail_to_set = dccommon_consts.AVAILABILITY_ONLINE
else:
avail_to_set = self._get_subcloud_availability_status(
subcloud_name, sysinv_client)
if avail_to_set == dccommon_consts.AVAILABILITY_OFFLINE:
if audit_fail_count < consts.AVAIL_FAIL_COUNT_MAX:
audit_fail_count = audit_fail_count + 1
if (avail_status_current == dccommon_consts.AVAILABILITY_ONLINE) and \
(audit_fail_count < consts.AVAIL_FAIL_COUNT_TO_ALARM):
# Do not set offline until we have failed audit
# the requisite number of times
avail_to_set = dccommon_consts.AVAILABILITY_ONLINE
else:
# In the case of a one off blip, we may need to set the
# fail count back to 0
audit_fail_count = 0
if avail_to_set != avail_status_current:
if avail_to_set == dccommon_consts.AVAILABILITY_ONLINE:
audit_fail_count = 0
LOG.debug('Setting new availability status: %s '
'on subcloud: %s' %
(avail_to_set, subcloud_name))
self._update_subcloud_availability(
subcloud_name,
subcloud_region,
availability_status=avail_to_set,
audit_fail_count=audit_fail_count)
elif audit_fail_count != subcloud.audit_fail_count:
# The subcloud remains offline, we only need to update
# the audit_fail_count in db directly by an audit worker
# to eliminate unnecessary notification to the dcmanager
self._update_subcloud_audit_fail_count(
subcloud,
audit_fail_count=audit_fail_count)
elif update_subcloud_state:
# Nothing has changed, but we want to send a state update for this
# subcloud as an audit.
LOG.debug('Updating subcloud state unconditionally for subcloud %s'
% subcloud_name)
self._update_subcloud_availability(
subcloud_name,
subcloud_region,
availability_status=avail_status_current,
update_state_only=True)
# If subcloud is managed and online and the identity was synced once,
# audit additional resources
if (subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED and
avail_to_set == dccommon_consts.AVAILABILITY_ONLINE and
subcloud.first_identity_sync_complete):
# Get alarm summary and store in db,
if fm_client:
self.alarm_aggr.update_alarm_summary(subcloud_name, fm_client)
failmsg = "Audit failure subcloud: %s, endpoint: %s"
# If we have patch audit data, audit the subcloud
if do_patch_audit and patch_audit_data:
try:
self.patch_audit.subcloud_patch_audit(subcloud_name,
subcloud_region,
patch_audit_data,
do_load_audit)
audits_done.append('patch')
if do_load_audit:
audits_done.append('load')
except Exception:
LOG.exception(failmsg % (subcloud.name, 'patch/load'))
failures.append('patch')
if do_load_audit:
# Currently there's no way to differentiate,
# so include same under 'load':
failures.append('load')
# Perform firmware audit
if do_firmware_audit:
try:
self.firmware_audit.subcloud_firmware_audit(subcloud_name,
subcloud_region,
firmware_audit_data)
audits_done.append('firmware')
except Exception:
LOG.exception(failmsg % (subcloud.name, 'firmware'))
failures.append('firmware')
# Perform kubernetes audit
if do_kubernetes_audit:
try:
self.kubernetes_audit.subcloud_kubernetes_audit(
subcloud_name,
subcloud_region,
kubernetes_audit_data)
audits_done.append('kubernetes')
except Exception:
LOG.exception(failmsg % (subcloud.name, 'kubernetes'))
failures.append('kubernetes')
# Perform kube rootca update audit
if do_kube_rootca_update_audit:
try:
self.kube_rootca_update_audit.subcloud_audit(
subcloud_name,
subcloud_region,
kube_rootca_update_audit_data)
audits_done.append('kube-rootca-update')
except Exception:
LOG.exception(failmsg % (subcloud.name,
'kube-rootca-update'))
failures.append('kube-rootca-update')
# Audit openstack application in the subcloud
if do_audit_openstack and sysinv_client:
# We don't want an exception here to cause our
# audits_done to be empty:
try:
self._audit_subcloud_openstack_app(
subcloud_region, sysinv_client, subcloud.openstack_installed)
except Exception:
LOG.exception(failmsg % (subcloud.name, 'openstack'))
failures.append('openstack')
return audits_done, failures