
On previous installations, the subcloud kube-rootca certificate is different than the one from system controller. Currently, the audit is comparing cert_id and declaring out-of-sync if they don't match, which leads to an out-of-sync in the endpoint post upgrade. This commit changes the audit logic to first audit by alarms so upgraded subclouds can remain in-sync. Audit by cert_id still happen, but only if the subcloud was rehomed. Additionally, the force parameter was re-introduced in kube-rootca update orchestration. Since with alarm based audit different cert_ids can still present an in-sync status, the user might want to update subcloud cert to match system controller, so the force parameter is necessary to allow this. Note: Dcagent didn't previously allowed extra_args to be sent in in the payload. To avoid breaking audit with previous versions of dcagent sending an unknown key in the payload (which will thrown an error), extra_args are being sent in request header with the key "X-DCAGENT-HEADERS". Support for extra_args in the payload was added, but can only be used when all supported dcagent versions have this option. Note: Due to the current issue that blocks upgrade test, this commit did not test subcloud upgrade, but the scenario would follow a similar path from the second test case below, where updating a subcloud rootca to a different cert from system controller results in an in-sync endpoint status. Test plan: - PASS: Deploy a subcloud and verify kube-rootca_sync_status is in-sync. - PASS: Perform a kube-rootca update orchestration directly in the subcloud without passing a cert so it will auto generate one and verify kube-rootca_sync_status is still in-sync. - PASS: Rehome the subcloud from the previous test and verify kube-rootca_sync_status is out-of-sync. - PASS: Perform a kube-rootca update orchestration using dcmanager in an out-of-sync subcloud providing system controller certs and verify the final sync status is in-sync. - PASS: Perform a kube-rootca update orchestration using dcmanager in an in-sync subcloud with force parameter without providing certs and verify the final sync status is in-sync. - PASS: Install a N-1 release and verify kube-rootca_sync_status is in-sync. Closes-bug: 2092069 Change-Id: If0cc002d0d4970730771ae90d80dc50c7daf4d4c Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
859 lines
36 KiB
Python
859 lines
36 KiB
Python
# Copyright 2017 Ericsson AB.
|
|
# Copyright (c) 2017-2024 Wind River Systems, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
|
|
import copy
|
|
import json
|
|
import os
|
|
import threading
|
|
import time
|
|
|
|
from keystoneauth1 import exceptions as keystone_exceptions
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
from oslo_utils import timeutils
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon.drivers.openstack.dcagent_v1 import DcagentClient
|
|
from dccommon.drivers.openstack.fm import FmClient
|
|
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
|
|
from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
|
|
from dccommon import endpoint_cache
|
|
from dccommon import utils as dccommon_utils
|
|
from dcmanager.audit import alarm_aggregation
|
|
from dcmanager.audit import base_audit
|
|
from dcmanager.audit import firmware_audit
|
|
from dcmanager.audit import kube_rootca_update_audit
|
|
from dcmanager.audit import kubernetes_audit
|
|
from dcmanager.audit import patch_audit
|
|
from dcmanager.audit import software_audit
|
|
from dcmanager.audit.subcloud_audit_manager import HELM_APP_OPENSTACK
|
|
from dcmanager.audit import utils as audit_utils
|
|
from dcmanager.common import consts
|
|
from dcmanager.common import context
|
|
from dcmanager.common import exceptions
|
|
from dcmanager.common.i18n import _
|
|
from dcmanager.common import manager
|
|
from dcmanager.common import scheduler
|
|
from dcmanager.common import utils
|
|
from dcmanager.db import api as db_api
|
|
from dcmanager.db.sqlalchemy import models
|
|
from dcmanager.rpc import client as dcmanager_rpc_client
|
|
from dcorch.rpc import client as dcorch_rpc_client
|
|
|
|
CONF = cfg.CONF
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
# We will update the state of each subcloud in the dcorch about once per hour.
|
|
# Calculate how many iterations that will be.
|
|
SUBCLOUD_STATE_UPDATE_ITERATIONS = (
|
|
dccommon_consts.SECONDS_IN_HOUR // CONF.scheduler.subcloud_audit_interval
|
|
)
|
|
|
|
|
|
class SubcloudAuditWorkerManager(manager.Manager):
|
|
"""Manages tasks related to audits."""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
LOG.debug(_("SubcloudAuditWorkerManager initialization..."))
|
|
|
|
super(SubcloudAuditWorkerManager, self).__init__(
|
|
service_name="subcloud_audit_worker_manager"
|
|
)
|
|
self.audit_lock = threading.Lock()
|
|
self.audits_finished = dict()
|
|
self.context = context.get_admin_context()
|
|
self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient()
|
|
self.dcorch_client = dcorch_rpc_client.EngineWorkerClient()
|
|
self.state_rpc_client = dcmanager_rpc_client.SubcloudStateClient()
|
|
# Keeps track of greenthreads we create to do work.
|
|
self.thread_group_manager = scheduler.ThreadGroupManager(thread_pool_size=150)
|
|
self.thread_group_manager.start(self._update_subclouds_end_audit)
|
|
# Track workers created for each subcloud.
|
|
self.subcloud_workers = dict()
|
|
self.alarm_aggr = alarm_aggregation.AlarmAggregation(self.context)
|
|
# todo(abailey): refactor the design pattern for adding new audits
|
|
self.patch_audit = patch_audit.PatchAudit(self.context)
|
|
self.firmware_audit = firmware_audit.FirmwareAudit()
|
|
self.kubernetes_audit = kubernetes_audit.KubernetesAudit()
|
|
self.kube_rootca_update_audit = kube_rootca_update_audit.KubeRootcaUpdateAudit()
|
|
self.software_audit = software_audit.SoftwareAudit()
|
|
self.pid = os.getpid()
|
|
|
|
def _update_subclouds_end_audit(self):
|
|
while True:
|
|
audits_to_set_finished = None
|
|
|
|
with self.audit_lock:
|
|
if len(self.audits_finished) > 0:
|
|
audits_to_set_finished = copy.deepcopy(self.audits_finished)
|
|
self.audits_finished = dict()
|
|
|
|
if audits_to_set_finished:
|
|
# Update the audit completion timestamp so it doesn't get
|
|
# audited again for a while.
|
|
try:
|
|
db_api.subcloud_audits_bulk_end_audit(
|
|
self.context, audits_to_set_finished
|
|
)
|
|
except Exception as e:
|
|
LOG.error(f"An error occurred when updating end audit: {e}")
|
|
|
|
with self.audit_lock:
|
|
self.audits_finished.update(audits_to_set_finished)
|
|
|
|
time.sleep(2)
|
|
|
|
def audit_subclouds(
|
|
self,
|
|
context,
|
|
subcloud_ids,
|
|
patch_audit_data,
|
|
firmware_audit_data,
|
|
kubernetes_audit_data,
|
|
do_openstack_audit,
|
|
kube_rootca_update_audit_data,
|
|
software_audit_data,
|
|
use_cache,
|
|
):
|
|
"""Run audits of the specified subcloud(s)"""
|
|
|
|
LOG.debug(
|
|
"PID: %s, subclouds to audit: %s, do_openstack_audit: %s"
|
|
% (self.pid, subcloud_ids, do_openstack_audit)
|
|
)
|
|
|
|
for subcloud_id in subcloud_ids:
|
|
# Retrieve the subcloud and subcloud audit info
|
|
try:
|
|
subcloud = db_api.subcloud_get(self.context, subcloud_id)
|
|
subcloud_audits = db_api.subcloud_audits_get_and_start_audit(
|
|
self.context, subcloud_id
|
|
)
|
|
except exceptions.SubcloudNotFound:
|
|
# Possibility subcloud could have been deleted since the list of
|
|
# subclouds to audit was created.
|
|
LOG.info(
|
|
"Ignoring SubcloudNotFound when auditing subcloud %s" % subcloud_id
|
|
)
|
|
continue
|
|
|
|
LOG.debug(
|
|
"PID: %s, starting audit of subcloud: %s." % (self.pid, subcloud.name)
|
|
)
|
|
|
|
# Check the per-subcloud audit flags
|
|
do_load_audit = subcloud_audits.load_audit_requested
|
|
# Currently we do the load audit as part of the patch audit,
|
|
# so if we want a load audit we need to do a patch audit.
|
|
do_patch_audit = subcloud_audits.patch_audit_requested or do_load_audit
|
|
do_firmware_audit = subcloud_audits.firmware_audit_requested
|
|
do_kubernetes_audit = subcloud_audits.kubernetes_audit_requested
|
|
do_kube_rootca_update_audit = (
|
|
subcloud_audits.kube_rootca_update_audit_requested
|
|
)
|
|
update_subcloud_state = subcloud_audits.state_update_requested
|
|
do_software_audit = subcloud_audits.spare_audit_requested
|
|
|
|
# Create a new greenthread for each subcloud to allow the audits
|
|
# to be done in parallel. If there are not enough greenthreads
|
|
# in the pool, this will block until one becomes available.
|
|
self.subcloud_workers[subcloud.region_name] = (
|
|
self.thread_group_manager.start(
|
|
self._do_audit_subcloud,
|
|
subcloud,
|
|
update_subcloud_state,
|
|
do_openstack_audit,
|
|
patch_audit_data,
|
|
firmware_audit_data,
|
|
kubernetes_audit_data,
|
|
kube_rootca_update_audit_data,
|
|
software_audit_data,
|
|
do_patch_audit,
|
|
do_load_audit,
|
|
do_firmware_audit,
|
|
do_kubernetes_audit,
|
|
do_kube_rootca_update_audit,
|
|
do_software_audit,
|
|
use_cache,
|
|
)
|
|
)
|
|
|
|
def update_subcloud_endpoints(self, context, subcloud_name, endpoints):
|
|
LOG.info(
|
|
f"Updating service endpoints for subcloud {subcloud_name} in endpoint cache"
|
|
)
|
|
endpoint_cache.EndpointCache.update_master_service_endpoint_region(
|
|
subcloud_name, endpoints
|
|
)
|
|
|
|
def _update_subcloud_audit_fail_count(self, subcloud, audit_fail_count):
|
|
"""Update the subcloud's audit_fail_count directly to db.
|
|
|
|
It's safe to update audit_fail_count because only the audit actually cares
|
|
about it, dcmanager itself doesn't do anything with the value. If
|
|
audit_fail_count is the only field to update, we want to update the db by
|
|
an audit worker directly to eliminate unnecessary notifications to dcmanager.
|
|
Note: this method should not be used for updating any other data.
|
|
param subcloud: the subcloud object to be updated.
|
|
param audit_fail_count: count of failed audit.
|
|
"""
|
|
try:
|
|
db_api.subcloud_update(
|
|
self.context, subcloud.id, audit_fail_count=audit_fail_count
|
|
)
|
|
except exceptions.SubcloudNotFound:
|
|
# Possibly subcloud could have been deleted since we found it in db,
|
|
# ignore this benign error.
|
|
LOG.info(
|
|
"Ignoring SubcloudNotFound when attempting update "
|
|
"audit_fail_count for subcloud: %s" % subcloud.name
|
|
)
|
|
|
|
def _audit_subcloud_openstack_app(
|
|
self, subcloud_name, sysinv_client, openstack_installed
|
|
):
|
|
openstack_installed_current = False
|
|
# get a list of installed apps in the subcloud
|
|
try:
|
|
apps = sysinv_client.get_applications()
|
|
except Exception:
|
|
LOG.exception(
|
|
"Cannot retrieve installed apps for subcloud: %s" % subcloud_name
|
|
)
|
|
return
|
|
|
|
for app in apps:
|
|
if app.name.endswith(HELM_APP_OPENSTACK) and app.active:
|
|
# audit find openstack app is installed and active in
|
|
# the subcloud
|
|
openstack_installed_current = True
|
|
break
|
|
|
|
endpoint_type_list = dccommon_consts.ENDPOINT_TYPES_LIST_OS
|
|
if openstack_installed_current and not openstack_installed:
|
|
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
|
|
self.context,
|
|
subcloud_name,
|
|
endpoint_type_list,
|
|
openstack_installed_current,
|
|
)
|
|
elif not openstack_installed_current and openstack_installed:
|
|
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
|
|
self.context,
|
|
subcloud_name,
|
|
endpoint_type_list,
|
|
openstack_installed_current,
|
|
)
|
|
|
|
def _do_audit_subcloud(
|
|
self,
|
|
subcloud: models.Subcloud,
|
|
update_subcloud_state: bool,
|
|
do_audit_openstack: bool,
|
|
patch_audit_data,
|
|
firmware_audit_data,
|
|
kubernetes_audit_data,
|
|
kube_rootca_update_audit_data,
|
|
software_audit_data,
|
|
do_patch_audit: bool,
|
|
do_load_audit: bool,
|
|
do_firmware_audit: bool,
|
|
do_kubernetes_audit: bool,
|
|
do_kube_rootca_update_audit: bool,
|
|
do_software_audit: bool,
|
|
use_cache: bool,
|
|
):
|
|
audits_done = list()
|
|
failures = list()
|
|
# Do the actual subcloud audit.
|
|
try:
|
|
audits_done, failures = self._audit_subcloud(
|
|
subcloud,
|
|
update_subcloud_state,
|
|
do_audit_openstack,
|
|
patch_audit_data,
|
|
firmware_audit_data,
|
|
kubernetes_audit_data,
|
|
kube_rootca_update_audit_data,
|
|
software_audit_data,
|
|
do_patch_audit,
|
|
do_load_audit,
|
|
do_firmware_audit,
|
|
do_kubernetes_audit,
|
|
do_kube_rootca_update_audit,
|
|
do_software_audit,
|
|
use_cache,
|
|
)
|
|
except Exception:
|
|
LOG.exception("Got exception auditing subcloud: %s" % subcloud.name)
|
|
|
|
if failures and len(failures) > 1:
|
|
# extra log for multiple failures:
|
|
LOG.error(
|
|
"Multiple failures auditing subcloud %s: for endpoints: %s",
|
|
subcloud.name,
|
|
", ".join(sorted(failures)),
|
|
)
|
|
|
|
with self.audit_lock:
|
|
self.audits_finished[subcloud.id] = {
|
|
"timestamp": timeutils.utcnow(),
|
|
"audits_finished": audits_done,
|
|
}
|
|
|
|
# Remove the worker for this subcloud
|
|
self.subcloud_workers.pop(subcloud.region_name, None)
|
|
LOG.debug("PID: %s, done auditing subcloud: %s." % (self.pid, subcloud.name))
|
|
|
|
@staticmethod
|
|
def _should_perform_additional_audit(
|
|
subcloud_management_state, subcloud_avail_status, first_identity_sync_complete
|
|
):
|
|
return (
|
|
subcloud_management_state == dccommon_consts.MANAGEMENT_MANAGED
|
|
and subcloud_avail_status == dccommon_consts.AVAILABILITY_ONLINE
|
|
and first_identity_sync_complete
|
|
)
|
|
|
|
def _build_dcagent_payload(
|
|
self,
|
|
should_perform_additional_audit,
|
|
firmware_audit_data,
|
|
kubernetes_audit_data,
|
|
kube_rootca_update_audit_data,
|
|
software_audit_data,
|
|
do_firmware_audit,
|
|
do_kubernetes_audit,
|
|
do_kube_rootca_update_audit,
|
|
do_software_audit,
|
|
use_cache,
|
|
):
|
|
audit_payload = {dccommon_consts.BASE_AUDIT: ""}
|
|
if should_perform_additional_audit:
|
|
if do_firmware_audit and firmware_audit_data:
|
|
audit_payload[dccommon_consts.FIRMWARE_AUDIT] = firmware_audit_data
|
|
if do_kubernetes_audit and kubernetes_audit_data:
|
|
audit_payload[dccommon_consts.KUBERNETES_AUDIT] = kubernetes_audit_data
|
|
if do_kube_rootca_update_audit and kube_rootca_update_audit_data:
|
|
audit_payload[dccommon_consts.KUBE_ROOTCA_AUDIT] = (
|
|
kube_rootca_update_audit_data
|
|
)
|
|
if do_software_audit and software_audit_data:
|
|
audit_payload[dccommon_consts.SOFTWARE_AUDIT] = software_audit_data
|
|
# If the audit was forced, we don't want to use the cache
|
|
if not use_cache:
|
|
audit_payload["use_cache"] = use_cache
|
|
return audit_payload
|
|
|
|
def _build_dcagent_request_headers(self, subcloud: models.Subcloud):
|
|
dc_agent_headers = {}
|
|
if subcloud.rehomed:
|
|
dc_agent_headers["rehomed"] = subcloud.rehomed
|
|
header = {"X-DCAGENT-HEADERS": json.dumps(dc_agent_headers)}
|
|
return header
|
|
|
|
def _update_sw_sync_status_from_deploy_status(self, subcloud, audit_results):
|
|
# If the subcloud deploy_status is in any of the following states,
|
|
# the sync_status should be set to out-of-sync for software audit.
|
|
# This allows the user to reapply the strategy to resolve the deploy_status.
|
|
if subcloud.deploy_status in [
|
|
consts.DEPLOY_STATE_SW_DEPLOY_APPLY_STRATEGY_FAILED,
|
|
consts.DEPLOY_STATE_SW_DEPLOY_IN_PROGRESS,
|
|
] and audit_results.get(dccommon_consts.SOFTWARE_AUDIT):
|
|
LOG.info(
|
|
"Setting software sync_status to out-of-sync due to deploy_status. "
|
|
f"subcloud: {subcloud.name} deploy_status: {subcloud.deploy_status}"
|
|
)
|
|
audit_results[dccommon_consts.SOFTWARE_AUDIT][
|
|
"sync_status"
|
|
] = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC
|
|
return audit_results
|
|
|
|
def _audit_subcloud(
|
|
self,
|
|
subcloud: models.Subcloud,
|
|
update_subcloud_state: bool,
|
|
do_audit_openstack: bool,
|
|
patch_audit_data,
|
|
firmware_audit_data,
|
|
kubernetes_audit_data,
|
|
kube_rootca_update_audit_data,
|
|
software_audit_data,
|
|
do_patch_audit: bool,
|
|
do_load_audit: bool,
|
|
do_firmware_audit: bool,
|
|
do_kubernetes_audit: bool,
|
|
do_kube_rootca_update_audit: bool,
|
|
do_software_audit: bool,
|
|
use_cache: bool,
|
|
):
|
|
"""Audit a single subcloud."""
|
|
|
|
avail_status_current = subcloud.availability_status
|
|
audit_fail_count = subcloud.audit_fail_count
|
|
subcloud_name = subcloud.name
|
|
subcloud_region = subcloud.region_name
|
|
subcloud_management_ip = subcloud.management_start_ip
|
|
audits_done = list()
|
|
failures = list()
|
|
availability_data = dict()
|
|
endpoint_data = dict()
|
|
has_dcagent = dccommon_utils.subcloud_has_dcagent(subcloud.software_version)
|
|
|
|
# Set defaults to None and disabled so we will still set disabled
|
|
# status if we encounter an error.
|
|
|
|
keystone_client = None
|
|
dcagent_client = None
|
|
sysinv_client = None
|
|
fm_client = None
|
|
avail_to_set = dccommon_consts.AVAILABILITY_OFFLINE
|
|
failmsg = "Audit failure subcloud: %s, endpoint: %s"
|
|
try:
|
|
keystone_client = OpenStackDriver(
|
|
region_name=subcloud_region,
|
|
region_clients=None,
|
|
fetch_subcloud_ips=utils.fetch_subcloud_mgmt_ips,
|
|
attempts=1,
|
|
).keystone_client
|
|
admin_session = keystone_client.session
|
|
if has_dcagent:
|
|
dcagent_client = DcagentClient(
|
|
subcloud_region,
|
|
admin_session,
|
|
endpoint=dccommon_utils.build_subcloud_endpoint(
|
|
subcloud_management_ip, "dcagent"
|
|
),
|
|
)
|
|
sysinv_client = SysinvClient(
|
|
subcloud_region,
|
|
admin_session,
|
|
endpoint=keystone_client.endpoint_cache.get_endpoint("sysinv"),
|
|
)
|
|
fm_client = FmClient(
|
|
subcloud_region,
|
|
admin_session,
|
|
endpoint=keystone_client.endpoint_cache.get_endpoint("fm"),
|
|
)
|
|
# TODO(vgluzrom): Revise and improve the debug and error messages
|
|
# as well as the exception causes
|
|
except keystone_exceptions.ConnectTimeout:
|
|
if avail_status_current == dccommon_consts.AVAILABILITY_OFFLINE:
|
|
LOG.debug(
|
|
"Identity or Platform endpoint for %s not found, ignoring for "
|
|
"offline subcloud." % subcloud_name
|
|
)
|
|
return audits_done, failures
|
|
else:
|
|
# The subcloud will be marked as offline below.
|
|
LOG.error(
|
|
"Identity or Platform endpoint for online subcloud: %s not found."
|
|
% subcloud_name
|
|
)
|
|
|
|
except keystone_exceptions.NotFound:
|
|
if (
|
|
subcloud.first_identity_sync_complete
|
|
and avail_status_current == dccommon_consts.AVAILABILITY_ONLINE
|
|
):
|
|
# The first identity sync is already complete
|
|
# Therefore this is an error
|
|
LOG.error(
|
|
"Identity or Platform endpoint for online subcloud: %s not found."
|
|
% subcloud_name
|
|
)
|
|
else:
|
|
LOG.debug(
|
|
"Identity or Platform endpoint for %s not found, ignoring for "
|
|
"offline subcloud or identity sync not done." % subcloud_name
|
|
)
|
|
return audits_done, failures
|
|
|
|
except (
|
|
keystone_exceptions.EndpointNotFound,
|
|
keystone_exceptions.ConnectFailure,
|
|
IndexError,
|
|
):
|
|
if avail_status_current == dccommon_consts.AVAILABILITY_OFFLINE:
|
|
LOG.info(
|
|
"Identity or Platform endpoint for %s not found, ignoring for "
|
|
"offline subcloud." % subcloud_name
|
|
)
|
|
return audits_done, failures
|
|
# The subcloud will be marked as offline below.
|
|
LOG.error(
|
|
"Identity or Platform endpoint for online subcloud: %s not found."
|
|
% subcloud_name
|
|
)
|
|
|
|
except Exception:
|
|
LOG.exception("Failed to create clients for subcloud: %s" % subcloud_name)
|
|
|
|
if has_dcagent and dcagent_client:
|
|
LOG.debug(f"Starting dcagent audit for subcloud: {subcloud_name}")
|
|
# If we don't have the audit data, we won't send the request to the
|
|
# dcagent service, so we set the status to "in sync"
|
|
shoud_perform_additional_audit = self._should_perform_additional_audit(
|
|
subcloud.management_state,
|
|
avail_status_current,
|
|
subcloud.first_identity_sync_complete,
|
|
)
|
|
if shoud_perform_additional_audit:
|
|
if do_firmware_audit and not firmware_audit_data:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_FIRMWARE] = (
|
|
dccommon_consts.SYNC_STATUS_IN_SYNC
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_FIRMWARE)
|
|
if do_kubernetes_audit and not kubernetes_audit_data:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_KUBERNETES] = (
|
|
dccommon_consts.SYNC_STATUS_IN_SYNC
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_KUBERNETES)
|
|
if do_kube_rootca_update_audit and not kube_rootca_update_audit_data:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA] = (
|
|
dccommon_consts.SYNC_STATUS_IN_SYNC
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA)
|
|
if do_software_audit and not software_audit_data:
|
|
endpoint_data[dccommon_consts.AUDIT_TYPE_SOFTWARE] = {
|
|
"sync_status": dccommon_consts.SYNC_STATUS_IN_SYNC,
|
|
"software_version": "",
|
|
}
|
|
audits_done.append(dccommon_consts.AUDIT_TYPE_SOFTWARE)
|
|
LOG.debug(
|
|
f"Skipping following audits for subcloud {subcloud_name} because "
|
|
f"RegionOne audit data is not available: {audits_done}"
|
|
)
|
|
audit_payload = self._build_dcagent_payload(
|
|
shoud_perform_additional_audit,
|
|
firmware_audit_data,
|
|
kubernetes_audit_data,
|
|
kube_rootca_update_audit_data,
|
|
software_audit_data,
|
|
do_firmware_audit,
|
|
do_kubernetes_audit,
|
|
do_kube_rootca_update_audit,
|
|
do_software_audit,
|
|
use_cache,
|
|
)
|
|
headers = self._build_dcagent_request_headers(subcloud)
|
|
audit_results = {}
|
|
try:
|
|
audit_results = dcagent_client.audit(audit_payload, headers)
|
|
except Exception:
|
|
LOG.exception(failmsg % (subcloud.name, "dcagent"))
|
|
failures.append("dcagent")
|
|
LOG.debug(f"Audits results for subcloud {subcloud_name}: {audit_results}")
|
|
audit_results = self._update_sw_sync_status_from_deploy_status(
|
|
subcloud, audit_results
|
|
)
|
|
for audit_type, audit_value in audit_results.items():
|
|
if audit_type == dccommon_consts.BASE_AUDIT:
|
|
avail_to_set = audit_value.get("availability")
|
|
if avail_to_set == dccommon_consts.AVAILABILITY_OFFLINE:
|
|
inactive_sg = audit_value.get("inactive_sg")
|
|
msg = f"Inactive service groups: {inactive_sg}"
|
|
dccommon_utils.log_subcloud_msg(
|
|
LOG.debug, msg, subcloud_name, avail_to_set
|
|
)
|
|
alarms = audit_value.get("alarms")
|
|
if (
|
|
alarms
|
|
and subcloud.management_state
|
|
== dccommon_consts.MANAGEMENT_MANAGED
|
|
):
|
|
self.alarm_aggr.update_alarm_summary(subcloud_name, alarms)
|
|
elif audit_value:
|
|
endpoint_type = dccommon_consts.DCAGENT_ENDPOINT_TYPE_MAP[
|
|
audit_type
|
|
]
|
|
endpoint_data[endpoint_type] = audit_value
|
|
audits_done.append(endpoint_type)
|
|
# Patch and load audits are not done in dcagent,
|
|
# so we need to do it separately
|
|
if self._should_perform_additional_audit(
|
|
subcloud.management_state,
|
|
avail_to_set,
|
|
subcloud.first_identity_sync_complete,
|
|
):
|
|
# TODO(nicodemos): Remove this when patching is no longer supported
|
|
if do_patch_audit:
|
|
try:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_PATCHING] = (
|
|
self.patch_audit.subcloud_patch_audit(
|
|
keystone_client.keystone_client,
|
|
subcloud,
|
|
)
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_PATCHING)
|
|
except Exception:
|
|
LOG.exception(
|
|
failmsg
|
|
% (subcloud.name, dccommon_consts.ENDPOINT_TYPE_PATCHING)
|
|
)
|
|
failures.append(dccommon_consts.ENDPOINT_TYPE_PATCHING)
|
|
# TODO(nicodemos): Remove this when patching is no longer supported
|
|
if do_load_audit:
|
|
try:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_LOAD] = (
|
|
self.patch_audit.subcloud_load_audit()
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_LOAD)
|
|
except Exception:
|
|
LOG.exception(
|
|
failmsg
|
|
% (subcloud.name, dccommon_consts.ENDPOINT_TYPE_LOAD)
|
|
)
|
|
failures.append(dccommon_consts.ENDPOINT_TYPE_LOAD)
|
|
|
|
# Check availability for subcloud that doesn't have dcagent
|
|
if not has_dcagent and sysinv_client:
|
|
# Avoid a network call to sysinv here if possible:
|
|
# If prestaging is active we can assume that the subcloud
|
|
# is online (otherwise prestaging will fail):
|
|
if subcloud.prestage_status in consts.STATES_FOR_ONGOING_PRESTAGE:
|
|
avail_to_set = dccommon_consts.AVAILABILITY_ONLINE
|
|
else:
|
|
avail_to_set, _ = base_audit.get_subcloud_availability_status(
|
|
sysinv_client, subcloud_name
|
|
)
|
|
|
|
if avail_to_set == dccommon_consts.AVAILABILITY_OFFLINE:
|
|
if audit_fail_count < consts.AVAIL_FAIL_COUNT_MAX:
|
|
audit_fail_count = audit_fail_count + 1
|
|
if (avail_status_current == dccommon_consts.AVAILABILITY_ONLINE) and (
|
|
audit_fail_count < consts.AVAIL_FAIL_COUNT_TO_ALARM
|
|
):
|
|
# Do not set offline until we have failed audit
|
|
# the requisite number of times
|
|
avail_to_set = dccommon_consts.AVAILABILITY_ONLINE
|
|
else:
|
|
# In the case of a one off blip, we may need to set the
|
|
# fail count back to 0
|
|
audit_fail_count = 0
|
|
|
|
if avail_to_set != avail_status_current:
|
|
|
|
if avail_to_set == dccommon_consts.AVAILABILITY_ONLINE:
|
|
audit_fail_count = 0
|
|
|
|
LOG.debug(
|
|
"Setting new availability status: %s "
|
|
"on subcloud: %s" % (avail_to_set, subcloud_name)
|
|
)
|
|
availability_data.update(
|
|
{
|
|
"availability_status": avail_to_set,
|
|
"update_state_only": False,
|
|
"audit_fail_count": audit_fail_count,
|
|
}
|
|
)
|
|
if avail_to_set == dccommon_consts.AVAILABILITY_OFFLINE:
|
|
utils.clear_subcloud_alarm_summary(self.context, subcloud_name)
|
|
|
|
elif audit_fail_count != subcloud.audit_fail_count:
|
|
# The subcloud remains offline, we only need to update
|
|
# the audit_fail_count in db directly by an audit worker
|
|
# to eliminate unnecessary notification to the dcmanager
|
|
self._update_subcloud_audit_fail_count(
|
|
subcloud, audit_fail_count=audit_fail_count
|
|
)
|
|
|
|
elif update_subcloud_state:
|
|
# Nothing has changed, but we want to send a state update for this
|
|
# subcloud as an audit.
|
|
LOG.debug(
|
|
"Updating subcloud state unconditionally for subcloud %s"
|
|
% subcloud_name
|
|
)
|
|
availability_data.update(
|
|
{
|
|
"availability_status": avail_status_current,
|
|
"update_state_only": True,
|
|
"audit_fail_count": None,
|
|
}
|
|
)
|
|
|
|
# If subcloud is managed, online, the identity was synced once
|
|
# and it doesn't have dcagent, audit additional resources
|
|
if not has_dcagent and self._should_perform_additional_audit(
|
|
subcloud.management_state,
|
|
avail_to_set,
|
|
subcloud.first_identity_sync_complete,
|
|
):
|
|
# Get alarm summary and store in db,
|
|
if fm_client:
|
|
try:
|
|
alarm_updates = self.alarm_aggr.get_alarm_summary(
|
|
fm_client, subcloud_name
|
|
)
|
|
self.alarm_aggr.update_alarm_summary(subcloud_name, alarm_updates)
|
|
except Exception:
|
|
# Exception was logged already
|
|
pass
|
|
|
|
failmsg = "Audit failure subcloud: %s, endpoint: %s"
|
|
# TODO(nicodemos): Remove this when patching is no longer supported
|
|
if do_patch_audit:
|
|
try:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_PATCHING] = (
|
|
self.patch_audit.subcloud_patch_audit(
|
|
keystone_client.keystone_client,
|
|
subcloud,
|
|
)
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_PATCHING)
|
|
except Exception:
|
|
LOG.exception(
|
|
failmsg
|
|
% (subcloud.name, dccommon_consts.ENDPOINT_TYPE_PATCHING)
|
|
)
|
|
failures.append(dccommon_consts.ENDPOINT_TYPE_PATCHING)
|
|
# TODO(nicodemos): Remove this when patching is no longer supported
|
|
if do_load_audit:
|
|
try:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_LOAD] = (
|
|
self.patch_audit.subcloud_load_audit()
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_LOAD)
|
|
except Exception:
|
|
LOG.exception(
|
|
failmsg % (subcloud.name, dccommon_consts.ENDPOINT_TYPE_LOAD)
|
|
)
|
|
failures.append(dccommon_consts.ENDPOINT_TYPE_LOAD)
|
|
# Perform firmware audit
|
|
if do_firmware_audit:
|
|
try:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_FIRMWARE] = (
|
|
self.firmware_audit.subcloud_firmware_audit(
|
|
sysinv_client, subcloud_name, firmware_audit_data
|
|
)
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_FIRMWARE)
|
|
except Exception:
|
|
LOG.exception(
|
|
failmsg
|
|
% (subcloud.name, dccommon_consts.ENDPOINT_TYPE_FIRMWARE)
|
|
)
|
|
failures.append(dccommon_consts.ENDPOINT_TYPE_FIRMWARE)
|
|
# Perform kubernetes audit
|
|
if do_kubernetes_audit:
|
|
try:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_KUBERNETES] = (
|
|
self.kubernetes_audit.subcloud_kubernetes_audit(
|
|
sysinv_client, subcloud_name, kubernetes_audit_data
|
|
)
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_KUBERNETES)
|
|
except Exception:
|
|
LOG.exception(
|
|
failmsg
|
|
% (subcloud.name, dccommon_consts.ENDPOINT_TYPE_KUBERNETES)
|
|
)
|
|
failures.append(dccommon_consts.ENDPOINT_TYPE_KUBERNETES)
|
|
# Perform kube rootca update audit
|
|
if do_kube_rootca_update_audit:
|
|
try:
|
|
endpoint_data[dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA] = (
|
|
self.kube_rootca_update_audit.subcloud_kube_rootca_audit(
|
|
sysinv_client,
|
|
fm_client,
|
|
subcloud,
|
|
kube_rootca_update_audit_data,
|
|
)
|
|
)
|
|
audits_done.append(dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA)
|
|
except Exception:
|
|
LOG.exception(
|
|
failmsg
|
|
% (subcloud.name, dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA)
|
|
)
|
|
failures.append(dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA)
|
|
# Audit openstack application in the subcloud
|
|
if do_audit_openstack:
|
|
# We don't want an exception here to cause our
|
|
# audits_done to be empty:
|
|
try:
|
|
self._audit_subcloud_openstack_app(
|
|
subcloud_region, sysinv_client, subcloud.openstack_installed
|
|
)
|
|
except Exception:
|
|
LOG.exception(failmsg % (subcloud.name, "openstack"))
|
|
failures.append("openstack")
|
|
# Perform software audit
|
|
if do_software_audit:
|
|
try:
|
|
endpoint_data[dccommon_consts.AUDIT_TYPE_SOFTWARE] = (
|
|
self.software_audit.subcloud_software_audit(
|
|
keystone_client.keystone_client,
|
|
subcloud,
|
|
software_audit_data,
|
|
)
|
|
)
|
|
audits_done.append(dccommon_consts.AUDIT_TYPE_SOFTWARE)
|
|
except Exception:
|
|
LOG.exception(
|
|
failmsg % (subcloud.name, dccommon_consts.AUDIT_TYPE_SOFTWARE)
|
|
)
|
|
failures.append(dccommon_consts.AUDIT_TYPE_SOFTWARE)
|
|
|
|
# Update the software_version if the software audit detects a different
|
|
# value. This can occur during a manual subcloud upgrade initiated by
|
|
# calling VIM commands directly on the subcloud.
|
|
audit_utils.update_subcloud_software_version(
|
|
self.context, subcloud, endpoint_data, self.dcorch_client
|
|
)
|
|
|
|
# Filter the endpoint_data to remove values that did not had any modification
|
|
# from the available data on subcloud table
|
|
audit_utils.filter_endpoint_data(self.context, subcloud, endpoint_data)
|
|
|
|
# Create a new variable to store the update method to avoid line too long error
|
|
bulk_update_subcloud_availability_and_endpoint_status = (
|
|
self.state_rpc_client.bulk_update_subcloud_availability_and_endpoint_status
|
|
)
|
|
|
|
if availability_data or (endpoint_data and any(endpoint_data.values())):
|
|
simplified_subcloud = {
|
|
"id": subcloud.id,
|
|
"name": subcloud.name,
|
|
"availability_status": subcloud.availability_status,
|
|
"management_state": subcloud.management_state,
|
|
"deploy_status": subcloud.deploy_status,
|
|
"region_name": subcloud.region_name,
|
|
}
|
|
|
|
try:
|
|
# If a value is not None, an update should be sent to the rpc client
|
|
bulk_update_subcloud_availability_and_endpoint_status(
|
|
self.context,
|
|
simplified_subcloud,
|
|
availability_data,
|
|
endpoint_data,
|
|
)
|
|
LOG.debug(
|
|
f"Notifying dcmanager-state, subcloud: {subcloud_name}, bulk "
|
|
"availability and endpoint status update"
|
|
)
|
|
except Exception:
|
|
LOG.exception(
|
|
"Failed to notify dcmanager-state of subcloud batch "
|
|
"availability and endpoint status update, "
|
|
f"subcloud: {subcloud_name}"
|
|
)
|
|
|
|
return audits_done, failures
|