distcloud/distributedcloud/dcmanager/audit/kube_rootca_update_audit.py
Raphael Lima 7181b6e476 Remove client cache in audit worker manager
This commit removes the client cache used with OpenStackDriver in the
subcloud's audit worker manager in order to remove the connection
persistence, deleting all of them once the audit process finishes.

The resulting improvements, considering 5K subclouds, were:
- Time taken to complete the audit process with a subcloud with full
  audit: from 5 seconds to 1 second
- Time taken to create the client: from ~1 seconds when there wasn't an
  available cache and ~500ms when there was to ~400ms

Test plan:
1. PASS: Execute a complete audit process in a DC system with over
4K subclouds
2. PASS: Verify that all of the keystone connections are closed
once the audit process is finished
3. PASS: Verify that the number of connections created during
audit is equal to or less than the number of subclouds to audit.

Story: 2011106
Task: 50224

Change-Id: I02597786977b2916fb9dc3fc9e70492670345636
Signed-off-by: Raphael Lima <Raphael.Lima@windriver.com>
2024-06-14 10:10:06 -03:00

201 lines
7.9 KiB
Python

#
# Copyright (c) 2021-2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
from oslo_config import cfg
from oslo_log import log as logging
from fm_api.constants import FM_ALARM_ID_CERT_EXPIRED
from fm_api.constants import FM_ALARM_ID_CERT_EXPIRING_SOON
from dccommon import consts as dccommon_consts
from dccommon import utils as dccommon_utils
from dccommon.drivers.openstack.sdk_platform import (
OptimizedOpenStackDriver as OpenStackDriver
)
from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
from dcmanager.audit.auditor import Auditor
from dcmanager.common import utils
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
KUBE_ROOTCA_ALARM_LIST = [FM_ALARM_ID_CERT_EXPIRED,
FM_ALARM_ID_CERT_EXPIRING_SOON, ]
MONITORED_ALARM_ENTITIES = ['system.certificate.kubernetes-root-ca', ]
class KubeRootcaUpdateAudit(Auditor):
"""Manages tasks related to kube rootca update audits."""
def __init__(self, context, dcmanager_state_rpc_client):
super(KubeRootcaUpdateAudit, self).__init__(
context,
dcmanager_state_rpc_client,
dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA
)
self.audit_type = "kube rootca update"
LOG.debug("%s audit initialized" % self.audit_type)
def get_regionone_audit_data(self):
"""Query RegionOne to determine kube rootca update information.
Kube rootca audit is based on the root CA cert ID. This identifier will
consist of a hash from certificate issuer representation and its serial
number.
:return: A string of the root CA cert ID
"""
try:
m_os_ks_client = OpenStackDriver(
region_name=dccommon_consts.DEFAULT_REGION_NAME,
region_clients=None,
fetch_subcloud_ips=utils.fetch_subcloud_mgmt_ips,
).keystone_client
endpoint = m_os_ks_client.endpoint_cache.get_endpoint('sysinv')
sysinv_client = SysinvClient(
dccommon_consts.DEFAULT_REGION_NAME, m_os_ks_client.session,
endpoint=endpoint)
except Exception:
LOG.exception("Failed init OS Client, skip Kubernetes root CA "
"audit.")
return None
try:
# Ignore the success flag as the sysinv get_kube_rootca_id is
# already introduced on system controllers.
_, cc_cert = sysinv_client.get_kube_rootca_cert_id()
except Exception:
# Cannot get the cert ID from central cloud, return None
LOG.exception("Failed to get Kubernetes root CA from Region One, "
"skip Kubernetes root CA audit.")
return None
regionone_rootca_certid = cc_cert.cert_id
LOG.debug("RegionOne kubernetes rootca update data: "
f"{regionone_rootca_certid}.")
return regionone_rootca_certid
def subcloud_kube_rootca_audit(
self, sysinv_client, fm_client, subcloud, regionone_rootca_certid
):
"""Perform an audit of kube root CA update info in a subcloud.
The audit logic is as follow:
CentOS subclouds -> alarm based
Debian subclouds:
not rehomed(initially deployed or re-deployed) -> alarm based
rehomed subclouds:
Not region one cert ID -> skip audit
subcloud doesn't have the API to get cert ID -> alarm based
region one cert ID -> cert based
:param sysinv_client: the sysinv client object
:param fm_client: the fm client object
:param subcloud: the subcloud obj
:param region_one_audit_data: the audit data of the region one
"""
subcloud_name = subcloud.name
subcloud_region = subcloud.region_name
LOG.info("Triggered %s audit for: %s" % (self.audit_type,
subcloud_name))
# Firstly, apply alarm based audit against the subclouds deployed in
# the distributed cloud and the subcloud running on old software
# version that cannot search for the k8s root CA cert id.
if dccommon_utils.is_centos(subcloud.software_version) or \
not subcloud.rehomed:
self.subcloud_audit_alarm_based(
fm_client, subcloud_name, subcloud_region
)
return
# Skip the audit if cannot get the region one cert ID.
if not regionone_rootca_certid:
self.set_subcloud_endpoint_in_sync(subcloud_name, subcloud_region)
LOG.debug(f"No region one audit data, skip {self.audit_type} "
f"audit for subcloud: {subcloud_name}.")
return
try:
success, subcloud_cert_data = \
sysinv_client.get_kube_rootca_cert_id()
except Exception:
LOG.exception("Failed to get Kubernetes root CA cert ID of "
f"subcloud: {subcloud_name}, skip "
f"{self.audit_type} audit.")
return
if not success:
# if not success, the subcloud is a Debian based subcloud without
# the sysinv API to get the cert ID, audit the subcloud based on
# its alarm.
self.subcloud_audit_alarm_based(
fm_client, subcloud_name, subcloud_region
)
else:
self.subcloud_audit_cert_based(subcloud_name, subcloud_region,
subcloud_cert_data,
regionone_rootca_certid)
def subcloud_audit_alarm_based(
self, fm_client, subcloud_name, subcloud_region
):
"""The subcloud doesn't have the method to get Kubernetes root CA
cert ID, use alarm based audit.
:param fm_client: the fm client object
:param subcloud_name: the name of the subcloud
:param subcloud_region: the region of the subcloud
"""
out_of_sync = False
detected_alarms = fm_client.get_alarms_by_ids(KUBE_ROOTCA_ALARM_LIST)
if detected_alarms:
for alarm in detected_alarms:
if alarm.entity_instance_id in MONITORED_ALARM_ENTITIES:
out_of_sync = True
break
if out_of_sync:
self.set_subcloud_endpoint_out_of_sync(subcloud_name,
subcloud_region)
else:
self.set_subcloud_endpoint_in_sync(subcloud_name, subcloud_region)
LOG.info("%s audit completed for: %s" % (self.audit_type,
subcloud_name))
def subcloud_audit_cert_based(self, subcloud_name, subcloud_region,
subcloud_cert_data, regionone_rootca_certid):
"""Audit if a subcloud's k8s root CA cert is the same as the central
:param subcloud_name: the name of the subcloud
:param subcloud_region: the region of the subcloud
:param regionone_rootca_certid: the cert ID of the region one
:param subcloud_cert: subcloud's cert info
"""
out_of_sync = False
if subcloud_cert_data.error:
LOG.exception("Failed to get Kubernetes root CA cert id for "
f"subcloud:{subcloud_name}, error: "
f"{subcloud_cert_data.error}, skip {self.audit_type} "
"audit.")
return
elif subcloud_cert_data.cert_id != regionone_rootca_certid:
out_of_sync = True
if out_of_sync:
self.set_subcloud_endpoint_out_of_sync(subcloud_name,
subcloud_region)
else:
self.set_subcloud_endpoint_in_sync(subcloud_name, subcloud_region)
LOG.info("%s audit completed for: %s" % (self.audit_type,
subcloud_name))