
This commit introduces an in-memory, dictionary-based token caching mechanism to reduce the number of token requests made to subclouds' identity APIs. The caching is implemented by subclassing the v3.Password authentication class, which normally handles HTTP requests to the identity API. The cache first checks if a valid, non-expired token exists and returns it if found. If not, it proceeds with the actual request and caches the new token for future use. Tokens can be invalidated early when all fernet keys are rotated (e.g., during the initial sync between subcloud and system controller). The cache leverages Keystone's session reauthentication mechanism to automatically invalidate cached tokens when necessary. This commit also raises the open file descriptor limit for the DC orchestrator service. With the use of sessions, TCP connections are reused and are not closed immediately after each request. Test Plan: 01. PASS - Deploy a subcloud and verify token caching behavior. 02. PASS - Deploy a subcloud with remote install, ensuring the token cache works. 03. PASS - Prestage a subcloud for install and software deployment, validating token caching during the process. 04. PASS - Run prestage orchestration and verify proper use of the token cache. 05. PASS - Manage a subcloud for the first time and verify that the initial sync functions as expected. Ensure fernet key rotation causes cached tokens to invalidate, and confirm reauthentication requests are made. 06. PASS - Unmanage a subcloud, rotate all fernet keys manually, then manage the subcloud again. Verify token invalidation and reauthentication function as expected. 07. PASS - Create a subcloud backup and ensure no token cache issues arise. 08. PASS - Restore a subcloud from backup and verify proper functionality of the token cache. 09. PASS - Deploy an N-1 subcloud and validate token caching for this subcloud. 10. PASS - Verify that audits correctly identify an N-1 subcloud without the USM patch as missing the USM service. 11. PASS - Apply the USM patch to the N-1 subcloud and verify that the audit detects the USM service and prestage orchestration for software deployment functions correctly. 12. PASS - Test DC orchestration audit and sync by creating a new OpenStack user, and verify the user is replicated to the subcloud. 13. PASS - Apply a patch to subclouds using software deployment orchestration, verifying token cache performance. 14. PASS - Test dcmanager API commands that send requests to subclouds (e.g., 'dcmanager subcloud show <subcloud> --details'), ensuring token cache is used. 15. PASS - Conduct a soak test of all DC services to verify token expiration, renewal, and cache behavior over extended use. 16. PASS - Monitor TCP connections to ensure they are properly closed after each use, preventing lingering open connections during token caching or HTTP request handling. 17. PASS - Run end-to-end geo-redundancy operation and verify that it completes successfully. 18. PASS - Run kube rootca update orchestration and verify that it completes successfully. 19. PASS - Verify that the number of POST token requests made by the DC audit to the subcloud per hour is equal to the number of DC audit workers on the system controller. 20. PASS - Monitor the number of open file descriptors to ensure it does not reach the new limit while executing a DC kube rootca update strategy with the maximum number of supported subclouds. Additionally, verify that all sessions are closed after the strategy is complete. Closes-Bug: 2084490 Change-Id: Ie3c17f58c09ae08df8cd9f0c92f50ab0c556c263 Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com>
206 lines
7.4 KiB
Python
206 lines
7.4 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# Copyright (c) 2017-2024 Wind River Systems, Inc.
|
|
#
|
|
# The right to copy, distribute, modify, or otherwise make use
|
|
# of this software may be licensed only pursuant to the terms
|
|
# of an applicable Wind River license agreement.
|
|
#
|
|
|
|
import functools
|
|
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
import oslo_messaging
|
|
from oslo_service import service
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
|
|
from dcmanager.common import consts
|
|
from dcmanager.common import context
|
|
from dcmanager.common import exceptions
|
|
from dcmanager.common.i18n import _
|
|
from dcmanager.common import messaging as rpc_messaging
|
|
from dcmanager.common import utils
|
|
from dcmanager.state.subcloud_state_manager import SubcloudStateManager
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
def request_context(func):
|
|
@functools.wraps(func)
|
|
def wrapped(self, ctx, *args, **kwargs):
|
|
if ctx is not None and not isinstance(ctx, context.RequestContext):
|
|
ctx = context.RequestContext.from_dict(ctx.to_dict())
|
|
try:
|
|
return func(self, ctx, *args, **kwargs)
|
|
except exceptions.DCManagerException:
|
|
raise oslo_messaging.rpc.dispatcher.ExpectedException()
|
|
|
|
return wrapped
|
|
|
|
|
|
class DCManagerStateService(service.Service):
|
|
"""Lifecycle manager for a running service.
|
|
|
|
- All the methods in here are called from the RPC client.
|
|
- If a RPC call does not have a corresponding method here, an exception
|
|
will be thrown.
|
|
- Arguments to these calls are added dynamically and will be treated as
|
|
keyword arguments by the RPC client.
|
|
"""
|
|
|
|
def __init__(self, host):
|
|
super(DCManagerStateService, self).__init__()
|
|
self.host = cfg.CONF.host
|
|
self.rpc_api_version = consts.RPC_API_VERSION
|
|
self.topic = consts.TOPIC_DC_MANAGER_STATE
|
|
# The following are initialized here, but assigned in start() which
|
|
# happens after the fork when spawning multiple worker processes
|
|
self.engine_id = None
|
|
self.target = None
|
|
self._rpc_server = None
|
|
self.subcloud_state_manager = None
|
|
self.audit_rpc_client = None
|
|
|
|
def _init_managers(self):
|
|
self.subcloud_state_manager = SubcloudStateManager()
|
|
|
|
def start(self):
|
|
LOG.info("Starting %s", self.__class__.__name__)
|
|
utils.set_open_file_limit(cfg.CONF.state_worker_rlimit_nofile)
|
|
self._init_managers()
|
|
target = oslo_messaging.Target(
|
|
version=self.rpc_api_version, server=self.host, topic=self.topic
|
|
)
|
|
self.target = target
|
|
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
|
|
self._rpc_server.start()
|
|
# Used to notify dcmanager-audit
|
|
self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient()
|
|
|
|
super(DCManagerStateService, self).start()
|
|
|
|
def _stop_rpc_server(self):
|
|
# Stop RPC connection to prevent new requests
|
|
LOG.debug(_("Attempting to stop engine service..."))
|
|
try:
|
|
self._rpc_server.stop()
|
|
self._rpc_server.wait()
|
|
LOG.info("Engine service stopped successfully")
|
|
except Exception as ex:
|
|
LOG.error("Failed to stop engine service: %s", str(ex))
|
|
|
|
def stop(self):
|
|
LOG.info("Stopping %s", self.__class__.__name__)
|
|
self._stop_rpc_server()
|
|
# Terminate the engine process
|
|
LOG.info("All threads were gone, terminating engine")
|
|
super(DCManagerStateService, self).stop()
|
|
|
|
@request_context
|
|
def update_subcloud_endpoint_status(
|
|
self,
|
|
context,
|
|
subcloud_name=None,
|
|
subcloud_region=None,
|
|
endpoint_type=None,
|
|
sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
|
|
alarmable=True,
|
|
ignore_endpoints=None,
|
|
):
|
|
# Updates subcloud endpoint sync status
|
|
LOG.info(
|
|
"Handling update_subcloud_endpoint_status request for subcloud: "
|
|
f"({subcloud_name if subcloud_name is not None else subcloud_region}) "
|
|
f"endpoint: ({endpoint_type}) status: ({sync_status})"
|
|
)
|
|
|
|
self.subcloud_state_manager.update_subcloud_endpoint_status(
|
|
context,
|
|
subcloud_region,
|
|
endpoint_type,
|
|
sync_status,
|
|
alarmable,
|
|
ignore_endpoints,
|
|
)
|
|
|
|
# If the patching sync status is being set to unknown, trigger the
|
|
# patching audit so it can update the sync status ASAP.
|
|
if (
|
|
endpoint_type == dccommon_consts.ENDPOINT_TYPE_PATCHING
|
|
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
):
|
|
self.audit_rpc_client.trigger_patch_audit(context)
|
|
|
|
# If the software sync status is being set to unknown, trigger the
|
|
# software audit so it can update the sync status ASAP.
|
|
if (
|
|
endpoint_type == dccommon_consts.AUDIT_TYPE_SOFTWARE
|
|
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
):
|
|
self.audit_rpc_client.trigger_software_audit(context)
|
|
|
|
# If the firmware sync status is being set to unknown, trigger the
|
|
# firmware audit so it can update the sync status ASAP.
|
|
if (
|
|
endpoint_type == dccommon_consts.ENDPOINT_TYPE_FIRMWARE
|
|
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
):
|
|
self.audit_rpc_client.trigger_firmware_audit(context)
|
|
|
|
# If the kubernetes sync status is being set to unknown, trigger the
|
|
# kubernetes audit so it can update the sync status ASAP.
|
|
if (
|
|
endpoint_type == dccommon_consts.ENDPOINT_TYPE_KUBERNETES
|
|
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
):
|
|
self.audit_rpc_client.trigger_kubernetes_audit(context)
|
|
|
|
return
|
|
|
|
@request_context
|
|
def update_subcloud_availability(
|
|
self,
|
|
context,
|
|
subcloud_name,
|
|
subcloud_region,
|
|
availability_status,
|
|
update_state_only=False,
|
|
audit_fail_count=None,
|
|
):
|
|
# Updates subcloud availability
|
|
LOG.info(
|
|
"Handling update_subcloud_availability request for: %s" % subcloud_name
|
|
)
|
|
self.subcloud_state_manager.update_subcloud_availability(
|
|
context,
|
|
subcloud_region,
|
|
availability_status,
|
|
update_state_only,
|
|
audit_fail_count,
|
|
)
|
|
|
|
def bulk_update_subcloud_availability_and_endpoint_status(
|
|
self, context, simplified_subcloud, availability_data, endpoint_data
|
|
):
|
|
LOG.debug(
|
|
"Handling bulk_update_subcloud_availability_and_endpoint_status request "
|
|
f"for subcloud: {simplified_subcloud['name']}"
|
|
)
|
|
|
|
manager = self.subcloud_state_manager
|
|
manager.bulk_update_subcloud_availability_and_endpoint_status(
|
|
context, simplified_subcloud, availability_data, endpoint_data
|
|
)
|