distcloud/distributedcloud/dcorch/engine/generic_sync_manager.py
Li Zhu 287246bf4f DCorch Engine Update for Scalability
1. Refactor dcorch's generic_sync_manager.py and initial_sync_manager
   into a main process manager and a worker manager. The main manager
   will handle the allocation of eligible subclouds to each worker.
2. Rename the current EngineService to EngineWorkerService and introduce
   a new EngineService for the main process, similar to
   DCManagerAuditService and DCManagerAuditWorkerService.
3. Rename the current RPC EngineClient to EngineWorkerClient and
   introduce a new EngineClient. Adapt the RPC methods to accommodate
   the modifications in these main process managers and worker managers.
4. Move master resources data retrieval from each sync_thread to engine
   workers.
5. Implement 2 new db APIs for subcloud batch sync and state updates.
6. Remove code related to sync_lock and its associated db table schema.
7. Add ocf script for managing the start and stop of the dcorch
   engine-worker service, and make changes in packaging accordingly.
8. Bug fixes for the issues related to the usage of
   base64.urlsafe_b64encode and base64.urlsafe_b64decode in python3.
9. Update unit tests for the main process and worker managers.

Test Plan:
PASS: Verify that the dcorch audit runs properly every 5 minutes.
PASS: Verify that the initial sync runs properly every 10 seconds.
PASS: Verify that the sync subclouds operation runs properly every 5
      seconds.
PASS: Successfully start and stop the dcorch-engine and
      dcorch-engine-worker services using the sm commands.
PASS: Change the admin password on the system controller using
      the command "openstack --os-region-name SystemController user
      password set". Verify that the admin password is synchronized
      to the subcloud and the dcorch receives the corresponding sync
      request, followed by successful execution of sync resources for
      the subcloud.
PASS: Unmanage and then manage a subcloud, and verify that the initial
      sync is executed successfully for that subcloud.
PASS: Verify the removal of the sync_lock table from the dcorch db.

Story: 2011106
Task: 50013

Change-Id: I329847bd1107ec43e67ec59bdd1e3111b7b37cd3
Signed-off-by: lzhu1 <li.zhu@windriver.com>
2024-05-15 10:49:13 -04:00

144 lines
5.7 KiB
Python

# Copyright 2017 Ericsson AB.
# Copyright (c) 2020-2024 Wind River Systems, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import eventlet
from oslo_config import cfg
from oslo_log import log as logging
from dccommon import consts as dccommon_consts
from dcorch.common import consts as dco_consts
from dcorch.common import context
from dcorch.db import api as db_api
from dcorch.rpc import client
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
CHECK_AUDIT_INTERVAL = 300 # frequency to check for audit work
CHECK_SYNC_INTERVAL = 5 # frequency to check for sync work
AUDIT_INTERVAL = 1200 # Default audit interval
class GenericSyncManager(object):
"""Manages tasks related to resource management."""
def __init__(self, *args, **kwargs):
self.context = context.get_admin_context()
self.engine_worker_rpc_client = client.EngineWorkerClient()
def sync_job_thread(self):
"""Perform sync request for subclouds as required."""
while True:
try:
self.sync_subclouds()
eventlet.greenthread.sleep(CHECK_SYNC_INTERVAL)
except eventlet.greenlet.GreenletExit:
# We have been told to exit
return
except Exception as e:
LOG.exception(e)
def sync_audit_thread(self):
"""Perform sync request for subclouds as required."""
while True:
try:
self.run_sync_audit()
eventlet.greenthread.sleep(CHECK_AUDIT_INTERVAL)
except eventlet.greenlet.GreenletExit:
# We have been told to exit
return
except Exception as e:
LOG.exception(e)
def _process_subclouds(self, rpc_method, subcloud_sync_list):
# We want a chunksize of at least 1 so add the number of workers.
chunksize = \
(len(subcloud_sync_list) + CONF.workers) // (CONF.workers)
subcloud_sync_chunk = []
for subcloud_sync in subcloud_sync_list:
subcloud_sync_chunk.append(subcloud_sync)
if len(subcloud_sync_chunk) == chunksize:
# We've gathered a batch of subclouds, send it to engine worker
# to process.
self._send_chunk(rpc_method, subcloud_sync_chunk)
subcloud_sync_chunk = []
if subcloud_sync_chunk:
# We've got a partial batch...send it off for processing.
self._send_chunk(rpc_method, subcloud_sync_chunk)
LOG.debug(f"Done sending {rpc_method.__name__} request messages.")
def sync_subclouds(self):
LOG.info("Start sync_subclouds")
# get a list of eligible subclouds (region_name, endpoint_type),
# and mark them as in-progress.
subcloud_sync_list = db_api.subcloud_sync_update_all_to_in_progress(
self.context,
management_state=dccommon_consts.MANAGEMENT_MANAGED,
availability_status=dccommon_consts.AVAILABILITY_ONLINE,
initial_sync_state=dco_consts.INITIAL_SYNC_STATE_COMPLETED,
sync_requests=[dco_consts.SYNC_STATUS_REQUESTED,
dco_consts.SYNC_STATUS_FAILED])
if subcloud_sync_list:
self._process_subclouds(
self.engine_worker_rpc_client.sync_subclouds, subcloud_sync_list)
else:
LOG.debug("No eligible subclouds for sync.")
def run_sync_audit(self):
LOG.info("Start run_sync_audit")
# get a list of eligible subclouds (region_name, endpoint_type),
# and mark them as in-progress.
# check if the last audit time is equal or greater than the audit
# interval only if the status is completed or in progress (in case
# the process is dead while audit is in progress), or go ahead with
# audit if the status is failed or none.
subcloud_sync_list = db_api.subcloud_audit_update_all_to_in_progress(
self.context,
management_state=dccommon_consts.MANAGEMENT_MANAGED,
availability_status=dccommon_consts.AVAILABILITY_ONLINE,
initial_sync_state=dco_consts.INITIAL_SYNC_STATE_COMPLETED,
audit_interval=AUDIT_INTERVAL)
if subcloud_sync_list:
self._process_subclouds(
self.engine_worker_rpc_client.run_sync_audit, subcloud_sync_list)
else:
LOG.debug("No eligible subclouds for audit.")
def sync_request(self, ctxt, endpoint_type):
# Someone has enqueued a sync job. set the endpoint sync_request to
# requested
db_api.subcloud_sync_update_all(
ctxt, dccommon_consts.MANAGEMENT_MANAGED, endpoint_type,
values={'sync_request': dco_consts.SYNC_STATUS_REQUESTED})
def _send_chunk(self, rpc_method, subcloud_sync_chunk):
try:
rpc_method(self.context, subcloud_sync_chunk)
LOG.debug(f"Sent {rpc_method.__name__} request message for "
f"{len(subcloud_sync_chunk)} (subcloud, endpoint_type) "
f"pairs.")
except Exception as e:
LOG.error(f"Exception occurred in {rpc_method.__name__} for "
f"subclouds {subcloud_sync_chunk}: {e}")