# Copyright (c) 2017-2025 Wind River Systems, Inc. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import eventlet import threading from oslo_concurrency import lockutils from oslo_config import cfg from oslo_log import log as logging from oslo_utils import timeutils from dccommon import consts as dccommon_consts from dccommon.drivers.openstack import sdk_platform as sdk from dccommon.endpoint_cache import EndpointCache from dccommon import utils as cutils from dcdbsync.dbsyncclient import client as dbsyncclient from dcmanager.rpc import client as dcmanager_rpc_client from dcorch.common import consts from dcorch.common import context from dcorch.common import exceptions from dcorch.common import utils from dcorch.db import api as db_api from dcorch.engine.fernet_key_manager import FERNET_REPO_MASTER_ID from dcorch.objects import orchrequest from dcorch.objects import resource from dcorch.objects.subcloud import Subcloud from dcorch.objects import subcloud_resource from keystoneclient import client as keystoneclient # Some of the variables defined in this file cannot be recognized by the # current pylint check, thus will raise error which will fail tox check # The pylint check is temporarily skipped on this file # pylint: skip-file LOG = logging.getLogger(__name__) SYNC_TIMEOUT = 600 # Timeout for subcloud sync # sync request states, should be in SyncRequest class STATE_QUEUED = "queued" STATE_IN_PROGRESS = "in-progress" STATE_TIMEDOUT = "timedout" STATE_ABORTED = "aborted" STATE_FAILED = "failed" STATE_COMPLETED = "completed" # Audit findings AUDIT_RESOURCE_MISSING = "missing" AUDIT_RESOURCE_EXTRA = "extra_resource" AUDIT_LOCK_NAME = "dcorch-audit" def get_master_os_client(region_clients=None): # Used by the master clients only. The subcloud clients don't need to be # cached in the openstack driver, because we don't want to hold the admin # sessions for the subclouds. try: os_client = sdk.OpenStackDriver(region_clients=region_clients) except Exception as e: LOG.error( "Failed to get os_client for " f"{cutils.get_region_one_name()}/{region_clients}: {e}." ) raise e return os_client class SyncThread(object): """Manages tasks related to resource management.""" MAX_RETRY = 3 PENDING_SYNC_REQUEST_STATES = ( consts.ORCH_REQUEST_QUEUED, consts.ORCH_REQUEST_IN_PROGRESS, consts.ORCH_REQUEST_FAILED, ) # used by the audit to cache the master resources master_resources_dict = collections.defaultdict(dict) def __init__( self, subcloud_name, endpoint_type=None, management_ip=None, software_version=None, subcloud_id=None, engine_id=None, ): self.endpoint_type = endpoint_type # endpoint type self.subcloud_name = subcloud_name # subcloud name self.management_ip = management_ip self.software_version = software_version self.subcloud_id = subcloud_id self.engine_id = engine_id self.ctxt = context.get_admin_context() self.sync_handler_map = {} self.master_region_name = cutils.get_region_one_name() self.audit_resources = [] self.log_extra = {"instance": self.subcloud_name + ": "} self.dcmanager_state_rpc_client = dcmanager_rpc_client.SubcloudStateClient() self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient() self.sc_admin_session = None self.sc_auth_url = None self.admin_session = None self.ks_client = None self.dbs_client = None def should_exit(self): # Return whether the sync/audit threads should exit. try: db_api.subcloud_sync_get(self.ctxt, self.subcloud_name, self.endpoint_type) except exceptions.SubcloudSyncNotFound: return True return False def is_subcloud_managed(self): # is this subcloud managed subcloud = Subcloud.get_by_name(self.ctxt, self.subcloud_name) return subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED def is_subcloud_enabled(self): # is this subcloud enabled subcloud = Subcloud.get_by_name(self.ctxt, self.subcloud_name) # We only enable syncing if the subcloud is online and the initial # sync has completed. if subcloud.availability_status == dccommon_consts.AVAILABILITY_ONLINE and ( subcloud.initial_sync_state == consts.INITIAL_SYNC_STATE_COMPLETED ): return True else: return False def initialize(self): # base implementation of initializing the master client. # The specific SyncThread subclasses may extend this. if self.endpoint_type in dccommon_consts.ENDPOINT_TYPES_LIST: config = cfg.CONF.endpoint_cache self.admin_session = EndpointCache.get_admin_session( config.auth_uri, config.username, config.user_domain_name, config.password, config.project_name, config.project_domain_name, timeout=60, ) elif self.endpoint_type in dccommon_consts.ENDPOINT_TYPES_LIST_OS: config = cfg.CONF.openstack_cache self.admin_session = EndpointCache.get_admin_session( config.auth_uri, config.admin_username, config.admin_user_domain_name, config.admin_password, config.admin_tenant, config.admin_project_domain_name, timeout=60, ) else: raise exceptions.EndpointNotSupported(endpoint=self.endpoint_type) # keystone client self.ks_client = keystoneclient.Client( session=self.admin_session, region_name=cutils.get_region_one_name() ) # dcdbsync client self.dbs_client = dbsyncclient.Client( endpoint_type=consts.DBS_ENDPOINT_INTERNAL, session=self.admin_session, region_name=cutils.get_region_one_name(), ) def initialize_sc_clients(self): # base implementation of initializing the subcloud specific # clients, only used by the subclasses. # The specific SyncThread subclasses may extend this if not self.sc_admin_session: # Subclouds will use token from the Subcloud specific Keystone, # so define a session against that subcloud's keystone endpoint self.sc_auth_url = cutils.build_subcloud_endpoint( self.management_ip, "keystone" ) LOG.debug( f"Built sc_auth_url {self.sc_auth_url} for subcloud " f"{self.subcloud_name}" ) if self.endpoint_type in dccommon_consts.ENDPOINT_TYPES_LIST: config = cfg.CONF.endpoint_cache self.sc_admin_session = EndpointCache.get_admin_session( self.sc_auth_url, config.username, config.user_domain_name, config.password, config.project_name, config.project_domain_name, timeout=60, ) elif self.endpoint_type in dccommon_consts.ENDPOINT_TYPES_LIST_OS: config = cfg.CONF.openstack_cache self.sc_admin_session = EndpointCache.get_admin_session( self.sc_auth_url, config.admin_username, config.admin_user_domain_name, config.admin_password, config.admin_tenant, config.admin_project_domain_name, timeout=60, ) def initial_sync(self): # Return True to indicate initial sync success return True def enable(self): # Called when DC manager thinks this subcloud is good to go. self.run_sync_audit() def get_db_subcloud_resource(self, rsrc_id): try: if self.subcloud_id is None: self.subcloud_id = Subcloud.get_by_name( self.ctxt, self.subcloud_name ).id subcloud_rsrc = ( subcloud_resource.SubcloudResource.get_by_resource_and_subcloud( self.ctxt, rsrc_id, self.subcloud_id ) ) # pylint: disable=E1101 return subcloud_rsrc except exceptions.SubcloudResourceNotFound: LOG.info( "{} not found in subcloud {} resource table".format( rsrc_id, self.subcloud_id ), extra=self.log_extra, ) return None def persist_db_subcloud_resource(self, db_rsrc_id, subcloud_rsrc_id): # This function can be invoked after creating a subcloud resource. # Persist the subcloud resource to the DB for later # # Parameters: # db_rsrc_id: the "id" field of the resource in the DB # subcloud_rsrc_id: the unique identifier of the subcloud resource subcloud_rsrc = self.get_db_subcloud_resource(db_rsrc_id) if not subcloud_rsrc: if self.subcloud_id is None: self.subcloud_id = Subcloud.get_by_name( self.ctxt, self.subcloud_name ).id subcloud_rsrc = subcloud_resource.SubcloudResource( self.ctxt, subcloud_resource_id=subcloud_rsrc_id, resource_id=db_rsrc_id, subcloud_id=self.subcloud_id, ) # pylint: disable=E1101 # There is no race condition for creation of # subcloud_resource as it is always done from the same thread. subcloud_rsrc.create() elif subcloud_rsrc.subcloud_resource_id != subcloud_rsrc_id: # May be the resource was manually deleted from the subcloud. # So, update the dcorch DB with the new resource id from subcloud. subcloud_rsrc.subcloud_resource_id = subcloud_rsrc_id LOG.info( "Updating {}:{} [{}]".format( db_rsrc_id, subcloud_rsrc.subcloud_resource_id, subcloud_rsrc_id ), extra=self.log_extra, ) subcloud_rsrc.save() else: LOG.info( "subcloud_rsrc {}:{} [{}] is up-to-date".format( db_rsrc_id, subcloud_rsrc.subcloud_resource_id, subcloud_rsrc_id ), extra=self.log_extra, ) return subcloud_rsrc.subcloud_resource_id def sync_resource(self, sync_request): rsrc = resource.Resource.get_by_id(self.ctxt, sync_request.orch_job.resource_id) # pylint: disable=E1101 handler = self.sync_handler_map[rsrc.resource_type] LOG.info( "{} Invoking {} for {} [{}]".format( self.engine_id, handler.__name__, rsrc.resource_type, sync_request.orch_job.operation_type, ), extra=self.log_extra, ) handler(sync_request, rsrc) def set_sync_status(self, sync_status, alarmable=True): # Only report sync_status when managed subcloud_managed = self.is_subcloud_managed() if not subcloud_managed: LOG.debug( "set_sync_status: skip update sync update for unmanaged " "subcloud {}".format(self.subcloud_name) ) return subcloud_sync = db_api.subcloud_sync_get( self.ctxt, self.subcloud_name, self.endpoint_type ) if subcloud_sync.sync_status_report_time: delta = timeutils.delta_seconds( subcloud_sync.sync_status_report_time, timeutils.utcnow() ) if delta < 3600: if subcloud_sync.sync_status_reported == sync_status: LOG.debug( "skip set_sync_status sync_status_reported={}, " "sync_status={}".format( subcloud_sync.sync_status_reported, sync_status ), extra=self.log_extra, ) return LOG.info( "{}: set_sync_status {}, alarmable: {}".format( self.subcloud_name, sync_status, alarmable ), extra=self.log_extra, ) self.dcmanager_state_rpc_client.update_subcloud_endpoint_status( self.ctxt, subcloud_region=self.subcloud_name, endpoint_type=self.endpoint_type, sync_status=sync_status, alarmable=alarmable, ) db_api.subcloud_sync_update( self.ctxt, self.subcloud_name, self.endpoint_type, values={ "sync_status_reported": sync_status, "sync_status_report_time": timeutils.utcnow(), }, ) def sync(self): LOG.debug( "{}: starting sync routine".format(self.subcloud_name), extra=self.log_extra ) region_name = self.subcloud_name sync_requests = orchrequest.OrchRequestList.get_by_attrs( self.ctxt, self.endpoint_type, target_region_name=region_name, states=self.PENDING_SYNC_REQUEST_STATES, ) # Early exit in case there are no pending sync requests if not sync_requests: LOG.debug( "Sync resources done for subcloud - no sync requests", extra=self.log_extra, ) self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC) return LOG.info( "Got {} sync request(s)".format(len(sync_requests)), extra=self.log_extra, ) actual_sync_requests = [] for req in sync_requests: # Failed orch requests were taken into consideration when reporting # sync status to the dcmanager. They need to be removed from the # orch requests list before proceeding. if req.state != consts.ORCH_REQUEST_STATE_FAILED: actual_sync_requests.append(req) if not actual_sync_requests: LOG.info( "Sync resources done for subcloud - no valid sync requests", extra=self.log_extra, ) # We got FAILED requests, set sync_status=out-of-sync self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC) return elif not self.is_subcloud_enabled(): LOG.info( "Sync resources done for subcloud - subcloud is disabled", extra=self.log_extra, ) self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC) return # Subcloud is enabled and there are pending sync requests, so # we have work to do. request_aborted = False timeout = eventlet.timeout.Timeout(SYNC_TIMEOUT) try: for request in actual_sync_requests: if not self.is_subcloud_enabled() or self.should_exit(): # Oops, someone disabled the endpoint while # we were processing work for it. raise exceptions.EndpointNotReachable() request.state = consts.ORCH_REQUEST_STATE_IN_PROGRESS try: request.save() # save to DB except exceptions.OrchRequestNotFound: # This case is handled in loop below, but should also be # handled here as well. LOG.info( "Orch request already deleted request uuid=%s state=%s" % (request.uuid, request.state), extra=self.log_extra, ) continue retry_count = 0 while retry_count < self.MAX_RETRY: try: self.sync_resource(request) # Sync succeeded, mark the request as # completed for tracking/debugging purpose # and tag it for purge when its deleted # time exceeds the data retention period. request.state = consts.ORCH_REQUEST_STATE_COMPLETED request.deleted = 1 request.deleted_at = timeutils.utcnow() request.save() break except exceptions.OrchRequestNotFound: LOG.info( "Orch request already deleted request uuid=%s state=%s" % (request.uuid, request.state), extra=self.log_extra, ) break except exceptions.SyncRequestTimeout: request.try_count += 1 request.save() retry_count += 1 if retry_count >= self.MAX_RETRY: raise exceptions.EndpointNotReachable() except exceptions.SyncRequestFailedRetry: LOG.info( "SyncRequestFailedRetry for {}/{}".format( self.subcloud_name, self.endpoint_type ), extra=self.log_extra, ) request.try_count += 1 request.state = consts.ORCH_REQUEST_STATE_FAILED request.save() retry_count += 1 # Incremental backoff retry is implemented to define the wait # time between each attempt to retry the sync. # 1st retry: 1s. # 2nd retry: 3s. if retry_count < self.MAX_RETRY: # Only sleep if this is not the last retry sleep_duration = 1 + (retry_count - 1) * 2 eventlet.greenthread.sleep(sleep_duration) else: LOG.error( "SyncRequestFailedRetry: max retries reached " "for {}/{}".format( self.subcloud_name, self.endpoint_type ), extra=self.log_extra, ) except exceptions.SyncRequestFailed: LOG.error( "SyncRequestFailed for {}/{}".format( self.subcloud_name, self.endpoint_type ), extra=self.log_extra, ) request.state = consts.ORCH_REQUEST_STATE_FAILED request.save() retry_count = self.MAX_RETRY request_aborted = True except exceptions.SyncRequestAbortedBySystem: request.state = consts.ORCH_REQUEST_STATE_FAILED request.save() retry_count = self.MAX_RETRY request_aborted = True except Exception as e: LOG.error( f"Unexpected error during sync: {e}", extra=self.log_extra, ) request.state = consts.ORCH_REQUEST_STATE_FAILED request.save() retry_count = self.MAX_RETRY # If we fall out of the retry loop we either succeeded # or failed multiple times and want to move to the next # request. except eventlet.timeout.Timeout: # The entire sync operation timed out, covering all sync requests. # Just log the exception and continue to check if there are # pending requests. LOG.exception( f"Sync timed out for {self.subcloud_name}/{self.endpoint_type}." ) except exceptions.EndpointNotReachable: # Endpoint not reachable, throw away all the sync requests. LOG.info( "EndpointNotReachable, {} sync requests pending".format( len(actual_sync_requests) ), extra=self.log_extra, ) # del sync_requests[:] #This fails due to: # 'OrchRequestList' object does not support item deletion finally: timeout.cancel() sync_requests = orchrequest.OrchRequestList.get_by_attrs( self.ctxt, self.endpoint_type, target_region_name=region_name, states=self.PENDING_SYNC_REQUEST_STATES, ) alarmable = False for req in sync_requests: # Any failed state should be alarmable if req.state == consts.ORCH_REQUEST_STATE_FAILED: alarmable = True # Do not raise an alarm if all the sync requests are due to # a fernet key rotation, as these are expected to occur # periodically. if req.orch_job.source_resource_id != FERNET_REPO_MASTER_ID: alarmable = True # If there are pending requests, update the status to out-of-sync. if sync_requests: # If the request was aborted due to an expired certificate, # update the status to 'out-of-sync' and just return so the # sync_request is updated to "completed". This way, the sync # job won't attempt to retry the sync in the next cycle. if request_aborted: self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC) LOG.info( "End of resource sync out-of-sync. {} sync request(s): " "request_aborted".format(len(sync_requests)), extra=self.log_extra, ) return # Otherwise, e.g. timeout or EndpointNotReachable, # update the status and raise an exception to set the sync_request to # 'failed', so the sync job will re-attempt the sync in the next # sync cycle. else: self.set_sync_status( dccommon_consts.SYNC_STATUS_OUT_OF_SYNC, alarmable=alarmable ) LOG.info( "End of resource sync out-of-sync. {} sync request(s)".format( len(sync_requests) ), extra=self.log_extra, ) msg = ( f"There are {len(sync_requests)} pending requests to sync. " "Will retry in next sync cycle." ) raise Exception(msg) else: self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC) LOG.info( "End of resource sync in-sync. {} sync request(s)".format( len(sync_requests) ), extra=self.log_extra, ) LOG.info( "Sync resources done for subcloud - " "synced {} request(s)".format(len(actual_sync_requests)), extra=self.log_extra, ) def run_sync_audit(self, engine_id=None): if self.endpoint_type in cfg.CONF.disable_audit_endpoints: LOG.warn("Audit disabled!", extra=self.log_extra) return LOG.debug( "Engine id={}: sync_audit started".format(engine_id), extra=self.log_extra ) try: self.sync_audit(engine_id) finally: self.post_audit() def sync_audit(self, engine_id): LOG.debug( "Engine id={}: starting sync audit".format(engine_id), extra=self.log_extra ) most_recent_failed_request = ( orchrequest.OrchRequest.get_most_recent_failed_request(self.ctxt) ) if most_recent_failed_request: LOG.debug( "Most recent failed request id=%s, timestamp=%s", most_recent_failed_request.id, most_recent_failed_request.updated_at, ) else: LOG.debug("There are no failed requests.") total_num_of_audit_jobs = 0 # TODO(ecandotti): move this behavior to SysinvSyncThread class # If the endpoint is of type Platform and the subcloud has dcagent, # retrieve all platform resources with a single dcagent call to avoid # making separate get_dcagent_resources calls for each resource type. if self.endpoint_type == dccommon_consts.ENDPOINT_TYPE_PLATFORM and ( self.has_dcagent ): all_master_resources = dict() for resource_type in self.audit_resources: all_master_resources[resource_type] = self.get_cached_master_resources( resource_type ) platform_resources = self.get_dcagent_resources( self.audit_resources, all_master_resources ) if platform_resources is None: # If subcloud is not reachable, abort audit. return for resource_type in self.audit_resources: if not self.is_subcloud_enabled() or self.should_exit(): LOG.info( "{}: aborting sync audit, as subcloud is disabled".format( threading.currentThread().getName() ), extra=self.log_extra, ) return # Skip resources with outstanding sync requests region_name = self.subcloud_name sync_requests = [] states = [ consts.ORCH_REQUEST_QUEUED, consts.ORCH_REQUEST_IN_PROGRESS, ] sync_requests = orchrequest.OrchRequestList.get_by_attrs( self.ctxt, self.endpoint_type, resource_type=resource_type, target_region_name=region_name, states=states, ) abort_resources = [req.orch_job.source_resource_id for req in sync_requests] if len(sync_requests) > 0: LOG.info( "Will not audit {}. {} sync request(s) pending".format( abort_resources, len(sync_requests) ), extra=self.log_extra, ) num_of_audit_jobs = 0 try: m_resources, db_resources, sc_resources = self.get_all_resources( resource_type ) if self.endpoint_type == dccommon_consts.ENDPOINT_TYPE_PLATFORM and ( self.has_dcagent ): sc_resources = platform_resources[resource_type] # todo: delete entries in db_resources with no corresponding # entry in m_resources? if sc_resources is None or m_resources is None: return LOG.debug("Audit {}".format(resource_type), extra=self.log_extra) LOG.debug( "Auditing {}: master={} db={} sc={}".format( resource_type, m_resources, db_resources, sc_resources ), extra=self.log_extra, ) num_of_audit_jobs += self.audit_find_missing( resource_type, m_resources, db_resources, sc_resources, abort_resources, ) num_of_audit_jobs += self.audit_find_extra( resource_type, m_resources, db_resources, sc_resources, abort_resources, ) except Exception: LOG.exception("Unexpected error while auditing %s", resource_type) # Extra resources in subcloud are not impacted by the audit. if not num_of_audit_jobs: LOG.debug( "Clean audit run for {}".format(resource_type), extra=self.log_extra ) else: LOG.info( "{} num_of_audit_jobs for {}".format( num_of_audit_jobs, resource_type ), extra=self.log_extra, ) total_num_of_audit_jobs += num_of_audit_jobs if most_recent_failed_request: # Soft delete all failed requests in the previous sync audit. try: orchrequest.OrchRequest.delete_previous_failed_requests( self.ctxt, most_recent_failed_request.updated_at ) except Exception: # shouldn't get here LOG.exception("Unexpected error!") if not total_num_of_audit_jobs: self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC) else: db_api.subcloud_sync_update( self.ctxt, self.subcloud_name, self.endpoint_type, values={"sync_request": consts.SYNC_STATUS_REQUESTED}, ) LOG.debug( "{}: done sync audit".format(threading.currentThread().getName()), extra=self.log_extra, ) def post_audit(self): # Some specific SyncThread subclasses may perform post audit actions utils.close_session( self.sc_admin_session, "audit", f"{self.subcloud_name}/{self.endpoint_type}" ) @classmethod @lockutils.synchronized(AUDIT_LOCK_NAME) def reset_master_resources_cache(cls): # reset the cached master resources LOG.debug("Reset the cached master resources.") SyncThread.master_resources_dict = collections.defaultdict(dict) def audit_find_missing( self, resource_type, m_resources, db_resources, sc_resources, abort_resources ): """Find missing resources in subcloud. - Input param db_resources is modified in this routine to remove entries that match the resources in master cloud. At the end, db_resources will have a list of resources that are present in dcorch DB, but not present in the master cloud. """ num_of_audit_jobs = 0 for m_r in m_resources: master_id = self.get_resource_id(resource_type, m_r) if master_id in abort_resources: LOG.info( "audit_find_missing: Aborting audit for {}".format(master_id), extra=self.log_extra, ) num_of_audit_jobs += 1 # There are pending jobs for this resource, abort audit continue missing_resource = False m_rsrc_db = None for db_resource in db_resources: if db_resource.master_id == master_id: m_rsrc_db = db_resource db_resources.remove(db_resource) break if m_rsrc_db: # resource from master cloud is present in DB. # Contents of "m_r" may refer to other master cloud resources. # Make a copy with the references updated to refer to subcloud # resources. try: m_r_updated = self.update_resource_refs(resource_type, m_r) except exceptions.SubcloudResourceNotFound: # If we couldn't find the equivalent subcloud resources, # we don't know what to look for in the subcloud so skip # this m_r and go to the next one. continue # Now, look for subcloud resource in DB. # If present: look for actual resource in the # subcloud and compare the resource details. # If not present: create resource in subcloud. db_sc_resource = self.get_db_subcloud_resource(m_rsrc_db.id) if db_sc_resource: if not db_sc_resource.is_managed(): LOG.info( "Resource {} is not managed".format(master_id), extra=self.log_extra, ) continue sc_rsrc_present = False # The subcloud resource will only have "in-sync" or "out-of-sync" # if returned by dcagent. For platform resources, audit_dependants # will always return 0. if self.is_dcagent_managed_resource(): sc_rsrc_present = self.is_resource_present_in_subcloud( resource_type, master_id, sc_resources ) else: for sc_r in sc_resources: sc_id = self.get_resource_id(resource_type, sc_r) if sc_id == db_sc_resource.subcloud_resource_id: if self.same_resource(resource_type, m_r_updated, sc_r): LOG.debug( "Resource type {} {} is in-sync".format( resource_type, master_id ), extra=self.log_extra, ) num_of_audit_jobs += self.audit_dependants( resource_type, m_r, sc_r ) sc_rsrc_present = True break if not sc_rsrc_present: LOG.info( "Subcloud resource {} found in master cloud & DB, " "but the exact same resource not found in subcloud".format( db_sc_resource.subcloud_resource_id ), extra=self.log_extra, ) # Subcloud resource is present in DB, but the check # for same_resource() was negative. Either the resource # disappeared from subcloud or the resource details # are different from that of master cloud. Let the # resource implementation decide on the audit action. missing_resource = self.audit_discrepancy( resource_type, m_r, sc_resources ) else: LOG.info( "Subcloud res {} not found in DB, will create".format( master_id ), extra=self.log_extra, ) # Check and see if there are any subcloud resources that # match the master resource, and if so set up mappings. # This returns true if it finds a match. if self.map_subcloud_resource( resource_type, m_r_updated, m_rsrc_db, sc_resources ): continue missing_resource = True else: # master_resource not in resource DB LOG.info( "{} not found in DB, will create it".format(master_id), extra=self.log_extra, ) # Check and see if there are any subcloud resources that # match the master resource, and if so set up mappings. # This returns true if it finds a match. # This is for the case where the resource is not even in dcorch # resource DB (ie, resource has not been tracked by dcorch yet) if self.map_subcloud_resource( resource_type, m_r, m_rsrc_db, sc_resources ): continue missing_resource = True if missing_resource: # Resource is missing from subcloud, take action num_of_audit_jobs += self.audit_action( resource_type, AUDIT_RESOURCE_MISSING, m_r ) # As the subcloud resource is missing, invoke # the hook for dependants with no subcloud resource. # Resource implementation should handle this. num_of_audit_jobs += self.audit_dependants(resource_type, m_r, None) if num_of_audit_jobs != 0: LOG.info( "audit_find_missing {} num_of_audit_jobs".format(num_of_audit_jobs), extra=self.log_extra, ) return num_of_audit_jobs def audit_find_extra( self, resource_type, m_resources, db_resources, sc_resources, abort_resources ): """Find extra resources in subcloud. - Input param db_resources is expected to be a list of resources that are present in dcorch DB, but not present in the master cloud. """ num_of_audit_jobs = 0 # At this point, db_resources contains resources present in DB, # but not in master cloud for db_resource in db_resources: if db_resource.master_id: if db_resource.master_id in abort_resources: LOG.info( "audit_find_extra: Aborting audit for {}".format( db_resource.master_id ), extra=self.log_extra, ) num_of_audit_jobs += 1 # There are pending jobs for this resource, abort audit continue LOG.debug( "Extra resource ({}) in DB".format(db_resource.id), extra=self.log_extra, ) subcloud_rsrc = self.get_db_subcloud_resource(db_resource.id) if subcloud_rsrc: if not subcloud_rsrc.is_managed(): LOG.info( "Resource {} is not managed".format( subcloud_rsrc.subcloud_resource_id ), extra=self.log_extra, ) continue # check if the resource exists in subcloud, no need to # schedule work if it doesn't exist in subcloud. # This is a precautionary action in case the resource # has already be deleted in the subcloud which can happen # for example, user deletes the resource from master right # after an audit (not through api-proxy), then user deletes # that resource manually in the subcloud before the # next audit. if not self.resource_exists_in_subcloud( subcloud_rsrc, sc_resources ): continue LOG.info( "Resource ({}) and subcloud resource ({}) " "not in sync with master cloud".format( db_resource.master_id, subcloud_rsrc.subcloud_resource_id ), extra=self.log_extra, ) # There is extra resource in the subcloud, take action. # Note that the resource is in dcorch DB, but not # actually present in the master cloud. num_of_audit_jobs += self.audit_action( resource_type, AUDIT_RESOURCE_EXTRA, db_resource ) else: # Resource is present in resource table, but not in # subcloud_resource table. We have also established that # the corresponding OpenStack resource is not present in # the master cloud. # There might be another subcloud with "unmanaged" # subcloud resource corresponding to this resource. # So, just ignore this here! pass return num_of_audit_jobs def schedule_work( self, endpoint_type, resource_type, source_resource_id, operation_type, resource_info=None, ): LOG.info( "Scheduling {} work for {}/{}".format( operation_type, resource_type, source_resource_id ), extra=self.log_extra, ) try: subcloud = Subcloud.get_by_name(self.ctxt, self.subcloud_name) utils.enqueue_work( self.ctxt, endpoint_type, resource_type, source_resource_id, operation_type, resource_info, subcloud=subcloud, ) except Exception as e: LOG.info( "Exception in schedule_work: {}".format(str(e)), extra=self.log_extra ) def get_resource_id(self, resource_type, resource): if hasattr(resource, "master_id"): # If resource from DB, return master resource id # from master cloud return resource.master_id else: # Else, return id field (by default) return resource.id # Audit functions to be overridden in inherited classes def get_all_resources(self, resource_type): m_resources = None db_resources = None sc_resources = None # Get resources from dcdbsync if the endpoint is not platform or it is # but the subcloud doesn't support dcagent. In case it has dcagent, # the subcloud resources have already been retrieved for all platform # resources previously if self.endpoint_type != dccommon_consts.ENDPOINT_TYPE_PLATFORM or not ( self.has_dcagent ): sc_resources = self.get_subcloud_resources(resource_type) # If subcloud is not reachable, abort audit. if sc_resources is None: return m_resources, db_resources, sc_resources db_resources = self.get_db_master_resources(resource_type) m_resources = self.get_cached_master_resources(resource_type) return m_resources, db_resources, sc_resources @lockutils.synchronized(AUDIT_LOCK_NAME) def get_cached_master_resources(self, resource_type): if resource_type in SyncThread.master_resources_dict: m_resources = SyncThread.master_resources_dict[resource_type] else: m_resources = self.get_master_resources(resource_type) if m_resources is not None: SyncThread.master_resources_dict[resource_type] = m_resources return m_resources def get_subcloud_resources(self, resource_type): return None def get_db_master_resources(self, resource_type): return list(resource.ResourceList.get_all(self.ctxt, resource_type)) def get_master_resources(self, resource_type): return None def same_resource(self, resource_type, m_resource, sc_resource): return True def has_same_ids(self, resource_type, m_resource, sc_resource): return False def is_dcagent_managed_resource(self): return False def is_resource_present_in_subcloud(self, resource_type, master_id, sc_resources): return False def map_subcloud_resource(self, resource_type, m_r, m_rsrc_db, sc_resources): # Child classes can override this function to map an existing subcloud # resource to an existing master resource. If a mapping is created # the function should return True. # # It is expected that update_resource_refs() has been called on m_r. return False def update_resource_refs(self, resource_type, m_r): # Child classes can override this function to update any references # to other master resources embedded within the info of this resource. return m_r def audit_dependants(self, resource_type, m_resource, sc_resource): num_of_audit_jobs = 0 if not self.is_subcloud_enabled() or self.should_exit(): return num_of_audit_jobs if not sc_resource: # Handle None value for sc_resource pass return num_of_audit_jobs def audit_discrepancy(self, resource_type, m_resource, sc_resources): # Return true to try creating the resource again return True def audit_action(self, resource_type, finding, resource): LOG.info( "audit_action: {}/{}".format(finding, resource_type), extra=self.log_extra ) # Default actions are create & delete. Can be overridden # in resource implementation num_of_audit_jobs = 0 # resource can be either from dcorch DB or fetched by OpenStack query resource_id = self.get_resource_id(resource_type, resource) if finding == AUDIT_RESOURCE_MISSING: # default action is create for a 'missing' resource self.schedule_work( self.endpoint_type, resource_type, resource_id, consts.OPERATION_TYPE_CREATE, self.get_resource_info( resource_type, resource, consts.OPERATION_TYPE_CREATE ), ) num_of_audit_jobs += 1 elif finding == AUDIT_RESOURCE_EXTRA: # default action is delete for an 'extra_resource' # resource passed in is db_resource (resource in dcorch DB) self.schedule_work( self.endpoint_type, resource_type, resource_id, consts.OPERATION_TYPE_DELETE, ) num_of_audit_jobs += 1 return num_of_audit_jobs def get_resource_info(self, resource_type, resource, operation_type=None): return "" # check if the subcloud resource (from dcorch subcloud_resource table) # exists in subcloud resources. def resource_exists_in_subcloud(self, subcloud_rsrc, sc_resources): return True