Update dcmanager's state lock for audit update

This commit updates the lock used for the bulk update of subclouds in subcloud state manager to use an external fair lock based on the region name. Additionally, it also increases the time for the periodic audit loop from 30s to 40s in order to accumulate more subclouds to process. Test plan: 1. Unmanage and manage a subcloud in a large scale system and verify it takes around 1m to 1m30s to identify the update and synchronise it. 2. Restart audit and verify that the subsequent audits will validate around 5 times more subclouds than it initially did. 3. Activate the debug log for oslo_concurrency's lockutils and verify that the wait to acquire the lock is maintained around 0.001s Closes-bug: 2086126 Change-Id: I31b9479a5bceb151c704bd5666170e305ef5b878 Signed-off-by: Raphael Lima <Raphael.Lima@windriver.com>
2024-10-25 18:03:58 -03:00 · 2024-10-25 18:03:58 -03:00 · e5997df6cb
commit e5997df6cb
parent 62cb871068
2 changed files with 21 additions and 13 deletions
--- a/distributedcloud/dcmanager/audit/subcloud_audit_manager.py
+++ b/distributedcloud/dcmanager/audit/subcloud_audit_manager.py
@ -47,6 +47,10 @@ SUBCLOUD_STATE_UPDATE_ITERATIONS = (
    dccommon_consts.SECONDS_IN_HOUR // CONF.scheduler.subcloud_audit_interval
 )

+# Time for the periodic audit loop to execute
+# It needs to be greater than the subcloud_audit_interval
+AUDIT_LOOP_INTERVAL = CONF.scheduler.subcloud_audit_interval + 10
+
 # Name of starlingx openstack helm application
 HELM_APP_OPENSTACK = "openstack"

@ -258,7 +262,7 @@ class SubcloudAuditManager(manager.Manager):
        # does not die.
        while True:
            try:
-                eventlet.greenthread.sleep(CONF.scheduler.subcloud_audit_interval)
+                eventlet.greenthread.sleep(AUDIT_LOOP_INTERVAL)
                self._periodic_subcloud_audit_loop()
            except eventlet.greenlet.GreenletExit:
                # We have been told to exit
--- a/distributedcloud/dcmanager/state/subcloud_state_manager.py
+++ b/distributedcloud/dcmanager/state/subcloud_state_manager.py
@ -21,7 +21,6 @@ import copy

 from fm_api import constants as fm_const
 from fm_api import fm_api
-from oslo_concurrency import lockutils
 from oslo_log import log as logging

 from dccommon import consts as dccommon_consts
@ -38,7 +37,6 @@ from dcorch.rpc import client as dcorch_rpc_client

 LOG = logging.getLogger(__name__)
 ALARM_OUT_OF_SYNC = fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC
-LOCK_NAME = "dc-audit-bulk-update"


 def sync_update_subcloud_endpoint_status(func):
@ -438,14 +436,16 @@ class SubcloudStateManager(manager.Manager):
        if endpoint_data:
            self._bulk_update_subcloud_endpoint_status(context, subcloud, endpoint_data)

-    @lockutils.synchronized(LOCK_NAME)
+    @sync_update_subcloud_endpoint_status
    def _do_bulk_update_subcloud_endpoint_status(
-        self, context, subcloud, endpoint_data
+        self, context, region_name, subcloud_id, subcloud_name, endpoint_data
    ):
        """Updates an online and managed subcloud's endpoints sync status

        :param context: request context object
-        :param subcloud: subcloud to update
+        :param region_name: region name of subcloud to update
+        :param subcloud_id: id of the subcloud to update
+        :param subcloud_name: name of the subcloud to update
        :param endpoint_data: a dict containing the endpoint as key and its sync
        status as value
        """
@ -456,14 +456,14 @@ class SubcloudStateManager(manager.Manager):
        # happen at once.
        status_to_set = [f"{key} ({value})" for key, value in endpoint_data.items()]
        LOG.info(
-            f"Updating endpoints on subcloud: {subcloud.name} "
+            f"Updating endpoints on subcloud: {subcloud_name} "
            f"endpoints: {', '.join(status_to_set)}"
        )

        # For each endpoint in endpoint_data, decide whether an alarm should be set
        # or not and create it in case it's necessary.
        faults_to_set = dict()
-        entity_instance_id = f"subcloud={subcloud.name}"
+        entity_instance_id = f"subcloud={subcloud_name}"

        # Acquire all existing alarms with the specified alarm_id for a subcloud.
        faults = self.fm_api.get_faults_by_id_n_eid(
@ -489,7 +489,7 @@ class SubcloudStateManager(manager.Manager):
            has_fault = True if endpoint in endpoints_with_faults else False

            if sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC and not has_fault:
-                faults_to_set[endpoint] = self._create_fault(subcloud.name, endpoint)
+                faults_to_set[endpoint] = self._create_fault(subcloud_name, endpoint)
            elif sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC and has_fault:
                del faults_to_set[endpoint]

@ -502,19 +502,19 @@ class SubcloudStateManager(manager.Manager):
                self.fm_api.set_faults(faults_to_set.values())
            except Exception as e:
                LOG.exception(
-                    f"An error occurred when updating subcloud {subcloud.name} "
+                    f"An error occurred when updating subcloud {subcloud_name} "
                    f"alarms: {e}"
                )

        try:
            db_api.subcloud_status_bulk_update_endpoints(
                context,
-                subcloud.id,
+                subcloud_id,
                endpoint_data,
            )
        except Exception as e:
            LOG.exception(
-                f"An error occured when updating the subcloud {subcloud.name}'s"
+                f"An error occured when updating the subcloud {subcloud_name}'s"
                f"endpoint status: {e}"
            )

@ -539,7 +539,11 @@ class SubcloudStateManager(manager.Manager):
        if endpoints_to_update:
            try:
                self._do_bulk_update_subcloud_endpoint_status(
-                    context, subcloud, endpoints_to_update
+                    context,
+                    subcloud.region_name,
+                    subcloud.id,
+                    subcloud.name,
+                    endpoints_to_update,
                )
            except Exception as e:
                LOG.exception(e)