Adjust orchestrator thread sleep time based on parallel subclouds
This update adjusts the orchestrator thread sleep time based on the number of parallel subclouds, improving operational efficiency. Previously, the orchestrator had a fixed 10-second delay between checks before proceeding to the next step in operations for subclouds. This caused unnecessary waiting. By dynamically adjusting the sleep time, the orchestrator can now move forward more quickly, reducing overall execution time. Test plan PASS: Verify that the orchestrator thread sleep time does not exceed the maximum value of 10 seconds, regardless of the number of subclouds. PASS: Verify that the orchestrator thread sleep time is reset to the default value (10 seconds) after orchestration completes, is deleted, or fails. PASS: Verify that the orchestrator sleep time is dynamically adjusted during applying, aborting, or deleting strategies based on the number of parallel subclouds. Closes-Bug: 2084822 Change-Id: I821ecf1bce96995b362d2d7e1a6aa194ce2e32cc Signed-off-by: lzhu1 <li.zhu@windriver.com>
This commit is contained in:
parent
4429018879
commit
1b938c0448
@ -52,8 +52,6 @@ FORCE_ALL_TYPES = [
|
||||
consts.SW_UPDATE_TYPE_PRESTAGE,
|
||||
]
|
||||
|
||||
MAX_PARALLEL_SUBCLOUDS_LIMIT = 5000
|
||||
|
||||
|
||||
class SwUpdateStrategyController(object):
|
||||
|
||||
@ -190,7 +188,7 @@ class SwUpdateStrategyController(object):
|
||||
pecan.abort(400, _("max-parallel-subclouds invalid"))
|
||||
if (
|
||||
max_parallel_subclouds < 1
|
||||
or max_parallel_subclouds > MAX_PARALLEL_SUBCLOUDS_LIMIT
|
||||
or max_parallel_subclouds > consts.MAX_PARALLEL_SUBCLOUDS_LIMIT
|
||||
):
|
||||
pecan.abort(400, _("max-parallel-subclouds invalid"))
|
||||
|
||||
|
@ -470,3 +470,6 @@ ASSOCIATION_SYNC_STATUS_UNKNOWN = "unknown"
|
||||
HEARTBEAT_FAILURE_POLICY_ALARM = "alarm"
|
||||
|
||||
SOFTWARE_VERSION_24_09 = "24.09"
|
||||
|
||||
# The maximum number of parallel subclouds in an orchestration process
|
||||
MAX_PARALLEL_SUBCLOUDS_LIMIT = 5000
|
||||
|
@ -37,6 +37,7 @@ from dcmanager.common import utils
|
||||
from dcmanager.db import api as db_api
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
DEFAULT_SLEEP_TIME_IN_SECONDS = 10
|
||||
|
||||
|
||||
class OrchThread(threading.Thread):
|
||||
@ -87,6 +88,8 @@ class OrchThread(threading.Thread):
|
||||
self.subcloud_workers = dict()
|
||||
# Track if the strategy setup function was executed
|
||||
self._setup = False
|
||||
# Initialize main orch thread sleep time
|
||||
self.sleep_time = DEFAULT_SLEEP_TIME_IN_SECONDS
|
||||
|
||||
@abc.abstractmethod
|
||||
def trigger_audit(self):
|
||||
@ -110,6 +113,7 @@ class OrchThread(threading.Thread):
|
||||
if self._setup:
|
||||
LOG.info("(%s) OrchThread Post-Delete Teardown" % self.update_type)
|
||||
self._setup = False
|
||||
self.sleep_time = DEFAULT_SLEEP_TIME_IN_SECONDS
|
||||
self.post_delete_teardown()
|
||||
|
||||
def post_delete_teardown(self):
|
||||
@ -254,6 +258,26 @@ class OrchThread(threading.Thread):
|
||||
LOG.debug("Remove %s from subcloud_workers dict" % region)
|
||||
del self.subcloud_workers[region]
|
||||
|
||||
def _adjust_sleep_time(self, number_of_subclouds):
|
||||
prev_sleep_time = self.sleep_time
|
||||
|
||||
if number_of_subclouds <= 0:
|
||||
new_sleep_time = DEFAULT_SLEEP_TIME_IN_SECONDS
|
||||
else:
|
||||
new_sleep_time = min(
|
||||
(DEFAULT_SLEEP_TIME_IN_SECONDS * 60)
|
||||
/ min(number_of_subclouds, consts.MAX_PARALLEL_SUBCLOUDS_LIMIT),
|
||||
DEFAULT_SLEEP_TIME_IN_SECONDS,
|
||||
)
|
||||
|
||||
if new_sleep_time != prev_sleep_time:
|
||||
self.sleep_time = new_sleep_time
|
||||
LOG.debug(
|
||||
f"Adjusted {self.update_type} orch thread sleep time from "
|
||||
f"{prev_sleep_time} to {self.sleep_time} "
|
||||
f"based on {number_of_subclouds} parallel subclouds."
|
||||
)
|
||||
|
||||
def run_orch(self):
|
||||
while not self.stopped():
|
||||
try:
|
||||
@ -287,8 +311,8 @@ class OrchThread(threading.Thread):
|
||||
# We catch all exceptions to avoid terminating the thread.
|
||||
LOG.exception("(%s) OrchThread unexpected exception" % self.update_type)
|
||||
|
||||
# Wake up every 10 seconds to see if there is work to do.
|
||||
time.sleep(10)
|
||||
# Wake up every so often to see if there is work to do.
|
||||
time.sleep(self.sleep_time)
|
||||
|
||||
LOG.info("(%s) OrchThread ended main loop" % self.update_type)
|
||||
|
||||
@ -297,6 +321,9 @@ class OrchThread(threading.Thread):
|
||||
|
||||
LOG.debug("(%s) Applying update strategy" % self.update_type)
|
||||
strategy_steps = db_api.strategy_step_get_all(self.context)
|
||||
# Adjust sleep time based on the number of subclouds being processed
|
||||
# in parallel
|
||||
self._adjust_sleep_time(len(strategy_steps))
|
||||
|
||||
stop = False
|
||||
failure_detected = False
|
||||
@ -335,6 +362,7 @@ class OrchThread(threading.Thread):
|
||||
state=consts.SW_UPDATE_STATE_FAILED,
|
||||
update_type=self.update_type,
|
||||
)
|
||||
self.sleep_time = DEFAULT_SLEEP_TIME_IN_SECONDS
|
||||
# Trigger audit to update the sync status for each subcloud.
|
||||
self.trigger_audit()
|
||||
return
|
||||
@ -372,6 +400,7 @@ class OrchThread(threading.Thread):
|
||||
update_type=self.update_type,
|
||||
)
|
||||
self.subcloud_workers.clear()
|
||||
self.sleep_time = DEFAULT_SLEEP_TIME_IN_SECONDS
|
||||
|
||||
# Trigger audit to update the sync status for each subcloud.
|
||||
LOG.info(f"Trigger audit for {self.update_type}")
|
||||
@ -393,6 +422,7 @@ class OrchThread(threading.Thread):
|
||||
state=consts.SW_UPDATE_STATE_FAILED,
|
||||
update_type=self.update_type,
|
||||
)
|
||||
self.sleep_time = DEFAULT_SLEEP_TIME_IN_SECONDS
|
||||
# Trigger audit to update the sync status for each subcloud.
|
||||
self.trigger_audit()
|
||||
return
|
||||
@ -402,6 +432,7 @@ class OrchThread(threading.Thread):
|
||||
if self.stopped():
|
||||
LOG.info("(%s) Exiting because task is stopped" % self.update_type)
|
||||
self.subcloud_workers.clear()
|
||||
self.sleep_time = DEFAULT_SLEEP_TIME_IN_SECONDS
|
||||
return
|
||||
if strategy_step.state == consts.STRATEGY_STATE_FAILED:
|
||||
LOG.debug("(%s) Intermediate step is failed" % self.update_type)
|
||||
@ -483,8 +514,10 @@ class OrchThread(threading.Thread):
|
||||
"""Delete an update strategy"""
|
||||
|
||||
LOG.info("(%s) Deleting update strategy" % self.update_type)
|
||||
|
||||
strategy_steps = db_api.strategy_step_get_all(self.context)
|
||||
# Adjust sleep time based on the number of subclouds being processed
|
||||
# in parallel
|
||||
self._adjust_sleep_time(len(strategy_steps))
|
||||
|
||||
for strategy_step in strategy_steps:
|
||||
region = self.get_region_name(strategy_step)
|
||||
@ -519,6 +552,9 @@ class OrchThread(threading.Thread):
|
||||
except Exception as e:
|
||||
LOG.exception("(%s) exception during delete" % self.update_type)
|
||||
raise e
|
||||
finally:
|
||||
self.sleep_time = DEFAULT_SLEEP_TIME_IN_SECONDS
|
||||
|
||||
LOG.info("(%s) Finished deleting update strategy" % self.update_type)
|
||||
|
||||
def delete_subcloud_strategy(self, strategy_step):
|
||||
|
Loading…
x
Reference in New Issue
Block a user