
This change adds the capability to rename the subcloud after bootstrap or during subcloud rehome operation. Added a field in the database to separate the region name from the subcloud name. The region name determines the subcloud reference in the Openstack core, through which it is possible to access the endpoints of a given subcloud. Since the region name cannot be changed, this commit adds the ability to maintain a unique region name based on the UUID format, and allows subcloud renaming when necessary without any endpoint impact. The region is randomly generated to configure the subcloud when it is created and only applies to future subclouds. For those systems that have existing subclouds, the region will be the same as on day 0, that is, region will keep the same name as the subcloud, but subclouds can be renamed. This topic involves changes to dcmanager, dcmanager-client and GUI. To ensure the region name reference needed by the cert-monitor, a mechanism to determine if the request is coming from the cert-monitor has been created. Usage for subcloud rename: dcmanager subcloud update <subcloud-name> --name <new-name> Usage for subcloud rehoming: dcmanager subcloud add --name <subcloud-name> --migrate ... Note: Upgrade test from StarlingX 8 -> 9 for this commit is deferred until upgrade functionality in master is restored. Any issue found during upgrade test will be addressed in a separate commit Test Plan: PASS: Run dcmanager subcloud passing subcommands: - add/delete/migrate/list/show/show --detail - errors/manage/unmanage/reinstall/reconfig - update/deploy PASS: Run dcmanager subcloud add supplying --name parameter and validate the operation is not allowed PASS: Run dcmanager supplying subcommands: - kube/patch/prestage strategies PASS: Run dcmanager to apply patch and remove it PASS: Run dcmanager subcloud-backup: - create/delete/restore/show/upload PASS: Run subcloud-group: - add/delete/list/list-subclouds/show/update PASS: Run dcmanager subcloud strategy for: - patch/kubernetes/firmware PASS: Run dcmanager subcloud update command passing --name parameter supplying the following values: - current subcloud name (not changed) - different existing subcloud name PASS: Run dcmanager to migrate a subcloud passing --name parameter supplying a new subcloud name PASS: Run dcmanager to migrate a subcloud without --name parameter PASS: Run dcmanager to migrate a subcloud passing --name parameter supplying a new subcloud name and different subcloud name in bootstrap file PASS: Test dcmanager API response using cURL command line to validate new region name field PASS: Run full DC sanity and regression Story: 2010788 Task: 48217 Signed-off-by: Cristian Mondo <cristian.mondo@windriver.com> Change-Id: Id04f42504b8e325d9ec3880c240fe4a06e3a20b7
186 lines
8.5 KiB
Python
186 lines
8.5 KiB
Python
#
|
|
# Copyright (c) 2020-2023 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
import os
|
|
import time
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon.exceptions import PlaybookExecutionFailed
|
|
from dccommon.utils import run_playbook
|
|
from dcmanager.common import consts
|
|
from dcmanager.common.exceptions import StrategyStoppedException
|
|
from dcmanager.common import utils
|
|
from dcmanager.db import api as db_api
|
|
from dcmanager.orchestrator.states.base import BaseState
|
|
|
|
|
|
ANSIBLE_UPGRADE_PLAYBOOK = \
|
|
'/usr/share/ansible/stx-ansible/playbooks/upgrade_platform.yml'
|
|
|
|
# When an unlock occurs, a reboot is triggered. During reboot, API calls fail.
|
|
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
|
|
DEFAULT_MAX_FAILED_QUERIES = 30
|
|
DEFAULT_FAILED_SLEEP = 60
|
|
|
|
# after reboot, the unlock needs to do post-reboot activities during which
|
|
# time the API will succeed, but the expected states will not yet be set.
|
|
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
|
|
DEFAULT_MAX_API_QUERIES = 30
|
|
DEFAULT_API_SLEEP = 60
|
|
|
|
# sleep for 3 minutes after ansible completes
|
|
DEFAULT_ANSIBLE_SLEEP = 180
|
|
|
|
|
|
def migrate_subcloud_data(migrate_command, log_file):
|
|
try:
|
|
run_playbook(log_file, migrate_command)
|
|
except PlaybookExecutionFailed:
|
|
msg_orch = ("Failed to migrate data, check individual "
|
|
"log at %s or run %s for details"
|
|
% (log_file, consts.ERROR_DESC_CMD))
|
|
raise Exception(msg_orch)
|
|
|
|
|
|
class MigratingDataState(BaseState):
|
|
"""Upgrade step for migrating data"""
|
|
|
|
def __init__(self, region_name):
|
|
super(MigratingDataState, self).__init__(
|
|
next_state=consts.STRATEGY_STATE_UNLOCKING_CONTROLLER_0, region_name=region_name)
|
|
|
|
self.ansible_sleep = DEFAULT_ANSIBLE_SLEEP
|
|
self.max_api_queries = DEFAULT_MAX_API_QUERIES
|
|
self.api_sleep_duration = DEFAULT_API_SLEEP
|
|
self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES
|
|
self.failed_sleep_duration = DEFAULT_FAILED_SLEEP
|
|
|
|
def wait_for_unlock(self, strategy_step):
|
|
"""This method returns successfully when the unlock completes.
|
|
|
|
An exception is raised if it does not recover on time.
|
|
"""
|
|
|
|
# This code is 'borrowed' from the unlock_host state
|
|
# Allow separate durations for failures (ie: reboot) and api retries
|
|
api_counter = 0
|
|
fail_counter = 0
|
|
# todo(abailey): only supports AIO-SX here
|
|
target_hostname = 'controller-0'
|
|
while True:
|
|
# If event handler stop has been triggered, fail the state
|
|
if self.stopped():
|
|
raise StrategyStoppedException()
|
|
try:
|
|
# query the administrative state to see if it is the new state.
|
|
host = self.get_sysinv_client(
|
|
strategy_step.subcloud.region_name).get_host(target_hostname)
|
|
if (host.administrative == consts.ADMIN_UNLOCKED and
|
|
host.operational == consts.OPERATIONAL_ENABLED):
|
|
# Success. Break out of the loop.
|
|
msg = "Host: %s is now: %s %s" % (target_hostname,
|
|
host.administrative,
|
|
host.operational)
|
|
self.info_log(strategy_step, msg)
|
|
break
|
|
# no exception was raised so reset fail and auth checks
|
|
fail_counter = 0
|
|
except Exception:
|
|
# Handle other exceptions due to being unreachable
|
|
# for a significant period of time when there is a
|
|
# controller swact, or in the case of AIO-SX,
|
|
# when the controller reboots.
|
|
fail_counter += 1
|
|
if fail_counter >= self.max_failed_queries:
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
|
|
raise Exception("Timeout waiting on reboot to complete")
|
|
time.sleep(self.failed_sleep_duration)
|
|
# skip the api_counter
|
|
continue
|
|
# If the max counter is exceeeded, raise a timeout exception
|
|
api_counter += 1
|
|
if api_counter >= self.max_api_queries:
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
|
|
raise Exception("Timeout waiting for unlock to complete")
|
|
time.sleep(self.api_sleep_duration)
|
|
|
|
def perform_state_action(self, strategy_step):
|
|
"""Migrate data for an upgrade on a subcloud
|
|
|
|
Returns the next state in the state machine on success.
|
|
Any exceptions raised by this method set the strategy to FAILED.
|
|
"""
|
|
|
|
# To account for abrupt termination of dcmanager, check the last known
|
|
# subcloud deploy status. If it is migrated/complete, advance to the next
|
|
# stage. If it is 'migrating', fail the strategy. The user will need to
|
|
# delete the existing strategy, create a new one and apply. Pre-check will
|
|
# set the appropriate next step for this subcloud.
|
|
subcloud = db_api.subcloud_get(self.context, strategy_step.subcloud.id)
|
|
if (subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED or
|
|
subcloud.deploy_status == consts.DEPLOY_STATE_DONE):
|
|
return self.next_state
|
|
elif subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATING_DATA:
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
|
|
raise Exception("Previous data migration was abruptly terminated. "
|
|
"Please try again with a new upgrade strategy.")
|
|
|
|
# If it gets here, the subcloud deploy status must be 'installed'.
|
|
self.info_log(strategy_step, "Start migrating data...")
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_MIGRATING_DATA)
|
|
|
|
ansible_subcloud_inventory_file = os.path.join(
|
|
dccommon_consts.ANSIBLE_OVERRIDES_PATH,
|
|
strategy_step.subcloud.name + consts.INVENTORY_FILE_POSTFIX)
|
|
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud.name) + \
|
|
'_playbook_output.log'
|
|
# Send skip_patching=true to prevent the playbook from applying any patches present in the
|
|
# upgrade_data. All the required patches will be included in the generated install iso.
|
|
data_migrating_cmd = [
|
|
"ansible-playbook", ANSIBLE_UPGRADE_PLAYBOOK,
|
|
"-i", ansible_subcloud_inventory_file, "-e",
|
|
"ansible_ssh_pass=%s ansible_become_pass=%s skip_patching=true"
|
|
% (consts.TEMP_SYSADMIN_PASSWORD, consts.TEMP_SYSADMIN_PASSWORD)]
|
|
|
|
try:
|
|
migrate_subcloud_data(data_migrating_cmd, log_file)
|
|
except Exception as e:
|
|
# Two error messages: one for subcloud error description and logs and
|
|
# one for orchestrator strategy_step detail (shorter than the previous).
|
|
msg_subcloud = utils.find_ansible_error_msg(
|
|
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_MIGRATING_DATA)
|
|
# Get script output in case it is available
|
|
error_msg = utils.get_failure_msg(strategy_step.subcloud.region_name)
|
|
failure = ('%s \n%s' % (error_msg, msg_subcloud))
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
|
|
error_description=failure[0:consts.ERROR_DESCRIPTION_LENGTH])
|
|
self.error_log(strategy_step, msg_subcloud)
|
|
self.error_log(strategy_step, str(e))
|
|
raise
|
|
|
|
# Ansible invokes an unlock. Need to wait for the unlock to complete.
|
|
# Wait for 3 minutes for mtc/scripts to shut down services
|
|
# todo(abailey): split this into smaller sleeps to allow stopping early
|
|
time.sleep(self.ansible_sleep)
|
|
# wait up to 60 minutes for reboot to complete
|
|
self.wait_for_unlock(strategy_step)
|
|
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_MIGRATED)
|
|
|
|
self.info_log(strategy_step, "Data migration completed.")
|
|
return self.next_state
|