From c607a82a992ebe63b4ae005746f2b9e8104bb3a0 Mon Sep 17 00:00:00 2001 From: Fernando Ferraz Date: Thu, 1 Oct 2020 11:20:51 -0300 Subject: [PATCH] NetApp SolidFire: Fix clone and request timeout issues Users are experiencing timeout issues in certain environments, mostly when volumes are too big (ie. multi-terabyte volumes), due to poor network performance or upgrade issues that revolve around the SolidFire cluster. A viable solution is to make driver timeout values configurable in cinder.conf, so users can set these timeouts according to their needs. This patch adds two timeout settings to the SolidFire driver (for cloning operation and globally to all api requests), to allow users to set the appropriate timeouts for their environment. Closes-Bug: #1898587 Change-Id: Ie330c76a5db0ea76d4fed5a6ae7b8736dadc8591 --- .../drivers/solidfire/test_solidfire.py | 13 +++- cinder/volume/drivers/solidfire.py | 61 ++++++++++++++----- ...quest-timeout-issues-56f7a7659c7ec775.yaml | 7 +++ 3 files changed, 62 insertions(+), 19 deletions(-) create mode 100644 releasenotes/notes/sf-fix-clone-and-request-timeout-issues-56f7a7659c7ec775.yaml diff --git a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py index a64325447ee..63cff69a3b4 100644 --- a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py +++ b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py @@ -221,7 +221,7 @@ class SolidFireVolumeTestCase(test.TestCase): 'volumeID': 6}] def fake_issue_api_request(self, method, params, version='1.0', - endpoint=None): + endpoint=None, timeout=None): if method == 'GetClusterCapacity': data = {} if version == '1.0': @@ -638,6 +638,12 @@ class SolidFireVolumeTestCase(test.TestCase): 'volume_type_id': None, 'created_at': timeutils.utcnow()} + fake_model_info = { + 'provider_id': '%s %s cluster-id-01' % ( + self.fake_sfvol['volumeID'], + self.fake_sfaccount['accountID']) + } + ctx = context.get_admin_context() testvol = fake_volume.fake_volume_obj(ctx, **updates_vol_a) testvol_b = fake_volume.fake_volume_obj(ctx, **updates_vol_b) @@ -657,7 +663,7 @@ class SolidFireVolumeTestCase(test.TestCase): return_value=[]), \ mock.patch.object(sfv, '_get_model_info', - return_value={}): + return_value=fake_model_info): sfv.create_cloned_volume(testvol_b, testvol) def test_initialize_connector_with_blocksizes(self): @@ -3041,6 +3047,7 @@ class SolidFireVolumeTestCase(test.TestCase): 'mvip': self.mvip, 'svip': self.svip} + self.configuration.sf_volume_clone_timeout = 1 sfv = solidfire.SolidFireDriver(configuration=self.configuration) sfv.replication_enabled = False @@ -3085,7 +3092,7 @@ class SolidFireVolumeTestCase(test.TestCase): mock_issue_api_request.assert_has_calls(calls) mock_test_set_cluster_pairs.assert_not_called() mock_update_attributes.assert_not_called() - mock_get_model_info.assert_called_once() + mock_get_model_info.assert_called() mock_snapshot_discovery.assert_not_called() reset_mocks() diff --git a/cinder/volume/drivers/solidfire.py b/cinder/volume/drivers/solidfire.py index be0dca7ced9..7bb6defae46 100644 --- a/cinder/volume/drivers/solidfire.py +++ b/cinder/volume/drivers/solidfire.py @@ -100,7 +100,18 @@ sf_opts = [ default=3600, min=30, help='Sets time in seconds to wait for a migrating volume to ' - 'complete pairing and sync.')] + 'complete pairing and sync.'), + cfg.IntOpt('sf_api_request_timeout', + default=30, + min=30, + help='Sets time in seconds to wait for an api request to ' + 'complete.'), + cfg.IntOpt('sf_volume_clone_timeout', + default=600, + min=60, + help='Sets time in seconds to wait for a clone of a volume or ' + 'snapshot to complete.' + )] CONF = cfg.CONF CONF.register_opts(sf_opts, group=configuration.SHARED_CONF_GROUP) @@ -656,11 +667,14 @@ class SolidFireDriver(san.SanISCSIDriver): return endpoint @retry(retry_exc_tuple, tries=6) - def _issue_api_request(self, method, params, version='1.0', endpoint=None): + def _issue_api_request(self, method, params, version='1.0', + endpoint=None, timeout=None): if params is None: params = {} if endpoint is None: endpoint = self.active_cluster['endpoint'] + if not timeout: + timeout = self.configuration.sf_api_request_timeout payload = {'method': method, 'params': params} url = '%s/json-rpc/%s/' % (endpoint['url'], version) @@ -672,7 +686,7 @@ class SolidFireDriver(san.SanISCSIDriver): data=json.dumps(payload), auth=(endpoint['login'], endpoint['passwd']), verify=self.verify_ssl, - timeout=30) + timeout=timeout) response = req.json() req.close() if (('error' in response) and @@ -859,15 +873,13 @@ class SolidFireDriver(san.SanISCSIDriver): def _get_model_info(self, sfaccount, sf_volume_id, endpoint=None): volume = None - iteration_count = 0 - while not volume and iteration_count < 600: - volume_list = self._get_volumes_by_sfaccount( - sfaccount['accountID'], endpoint=endpoint) - for v in volume_list: - if v['volumeID'] == sf_volume_id: - volume = v - break - iteration_count += 1 + volume_list = self._get_volumes_by_sfaccount( + sfaccount['accountID'], endpoint=endpoint) + + for v in volume_list: + if v['volumeID'] == sf_volume_id: + volume = v + break if not volume: LOG.error('Failed to retrieve volume SolidFire-' @@ -937,10 +949,27 @@ class SolidFireDriver(san.SanISCSIDriver): params['volumeID'] = sf_cloned_id data = self._issue_api_request('ModifyVolume', params) - model_update = self._get_model_info(sf_account, sf_cloned_id) - if model_update is None: - mesg = _('Failed to get model update from clone') - raise SolidFireAPIException(mesg) + def _wait_volume_is_active(): + try: + model_info = self._get_model_info(sf_account, sf_cloned_id) + if model_info: + raise loopingcall.LoopingCallDone(model_info) + except exception.VolumeNotFound: + LOG.debug('Waiting for cloned volume [%s] - [%s] to become ' + 'active', sf_cloned_id, vref.id) + pass + + try: + timer = loopingcall.FixedIntervalWithTimeoutLoopingCall( + _wait_volume_is_active) + model_update = timer.start( + interval=1, + timeout=self.configuration.sf_volume_clone_timeout).wait() + except loopingcall.LoopingCallTimeOut: + msg = _('Failed to get model update from clone [%s] - [%s]' % + (sf_cloned_id, vref.id)) + LOG.error(msg) + raise SolidFireAPIException(msg) rep_settings = self._retrieve_replication_settings(vref) if self.replication_enabled and rep_settings: diff --git a/releasenotes/notes/sf-fix-clone-and-request-timeout-issues-56f7a7659c7ec775.yaml b/releasenotes/notes/sf-fix-clone-and-request-timeout-issues-56f7a7659c7ec775.yaml new file mode 100644 index 00000000000..d6e63a3d10c --- /dev/null +++ b/releasenotes/notes/sf-fix-clone-and-request-timeout-issues-56f7a7659c7ec775.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + `Bug #1898587 `_: + Address cloning and api request timeout issues users may hit in + certain environments, by allowing configuring timeout values for + these operations through cinder configuration file.