From 6458c3dba53b9a9fb903bdb6e5e08af14ad015d6 Mon Sep 17 00:00:00 2001 From: Sasha Andonov Date: Tue, 4 Feb 2020 16:59:14 +0100 Subject: [PATCH] rbd_utils: increase _destroy_volume timeout If RBD backend is used for Nova ephemeral storage, Nova tries to remove ephemeral storage volume from Ceph in a retry loop: 10 attempts at 1 second intervals, totaling 10 seconds overall - which, due to a thirty second ceph watcher timeout, might result in intermittent volume removal failures on Ceph side. This patch adds params rbd_destroy_volume_retries, defaulting to 12, and rbd_destroy_volume_retry_interval, defaulting to 5, which multiplied, give Ceph reasonable amount of time to complete the operation successfully. Closes-Bug: #1856845 Change-Id: Icfd55617f0126f79d9610f8a2fc6b4c817d1a2bd --- nova/conf/libvirt.py | 21 ++++++++++++++++ .../unit/virt/libvirt/storage/test_rbd.py | 25 +++++++++++++++++-- nova/virt/libvirt/storage/rbd_utils.py | 8 +++--- ...rbd-increase-timeout-c4e5a34cf5da7fdc.yaml | 19 ++++++++++++++ 4 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml diff --git a/nova/conf/libvirt.py b/nova/conf/libvirt.py index 3da878553f05..2568d65ba371 100644 --- a/nova/conf/libvirt.py +++ b/nova/conf/libvirt.py @@ -1095,6 +1095,27 @@ The libvirt UUID of the secret for the rbd_user volumes. default=5, help=""" The RADOS client timeout in seconds when initially connecting to the cluster. +"""), + cfg.IntOpt('rbd_destroy_volume_retry_interval', + default=5, + min=0, + help=""" +Number of seconds to wait between each consecutive retry to destroy a +RBD volume. + +Related options: + +* [libvirt]/images_type = 'rbd' +"""), + cfg.IntOpt('rbd_destroy_volume_retries', + default=12, + min=0, + help=""" +Number of retries to destroy a RBD volume. + +Related options: + +* [libvirt]/images_type = 'rbd' """), ] diff --git a/nova/tests/unit/virt/libvirt/storage/test_rbd.py b/nova/tests/unit/virt/libvirt/storage/test_rbd.py index 2b22d5286213..788df0de8d56 100644 --- a/nova/tests/unit/virt/libvirt/storage/test_rbd.py +++ b/nova/tests/unit/virt/libvirt/storage/test_rbd.py @@ -435,8 +435,8 @@ class RbdTestCase(test.NoDBTestCase): self.driver.cleanup_volumes(filter_fn) rbd.remove.assert_any_call(client.__enter__.return_value.ioctx, '%s_test' % uuids.instance) - # NOTE(danms): 10 retries + 1 final attempt to propagate = 11 - self.assertEqual(11, len(rbd.remove.call_args_list)) + # NOTE(sandonov): 12 retries + 1 final attempt to propagate = 13 + self.assertEqual(13, len(rbd.remove.call_args_list)) def test_cleanup_volumes_fail_not_found(self): self._test_cleanup_exception('ImageBusy') @@ -508,6 +508,27 @@ class RbdTestCase(test.NoDBTestCase): client.__enter__.assert_called_once_with() client.__exit__.assert_called_once_with(None, None, None) + @mock.patch.object(rbd_utils, 'RADOSClient') + @mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall') + def test_destroy_volume_with_retries(self, mock_loopingcall, mock_client): + vol = '12345_test' + client = mock_client.return_value + loopingcall = mock_loopingcall.return_value + + # Try for sixty seconds: six retries at 10 second interval + self.flags(rbd_destroy_volume_retries=6, group='libvirt') + self.flags(rbd_destroy_volume_retry_interval=10, group='libvirt') + self.driver.destroy_volume(vol) + + # Make sure both params have the expected values + retryctx = mock_loopingcall.call_args.args[3] + self.assertEqual(retryctx, {'retries': 6}) + loopingcall.start.assert_called_with(interval=10) + + # Make sure that we entered and exited the RADOSClient + client.__enter__.assert_called_once_with() + client.__exit__.assert_called_once_with(None, None, None) + @mock.patch.object(rbd_utils, 'RADOSClient') def test_remove_image(self, mock_client): name = '12345_disk.config.rescue' diff --git a/nova/virt/libvirt/storage/rbd_utils.py b/nova/virt/libvirt/storage/rbd_utils.py index 35316f5c4d22..8a4782f5a3c7 100644 --- a/nova/virt/libvirt/storage/rbd_utils.py +++ b/nova/virt/libvirt/storage/rbd_utils.py @@ -351,11 +351,13 @@ class RBDDriver(object): if retryctx['retries'] <= 0: raise loopingcall.LoopingCallDone() - # NOTE(danms): We let it go for ten seconds - retryctx = {'retries': 10} + # NOTE(sandonov): We let it go for: + # rbd_destroy_volume_retries*rbd_destroy_volume_retry_interval seconds + retryctx = {'retries': CONF.libvirt.rbd_destroy_volume_retries} timer = loopingcall.FixedIntervalLoopingCall( _cleanup_vol, client.ioctx, volume, retryctx) - timed_out = timer.start(interval=1).wait() + timed_out = timer.start( + interval=CONF.libvirt.rbd_destroy_volume_retry_interval).wait() if timed_out: # NOTE(danms): Run this again to propagate the error, but # if it succeeds, don't raise the loopingcall exception diff --git a/releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml b/releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml new file mode 100644 index 000000000000..20fecc51c994 --- /dev/null +++ b/releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml @@ -0,0 +1,19 @@ +--- +features: + - | + Added params ``[libvirt]/rbd_destroy_volume_retries``, defaulting to 12, + and ``[libvirt]/rbd_destroy_volume_retry_interval``, defaulting to 5, that + Nova will use when trying to remove a volume from Ceph in a retry loop + that combines these parameters together. Thus, maximum elapsing time is by + default 60 seconds. +fixes: + - | + Nova tries to remove a volume from Ceph in a retry loop of 10 attempts at + 1 second intervals, totaling 10 seconds overall - which, due to 30 second + ceph watcher timeout, might result in intermittent object removal failures + on Ceph side (`bug 1856845`_). Setting default values for + ``[libvirt]/rbd_destroy_volume_retries`` to 12 and + ``[libvirt]/rbd_destroy_volume_retry_interval`` to 5, now gives Ceph + reasonable amount of time to complete the operation successfully. + + .. _`bug 1856845`: https://bugs.launchpad.net/nova/+bug/1856845