From 6458c3dba53b9a9fb903bdb6e5e08af14ad015d6 Mon Sep 17 00:00:00 2001
From: Sasha Andonov <sandonov@suse.com>
Date: Tue, 4 Feb 2020 16:59:14 +0100
Subject: [PATCH] rbd_utils: increase _destroy_volume timeout

If RBD backend is used for Nova ephemeral storage, Nova tries to remove
ephemeral storage volume from Ceph in a retry loop: 10 attempts at 1
second intervals, totaling 10 seconds overall - which, due to a thirty
second ceph watcher timeout, might result in intermittent volume
removal failures on Ceph side.
This patch adds params rbd_destroy_volume_retries, defaulting to 12, and
rbd_destroy_volume_retry_interval, defaulting to 5, which multiplied, give
Ceph reasonable amount of time to complete the operation successfully.

Closes-Bug: #1856845
Change-Id: Icfd55617f0126f79d9610f8a2fc6b4c817d1a2bd
---
 nova/conf/libvirt.py                          | 21 ++++++++++++++++
 .../unit/virt/libvirt/storage/test_rbd.py     | 25 +++++++++++++++++--
 nova/virt/libvirt/storage/rbd_utils.py        |  8 +++---
 ...rbd-increase-timeout-c4e5a34cf5da7fdc.yaml | 19 ++++++++++++++
 4 files changed, 68 insertions(+), 5 deletions(-)
 create mode 100644 releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml

diff --git a/nova/conf/libvirt.py b/nova/conf/libvirt.py
index 3da878553f05..2568d65ba371 100644
--- a/nova/conf/libvirt.py
+++ b/nova/conf/libvirt.py
@@ -1095,6 +1095,27 @@ The libvirt UUID of the secret for the rbd_user volumes.
                default=5,
                help="""
 The RADOS client timeout in seconds when initially connecting to the cluster.
+"""),
+    cfg.IntOpt('rbd_destroy_volume_retry_interval',
+               default=5,
+               min=0,
+               help="""
+Number of seconds to wait between each consecutive retry to destroy a
+RBD volume.
+
+Related options:
+
+* [libvirt]/images_type = 'rbd'
+"""),
+    cfg.IntOpt('rbd_destroy_volume_retries',
+               default=12,
+               min=0,
+               help="""
+Number of retries to destroy a RBD volume.
+
+Related options:
+
+* [libvirt]/images_type = 'rbd'
 """),
 ]
 
diff --git a/nova/tests/unit/virt/libvirt/storage/test_rbd.py b/nova/tests/unit/virt/libvirt/storage/test_rbd.py
index 2b22d5286213..788df0de8d56 100644
--- a/nova/tests/unit/virt/libvirt/storage/test_rbd.py
+++ b/nova/tests/unit/virt/libvirt/storage/test_rbd.py
@@ -435,8 +435,8 @@ class RbdTestCase(test.NoDBTestCase):
             self.driver.cleanup_volumes(filter_fn)
         rbd.remove.assert_any_call(client.__enter__.return_value.ioctx,
                                    '%s_test' % uuids.instance)
-        # NOTE(danms): 10 retries + 1 final attempt to propagate = 11
-        self.assertEqual(11, len(rbd.remove.call_args_list))
+        # NOTE(sandonov): 12 retries + 1 final attempt to propagate = 13
+        self.assertEqual(13, len(rbd.remove.call_args_list))
 
     def test_cleanup_volumes_fail_not_found(self):
         self._test_cleanup_exception('ImageBusy')
@@ -508,6 +508,27 @@ class RbdTestCase(test.NoDBTestCase):
         client.__enter__.assert_called_once_with()
         client.__exit__.assert_called_once_with(None, None, None)
 
+    @mock.patch.object(rbd_utils, 'RADOSClient')
+    @mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall')
+    def test_destroy_volume_with_retries(self, mock_loopingcall, mock_client):
+        vol = '12345_test'
+        client = mock_client.return_value
+        loopingcall = mock_loopingcall.return_value
+
+        # Try for sixty seconds: six retries at 10 second interval
+        self.flags(rbd_destroy_volume_retries=6, group='libvirt')
+        self.flags(rbd_destroy_volume_retry_interval=10, group='libvirt')
+        self.driver.destroy_volume(vol)
+
+        # Make sure both params have the expected values
+        retryctx = mock_loopingcall.call_args.args[3]
+        self.assertEqual(retryctx, {'retries': 6})
+        loopingcall.start.assert_called_with(interval=10)
+
+        # Make sure that we entered and exited the RADOSClient
+        client.__enter__.assert_called_once_with()
+        client.__exit__.assert_called_once_with(None, None, None)
+
     @mock.patch.object(rbd_utils, 'RADOSClient')
     def test_remove_image(self, mock_client):
         name = '12345_disk.config.rescue'
diff --git a/nova/virt/libvirt/storage/rbd_utils.py b/nova/virt/libvirt/storage/rbd_utils.py
index 35316f5c4d22..8a4782f5a3c7 100644
--- a/nova/virt/libvirt/storage/rbd_utils.py
+++ b/nova/virt/libvirt/storage/rbd_utils.py
@@ -351,11 +351,13 @@ class RBDDriver(object):
             if retryctx['retries'] <= 0:
                 raise loopingcall.LoopingCallDone()
 
-        # NOTE(danms): We let it go for ten seconds
-        retryctx = {'retries': 10}
+        # NOTE(sandonov): We let it go for:
+        # rbd_destroy_volume_retries*rbd_destroy_volume_retry_interval seconds
+        retryctx = {'retries': CONF.libvirt.rbd_destroy_volume_retries}
         timer = loopingcall.FixedIntervalLoopingCall(
             _cleanup_vol, client.ioctx, volume, retryctx)
-        timed_out = timer.start(interval=1).wait()
+        timed_out = timer.start(
+            interval=CONF.libvirt.rbd_destroy_volume_retry_interval).wait()
         if timed_out:
             # NOTE(danms): Run this again to propagate the error, but
             # if it succeeds, don't raise the loopingcall exception
diff --git a/releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml b/releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml
new file mode 100644
index 000000000000..20fecc51c994
--- /dev/null
+++ b/releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml
@@ -0,0 +1,19 @@
+---
+features:
+  - |
+    Added params ``[libvirt]/rbd_destroy_volume_retries``, defaulting to 12,
+    and ``[libvirt]/rbd_destroy_volume_retry_interval``, defaulting to 5, that
+    Nova will use when trying to remove a volume from Ceph in a retry loop
+    that combines these parameters together. Thus, maximum elapsing time is by
+    default 60 seconds.
+fixes:
+  - |
+    Nova tries to remove a volume from Ceph in a retry loop of 10 attempts at
+    1 second intervals, totaling 10 seconds overall - which, due to 30 second
+    ceph watcher timeout, might result in intermittent object removal failures
+    on Ceph side (`bug 1856845`_). Setting default values for
+    ``[libvirt]/rbd_destroy_volume_retries`` to 12 and
+    ``[libvirt]/rbd_destroy_volume_retry_interval`` to 5, now gives Ceph
+    reasonable amount of time to complete the operation successfully.
+
+    .. _`bug 1856845`: https://bugs.launchpad.net/nova/+bug/1856845