From 62d35009755a45c39a0b1cdc4c69791f469e469e Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Tue, 31 Oct 2017 07:46:36 -0700
Subject: [PATCH] Make put_allocations() retry on concurrent update

This adds a retries decorator to the scheduler report client
and modifies put_allocations() so that it will detect a concurrent
update, raising the Retry exception to trigger the decorator.

This should be usable by other methods in the client easily, but
this patch only modifies put_allocations() to fix the bug.

Change-Id: Ic32a54678dd413668f02e77d5e6c4195664ac24c
Closes-Bug: #1728722
---
 nova/scheduler/client/report.py               | 48 +++++++++++++++++--
 .../unit/scheduler/client/test_report.py      | 46 ++++++++++++++++++
 2 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py
index 63dc25d4a516..b11e0ff34ea6 100644
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@@ -94,6 +94,33 @@ def safe_connect(f):
     return wrapper
 
 
+class Retry(Exception):
+    def __init__(self, operation, reason):
+        self.operation = operation
+        self.reason = reason
+
+
+def retries(f):
+    """Decorator to retry a call three times if it raises Retry
+
+    Note that this returns the actual value of the inner call on success
+    or returns False if all the retries fail.
+    """
+    @functools.wraps(f)
+    def wrapper(self, *a, **k):
+        for retry in range(0, 3):
+            try:
+                return f(self, *a, **k)
+            except Retry as e:
+                LOG.debug(
+                    'Unable to %(op)s because %(reason)s; retrying...',
+                    {'op': e.operation, 'reason': e.reason})
+        LOG.error('Failed scheduler client operation %s: out of retries',
+                  f.__name__)
+        return False
+    return wrapper
+
+
 def _compute_node_to_inventory_dict(compute_node):
     """Given a supplied `objects.ComputeNode` object, return a dict, keyed
     by resource class, of various inventory information.
@@ -1125,6 +1152,7 @@ class SchedulerReportClient(object):
         return r.status_code == 204
 
     @safe_connect
+    @retries
     def put_allocations(self, rp_uuid, consumer_uuid, alloc_data, project_id,
                         user_id):
         """Creates allocation records for the supplied instance UUID against
@@ -1141,6 +1169,8 @@ class SchedulerReportClient(object):
         :param project_id: The project_id associated with the allocations.
         :param user_id: The user_id associated with the allocations.
         :returns: True if the allocations were created, False otherwise.
+        :raises: Retry if the operation should be retried due to a concurrent
+                 update.
         """
         payload = {
             'allocations': [
@@ -1164,12 +1194,20 @@ class SchedulerReportClient(object):
             payload.pop('user_id')
             r = self.put(url, payload)
         if r.status_code != 204:
-            LOG.warning(
-                'Unable to submit allocation for instance '
+            # NOTE(jaypipes): Yes, it sucks doing string comparison like this
+            # but we have no error codes, only error messages.
+            if 'concurrently updated' in r.text:
+                reason = ('another process changed the resource providers '
+                          'involved in our attempt to put allocations for '
+                          'consumer %s' % consumer_uuid)
+                raise Retry('put_allocations', reason)
+            else:
+                LOG.warning(
+                    'Unable to submit allocation for instance '
                     '%(uuid)s (%(code)i %(text)s)',
-                {'uuid': consumer_uuid,
-                 'code': r.status_code,
-                 'text': r.text})
+                    {'uuid': consumer_uuid,
+                     'code': r.status_code,
+                     'text': r.text})
         return r.status_code == 204
 
     @safe_connect
diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py
index 06dc841fc5b0..a86b4d919627 100644
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@@ -292,6 +292,52 @@ class TestPutAllocations(SchedulerReportClientTestCase):
         log_msg = mock_warn.call_args[0][0]
         self.assertIn("Unable to submit allocation for instance", log_msg)
 
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.put')
+    def test_put_allocations_retries_conflict(self, mock_put):
+
+        failed = mock.MagicMock()
+        failed.status_code = 409
+        failed.text = "concurrently updated"
+
+        succeeded = mock.MagicMock()
+        succeeded.status_code = 204
+
+        mock_put.side_effect = (failed, succeeded)
+
+        rp_uuid = mock.sentinel.rp
+        consumer_uuid = mock.sentinel.consumer
+        data = {"MEMORY_MB": 1024}
+        expected_url = "/allocations/%s" % consumer_uuid
+        resp = self.client.put_allocations(rp_uuid, consumer_uuid, data,
+                                           mock.sentinel.project_id,
+                                           mock.sentinel.user_id)
+        self.assertTrue(resp)
+        mock_put.assert_has_calls([
+            mock.call(expected_url, mock.ANY, version='1.8'),
+            mock.call(expected_url, mock.ANY, version='1.8')])
+
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.put')
+    def test_put_allocations_retry_gives_up(self, mock_put):
+
+        failed = mock.MagicMock()
+        failed.status_code = 409
+        failed.text = "concurrently updated"
+
+        mock_put.return_value = failed
+
+        rp_uuid = mock.sentinel.rp
+        consumer_uuid = mock.sentinel.consumer
+        data = {"MEMORY_MB": 1024}
+        expected_url = "/allocations/%s" % consumer_uuid
+        resp = self.client.put_allocations(rp_uuid, consumer_uuid, data,
+                                           mock.sentinel.project_id,
+                                           mock.sentinel.user_id)
+        self.assertFalse(resp)
+        mock_put.assert_has_calls([
+            mock.call(expected_url, mock.ANY, version='1.8'),
+            mock.call(expected_url, mock.ANY, version='1.8'),
+            mock.call(expected_url, mock.ANY, version='1.8')])
+
     def test_claim_resources_success(self):
         get_resp_mock = mock.Mock(status_code=200)
         get_resp_mock.json.return_value = {