Reduce retries during cluster health validation

This change reduces retries during cluster health validation. There are multiple retry levels today: * retry on urllib3 http level * retry in validating cluster health * retry in _proxy_internal This causes retry storm, which brings significant delays to API calls. This is especially relevant when nsxlib is configured with cluster_unavailable_retry = True (this is always the case with single endpoint). This change reduces configurable retry attempts in cluster health validation to single retry per endpoint. In addition, this change fixes scenario when client configures nsxlib with no validation, in which case cluster should not mark endpoint as UP in validation related code. Change-Id: I33b4101a0e0c0f4088e10776e126cc495dabd89c
2020-03-19 15:14:13 -07:00 · 2020-03-19 15:14:13 -07:00 · 1e675ae459
commit 1e675ae459
parent cddf191084
4 changed files with 28 additions and 19 deletions
--- a/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py
+++ b/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py
@ -175,7 +175,7 @@ class MemoryMockAPIProvider(nsx_cluster.AbstractHTTPProvider):
        return "Memory mock API"

    def validate_connection(self, cluster_api, endpoint, conn):
-        return
+        return True

    def new_connection(self, cluster_api, provider):
        # all callers use the same backing
@ -335,6 +335,7 @@ class NsxClientTestCase(NsxLibTestCase):

        def validate_connection(self, cluster_api, endpoint, conn):
            assert conn is not None
+            return True

    def mock_nsx_clustered_api(self, session_response=None, **kwargs):
        orig_request = nsx_cluster.TimeoutSession.request
--- a/vmware_nsxlib/tests/unit/v3/test_cluster.py
+++ b/vmware_nsxlib/tests/unit/v3/test_cluster.py
@ -30,7 +30,7 @@ from vmware_nsxlib.v3 import exceptions as nsxlib_exc


 def _validate_conn_up(*args, **kwargs):
-    return
+    return True


 def _validate_conn_down(*args, **kwargs):
@ -312,6 +312,8 @@ class ClusteredAPITestCase(nsxlib_testcase.NsxClientTestCase):
            if endpoint.provider.id == '8.9.10.11':
                raise Exception()

+            return True
+
        self._test_health(_validate, cluster.ClusterHealth.ORANGE)

    def test_green_health(self):
@ -415,8 +417,7 @@ class ClusteredAPITestCase(nsxlib_testcase.NsxClientTestCase):
        # until retries have been exhausted
        api.nsxlib_config.cluster_unavailable_retry = True
        self.assertEqual(api._select_endpoint(), None)
-        self.assertEqual(api._validate.call_count,
-                         api.nsxlib_config.max_attempts * len(eps))
+        self.assertEqual(api._validate.call_count, len(eps))

        # simulate the case where 1 endpoint finally goes up
        self.validate_count = 0
--- a/vmware_nsxlib/v3/client.py
+++ b/vmware_nsxlib/v3/client.py
@ -215,6 +215,7 @@ class RESTClient(object):
                   silent=False, expected_results=None, **kwargs):
        request_headers = headers.copy() if headers else {}
        request_headers.update(self._default_headers)
+
        if utils.INJECT_HEADERS_CALLBACK:
            inject_headers = utils.INJECT_HEADERS_CALLBACK()
            request_headers.update(inject_headers)
--- a/vmware_nsxlib/v3/cluster.py
+++ b/vmware_nsxlib/v3/cluster.py
@ -33,7 +33,6 @@ from requests import adapters
 from requests import exceptions as requests_exceptions
 import six
 import six.moves.urllib.parse as urlparse
-import tenacity
 import urllib3

 from vmware_nsxlib._i18n import _
@ -186,22 +185,32 @@ class NSXRequestsHTTPProvider(AbstractHTTPProvider):
        return "%s-%s" % (requests.__title__, requests.__version__)

    def validate_connection(self, cluster_api, endpoint, conn):
+        # We don't need to retry with different endpoint during validation,
+        # thus limit max_attempts to 1
+        # on connection level, validation will be retried according to
+        # nsxlib 'retries' and 'http_timeout' parameters.
        client = nsx_client.NSX3Client(
            conn, url_prefix=endpoint.provider.url,
            url_path_base=cluster_api.nsxlib_config.url_base,
-            default_headers=conn.default_headers)
+            default_headers=conn.default_headers,
+            max_attempts=1)

+        validation_done = False
        # Check the manager state directly
        if cluster_api.nsxlib_config.validate_connection_method:
            cluster_api.nsxlib_config.validate_connection_method(
                client, endpoint.provider.url)
+            validation_done = True

        # If keeplive section returns a list, it is assumed to be non-empty
        keepalive_section = cluster_api.nsxlib_config.keepalive_section
        # When validate connection also has the effect of keep-alive,
        # keepalive_section can be disabled by passing in an empty value
        if keepalive_section:
-            result = client.get(keepalive_section, silent=True)
+            result = client.get(keepalive_section,
+                                silent=True,
+                                with_retries=False)
+            validation_done = True
            if not result or result.get('result_count', 1) <= 0:
                msg = _("No %(section)s found "
                        "for '%(url)s'") % {'section': keepalive_section,
@ -210,6 +219,8 @@ class NSXRequestsHTTPProvider(AbstractHTTPProvider):
                raise exceptions.ResourceNotFound(
                    manager=endpoint.provider.url, operation=msg)

+        return validation_done
+
    def new_connection(self, cluster_api, provider):
        config = cluster_api.nsxlib_config
        session = TimeoutSession(config.http_timeout,
@ -574,8 +585,12 @@ class ClusteredAPI(object):
    def _validate(self, endpoint):
        try:
            with endpoint.pool.item() as conn:
-                self._http_provider.validate_connection(self, endpoint, conn)
-                endpoint.set_state(EndpointState.UP)
+                # with some configurations, validation will be skipped
+                result = self._http_provider.validate_connection(self,
+                                                                 endpoint,
+                                                                 conn)
+                if result:
+                    endpoint.set_state(EndpointState.UP)
        except exceptions.ClientCertificateNotTrusted:
            LOG.warning("Failed to validate API cluster endpoint "
                        "'%(ep)s' due to untrusted client certificate",
@ -612,22 +627,13 @@ class ClusteredAPI(object):
                    return endpoint
                seen += 1

-        @utils.retry_upon_none_result(self.nsxlib_config.max_attempts)
-        def _select_endpoint_internal_with_retry():
-            # redo endpoint selection with refreshing states
-            return _select_endpoint_internal(refresh=True)
-
        # First attempt to get an UP endpoint
        endpoint = _select_endpoint_internal()
        if endpoint or not self.nsxlib_config.cluster_unavailable_retry:
            return endpoint

        # Retry the selection while refreshing the endpoints state
-        try:
-            return _select_endpoint_internal_with_retry()
-        except tenacity.RetryError:
-            # exhausted number of retries
-            return None
+        return _select_endpoint_internal(refresh=True)

    def endpoint_for_connection(self, conn):
        # check all endpoint pools