From b3f881572088b16f693bf52d932273429996ca60 Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Thu, 16 Jan 2025 18:02:57 +0000
Subject: [PATCH] Disable the heal instance info cache periodic task

The _heal_instance_info_cache periodic task predates
the introduction of the server external events API
which is now the canonical way to refresh the cache.

This change updates the default value of
``[compute]heal_instance_info_cache_interval``
to -1 disabling it by default.

The nova-ovs-hybrid-plug job is extended to test the
legacy configuration value and the config override is removed
from nova-next

Closes-Bug: #1996094
Related-Bug: #2089225
Change-Id: I33ac91bb4f3ead51af2f7005002d5eb5078540d9
---
 .zuul.yaml                                    | 10 +++--
 nova/conf/compute.py                          |  2 +-
 ..._info_cache_interval-0d9ae7c12793bf7b.yaml | 43 +++++++++++++++++++
 3 files changed, 50 insertions(+), 5 deletions(-)
 create mode 100644 releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml

diff --git a/.zuul.yaml b/.zuul.yaml
index b6a8b04fc97f..07277e065b2d 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -210,6 +210,9 @@
           $NEUTRON_CONF:
             nova:
               live_migration_events: True
+          $NOVA_CPU_CONF:
+              compute:
+                heal_instance_info_cache_interval: 60
     group-vars:
       subnode:
         devstack_localrc:
@@ -250,6 +253,9 @@
             $NEUTRON_CONF:
               nova:
                 live_migration_events: True
+            $NOVA_CPU_CONF:
+              compute:
+                heal_instance_info_cache_interval: 60
     post-run: playbooks/nova-live-migration/post-run.yaml
 
 - job:
@@ -422,10 +428,6 @@
               # reduce the number of placement calls in steady state. Added in
               # Stein.
               resource_provider_association_refresh: 0
-              # Neutron networking backends today are expected to work without
-              # the periodic healing of the cache in Nova. Turn it off to gain
-              # additional performance.
-              heal_instance_info_cache_interval: -1
             workarounds:
               # This wa is an improvement on hard reboot that cannot be turned
               # on unconditionally. But we know that ml2/ovs sends plug time
diff --git a/nova/conf/compute.py b/nova/conf/compute.py
index 3a5b3f5a7315..c8b143b5dd44 100644
--- a/nova/conf/compute.py
+++ b/nova/conf/compute.py
@@ -1085,7 +1085,7 @@ Related options:
   to be synchronized manually.
 """),
     cfg.IntOpt('heal_instance_info_cache_interval',
-        default=60,
+        default=-1,
         help="""
 Interval between instance network information cache updates.
 
diff --git a/releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml b/releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml
new file mode 100644
index 000000000000..402f5e9fcada
--- /dev/null
+++ b/releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml
@@ -0,0 +1,43 @@
+---
+upgrade:
+  - |
+    ``[compute]heal_instance_info_cache_interval`` now defaults to -1.
+
+    In the early days of Nova, all networking was internal, then ``quantum``,
+    now known as ``neutron`` was introduced.
+    When the networking subsystem was being externalized and neutron was
+    optional Nova still needed to keep track of the ports associated with an
+    instance.
+    To that end, to avoid these expensive calls to an optional service the
+    instance info cache was extended to include network information and a
+    periodic task was introduced to update it in
+    ``08fa534a0d28fa1be48aef927584161becb936c7`` as part of the
+    ``Essex`` release.
+
+    As we have learned over the years per compute periodic tasks that call
+    other services do not scale well as the number of compute nodes increases.
+    In ``ce936ea5f3ae0b4d3b816a7fe42d5f0100b20fca`` the os-server-external-events
+    API was introduced. The server external events API allows external systems
+    such as Neutron to trigger cache refreshes on demand, this was part
+    of the Icehouse release. With the introduction of this API, neutron was
+    modified to send network-changed events on a per-port basis as API actions
+    are performed on neutron ports. When that was introduced the default value
+    of ``[compute]heal_instance_info_cache_interval`` was not changed
+    to ensure there was no upgrade impact.
+
+    In``ba44c155ce1dcefede9741722a0525820d6da2b8`` as part of bug #1751923
+    the _heal_instance_info_cache periodic task was modified to pass a
+    "force_refresh" forcing Nova to lookup the current state of all ports for
+    the instance from neutron and fully rebuild the info_cache. This has the
+    side effect of making the already poor scaling of this optional periodic
+    task even worse.
+
+    In this release, the default behaviour of Nova has been changed to
+    disable the periodic, optimizing for performance, scale, power consumption
+    and typical deployment topologies, where the instance network information
+    is updated by neutron via the external event API as ports are modified.
+    This should significantly reduce the background neutron API load in
+    medium to large clouds. If you have a neutron backend that does not
+    reliably send network-changed event notifications to Nova you can
+    re-enable this periodic task by setting
+    ``[compute]heal_instance_info_cache_interval`` to a value greater than 0.
\ No newline at end of file