From b3f881572088b16f693bf52d932273429996ca60 Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Thu, 16 Jan 2025 18:02:57 +0000 Subject: [PATCH] Disable the heal instance info cache periodic task The _heal_instance_info_cache periodic task predates the introduction of the server external events API which is now the canonical way to refresh the cache. This change updates the default value of ``[compute]heal_instance_info_cache_interval`` to -1 disabling it by default. The nova-ovs-hybrid-plug job is extended to test the legacy configuration value and the config override is removed from nova-next Closes-Bug: #1996094 Related-Bug: #2089225 Change-Id: I33ac91bb4f3ead51af2f7005002d5eb5078540d9 --- .zuul.yaml | 10 +++-- nova/conf/compute.py | 2 +- ..._info_cache_interval-0d9ae7c12793bf7b.yaml | 43 +++++++++++++++++++ 3 files changed, 50 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml diff --git a/.zuul.yaml b/.zuul.yaml index b6a8b04fc97f..07277e065b2d 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -210,6 +210,9 @@ $NEUTRON_CONF: nova: live_migration_events: True + $NOVA_CPU_CONF: + compute: + heal_instance_info_cache_interval: 60 group-vars: subnode: devstack_localrc: @@ -250,6 +253,9 @@ $NEUTRON_CONF: nova: live_migration_events: True + $NOVA_CPU_CONF: + compute: + heal_instance_info_cache_interval: 60 post-run: playbooks/nova-live-migration/post-run.yaml - job: @@ -422,10 +428,6 @@ # reduce the number of placement calls in steady state. Added in # Stein. resource_provider_association_refresh: 0 - # Neutron networking backends today are expected to work without - # the periodic healing of the cache in Nova. Turn it off to gain - # additional performance. - heal_instance_info_cache_interval: -1 workarounds: # This wa is an improvement on hard reboot that cannot be turned # on unconditionally. But we know that ml2/ovs sends plug time diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 3a5b3f5a7315..c8b143b5dd44 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -1085,7 +1085,7 @@ Related options: to be synchronized manually. """), cfg.IntOpt('heal_instance_info_cache_interval', - default=60, + default=-1, help=""" Interval between instance network information cache updates. diff --git a/releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml b/releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml new file mode 100644 index 000000000000..402f5e9fcada --- /dev/null +++ b/releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml @@ -0,0 +1,43 @@ +--- +upgrade: + - | + ``[compute]heal_instance_info_cache_interval`` now defaults to -1. + + In the early days of Nova, all networking was internal, then ``quantum``, + now known as ``neutron`` was introduced. + When the networking subsystem was being externalized and neutron was + optional Nova still needed to keep track of the ports associated with an + instance. + To that end, to avoid these expensive calls to an optional service the + instance info cache was extended to include network information and a + periodic task was introduced to update it in + ``08fa534a0d28fa1be48aef927584161becb936c7`` as part of the + ``Essex`` release. + + As we have learned over the years per compute periodic tasks that call + other services do not scale well as the number of compute nodes increases. + In ``ce936ea5f3ae0b4d3b816a7fe42d5f0100b20fca`` the os-server-external-events + API was introduced. The server external events API allows external systems + such as Neutron to trigger cache refreshes on demand, this was part + of the Icehouse release. With the introduction of this API, neutron was + modified to send network-changed events on a per-port basis as API actions + are performed on neutron ports. When that was introduced the default value + of ``[compute]heal_instance_info_cache_interval`` was not changed + to ensure there was no upgrade impact. + + In``ba44c155ce1dcefede9741722a0525820d6da2b8`` as part of bug #1751923 + the _heal_instance_info_cache periodic task was modified to pass a + "force_refresh" forcing Nova to lookup the current state of all ports for + the instance from neutron and fully rebuild the info_cache. This has the + side effect of making the already poor scaling of this optional periodic + task even worse. + + In this release, the default behaviour of Nova has been changed to + disable the periodic, optimizing for performance, scale, power consumption + and typical deployment topologies, where the instance network information + is updated by neutron via the external event API as ports are modified. + This should significantly reduce the background neutron API load in + medium to large clouds. If you have a neutron backend that does not + reliably send network-changed event notifications to Nova you can + re-enable this periodic task by setting + ``[compute]heal_instance_info_cache_interval`` to a value greater than 0. \ No newline at end of file