fix "down" nova-compute service spuriously marked as "up"

Currently we use the auto-updated "updated_at" field to determine
whether a service is "up".  An end-user can cause the "updated_at"
field to be updated by disabling or enabling the service, thus
potentially causing a service that is unavailable to be detected
as "up".  This could result in the scheduler trying to assign
instances to an unavailable compute node, or in the system
mistakenly preventing evacuation of an instance.

The fix is to add a new field to explicitly track the timestamp of
the last time the service sent in a status report and use that if
available when testing whether the service is up.

DocImpact
This commit will cause a behaviour change for the DB servicegroup
driver.  It will mean that enabling/disabling the service will
cause the "updated_at" field to change (as before) but that will
no longer be tied to the "up/down" status of the service. So
"nova service-list" could show the service as "down" even if it
shows a recent "updated_at".  (But this could happen for the other
servicegroup drivers already.)

Closes-Bug: #1420848
Change-Id: Ied7d47363d0489bca3cf2c711217e1a3b7d24a03
This commit is contained in:
Chris Friesen 2015-03-27 09:23:48 -06:00
parent e5fe8fd6af
commit b9bae02af2
10 changed files with 74 additions and 6 deletions

View File

@ -540,6 +540,12 @@ def service_update(context, service_id, values):
session = get_session()
with session.begin():
service_ref = _service_get(context, service_id, session=session)
# Only servicegroup.drivers.db.DbDriver._report_state() updates
# 'report_count', so if that value changes then store the timestamp
# as the last time we got a state report.
if 'report_count' in values:
if values['report_count'] > service_ref.report_count:
service_ref.last_seen_up = timeutils.utcnow()
service_ref.update(values)
return service_ref

View File

@ -0,0 +1,29 @@
# Copyright (c) 2015 Wind River Systems Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from sqlalchemy import MetaData, Table, Column, DateTime
BASE_TABLE_NAME = 'services'
NEW_COLUMN_NAME = 'last_seen_up'
def upgrade(migrate_engine):
meta = MetaData()
meta.bind = migrate_engine
for prefix in ('', 'shadow_'):
table = Table(prefix + BASE_TABLE_NAME, meta, autoload=True)
new_column = Column(NEW_COLUMN_NAME, DateTime, nullable=True)
if not hasattr(table.c, NEW_COLUMN_NAME):
table.create_column(new_column)

View File

@ -100,6 +100,7 @@ class Service(BASE, NovaBase):
report_count = Column(Integer, nullable=False, default=0)
disabled = Column(Boolean, default=False)
disabled_reason = Column(String(255))
last_seen_up = Column(DateTime, nullable=True)
class ComputeNode(BASE, NovaBase):

View File

@ -43,7 +43,8 @@ class Service(base.NovaPersistentObject, base.NovaObject,
# Version 1.10: Changes behaviour of loading compute_node
# Version 1.11: Added get_by_host_and_binary
# Version 1.12: ComputeNode version 1.11
VERSION = '1.12'
# Version 1.13: Added last_seen_up
VERSION = '1.13'
fields = {
'id': fields.IntegerField(read_only=True),
@ -55,7 +56,8 @@ class Service(base.NovaPersistentObject, base.NovaObject,
'disabled_reason': fields.StringField(nullable=True),
'availability_zone': fields.StringField(nullable=True),
'compute_node': fields.ObjectField('ComputeNode'),
}
'last_seen_up': fields.DateTimeField(nullable=True),
}
obj_relationships = {
'compute_node': [('1.1', '1.4'), ('1.3', '1.5'), ('1.5', '1.6'),
@ -65,6 +67,8 @@ class Service(base.NovaPersistentObject, base.NovaObject,
def obj_make_compatible(self, primitive, target_version):
_target_version = utils.convert_version_to_tuple(target_version)
if _target_version < (1, 13) and 'last_seen_up' in primitive:
del primitive['last_seen_up']
if _target_version < (1, 10):
target_compute_version = self.obj_calculate_child_version(
target_version, 'compute_node')
@ -194,7 +198,8 @@ class ServiceList(base.ObjectListBase, base.NovaObject):
# Version 1.8: Service version 1.10
# Version 1.9: Added get_by_binary() and Service version 1.11
# Version 1.10: Service version 1.12
VERSION = '1.10'
# Version 1.11: Service version 1.13
VERSION = '1.11'
fields = {
'objects': fields.ListOfObjectsField('Service'),
@ -212,6 +217,7 @@ class ServiceList(base.ObjectListBase, base.NovaObject):
'1.8': '1.10',
'1.9': '1.11',
'1.10': '1.12',
'1.11': '1.13',
}
@base.remotable_classmethod

View File

@ -57,7 +57,10 @@ class DbDriver(base.Driver):
"""Moved from nova.utils
Check whether a service is up based on last heartbeat.
"""
last_heartbeat = service_ref['updated_at'] or service_ref['created_at']
# Keep checking 'updated_at' if 'last_seen_up' isn't set.
# Should be able to use only 'last_seen_up' in the M release
last_heartbeat = (service_ref.get('last_seen_up') or
service_ref['updated_at'] or service_ref['created_at'])
if isinstance(last_heartbeat, six.string_types):
# NOTE(russellb) If this service_ref came in over rpc via
# conductor, then the timestamp will be a string and needs to be

View File

@ -281,6 +281,7 @@ class BaseTestCase(test.TestCase):
'updated_at': None,
'deleted_at': None,
'deleted': False,
'last_seen_up': None,
}
return service

View File

@ -693,6 +693,18 @@ class NovaMigrationsCheckers(test_migrations.ModelsMigrationsSync,
self.assertIsNone(fake_migration.migration_type)
self.assertFalse(fake_migration.hidden)
def _check_294(self, engine, data):
self.assertColumnExists(engine, 'services', 'last_seen_up')
self.assertColumnExists(engine, 'shadow_services', 'last_seen_up')
services = oslodbutils.get_table(engine, 'services')
shadow_services = oslodbutils.get_table(
engine, 'shadow_services')
self.assertIsInstance(services.c.last_seen_up.type,
sqlalchemy.types.DateTime)
self.assertIsInstance(shadow_services.c.last_seen_up.type,
sqlalchemy.types.DateTime)
class TestNovaMigrationsSQLite(NovaMigrationsCheckers,
test_base.DbTestCase,

View File

@ -1156,8 +1156,8 @@ object_data = {
'SecurityGroupList': '1.0-a3bb51998e7d2a95b3e613111e853817',
'SecurityGroupRule': '1.1-ae1da17b79970012e8536f88cb3c6b29',
'SecurityGroupRuleList': '1.1-521f1aeb7b0cc00d026175509289d020',
'Service': '1.12-220e3f7aec641ed08dc4a32af4f38915',
'ServiceList': '1.10-2f49ab65571c0edcbf623f664da612c0',
'Service': '1.13-bc6c9671a91439e08224c2652da5fc4c',
'ServiceList': '1.11-d1728430a30700c143e542b7c75f65b0',
'Tag': '1.0-616bf44af4a22e853c17b37a758ec73e',
'TagList': '1.0-e16d65894484b7530b720792ffbbbd02',
'TestSubclassedObject': '1.6-716fc8b481c9374f7e222de03ba0a621',

View File

@ -36,6 +36,7 @@ fake_service = {
'report_count': 1,
'disabled': False,
'disabled_reason': None,
'last_seen_up': None,
}
OPTIONAL = ['availability_zone', 'compute_node']

View File

@ -40,6 +40,7 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
# Up (equal)
now_mock.return_value = fts_func(fake_now)
service_ref['last_seen_up'] = fts_func(fake_now - self.down_time)
service_ref['updated_at'] = fts_func(fake_now - self.down_time)
service_ref['created_at'] = fts_func(fake_now - self.down_time)
@ -47,17 +48,25 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
self.assertTrue(result)
# Up
service_ref['last_seen_up'] = fts_func(fake_now - self.down_time + 1)
service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1)
service_ref['created_at'] = fts_func(fake_now - self.down_time + 1)
result = self.servicegroup_api.service_is_up(service_ref)
self.assertTrue(result)
# Down
service_ref['last_seen_up'] = fts_func(fake_now - self.down_time - 3)
service_ref['updated_at'] = fts_func(fake_now - self.down_time - 3)
service_ref['created_at'] = fts_func(fake_now - self.down_time - 3)
result = self.servicegroup_api.service_is_up(service_ref)
self.assertFalse(result)
# "last_seen_up" says down, "updated_at" says up.
# This can happen if we do a service disable/enable while it's down.
service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1)
result = self.servicegroup_api.service_is_up(service_ref)
self.assertFalse(result)
def test_join(self):
service = mock.MagicMock(report_interval=1)