fix "down" nova-compute service spuriously marked as "up"
Currently we use the auto-updated "updated_at" field to determine whether a service is "up". An end-user can cause the "updated_at" field to be updated by disabling or enabling the service, thus potentially causing a service that is unavailable to be detected as "up". This could result in the scheduler trying to assign instances to an unavailable compute node, or in the system mistakenly preventing evacuation of an instance. The fix is to add a new field to explicitly track the timestamp of the last time the service sent in a status report and use that if available when testing whether the service is up. DocImpact This commit will cause a behaviour change for the DB servicegroup driver. It will mean that enabling/disabling the service will cause the "updated_at" field to change (as before) but that will no longer be tied to the "up/down" status of the service. So "nova service-list" could show the service as "down" even if it shows a recent "updated_at". (But this could happen for the other servicegroup drivers already.) Closes-Bug: #1420848 Change-Id: Ied7d47363d0489bca3cf2c711217e1a3b7d24a03
This commit is contained in:
parent
e5fe8fd6af
commit
b9bae02af2
@ -540,6 +540,12 @@ def service_update(context, service_id, values):
|
||||
session = get_session()
|
||||
with session.begin():
|
||||
service_ref = _service_get(context, service_id, session=session)
|
||||
# Only servicegroup.drivers.db.DbDriver._report_state() updates
|
||||
# 'report_count', so if that value changes then store the timestamp
|
||||
# as the last time we got a state report.
|
||||
if 'report_count' in values:
|
||||
if values['report_count'] > service_ref.report_count:
|
||||
service_ref.last_seen_up = timeutils.utcnow()
|
||||
service_ref.update(values)
|
||||
|
||||
return service_ref
|
||||
|
@ -0,0 +1,29 @@
|
||||
# Copyright (c) 2015 Wind River Systems Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from sqlalchemy import MetaData, Table, Column, DateTime
|
||||
|
||||
BASE_TABLE_NAME = 'services'
|
||||
NEW_COLUMN_NAME = 'last_seen_up'
|
||||
|
||||
|
||||
def upgrade(migrate_engine):
|
||||
meta = MetaData()
|
||||
meta.bind = migrate_engine
|
||||
|
||||
for prefix in ('', 'shadow_'):
|
||||
table = Table(prefix + BASE_TABLE_NAME, meta, autoload=True)
|
||||
new_column = Column(NEW_COLUMN_NAME, DateTime, nullable=True)
|
||||
if not hasattr(table.c, NEW_COLUMN_NAME):
|
||||
table.create_column(new_column)
|
@ -100,6 +100,7 @@ class Service(BASE, NovaBase):
|
||||
report_count = Column(Integer, nullable=False, default=0)
|
||||
disabled = Column(Boolean, default=False)
|
||||
disabled_reason = Column(String(255))
|
||||
last_seen_up = Column(DateTime, nullable=True)
|
||||
|
||||
|
||||
class ComputeNode(BASE, NovaBase):
|
||||
|
@ -43,7 +43,8 @@ class Service(base.NovaPersistentObject, base.NovaObject,
|
||||
# Version 1.10: Changes behaviour of loading compute_node
|
||||
# Version 1.11: Added get_by_host_and_binary
|
||||
# Version 1.12: ComputeNode version 1.11
|
||||
VERSION = '1.12'
|
||||
# Version 1.13: Added last_seen_up
|
||||
VERSION = '1.13'
|
||||
|
||||
fields = {
|
||||
'id': fields.IntegerField(read_only=True),
|
||||
@ -55,7 +56,8 @@ class Service(base.NovaPersistentObject, base.NovaObject,
|
||||
'disabled_reason': fields.StringField(nullable=True),
|
||||
'availability_zone': fields.StringField(nullable=True),
|
||||
'compute_node': fields.ObjectField('ComputeNode'),
|
||||
}
|
||||
'last_seen_up': fields.DateTimeField(nullable=True),
|
||||
}
|
||||
|
||||
obj_relationships = {
|
||||
'compute_node': [('1.1', '1.4'), ('1.3', '1.5'), ('1.5', '1.6'),
|
||||
@ -65,6 +67,8 @@ class Service(base.NovaPersistentObject, base.NovaObject,
|
||||
|
||||
def obj_make_compatible(self, primitive, target_version):
|
||||
_target_version = utils.convert_version_to_tuple(target_version)
|
||||
if _target_version < (1, 13) and 'last_seen_up' in primitive:
|
||||
del primitive['last_seen_up']
|
||||
if _target_version < (1, 10):
|
||||
target_compute_version = self.obj_calculate_child_version(
|
||||
target_version, 'compute_node')
|
||||
@ -194,7 +198,8 @@ class ServiceList(base.ObjectListBase, base.NovaObject):
|
||||
# Version 1.8: Service version 1.10
|
||||
# Version 1.9: Added get_by_binary() and Service version 1.11
|
||||
# Version 1.10: Service version 1.12
|
||||
VERSION = '1.10'
|
||||
# Version 1.11: Service version 1.13
|
||||
VERSION = '1.11'
|
||||
|
||||
fields = {
|
||||
'objects': fields.ListOfObjectsField('Service'),
|
||||
@ -212,6 +217,7 @@ class ServiceList(base.ObjectListBase, base.NovaObject):
|
||||
'1.8': '1.10',
|
||||
'1.9': '1.11',
|
||||
'1.10': '1.12',
|
||||
'1.11': '1.13',
|
||||
}
|
||||
|
||||
@base.remotable_classmethod
|
||||
|
@ -57,7 +57,10 @@ class DbDriver(base.Driver):
|
||||
"""Moved from nova.utils
|
||||
Check whether a service is up based on last heartbeat.
|
||||
"""
|
||||
last_heartbeat = service_ref['updated_at'] or service_ref['created_at']
|
||||
# Keep checking 'updated_at' if 'last_seen_up' isn't set.
|
||||
# Should be able to use only 'last_seen_up' in the M release
|
||||
last_heartbeat = (service_ref.get('last_seen_up') or
|
||||
service_ref['updated_at'] or service_ref['created_at'])
|
||||
if isinstance(last_heartbeat, six.string_types):
|
||||
# NOTE(russellb) If this service_ref came in over rpc via
|
||||
# conductor, then the timestamp will be a string and needs to be
|
||||
|
@ -281,6 +281,7 @@ class BaseTestCase(test.TestCase):
|
||||
'updated_at': None,
|
||||
'deleted_at': None,
|
||||
'deleted': False,
|
||||
'last_seen_up': None,
|
||||
}
|
||||
return service
|
||||
|
||||
|
@ -693,6 +693,18 @@ class NovaMigrationsCheckers(test_migrations.ModelsMigrationsSync,
|
||||
self.assertIsNone(fake_migration.migration_type)
|
||||
self.assertFalse(fake_migration.hidden)
|
||||
|
||||
def _check_294(self, engine, data):
|
||||
self.assertColumnExists(engine, 'services', 'last_seen_up')
|
||||
self.assertColumnExists(engine, 'shadow_services', 'last_seen_up')
|
||||
|
||||
services = oslodbutils.get_table(engine, 'services')
|
||||
shadow_services = oslodbutils.get_table(
|
||||
engine, 'shadow_services')
|
||||
self.assertIsInstance(services.c.last_seen_up.type,
|
||||
sqlalchemy.types.DateTime)
|
||||
self.assertIsInstance(shadow_services.c.last_seen_up.type,
|
||||
sqlalchemy.types.DateTime)
|
||||
|
||||
|
||||
class TestNovaMigrationsSQLite(NovaMigrationsCheckers,
|
||||
test_base.DbTestCase,
|
||||
|
@ -1156,8 +1156,8 @@ object_data = {
|
||||
'SecurityGroupList': '1.0-a3bb51998e7d2a95b3e613111e853817',
|
||||
'SecurityGroupRule': '1.1-ae1da17b79970012e8536f88cb3c6b29',
|
||||
'SecurityGroupRuleList': '1.1-521f1aeb7b0cc00d026175509289d020',
|
||||
'Service': '1.12-220e3f7aec641ed08dc4a32af4f38915',
|
||||
'ServiceList': '1.10-2f49ab65571c0edcbf623f664da612c0',
|
||||
'Service': '1.13-bc6c9671a91439e08224c2652da5fc4c',
|
||||
'ServiceList': '1.11-d1728430a30700c143e542b7c75f65b0',
|
||||
'Tag': '1.0-616bf44af4a22e853c17b37a758ec73e',
|
||||
'TagList': '1.0-e16d65894484b7530b720792ffbbbd02',
|
||||
'TestSubclassedObject': '1.6-716fc8b481c9374f7e222de03ba0a621',
|
||||
|
@ -36,6 +36,7 @@ fake_service = {
|
||||
'report_count': 1,
|
||||
'disabled': False,
|
||||
'disabled_reason': None,
|
||||
'last_seen_up': None,
|
||||
}
|
||||
|
||||
OPTIONAL = ['availability_zone', 'compute_node']
|
||||
|
@ -40,6 +40,7 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
|
||||
|
||||
# Up (equal)
|
||||
now_mock.return_value = fts_func(fake_now)
|
||||
service_ref['last_seen_up'] = fts_func(fake_now - self.down_time)
|
||||
service_ref['updated_at'] = fts_func(fake_now - self.down_time)
|
||||
service_ref['created_at'] = fts_func(fake_now - self.down_time)
|
||||
|
||||
@ -47,17 +48,25 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
|
||||
self.assertTrue(result)
|
||||
|
||||
# Up
|
||||
service_ref['last_seen_up'] = fts_func(fake_now - self.down_time + 1)
|
||||
service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1)
|
||||
service_ref['created_at'] = fts_func(fake_now - self.down_time + 1)
|
||||
result = self.servicegroup_api.service_is_up(service_ref)
|
||||
self.assertTrue(result)
|
||||
|
||||
# Down
|
||||
service_ref['last_seen_up'] = fts_func(fake_now - self.down_time - 3)
|
||||
service_ref['updated_at'] = fts_func(fake_now - self.down_time - 3)
|
||||
service_ref['created_at'] = fts_func(fake_now - self.down_time - 3)
|
||||
result = self.servicegroup_api.service_is_up(service_ref)
|
||||
self.assertFalse(result)
|
||||
|
||||
# "last_seen_up" says down, "updated_at" says up.
|
||||
# This can happen if we do a service disable/enable while it's down.
|
||||
service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1)
|
||||
result = self.servicegroup_api.service_is_up(service_ref)
|
||||
self.assertFalse(result)
|
||||
|
||||
def test_join(self):
|
||||
service = mock.MagicMock(report_interval=1)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user