Block deleting compute services with in-progress migrations
This builds on I0bd63b655ad3d3d39af8d15c781ce0a45efc8e3a which made DELETE /os-services/{service_id} fail with a 409 response if the host has instances on it. This change checks for in-progress migrations involving the nodes on the host, either as the source or destination nodes, and returns a 409 error response if any are found. Failling to do this can lead to orphaned resource providers in placement and also failing to properly confirm or revert a pending resize or cold migration. A release note is included for the (justified) behavior change in the API. A new microversion should not be required for this since admins should not have to opt out of broken behavior. Change-Id: I70e06c607045a1c0842f13069e51fef438012a9c Closes-Bug: #1852610
This commit is contained in:
parent
f7dde6054e
commit
92fed02610
@ -349,6 +349,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances
|
|||||||
will result in a 409 HTTPConflict response. The instances will need to be
|
will result in a 409 HTTPConflict response. The instances will need to be
|
||||||
migrated or deleted before a compute service can be deleted.
|
migrated or deleted before a compute service can be deleted.
|
||||||
|
|
||||||
|
Similarly, attempts to delete a ``nova-compute`` service which is involved in
|
||||||
|
in-progress migrations will result in a 409 HTTPConflict response. The
|
||||||
|
migrations will need to be completed, for example confirming or reverting a
|
||||||
|
resize, or the instances will need to be deleted before the compute service can
|
||||||
|
be deleted.
|
||||||
|
|
||||||
.. important:: Be sure to stop the actual ``nova-compute`` process on the
|
.. important:: Be sure to stop the actual ``nova-compute`` process on the
|
||||||
physical host *before* deleting the service with this API.
|
physical host *before* deleting the service with this API.
|
||||||
Failing to do so can lead to the running service re-creating
|
Failing to do so can lead to the running service re-creating
|
||||||
|
@ -264,6 +264,16 @@ class ServiceController(wsgi.Controller):
|
|||||||
'is hosting instances. Migrate or '
|
'is hosting instances. Migrate or '
|
||||||
'delete the instances first.'))
|
'delete the instances first.'))
|
||||||
|
|
||||||
|
# Similarly, check to see if the are any in-progress migrations
|
||||||
|
# involving this host because if there are we need to block the
|
||||||
|
# service delete since we could orphan resource providers and
|
||||||
|
# break the ability to do things like confirm/revert instances
|
||||||
|
# in VERIFY_RESIZE status.
|
||||||
|
compute_nodes = objects.ComputeNodeList.get_all_by_host(
|
||||||
|
context, service.host)
|
||||||
|
self._assert_no_in_progress_migrations(
|
||||||
|
context, id, compute_nodes)
|
||||||
|
|
||||||
aggrs = self.aggregate_api.get_aggregates_by_host(context,
|
aggrs = self.aggregate_api.get_aggregates_by_host(context,
|
||||||
service.host)
|
service.host)
|
||||||
for ag in aggrs:
|
for ag in aggrs:
|
||||||
@ -274,8 +284,6 @@ class ServiceController(wsgi.Controller):
|
|||||||
# placement for the compute nodes managed by this service;
|
# placement for the compute nodes managed by this service;
|
||||||
# remember that an ironic compute service can manage multiple
|
# remember that an ironic compute service can manage multiple
|
||||||
# nodes
|
# nodes
|
||||||
compute_nodes = objects.ComputeNodeList.get_all_by_host(
|
|
||||||
context, service.host)
|
|
||||||
for compute_node in compute_nodes:
|
for compute_node in compute_nodes:
|
||||||
try:
|
try:
|
||||||
self.placementclient.delete_resource_provider(
|
self.placementclient.delete_resource_provider(
|
||||||
@ -303,6 +311,36 @@ class ServiceController(wsgi.Controller):
|
|||||||
explanation = _("Service id %s refers to multiple services.") % id
|
explanation = _("Service id %s refers to multiple services.") % id
|
||||||
raise webob.exc.HTTPBadRequest(explanation=explanation)
|
raise webob.exc.HTTPBadRequest(explanation=explanation)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _assert_no_in_progress_migrations(context, service_id, compute_nodes):
|
||||||
|
"""Ensures there are no in-progress migrations on the given nodes.
|
||||||
|
|
||||||
|
:param context: nova auth RequestContext
|
||||||
|
:param service_id: id of the Service being deleted
|
||||||
|
:param compute_nodes: ComputeNodeList of nodes on a compute service
|
||||||
|
:raises: HTTPConflict if there are any in-progress migrations on the
|
||||||
|
nodes
|
||||||
|
"""
|
||||||
|
for cn in compute_nodes:
|
||||||
|
migrations = (
|
||||||
|
objects.MigrationList.get_in_progress_by_host_and_node(
|
||||||
|
context, cn.host, cn.hypervisor_hostname))
|
||||||
|
if migrations:
|
||||||
|
# Log the migrations for the operator and then raise
|
||||||
|
# a 409 error.
|
||||||
|
LOG.info('Unable to delete compute service with id %s '
|
||||||
|
'for host %s. There are %i in-progress '
|
||||||
|
'migrations involving the host. Migrations '
|
||||||
|
'(uuid:status): %s',
|
||||||
|
service_id, cn.host, len(migrations),
|
||||||
|
','.join(['%s:%s' % (mig.uuid, mig.status)
|
||||||
|
for mig in migrations]))
|
||||||
|
raise webob.exc.HTTPConflict(
|
||||||
|
explanation=_(
|
||||||
|
'Unable to delete compute service that has '
|
||||||
|
'in-progress migrations. Complete the '
|
||||||
|
'migrations or delete the instances first.'))
|
||||||
|
|
||||||
@validation.query_schema(services.index_query_schema_275, '2.75')
|
@validation.query_schema(services.index_query_schema_275, '2.75')
|
||||||
@validation.query_schema(services.index_query_schema, '2.0', '2.74')
|
@validation.query_schema(services.index_query_schema, '2.0', '2.74')
|
||||||
@wsgi.expected_errors(())
|
@wsgi.expected_errors(())
|
||||||
|
@ -955,6 +955,18 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
|
|||||||
'compute_confirm_resize', 'success')
|
'compute_confirm_resize', 'success')
|
||||||
return server
|
return server
|
||||||
|
|
||||||
|
def _revert_resize(self, server):
|
||||||
|
self.api.post_server_action(server['id'], {'revertResize': None})
|
||||||
|
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
|
||||||
|
self._wait_for_migration_status(server, ['reverted'])
|
||||||
|
# Note that the migration status is changed to "reverted" in the
|
||||||
|
# dest host revert_resize method but the allocations are cleaned up
|
||||||
|
# in the source host finish_revert_resize method so we need to wait
|
||||||
|
# for the finish_revert_resize method to complete.
|
||||||
|
fake_notifier.wait_for_versioned_notifications(
|
||||||
|
'instance.resize_revert.end')
|
||||||
|
return server
|
||||||
|
|
||||||
def get_unused_flavor_name_id(self):
|
def get_unused_flavor_name_id(self):
|
||||||
flavors = self.api.get_flavors()
|
flavors = self.api.get_flavors()
|
||||||
flavor_names = list()
|
flavor_names = list()
|
||||||
|
@ -194,21 +194,29 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
|
|||||||
# Delete the source compute service.
|
# Delete the source compute service.
|
||||||
service = self.admin_api.get_services(
|
service = self.admin_api.get_services(
|
||||||
binary='nova-compute', host='host1')[0]
|
binary='nova-compute', host='host1')[0]
|
||||||
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
# We expect the delete request to fail with a 409 error because of the
|
||||||
# FIXME(mriedem): This is bug 1852610 where the compute service is
|
# instance in VERIFY_RESIZE status even though that instance is marked
|
||||||
# deleted but the resource provider is not because there are still
|
# as being on host2 now.
|
||||||
# migration-based allocations against the source node provider.
|
ex = self.assertRaises(api_client.OpenStackApiException,
|
||||||
|
self.admin_api.api_delete,
|
||||||
|
'/os-services/%s' % service['id'])
|
||||||
|
self.assertEqual(409, ex.response.status_code)
|
||||||
|
self.assertIn('Unable to delete compute service that has in-progress '
|
||||||
|
'migrations', six.text_type(ex))
|
||||||
|
self.assertIn('There are 1 in-progress migrations involving the host',
|
||||||
|
self.stdlog.logger.output)
|
||||||
|
# The provider is still around because we did not delete the service.
|
||||||
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
||||||
self.assertEqual(200, resp.status)
|
self.assertEqual(200, resp.status)
|
||||||
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
|
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
|
||||||
# Now try to confirm the migration.
|
# Now try to confirm the migration.
|
||||||
# FIXME(mriedem): This will fail until bug 1852610 is fixed and the
|
self._confirm_resize(server)
|
||||||
# source compute service delete is blocked while there is an
|
# Delete the host1 service since the migration is confirmed and the
|
||||||
# in-progress migration involving the node.
|
# server is on host2.
|
||||||
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
|
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
||||||
self.api.post_server_action(server['id'], {'confirmResize': None})
|
# The host1 resource provider should be gone.
|
||||||
self._wait_for_state_change(self.api, server, 'ERROR')
|
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
||||||
self.assertIn('ComputeHostNotFound', self.stdlog.logger.output)
|
self.assertEqual(404, resp.status)
|
||||||
|
|
||||||
def test_resize_revert_after_deleted_source_compute(self):
|
def test_resize_revert_after_deleted_source_compute(self):
|
||||||
"""Tests a scenario where a server is resized and while in
|
"""Tests a scenario where a server is resized and while in
|
||||||
@ -231,25 +239,34 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
|
|||||||
# Delete the source compute service.
|
# Delete the source compute service.
|
||||||
service = self.admin_api.get_services(
|
service = self.admin_api.get_services(
|
||||||
binary='nova-compute', host='host1')[0]
|
binary='nova-compute', host='host1')[0]
|
||||||
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
# We expect the delete request to fail with a 409 error because of the
|
||||||
# FIXME(mriedem): This is bug 1852610 where the compute service is
|
# instance in VERIFY_RESIZE status even though that instance is marked
|
||||||
# deleted but the resource provider is not because there are still
|
# as being on host2 now.
|
||||||
# migration-based allocations against the source node provider.
|
ex = self.assertRaises(api_client.OpenStackApiException,
|
||||||
|
self.admin_api.api_delete,
|
||||||
|
'/os-services/%s' % service['id'])
|
||||||
|
self.assertEqual(409, ex.response.status_code)
|
||||||
|
self.assertIn('Unable to delete compute service that has in-progress '
|
||||||
|
'migrations', six.text_type(ex))
|
||||||
|
self.assertIn('There are 1 in-progress migrations involving the host',
|
||||||
|
self.stdlog.logger.output)
|
||||||
|
# The provider is still around because we did not delete the service.
|
||||||
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
||||||
self.assertEqual(200, resp.status)
|
self.assertEqual(200, resp.status)
|
||||||
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
||||||
# Now try to revert the resize.
|
# Now revert the resize.
|
||||||
# NOTE(mriedem): This actually works because the drop_move_claim
|
self._revert_resize(server)
|
||||||
# happens in revert_resize on the dest host which still has its
|
|
||||||
# ComputeNode record. The migration-based allocations are reverted
|
|
||||||
# so the instance holds the allocations for the source provider and
|
|
||||||
# the allocations against the dest provider are dropped.
|
|
||||||
self.api.post_server_action(server['id'], {'revertResize': None})
|
|
||||||
self._wait_for_state_change(self.api, server, 'ACTIVE')
|
|
||||||
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
|
|
||||||
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
||||||
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
|
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
|
||||||
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
|
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
|
||||||
|
# Delete the host2 service since the migration is reverted and the
|
||||||
|
# server is on host1 again.
|
||||||
|
service2 = self.admin_api.get_services(
|
||||||
|
binary='nova-compute', host='host2')[0]
|
||||||
|
self.admin_api.api_delete('/os-services/%s' % service2['id'])
|
||||||
|
# The host2 resource provider should be gone.
|
||||||
|
resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid)
|
||||||
|
self.assertEqual(404, resp.status)
|
||||||
|
|
||||||
|
|
||||||
class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):
|
class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):
|
||||||
|
@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
The ``DELETE /os-services/{service_id}`` compute API will now return a
|
||||||
|
``409 HTTPConflict`` response when trying to delete a ``nova-compute``
|
||||||
|
service which is involved in in-progress migrations. This is because doing
|
||||||
|
so would not only orphan the compute node resource provider in the
|
||||||
|
placement service on which those instances have resource allocations but
|
||||||
|
can also break the ability to confirm/revert a pending resize properly.
|
||||||
|
See https://bugs.launchpad.net/nova/+bug/1852610 for more details.
|
Loading…
x
Reference in New Issue
Block a user