diff --git a/doc/source/cli/nova-manage.rst b/doc/source/cli/nova-manage.rst index a59eff8464d2..ee5ddec26c88 100644 --- a/doc/source/cli/nova-manage.rst +++ b/doc/source/cli/nova-manage.rst @@ -686,6 +686,42 @@ Placement - An unexpected error occurred. +``nova-manage placement audit [--verbose] [--delete] [--resource_provider ]`` + Iterates over all the Resource Providers (or just one if you provide the + UUID) and then verifies if the compute allocations are either related to + an existing instance or a migration UUID. + If not, it will tell which allocations are orphaned. + + You can also ask to delete all the orphaned allocations by specifying + ``-delete``. + + Specify ``--verbose`` to get detailed progress output during execution. + + This command requires that the + :oslo.config:option:`api_database.connection` and + :oslo.config:group:`placement` configuration options are set. Placement API + >= 1.14 is required. + + **Return Codes** + + .. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - Return code + - Description + * - 0 + - No orphaned allocations were found + * - 1 + - An unexpected error occurred + * - 3 + - Orphaned allocations were found + * - 4 + - All found orphaned allocations were deleted + * - 127 + - Invalid input + + See Also ======== diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py index ccd67dc5485c..ea3c67ba7ef0 100644 --- a/nova/cmd/manage.py +++ b/nova/cmd/manage.py @@ -32,6 +32,7 @@ import traceback from dateutil import parser as dateutil_parser from keystoneauth1 import exceptions as ks_exc from neutronclient.common import exceptions as neutron_client_exc +import os_resource_classes as orc from oslo_config import cfg from oslo_db import exception as db_exc from oslo_log import log as logging @@ -2391,6 +2392,300 @@ class PlacementCommands(object): return return_code + def _get_instances_and_current_migrations(self, ctxt, cn_uuid): + if self.cn_uuid_mapping.get(cn_uuid): + cell_uuid, cn_host, cn_node = self.cn_uuid_mapping[cn_uuid] + else: + # We need to find the compute node record from all cells. + results = context.scatter_gather_skip_cell0( + ctxt, objects.ComputeNode.get_by_uuid, cn_uuid) + for result_cell_uuid, result in results.items(): + if not context.is_cell_failure_sentinel(result): + cn = result + cell_uuid = result_cell_uuid + break + else: + return False + cn_host, cn_node = (cn.host, cn.hypervisor_hostname) + self.cn_uuid_mapping[cn_uuid] = (cell_uuid, cn_host, cn_node) + cell_mapping = objects.CellMapping.get_by_uuid(ctxt, cell_uuid) + + # Get all the active instances from this compute node + if self.instances_mapping.get(cn_uuid): + inst_uuids = self.instances_mapping[cn_uuid] + else: + # Get the instance list record from the cell. + with context.target_cell(ctxt, cell_mapping) as cctxt: + instances = objects.InstanceList.get_by_host_and_node( + cctxt, cn_host, cn_node, expected_attrs=[]) + inst_uuids = [instance.uuid for instance in instances] + self.instances_mapping[cn_uuid] = inst_uuids + + # Get all *active* migrations for this compute node + # NOTE(sbauza): Since migrations are transient, it's better to not + # cache the results as they could be stale + with context.target_cell(ctxt, cell_mapping) as cctxt: + migs = objects.MigrationList.get_in_progress_by_host_and_node( + cctxt, cn_host, cn_node) + mig_uuids = [migration.uuid for migration in migs] + + return (inst_uuids, mig_uuids) + + def _delete_allocations_from_consumer(self, ctxt, placement, provider, + consumer_uuid, consumer_type): + """Deletes allocations from a resource provider with consumer UUID. + + :param ctxt: nova.context.RequestContext + :param placement: nova.scheduler.client.report.SchedulerReportClient + to communicate with the Placement service API. + :param provider: Resource Provider to look at. + :param consumer_uuid: the consumer UUID having allocations. + :param consumer_type: the type of consumer, + either 'instance' or 'migration' + :returns: bool whether the allocations were deleted. + """ + # We need to be careful and only remove the allocations + # against this specific RP or we would delete the + # whole instance usage and then it would require some + # healing. + # TODO(sbauza): Remove this extra check once placement + # supports querying allocation delete on both + # consumer and resource provider parameters. + allocations = placement.get_allocs_for_consumer( + ctxt, consumer_uuid) + if len(allocations['allocations']) > 1: + # This consumer has resources spreaded amongst + # multiple RPs (think nested or shared for example) + # We then need to just update the usage to remove + # the orphaned resources on the specific RP + del allocations['allocations'][provider['uuid']] + try: + placement.put_allocations( + ctxt, consumer_uuid, allocations) + except exception.AllocationUpdateFailed: + return False + + else: + try: + placement.delete_allocation_for_instance( + ctxt, consumer_uuid, consumer_type) + except exception.AllocationDeleteFailed: + return False + return True + + def _check_orphaned_allocations_for_provider(self, ctxt, placement, + output, provider, + delete): + """Finds orphaned allocations for a specific resource provider. + + :param ctxt: nova.context.RequestContext + :param placement: nova.scheduler.client.report.SchedulerReportClient + to communicate with the Placement service API. + :param output: function that takes a single message for verbose output + :param provider: Resource Provider to look at. + :param delete: deletes the found orphaned allocations. + :return: a tuple (, ) + """ + num_processed = 0 + faults = 0 + + # TODO(sbauza): Are we sure we have all Nova RCs ? + # FIXME(sbauza): Possibly use consumer types once Placement API + # supports them. + # NOTE(sbauza): We check allocations having *any* below RC, not having + # *all* of them. + NOVA_RCS = [orc.VCPU, orc.MEMORY_MB, orc.DISK_GB, orc.VGPU, + orc.NET_BW_EGR_KILOBIT_PER_SEC, + orc.NET_BW_IGR_KILOBIT_PER_SEC, + orc.PCPU, orc.MEM_ENCRYPTION_CONTEXT] + + # Since the RP can be a child RP, we need to get the root RP as it's + # the compute node UUID + # NOTE(sbauza): In case Placement doesn't support 1.14 microversion, + # that means we don't have nested RPs. + # Since we ask for microversion 1.14, all RPs have a root RP UUID. + cn_uuid = provider.get("root_provider_uuid") + # Now get all the existing instances and active migrations for this + # compute node + result = self._get_instances_and_current_migrations(ctxt, cn_uuid) + if result is False: + # We don't want to hard stop here because the compute service could + # have disappear while we could still have orphaned allocations. + output(_('The compute node for UUID %s can not be ' + 'found') % cn_uuid) + inst_uuids, mig_uuids = result or ([], []) + try: + pallocs = placement.get_allocations_for_resource_provider( + ctxt, provider['uuid']) + except exception.ResourceProviderAllocationRetrievalFailed: + print(_('Not able to find allocations for resource ' + 'provider %s.') % provider['uuid']) + raise + + # Verify every allocations for each consumer UUID + for consumer_uuid, consumer_resources in six.iteritems( + pallocs.allocations): + consumer_allocs = consumer_resources['resources'] + if any(rc in NOVA_RCS + for rc in consumer_allocs): + # We reset the consumer type for each allocation + consumer_type = None + # This is an allocation for Nova resources + # We need to guess whether the instance was deleted + # or if the instance is currently migrating + if not (consumer_uuid in inst_uuids or + consumer_uuid in mig_uuids): + # By default we suspect the orphaned allocation was for a + # migration... + consumer_type = 'migration' + if not(consumer_uuid in inst_uuids): + # ... but if we can't find it either for an instance, + # that means it was for this. + consumer_type = 'instance' + if consumer_type is not None: + output(_('Allocations were set against consumer UUID ' + '%(consumer_uuid)s but no existing instances or ' + 'active migrations are related. ') + % {'consumer_uuid': consumer_uuid}) + if delete: + deleted = self._delete_allocations_from_consumer( + ctxt, placement, provider, consumer_uuid, + consumer_type) + if not deleted: + print(_('Not able to delete allocations ' + 'for consumer UUID %s') + % consumer_uuid) + faults += 1 + continue + output(_('Deleted allocations for consumer UUID ' + '%(consumer_uuid)s on Resource Provider ' + '%(rp)s: %(allocations)s') + % {'consumer_uuid': consumer_uuid, + 'rp': provider['uuid'], + 'allocations': consumer_allocs}) + else: + output(_('Allocations for consumer UUID ' + '%(consumer_uuid)s on Resource Provider ' + '%(rp)s can be deleted: ' + '%(allocations)s') + % {'consumer_uuid': consumer_uuid, + 'rp': provider['uuid'], + 'allocations': consumer_allocs}) + num_processed += 1 + return (num_processed, faults) + + # TODO(sbauza): Move this to the scheduler report client ? + def _get_resource_provider(self, context, placement, uuid): + """Returns a single Resource Provider by its UUID. + + :param context: The nova.context.RequestContext auth context + :param placement: nova.scheduler.client.report.SchedulerReportClient + to communicate with the Placement service API. + :param uuid: A specific Resource Provider UUID + :return: the existing resource provider. + :raises: keystoneauth1.exceptions.base.ClientException on failure to + communicate with the placement API + """ + + resource_providers = self._get_resource_providers(context, placement, + uuid=uuid) + if not resource_providers: + # The endpoint never returns a 404, it rather returns an empty list + raise exception.ResourceProviderNotFound(name_or_uuid=uuid) + return resource_providers[0] + + def _get_resource_providers(self, context, placement, **kwargs): + """Returns all resource providers regardless of their relationships. + + :param context: The nova.context.RequestContext auth context + :param placement: nova.scheduler.client.report.SchedulerReportClient + to communicate with the Placement service API. + :param kwargs: extra attributes for the query string + :return: list of resource providers. + :raises: keystoneauth1.exceptions.base.ClientException on failure to + communicate with the placement API + """ + url = '/resource_providers' + if 'uuid' in kwargs: + url += '&uuid=%s' % kwargs['uuid'] + + resp = placement.get(url, global_request_id=context.global_id, + version='1.14') + if resp is None: + raise exception.PlacementAPIConnectFailure() + + data = resp.json() + resource_providers = data.get('resource_providers') + + return resource_providers + + @action_description( + _("Audits orphaned allocations that are no longer corresponding to " + "existing instance resources. This command requires that " + "the [api_database]/connection and [placement] configuration " + "options are set.")) + @args('--verbose', action='store_true', dest='verbose', default=False, + help='Provide verbose output during execution.') + @args('--resource_provider', metavar='', + dest='provider_uuid', + help='UUID of a specific resource provider to verify.') + @args('--delete', action='store_true', dest='delete', default=False, + help='Deletes orphaned allocations that were found.') + def audit(self, verbose=False, provider_uuid=None, delete=False): + """Provides information about orphaned allocations that can be removed + + Return codes: + + * 0: Command completed successfully and no orphaned allocations exist. + * 1: An unexpected error happened during run. + * 3: Orphaned allocations were detected. + * 4: Orphaned allocations were detected and deleted. + * 127: Invalid input. + """ + + ctxt = context.get_admin_context() + output = lambda msg: None + if verbose: + output = lambda msg: print(msg) + + placement = report.SchedulerReportClient() + # Resets two in-memory dicts for knowing instances per compute node + self.cn_uuid_mapping = collections.defaultdict(tuple) + self.instances_mapping = collections.defaultdict(list) + + num_processed = 0 + faults = 0 + + if provider_uuid: + try: + resource_provider = self._get_resource_provider( + ctxt, placement, provider_uuid) + except exception.ResourceProviderNotFound: + print(_('Resource provider with UUID %s does not exist.') % + provider_uuid) + return 127 + resource_providers = [resource_provider] + else: + resource_providers = self._get_resource_providers(ctxt, placement) + + for provider in resource_providers: + (nb_p, faults) = self._check_orphaned_allocations_for_provider( + ctxt, placement, output, provider, delete) + num_processed += nb_p + if faults > 0: + print(_('The Resource Provider %s had problems when ' + 'deleting allocations. Stopping now. Please fix the ' + 'problem by hand and run again.') % + provider['uuid']) + return 1 + if num_processed > 0: + suffix = 's.' if num_processed > 1 else '.' + output(_('Processed %(num)s allocation%(suffix)s') + % {'num': num_processed, + 'suffix': suffix}) + return 4 if delete else 3 + return 0 + CATEGORIES = { 'api_db': ApiDbCommands, diff --git a/nova/tests/functional/test_nova_manage.py b/nova/tests/functional/test_nova_manage.py index 2634bde14eb5..a98f97573c21 100644 --- a/nova/tests/functional/test_nova_manage.py +++ b/nova/tests/functional/test_nova_manage.py @@ -1393,6 +1393,232 @@ class TestNovaManagePlacementSyncAggregates( '%s should be in two provider aggregates' % host) +class TestNovaManagePlacementAudit( + integrated_helpers.ProviderUsageBaseTestCase): + """Functional tests for nova-manage placement audit""" + + # Let's just use a simple fake driver + compute_driver = 'fake.SmallFakeDriver' + + def setUp(self): + super(TestNovaManagePlacementAudit, self).setUp() + self.cli = manage.PlacementCommands() + # Make sure we have two computes for migrations + self.compute1 = self._start_compute('host1') + self.compute2 = self._start_compute('host2') + + # Make sure we have two hypervisors reported in the API. + hypervisors = self.admin_api.api_get( + '/os-hypervisors').body['hypervisors'] + self.assertEqual(2, len(hypervisors)) + + self.output = StringIO() + self.useFixture(fixtures.MonkeyPatch('sys.stdout', self.output)) + + self.flavor = self.api.get_flavors()[0] + + def _delete_instance_but_keep_its_allocations(self, server): + """Mocks out the call to Placement for deleting the allocations but + still performs the instance deletion. + """ + + with mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance'): + self.api.delete_server(server['id']) + self._wait_until_deleted(server) + + def test_audit_orphaned_allocation_from_instance_delete(self): + """Creates a server and deletes it by retaining its allocations so the + audit command can find it. + """ + target_hostname = self.compute1.host + rp_uuid = self._get_provider_uuid_by_host(target_hostname) + + server = self._boot_and_check_allocations(self.flavor, target_hostname) + + # let's mock the allocation delete call to placement + with mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance'): + self.api.delete_server(server['id']) + self._wait_until_deleted(server) + + # make sure the allocation is still around + self.assertFlavorMatchesUsage(rp_uuid, self.flavor) + + # Don't ask to delete the orphaned allocations, just audit them + ret = self.cli.audit(verbose=True) + # The allocation should still exist + self.assertFlavorMatchesUsage(rp_uuid, self.flavor) + + output = self.output.getvalue() + self.assertIn( + 'Allocations for consumer UUID %(consumer_uuid)s on ' + 'Resource Provider %(rp_uuid)s can be deleted' % + {'consumer_uuid': server['id'], + 'rp_uuid': rp_uuid}, + output) + self.assertIn('Processed 1 allocation.', output) + self.assertEqual(3, ret) + + # Now ask the audit command to delete the rogue allocations. + ret = self.cli.audit(delete=True, verbose=True) + + # The allocations are now deleted + self.assertRequestMatchesUsage({'VCPU': 0, + 'MEMORY_MB': 0, + 'DISK_GB': 0}, rp_uuid) + + output = self.output.getvalue() + self.assertIn( + 'Deleted allocations for consumer UUID %s' % server['id'], output) + self.assertIn('Processed 1 allocation.', output) + self.assertEqual(4, ret) + + def test_audit_orphaned_allocations_from_confirmed_resize(self): + """Resize a server but when confirming it, leave the migration + allocation there so the audit command can find it. + """ + source_hostname = self.compute1.host + dest_hostname = self.compute2.host + + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname) + + old_flavor = self.flavor + new_flavor = self.api.get_flavors()[1] + # we want to make sure we resize to compute2 + self.flags(allow_resize_to_same_host=False) + + server = self._boot_and_check_allocations(self.flavor, source_hostname) + + # Do a resize + post = { + 'resize': { + 'flavorRef': new_flavor['id'] + } + } + self._move_and_check_allocations( + server, request=post, old_flavor=old_flavor, + new_flavor=new_flavor, source_rp_uuid=source_rp_uuid, + dest_rp_uuid=dest_rp_uuid) + + # Retain the migration UUID record for later usage + migration_uuid = self.get_migration_uuid_for_instance(server['id']) + + # Confirm the resize so it should in theory delete the source + # allocations but mock out the allocation delete for the source + post = {'confirmResize': None} + with mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance'): + self.api.post_server_action( + server['id'], post, check_response_status=[204]) + self._wait_for_state_change(server, 'ACTIVE') + + # The target host usage should be according to the new flavor... + self.assertFlavorMatchesUsage(dest_rp_uuid, new_flavor) + # ...but we should still see allocations for the source compute + self.assertFlavorMatchesUsage(source_rp_uuid, old_flavor) + + # Now, run the audit command that will find this orphaned allocation + ret = self.cli.audit(verbose=True) + output = self.output.getvalue() + self.assertIn( + 'Allocations for consumer UUID %(consumer_uuid)s on ' + 'Resource Provider %(rp_uuid)s can be deleted' % + {'consumer_uuid': migration_uuid, + 'rp_uuid': source_rp_uuid}, + output) + self.assertIn('Processed 1 allocation.', output) + self.assertEqual(3, ret) + + # Now we want to delete the orphaned allocation that is duplicate + ret = self.cli.audit(delete=True, verbose=True) + + # There should be no longer usage for the source host since the + # allocation disappeared + self.assertRequestMatchesUsage({'VCPU': 0, + 'MEMORY_MB': 0, + 'DISK_GB': 0}, source_rp_uuid) + + output = self.output.getvalue() + self.assertIn( + 'Deleted allocations for consumer UUID %(consumer_uuid)s on ' + 'Resource Provider %(rp_uuid)s' % + {'consumer_uuid': migration_uuid, + 'rp_uuid': source_rp_uuid}, + output) + self.assertIn('Processed 1 allocation.', output) + self.assertEqual(4, ret) + + # TODO(sbauza): Mock this test once bug #1829479 is fixed + def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self): + """Evacuate a server and the delete the source node so that it will + leave a source allocation that the audit command will find. + """ + + source_hostname = self.compute1.host + dest_hostname = self.compute2.host + + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname) + + server = self._boot_and_check_allocations(self.flavor, source_hostname) + + # Stop the service and fake it down + self.compute1.stop() + source_service_id = self.admin_api.get_services( + host=source_hostname, binary='nova-compute')[0]['id'] + self.admin_api.put_service(source_service_id, {'forced_down': 'true'}) + + # evacuate the instance to the target + post = {'evacuate': {"host": dest_hostname}} + self.admin_api.post_server_action(server['id'], post) + self._wait_for_server_parameter(server, + {'OS-EXT-SRV-ATTR:host': dest_hostname, + 'status': 'ACTIVE'}) + + # Now the instance is gone, we can delete the compute service + self.admin_api.api_delete('/os-services/%s' % source_service_id) + + # Since the compute is deleted, we should have in theory a single + # allocation against the destination resource provider, but evacuated + # instances are not having their allocations deleted. See bug #1829479. + # We have two allocations for the same consumer, source and destination + self._check_allocation_during_evacuate( + self.flavor, server['id'], source_rp_uuid, dest_rp_uuid) + + # Now, run the audit command that will find this orphaned allocation + ret = self.cli.audit(verbose=True) + output = self.output.getvalue() + self.assertIn( + 'Allocations for consumer UUID %(consumer_uuid)s on ' + 'Resource Provider %(rp_uuid)s can be deleted' % + {'consumer_uuid': server['id'], + 'rp_uuid': source_rp_uuid}, + output) + self.assertIn('Processed 1 allocation.', output) + self.assertEqual(3, ret) + + # Now we want to delete the orphaned allocation that is duplicate + ret = self.cli.audit(delete=True, verbose=True) + + # We finally should only have the target allocations + self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor) + self.assertRequestMatchesUsage({'VCPU': 0, + 'MEMORY_MB': 0, + 'DISK_GB': 0}, source_rp_uuid) + + output = self.output.getvalue() + self.assertIn( + 'Deleted allocations for consumer UUID %(consumer_uuid)s on ' + 'Resource Provider %(rp_uuid)s' % + {'consumer_uuid': server['id'], + 'rp_uuid': source_rp_uuid}, + output) + self.assertIn('Processed 1 allocation.', output) + self.assertEqual(4, ret) + + class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase): """Functional tests for the "nova-manage db archive_deleted_rows" CLI.""" api_major_version = 'v2.1' diff --git a/nova/tests/unit/cmd/test_manage.py b/nova/tests/unit/cmd/test_manage.py index 9782587941b6..d23b13412ae9 100644 --- a/nova/tests/unit/cmd/test_manage.py +++ b/nova/tests/unit/cmd/test_manage.py @@ -34,6 +34,7 @@ from nova.db import migration from nova.db.sqlalchemy import migration as sqla_migration from nova import exception from nova import objects +from nova.scheduler.client import report from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.unit import fake_requests @@ -2851,6 +2852,142 @@ class TestNovaManagePlacement(test.NoDBTestCase): neutron.update_port.assert_called_once_with( uuidsentinel.port_id, body=expected_update_body) + def test_audit_with_wrong_provider_uuid(self): + with mock.patch.object( + self.cli, '_get_resource_provider', + side_effect=exception.ResourceProviderNotFound( + name_or_uuid=uuidsentinel.fake_uuid)): + ret = self.cli.audit( + provider_uuid=uuidsentinel.fake_uuid) + self.assertEqual(127, ret) + output = self.output.getvalue() + self.assertIn( + 'Resource provider with UUID %s' % uuidsentinel.fake_uuid, + output) + + @mock.patch.object(manage.PlacementCommands, + '_check_orphaned_allocations_for_provider') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.get') + def _test_audit(self, get_resource_providers, check_orphaned_allocs, + verbose=False, delete=False, errors=False, found=False): + rps = [ + {"generation": 1, + "uuid": uuidsentinel.rp1, + "links": None, + "name": "rp1", + "parent_provider_uuid": None, + "root_provider_uuid": uuidsentinel.rp1}, + {"generation": 1, + "uuid": uuidsentinel.rp2, + "links": None, + "name": "rp2", + "parent_provider_uuid": None, + "root_provider_uuid": uuidsentinel.rp2}, + ] + get_resource_providers.return_value = fake_requests.FakeResponse( + 200, content=jsonutils.dumps({"resource_providers": rps})) + + if errors: + # We found one orphaned allocation per RP but RP1 got a fault + check_orphaned_allocs.side_effect = ((1, 1), (1, 0)) + elif found: + # we found one orphaned allocation per RP and we had no faults + check_orphaned_allocs.side_effect = ((1, 0), (1, 0)) + else: + # No orphaned allocations are found for all the RPs + check_orphaned_allocs.side_effect = ((0, 0), (0, 0)) + + ret = self.cli.audit(verbose=verbose, delete=delete) + if errors: + # Any fault stops the audit and provides a return code equals to 1 + expected_ret = 1 + elif found and delete: + # We found orphaned allocations and deleted them + expected_ret = 4 + elif found and not delete: + # We found orphaned allocations but we left them + expected_ret = 3 + else: + # Nothing was found + expected_ret = 0 + self.assertEqual(expected_ret, ret) + + call1 = mock.call(mock.ANY, mock.ANY, mock.ANY, rps[0], delete) + call2 = mock.call(mock.ANY, mock.ANY, mock.ANY, rps[1], delete) + if errors: + # We stop checking other RPs once we got a fault + check_orphaned_allocs.assert_has_calls([call1]) + else: + # All the RPs are checked + check_orphaned_allocs.assert_has_calls([call1, call2]) + + if verbose and found: + output = self.output.getvalue() + self.assertIn('Processed 2 allocations', output) + if errors: + output = self.output.getvalue() + self.assertIn( + 'The Resource Provider %s had problems' % rps[0]["uuid"], + output) + + def test_audit_not_found_orphaned_allocs(self): + self._test_audit(found=False) + + def test_audit_found_orphaned_allocs_not_verbose(self): + self._test_audit(found=True) + + def test_audit_found_orphaned_allocs_verbose(self): + self._test_audit(found=True, verbose=True) + + def test_audit_found_orphaned_allocs_and_deleted_them(self): + self._test_audit(found=True, delete=True) + + def test_audit_found_orphaned_allocs_but_got_errors(self): + self._test_audit(errors=True) + + @mock.patch.object(manage.PlacementCommands, + '_delete_allocations_from_consumer') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_allocations_for_resource_provider') + @mock.patch.object(manage.PlacementCommands, + '_get_instances_and_current_migrations') + def test_check_orphaned_allocations_for_provider(self, + get_insts_and_migs, + get_allocs_for_rp, + delete_allocs): + provider = {"generation": 1, + "uuid": uuidsentinel.rp1, + "links": None, + "name": "rp1", + "parent_provider_uuid": None, + "root_provider_uuid": uuidsentinel.rp1} + compute_resources = {'VCPU': 1, 'MEMORY_MB': 2048, 'DISK_GB': 20} + allocations = { + # Some orphaned compute allocation + uuidsentinel.orphaned_alloc1: {'resources': compute_resources}, + # Some existing instance allocation + uuidsentinel.inst1: {'resources': compute_resources}, + # Some existing migration allocation + uuidsentinel.mig1: {'resources': compute_resources}, + # Some other allocation not related to Nova + uuidsentinel.other_alloc1: {'resources': {'CUSTOM_GOO'}}, + } + + get_insts_and_migs.return_value = ( + [uuidsentinel.inst1], + [uuidsentinel.mig1]) + get_allocs_for_rp.return_value = report.ProviderAllocInfo(allocations) + + ctxt = context.RequestContext() + placement = report.SchedulerReportClient() + ret = self.cli._check_orphaned_allocations_for_provider( + ctxt, placement, lambda x: x, provider, True) + get_allocs_for_rp.assert_called_once_with(ctxt, uuidsentinel.rp1) + delete_allocs.assert_called_once_with(ctxt, placement, provider, + uuidsentinel.orphaned_alloc1, + 'instance') + self.assertEqual((1, 0), ret) + class TestNovaManageMain(test.NoDBTestCase): """Tests the nova-manage:main() setup code.""" diff --git a/releasenotes/notes/placement-audit-59a00dcfb188c6ac.yaml b/releasenotes/notes/placement-audit-59a00dcfb188c6ac.yaml new file mode 100644 index 000000000000..f7b232b6a373 --- /dev/null +++ b/releasenotes/notes/placement-audit-59a00dcfb188c6ac.yaml @@ -0,0 +1,12 @@ +--- +other: + - | + A new ``nova-manage placement audit`` CLI has been added to help identify + orphaned compute allocations in the Placement API that are no longer + related to either instances or migrations. + Some race conditions in Nova could not remove allocations for some + instances or migrations when they're done and then it would create some + capacity issues. Thanks to the command, you could know the orphaned + allocations and ask to remove them. + For more details on CLI usage, see the man page entry: + https://docs.openstack.org/nova/latest/cli/nova-manage.html#placement