Enforce quota usage from placement when unshelving
When [quota]count_usage_from_placement = true or [quota]driver = nova.quota.UnifiedLimitsDriver, cores and ram quota usage are counted from placement. When an instance is SHELVED_OFFLOADED, it will not have allocations in placement, so its cores and ram should not count against quota during that time. This means however that when an instance is unshelved, there is a possibility of going over quota if the cores and ram it needs were allocated by some other instance(s) while it was SHELVED_OFFLOADED. This fixes a bug where quota was not being properly enforced during unshelve of a SHELVED_OFFLOADED instance when quota usage is counted from placement. Test coverage is also added for the "recheck" quota cases. Closes-Bug: #2003991 Change-Id: I4ab97626c10052c7af9934a80ff8db9ddab82738
This commit is contained in:
parent
427b2cb4d6
commit
6f79d6321e
@ -81,7 +81,7 @@ class ShelveController(wsgi.Controller):
|
||||
raise exc.HTTPBadRequest(explanation=e.format_message())
|
||||
|
||||
@wsgi.response(202)
|
||||
@wsgi.expected_errors((400, 404, 409))
|
||||
@wsgi.expected_errors((400, 403, 404, 409))
|
||||
@wsgi.action('unshelve')
|
||||
# In microversion 2.77 we support specifying 'availability_zone' to
|
||||
# unshelve a server. But before 2.77 there is no request body
|
||||
@ -142,3 +142,5 @@ class ShelveController(wsgi.Controller):
|
||||
exception.ComputeHostNotFound,
|
||||
) as e:
|
||||
raise exc.HTTPBadRequest(explanation=e.format_message())
|
||||
except exception.OverQuota as e:
|
||||
raise exc.HTTPForbidden(explanation=e.format_message())
|
||||
|
@ -60,6 +60,7 @@ from nova.i18n import _
|
||||
from nova.image import glance
|
||||
from nova.limit import local as local_limit
|
||||
from nova.limit import placement as placement_limits
|
||||
from nova.limit import utils as limit_utils
|
||||
from nova.network import constants
|
||||
from nova.network import model as network_model
|
||||
from nova.network import neutron
|
||||
@ -4522,6 +4523,42 @@ class API:
|
||||
"vol_zone": volume['availability_zone']}
|
||||
raise exception.MismatchVolumeAZException(reason=msg)
|
||||
|
||||
@staticmethod
|
||||
def _check_quota_unshelve_offloaded(
|
||||
context: nova_context.RequestContext,
|
||||
instance: 'objects.Instance',
|
||||
request_spec: 'objects.RequestSpec'
|
||||
):
|
||||
if not (CONF.quota.count_usage_from_placement or
|
||||
limit_utils.use_unified_limits()):
|
||||
return
|
||||
# TODO(melwitt): This is ugly but we have to do it this way because
|
||||
# instances quota is currently counted from the API database but cores
|
||||
# and ram are counted from placement. That means while an instance is
|
||||
# SHELVED_OFFLOADED, it will still consume instances quota but it will
|
||||
# not consume cores and ram. So we need an instances delta of
|
||||
# 0 but cores and ram deltas from the flavor.
|
||||
# Once instances usage is also being counted from placement, we can
|
||||
# replace this method with a normal check_num_instances_quota() call.
|
||||
vcpus = instance.flavor.vcpus
|
||||
memory_mb = instance.flavor.memory_mb
|
||||
# We are not looking to create a new server, we are unshelving an
|
||||
# existing one.
|
||||
deltas = {'instances': 0, 'cores': vcpus, 'ram': memory_mb}
|
||||
|
||||
objects.Quotas.check_deltas(
|
||||
context,
|
||||
deltas,
|
||||
context.project_id,
|
||||
user_id=context.user_id,
|
||||
check_project_id=instance.project_id,
|
||||
check_user_id=instance.user_id,
|
||||
)
|
||||
# Do the same for unified limits.
|
||||
placement_limits.enforce_num_instances_and_flavor(
|
||||
context, context.project_id, instance.flavor, request_spec.is_bfv,
|
||||
0, 0, delta_updates={'servers': 0})
|
||||
|
||||
@block_extended_resource_request
|
||||
@check_instance_lock
|
||||
@check_instance_state(
|
||||
@ -4550,6 +4587,15 @@ class API:
|
||||
request_spec = objects.RequestSpec.get_by_instance_uuid(
|
||||
context, instance.uuid)
|
||||
|
||||
# Check quota before we save any changes to the database, but only if
|
||||
# we are counting quota usage from placement. When an instance is
|
||||
# SHELVED_OFFLOADED, it will not consume cores or ram resources in
|
||||
# placement. This means it is possible that an unshelve would cause the
|
||||
# project/user to go over quota.
|
||||
if instance.vm_state == vm_states.SHELVED_OFFLOADED:
|
||||
self._check_quota_unshelve_offloaded(
|
||||
context, instance, request_spec)
|
||||
|
||||
# We need to check a list of preconditions and validate inputs first
|
||||
|
||||
# Ensure instance is shelve offloaded
|
||||
|
@ -20,6 +20,7 @@ import copy
|
||||
import eventlet
|
||||
import functools
|
||||
import sys
|
||||
import typing as ty
|
||||
|
||||
from keystoneauth1 import exceptions as ks_exc
|
||||
from oslo_config import cfg
|
||||
@ -48,6 +49,7 @@ from nova import exception
|
||||
from nova.i18n import _
|
||||
from nova.image import glance
|
||||
from nova.limit import placement as placement_limits
|
||||
from nova.limit import utils as limit_utils
|
||||
from nova import manager
|
||||
from nova.network import neutron
|
||||
from nova import notifications
|
||||
@ -970,6 +972,33 @@ class ComputeTaskManager:
|
||||
objects.Destination(
|
||||
cell=instance_mapping.cell_mapping))
|
||||
|
||||
def _recheck_quota(
|
||||
self,
|
||||
context: nova_context.RequestContext,
|
||||
flavor: 'objects.Flavor',
|
||||
request_spec: 'objects.RequestSpec',
|
||||
orig_num_req: int,
|
||||
project_id: ty.Optional[str] = None,
|
||||
user_id: ty.Optional[str] = None
|
||||
) -> None:
|
||||
# A quota "recheck" is a quota check that is performed *after* quota
|
||||
# limited resources are consumed. It is meant to address race
|
||||
# conditions where a request that was not over quota at the beginning
|
||||
# of the request before resources are allocated becomes over quota
|
||||
# after resources (like database rows or placement allocations) are
|
||||
# created. An example of this would be a large number of requests for
|
||||
# the same resource for the same project sent simultaneously.
|
||||
if CONF.quota.recheck_quota:
|
||||
# The orig_num_req is the number of instances requested, which is
|
||||
# the delta that was quota checked before resources were allocated.
|
||||
# This is only used for the exception message is the recheck fails
|
||||
# for lack of enough quota.
|
||||
compute_utils.check_num_instances_quota(
|
||||
context, flavor, 0, 0, project_id=project_id,
|
||||
user_id=user_id, orig_num_req=orig_num_req)
|
||||
placement_limits.enforce_num_instances_and_flavor(
|
||||
context, project_id, flavor, request_spec.is_bfv, 0, 0)
|
||||
|
||||
# TODO(mriedem): Make request_spec required in ComputeTaskAPI RPC v2.0.
|
||||
@targets_cell
|
||||
def unshelve_instance(self, context, instance, request_spec=None):
|
||||
@ -1055,6 +1084,30 @@ class ComputeTaskManager:
|
||||
host_lists = self._schedule_instances(context,
|
||||
request_spec, [instance.uuid],
|
||||
return_alternates=False)
|
||||
|
||||
# NOTE(melwitt): We recheck the quota after allocating the
|
||||
# resources in placement, to prevent users from allocating
|
||||
# more resources than their allowed quota in the event of a
|
||||
# race. This is configurable because it can be expensive if
|
||||
# strict quota limits are not required in a deployment.
|
||||
try:
|
||||
# Quota should only be checked for unshelve only if
|
||||
# resources are being counted in placement. Legacy
|
||||
# quotas continue to consume resources while
|
||||
# SHELVED_OFFLOADED and will not allocate any new
|
||||
# resources during unshelve.
|
||||
if (CONF.quota.count_usage_from_placement or
|
||||
limit_utils.use_unified_limits()):
|
||||
self._recheck_quota(
|
||||
context, instance.flavor, request_spec, 0,
|
||||
project_id=instance.project_id,
|
||||
user_id=instance.user_id)
|
||||
except (exception.TooManyInstances,
|
||||
limit_exceptions.ProjectOverLimit):
|
||||
with excutils.save_and_reraise_exception():
|
||||
self.report_client.delete_allocation_for_instance(
|
||||
context, instance.uuid, force=True)
|
||||
|
||||
host_list = host_lists[0]
|
||||
selection = host_list[0]
|
||||
scheduler_utils.populate_filter_properties(
|
||||
@ -1677,27 +1730,22 @@ class ComputeTaskManager:
|
||||
instances.append(instance)
|
||||
cell_mapping_cache[instance.uuid] = cell
|
||||
|
||||
# NOTE(melwitt): We recheck the quota after creating the
|
||||
# objects to prevent users from allocating more resources
|
||||
# NOTE(melwitt): We recheck the quota after allocating the
|
||||
# resources to prevent users from allocating more resources
|
||||
# than their allowed quota in the event of a race. This is
|
||||
# configurable because it can be expensive if strict quota
|
||||
# limits are not required in a deployment.
|
||||
if CONF.quota.recheck_quota:
|
||||
try:
|
||||
compute_utils.check_num_instances_quota(
|
||||
context, instance.flavor, 0, 0,
|
||||
orig_num_req=len(build_requests))
|
||||
placement_limits.enforce_num_instances_and_flavor(
|
||||
context, context.project_id, instance.flavor,
|
||||
request_specs[0].is_bfv, 0, 0)
|
||||
except (exception.TooManyInstances,
|
||||
limit_exceptions.ProjectOverLimit) as exc:
|
||||
with excutils.save_and_reraise_exception():
|
||||
self._cleanup_build_artifacts(context, exc, instances,
|
||||
build_requests,
|
||||
request_specs,
|
||||
block_device_mapping, tags,
|
||||
cell_mapping_cache)
|
||||
try:
|
||||
self._recheck_quota(context, instance.flavor, request_specs[0],
|
||||
len(build_requests), project_id=instance.project_id,
|
||||
user_id=instance.user_id
|
||||
)
|
||||
except (exception.TooManyInstances,
|
||||
limit_exceptions.ProjectOverLimit) as exc:
|
||||
with excutils.save_and_reraise_exception():
|
||||
self._cleanup_build_artifacts(
|
||||
context, exc, instances, build_requests, request_specs,
|
||||
block_device_mapping, tags, cell_mapping_cache)
|
||||
|
||||
zipped = zip(build_requests, request_specs, host_lists, instances)
|
||||
for (build_request, request_spec, host_list, instance) in zipped:
|
||||
|
@ -156,7 +156,8 @@ def enforce_num_instances_and_flavor(
|
||||
is_bfvm: bool,
|
||||
min_count: int,
|
||||
max_count: int,
|
||||
enforcer: ty.Optional[limit.Enforcer] = None
|
||||
enforcer: ty.Optional[limit.Enforcer] = None,
|
||||
delta_updates: ty.Optional[ty.Dict[str, int]] = None,
|
||||
) -> int:
|
||||
"""Return max instances possible, else raise TooManyInstances exception."""
|
||||
if not limit_utils.use_unified_limits():
|
||||
@ -169,7 +170,10 @@ def enforce_num_instances_and_flavor(
|
||||
raise ValueError("invalid max_count")
|
||||
|
||||
deltas = _get_deltas_by_flavor(flavor, is_bfvm, max_count)
|
||||
enforcer = _get_enforcer(context, project_id)
|
||||
if delta_updates:
|
||||
deltas.update(delta_updates)
|
||||
|
||||
enforcer = enforcer or _get_enforcer(context, project_id)
|
||||
try:
|
||||
enforcer.enforce(project_id, deltas)
|
||||
except limit_exceptions.ProjectOverLimit as e:
|
||||
|
@ -414,15 +414,12 @@ class ServersTest(integrated_helpers._IntegratedTestBase):
|
||||
# quota.
|
||||
self._create_server(flavor_id=1)
|
||||
|
||||
# FIXME(melwitt): This is bug #2003991, the unshelve is supposed to
|
||||
# fail if we would be over quota after unshelving.
|
||||
# Now try to unshelve the earlier instance. It should fail because it
|
||||
# would put us over quota to have 4 running instances.
|
||||
# ex = self.assertRaises(client.OpenStackApiException,
|
||||
# self._unshelve_server,
|
||||
# server)
|
||||
# self.assertEqual(403, ex.response.status_code)
|
||||
self._unshelve_server(server)
|
||||
ex = self.assertRaises(client.OpenStackApiException,
|
||||
self._unshelve_server,
|
||||
server)
|
||||
self.assertEqual(403, ex.response.status_code)
|
||||
|
||||
def test_unshelve_offloaded_overquota_placement(self):
|
||||
# Count quota usage from placement.
|
||||
|
@ -1362,6 +1362,7 @@ class _BaseTaskTestCase(object):
|
||||
|
||||
fake_spec = fake_request_spec.fake_spec_obj()
|
||||
fake_spec.flavor = instance.flavor
|
||||
fake_spec.is_bfv = False
|
||||
# FIXME(sbauza): Modify the fake RequestSpec object to either add a
|
||||
# non-empty SchedulerRetries object or nullify the field
|
||||
fake_spec.retry = None
|
||||
@ -1474,6 +1475,7 @@ class _BaseTaskTestCase(object):
|
||||
# 'shelved_image_id' is None for volumebacked instance
|
||||
instance.system_metadata['shelved_image_id'] = None
|
||||
self.request_spec.flavor = instance.flavor
|
||||
self.request_spec.is_bfv = True
|
||||
|
||||
with test.nested(
|
||||
mock.patch.object(self.conductor_manager,
|
||||
@ -1502,6 +1504,7 @@ class _BaseTaskTestCase(object):
|
||||
def test_unshelve_instance_schedule_and_rebuild(
|
||||
self, mock_im, mock_get, mock_schedule, mock_unshelve):
|
||||
fake_spec = objects.RequestSpec()
|
||||
fake_spec.is_bfv = False
|
||||
# Set requested_destination to test setting cell_mapping in
|
||||
# existing object.
|
||||
fake_spec.requested_destination = objects.Destination(
|
||||
@ -1606,6 +1609,7 @@ class _BaseTaskTestCase(object):
|
||||
def test_unshelve_instance_schedule_and_rebuild_volume_backed(
|
||||
self, mock_im, mock_schedule, mock_unshelve):
|
||||
fake_spec = objects.RequestSpec()
|
||||
fake_spec.is_bfv = True
|
||||
mock_im.return_value = objects.InstanceMapping(
|
||||
cell_mapping=objects.CellMapping.get_by_uuid(self.context,
|
||||
uuids.cell1))
|
||||
@ -1637,6 +1641,7 @@ class _BaseTaskTestCase(object):
|
||||
|
||||
request_spec = objects.RequestSpec()
|
||||
request_spec.flavor = instance.flavor
|
||||
request_spec.is_bfv = False
|
||||
|
||||
selection = objects.Selection(
|
||||
service_host='fake_host',
|
||||
@ -1680,6 +1685,7 @@ class _BaseTaskTestCase(object):
|
||||
request_spec = objects.RequestSpec()
|
||||
request_spec.flavor = instance.flavor
|
||||
request_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
|
||||
request_spec.is_bfv = False
|
||||
|
||||
selection = objects.Selection(
|
||||
service_host='fake_host',
|
||||
@ -1728,6 +1734,7 @@ class _BaseTaskTestCase(object):
|
||||
request_spec = objects.RequestSpec()
|
||||
request_spec.flavor = instance.flavor
|
||||
request_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
|
||||
request_spec.is_bfv = False
|
||||
|
||||
selection = objects.Selection(
|
||||
service_host='fake_host',
|
||||
@ -2965,6 +2972,131 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
|
||||
|
||||
self.assertTrue(mock_build.called)
|
||||
|
||||
@mock.patch('nova.compute.rpcapi.ComputeAPI.unshelve_instance')
|
||||
@mock.patch('nova.scheduler.rpcapi.SchedulerAPI.select_destinations')
|
||||
@mock.patch('nova.objects.Instance.save', new=mock.Mock())
|
||||
def _test_unshelve_over_quota_during_recheck(self, mock_select,
|
||||
mock_unshelve):
|
||||
mock_select.return_value = [[fake_selection1]]
|
||||
|
||||
instance = fake_instance.fake_instance_obj(
|
||||
self.context, vm_state=vm_states.SHELVED_OFFLOADED)
|
||||
# This has to be set separately because fake_instance_obj() would
|
||||
# overwrite it when it calls _from_db_object().
|
||||
instance.system_metadata = {}
|
||||
|
||||
im = objects.InstanceMapping(
|
||||
self.context, instance_uuid=instance.uuid,
|
||||
project_id=instance.project_id)
|
||||
im.cell_mapping = self.cell_mappings['cell1']
|
||||
im.create()
|
||||
|
||||
req_spec = objects.RequestSpec(
|
||||
instance_uuid=instance.uuid, flavor=instance.flavor,
|
||||
instance_group=None, is_bfv=False)
|
||||
|
||||
self.assertRaises(
|
||||
exc.TooManyInstances,
|
||||
self.conductor.unshelve_instance,
|
||||
self.context, instance, req_spec
|
||||
)
|
||||
|
||||
# We should not have called the compute API unshelve().
|
||||
self.assertFalse(mock_unshelve.called)
|
||||
|
||||
return instance, req_spec
|
||||
|
||||
@mock.patch('nova.objects.quotas.Quotas.check_deltas')
|
||||
def test_unshelve_over_quota_during_recheck_placement(self, mock_check):
|
||||
self.flags(count_usage_from_placement=True, group='quota')
|
||||
|
||||
# Simulate a race where the first check passes and the recheck fails.
|
||||
# First check occurs in compute/api.
|
||||
fake_quotas = {'instances': 5, 'cores': 10, 'ram': 4096}
|
||||
fake_headroom = {'instances': 5, 'cores': 10, 'ram': 4096}
|
||||
fake_usages = {'instances': 5, 'cores': 10, 'ram': 4096}
|
||||
e = exc.OverQuota(overs=['instances'], quotas=fake_quotas,
|
||||
headroom=fake_headroom, usages=fake_usages)
|
||||
mock_check.side_effect = e
|
||||
|
||||
instance, _ = self._test_unshelve_over_quota_during_recheck()
|
||||
|
||||
# Verify we called the quota check function with expected args.
|
||||
mock_check.assert_called_once_with(
|
||||
self.context, {'instances': 0, 'cores': 0, 'ram': 0},
|
||||
instance.project_id, user_id=None,
|
||||
check_project_id=instance.project_id, check_user_id=None)
|
||||
|
||||
@mock.patch.object(placement_limit, 'enforce_num_instances_and_flavor')
|
||||
@mock.patch('nova.objects.quotas.Quotas.check_deltas', new=mock.Mock())
|
||||
def test_unshelve_over_quota_during_recheck_ul(self, mock_enforce):
|
||||
self.flags(driver="nova.quota.UnifiedLimitsDriver", group="quota")
|
||||
|
||||
# Simulate a race where the first check passes and the recheck fails.
|
||||
# First check occurs in compute/api.
|
||||
mock_enforce.side_effect = exc.TooManyInstances(
|
||||
overs=['instances'], req=5, used=5, allowed=5)
|
||||
|
||||
instance, req_spec = self._test_unshelve_over_quota_during_recheck()
|
||||
|
||||
# Verify we called the quota check function with expected args.
|
||||
mock_enforce.assert_called_once_with(
|
||||
self.context, instance.project_id, instance.flavor,
|
||||
req_spec.is_bfv, 0, 0)
|
||||
|
||||
@mock.patch('nova.compute.rpcapi.ComputeAPI.unshelve_instance')
|
||||
@mock.patch.object(placement_limit, 'enforce_num_instances_and_flavor')
|
||||
@mock.patch('nova.objects.quotas.Quotas.check_deltas')
|
||||
@mock.patch('nova.scheduler.rpcapi.SchedulerAPI.select_destinations')
|
||||
@mock.patch('nova.objects.Instance.save', new=mock.Mock())
|
||||
def test_unshelve_no_quota_recheck(self, mock_select, mock_check,
|
||||
mock_enforce, mock_unshelve):
|
||||
# Quota should not be checked on unshelve for legacy quotas at all.
|
||||
mock_select.return_value = [[fake_selection1]]
|
||||
|
||||
# This is needed to register the compute node in a cell.
|
||||
self.start_service('compute', host='host1')
|
||||
|
||||
instance = fake_instance.fake_instance_obj(
|
||||
self.context, vm_state=vm_states.SHELVED_OFFLOADED)
|
||||
# This has to be set separately because fake_instance_obj() would
|
||||
# overwrite it when it calls _from_db_object().
|
||||
instance.system_metadata = {}
|
||||
|
||||
im = objects.InstanceMapping(
|
||||
self.context, instance_uuid=instance.uuid,
|
||||
project_id=instance.project_id)
|
||||
im.cell_mapping = self.cell_mappings['cell1']
|
||||
im.create()
|
||||
|
||||
req_spec = objects.RequestSpec(
|
||||
instance_uuid=instance.uuid, flavor=instance.flavor,
|
||||
instance_group=None)
|
||||
|
||||
self.conductor.unshelve_instance(self.context, instance, req_spec)
|
||||
|
||||
# check_deltas should not have been called
|
||||
mock_check.assert_not_called()
|
||||
|
||||
# Same for enforce_num_instances_and_flavor
|
||||
mock_enforce.assert_not_called()
|
||||
|
||||
# We should have called the compute API unshelve()
|
||||
self.assertTrue(mock_unshelve.called)
|
||||
|
||||
def test_unshelve_quota_recheck_disabled(self):
|
||||
# Disable recheck_quota.
|
||||
self.flags(recheck_quota=False, group='quota')
|
||||
self.test_unshelve_no_quota_recheck()
|
||||
|
||||
def test_unshelve_no_quota_recheck_disabled_placement(self):
|
||||
self.flags(count_usage_from_placement=True, group='quota')
|
||||
self.test_unshelve_quota_recheck_disabled()
|
||||
|
||||
def test_unshelve_no_quota_recheck_disabled_ul(self):
|
||||
self.flags(driver='nova.quota.UnifiedLimitsDriver', group='quota')
|
||||
self.test_unshelve_quota_recheck_disabled()
|
||||
|
||||
def test_schedule_and_build_instances_fill_request_spec(self):
|
||||
# makes sure there is some request group in the spec to be mapped
|
||||
self.params['request_specs'][0].requested_resources = [
|
||||
|
@ -0,0 +1,8 @@
|
||||
fixes:
|
||||
- |
|
||||
`Bug #2003991`_: Fixes an issue where quota was not properly enforced
|
||||
during unshelve of a ``SHELVED_OFFLOADED`` server when
|
||||
``[quota]count_usage_from_placement = true`` or
|
||||
``[quota]driver = nova.quota.UnifiedLimitsDriver`` are configured.
|
||||
|
||||
.. _Bug #2003991: https://launchpad.net/bugs/2003991
|
Loading…
x
Reference in New Issue
Block a user