Merge "Make allocation candidates available for scheduler filters"

2022-12-17 17:12:31 +00:00 · 2022-12-17 17:12:31 +00:00 · b54beee429
commit b54beee429
parent 8a476061c5 3d818c3473
4 changed files with 866 additions and 122 deletions
--- a/nova/scheduler/filters/init.py
+++ b/nova/scheduler/filters/init.py
@ -28,6 +28,9 @@ class BaseHostFilter(filters.BaseFilter):
    # other parameters. We care about running policy filters (i.e.
    # ImagePropertiesFilter) but not things that check usage on the
    # existing compute node, etc.
+    # This also means that filters marked with RUN_ON_REBUILD = True cannot
+    # filter on allocation candidates or need to handle the rebuild case
+    # specially.
    RUN_ON_REBUILD = False

    def _filter_one(self, obj, spec):
--- a/nova/scheduler/host_manager.py
+++ b/nova/scheduler/host_manager.py
@ -153,6 +153,8 @@ class HostState(object):

        self.updated = None

+        self.allocation_candidates = []
+
    def update(self, compute=None, service=None, aggregates=None,
            inst_dict=None):
        """Update all information about a host."""
@ -314,13 +316,21 @@ class HostState(object):
        self.num_io_ops += 1

    def __repr__(self):
-        return ("(%(host)s, %(node)s) ram: %(free_ram)sMB "
-                "disk: %(free_disk)sMB io_ops: %(num_io_ops)s "
-                "instances: %(num_instances)s" %
-                {'host': self.host, 'node': self.nodename,
-                 'free_ram': self.free_ram_mb, 'free_disk': self.free_disk_mb,
-                 'num_io_ops': self.num_io_ops,
-                 'num_instances': self.num_instances})
+        return (
+            "(%(host)s, %(node)s) ram: %(free_ram)sMB "
+            "disk: %(free_disk)sMB io_ops: %(num_io_ops)s "
+            "instances: %(num_instances)s, "
+            "allocation_candidates: %(num_a_c)s"
+            % {
+                "host": self.host,
+                "node": self.nodename,
+                "free_ram": self.free_ram_mb,
+                "free_disk": self.free_disk_mb,
+                "num_io_ops": self.num_io_ops,
+                "num_instances": self.num_instances,
+                "num_a_c": len(self.allocation_candidates),
+            }
+        )


 class HostManager(object):
--- a/nova/scheduler/manager.py
+++ b/nova/scheduler/manager.py
@ -20,6 +20,7 @@ Scheduler Service
 """

 import collections
+import copy
 import random

 from oslo_log import log as logging
@ -299,12 +300,29 @@ class SchedulerManager(manager.Manager):
        # host, we virtually consume resources on it so subsequent
        # selections can adjust accordingly.

+        def hosts_with_alloc_reqs(hosts_gen):
+            """Extend the HostState objects returned by the generator with
+            the allocation requests of that host
+            """
+            for host in hosts_gen:
+                host.allocation_candidates = copy.deepcopy(
+                    alloc_reqs_by_rp_uuid[host.uuid])
+                yield host
+
        # Note: remember, we are using a generator-iterator here. So only
        # traverse this list once. This can bite you if the hosts
        # are being scanned in a filter or weighing function.
        hosts = self._get_all_host_states(
            elevated, spec_obj, provider_summaries)

+        # alloc_reqs_by_rp_uuid is None during rebuild, so this mean we cannot
+        # run filters that are using allocation candidates during rebuild
+        if alloc_reqs_by_rp_uuid is not None:
+            # wrap the generator to extend the HostState objects with the
+            # allocation requests for that given host. This is needed to
+            # support scheduler filters filtering on allocation candidates.
+            hosts = hosts_with_alloc_reqs(hosts)
+
        # NOTE(sbauza): The RequestSpec.num_instances field contains the number
        # of instances created when the RequestSpec was used to first boot some
        # instances. This is incorrect when doing a move or resize operation,
@ -332,6 +350,13 @@ class SchedulerManager(manager.Manager):
            # the older dict format representing HostState objects.
            # TODO(stephenfin): Remove this when we bump scheduler the RPC API
            # version to 5.0
+            # NOTE(gibi): We cannot remove this branch as it is actively used
+            # when nova calls the scheduler during rebuild (not evacuate) to
+            # check if the current host is still good for the new image used
+            # for the rebuild. In this case placement cannot be used to
+            # generate candidates as that would require space on the current
+            # compute for double allocation. So no allocation candidates for
+            # rebuild and therefore alloc_reqs_by_rp_uuid is None
            return self._legacy_find_hosts(
                context, num_instances, spec_obj, hosts, num_alts,
                instance_uuids=instance_uuids)
@ -345,6 +370,9 @@ class SchedulerManager(manager.Manager):
        # The list of hosts that have been selected (and claimed).
        claimed_hosts = []

+        # The allocation request allocated on the given claimed host
+        claimed_alloc_reqs = []
+
        for num, instance_uuid in enumerate(instance_uuids):
            # In a multi-create request, the first request spec from the list
            # is passed to the scheduler and that request spec's instance_uuid
@ -371,21 +399,20 @@ class SchedulerManager(manager.Manager):
            # resource provider UUID
            claimed_host = None
            for host in hosts:
-                cn_uuid = host.uuid
-                if cn_uuid not in alloc_reqs_by_rp_uuid:
-                    msg = ("A host state with uuid = '%s' that did not have a "
-                           "matching allocation_request was encountered while "
-                           "scheduling. This host was skipped.")
-                    LOG.debug(msg, cn_uuid)
+                if not host.allocation_candidates:
+                    LOG.debug(
+                        "The nova scheduler removed every allocation candidate"
+                        "for host %s so this host was skipped.",
+                        host
+                    )
                    continue

-                alloc_reqs = alloc_reqs_by_rp_uuid[cn_uuid]
                # TODO(jaypipes): Loop through all allocation_requests instead
                # of just trying the first one. For now, since we'll likely
                # want to order the allocation_requests in the future based on
                # information in the provider summaries, we'll just try to
                # claim resources using the first allocation_request
-                alloc_req = alloc_reqs[0]
+                alloc_req = host.allocation_candidates[0]
                if utils.claim_resources(
                    elevated, self.placement_client, spec_obj, instance_uuid,
                    alloc_req,
@ -405,6 +432,15 @@ class SchedulerManager(manager.Manager):

            claimed_instance_uuids.append(instance_uuid)
            claimed_hosts.append(claimed_host)
+            claimed_alloc_reqs.append(alloc_req)
+
+            # update the provider mapping in the request spec based
+            # on the allocated candidate as the _consume_selected_host depends
+            # on this information to temporally consume PCI devices tracked in
+            # placement
+            for request_group in spec_obj.requested_resources:
+                request_group.provider_uuids = alloc_req[
+                    'mappings'][request_group.requester_id]

            # Now consume the resources so the filter/weights will change for
            # the next instance.
@ -416,11 +452,19 @@ class SchedulerManager(manager.Manager):
        self._ensure_sufficient_hosts(
            context, claimed_hosts, num_instances, claimed_instance_uuids)

-        # We have selected and claimed hosts for each instance. Now we need to
-        # find alternates for each host.
+        # We have selected and claimed hosts for each instance along with a
+        # claimed allocation request. Now we need to find alternates for each
+        # host.
        return self._get_alternate_hosts(
-            claimed_hosts, spec_obj, hosts, num, num_alts,
-            alloc_reqs_by_rp_uuid, allocation_request_version)
+            claimed_hosts,
+            spec_obj,
+            hosts,
+            num,
+            num_alts,
+            alloc_reqs_by_rp_uuid,
+            allocation_request_version,
+            claimed_alloc_reqs,
+        )

    def _ensure_sufficient_hosts(
        self, context, hosts, required_count, claimed_uuids=None,
@ -532,7 +576,21 @@ class SchedulerManager(manager.Manager):
    def _get_alternate_hosts(
        self, selected_hosts, spec_obj, hosts, index, num_alts,
        alloc_reqs_by_rp_uuid=None, allocation_request_version=None,
+        selected_alloc_reqs=None,
    ):
+        """Generate the main Selection and possible alternate Selection
+        objects for each "instance".
+
+        :param selected_hosts: This is a list of HostState objects. Each
+            HostState represents the main selection for a given instance being
+            scheduled (we can have multiple instances during multi create).
+        :param selected_alloc_reqs: This is a list of allocation requests that
+            are already allocated in placement for the main Selection for each
+            instance. This list is matching with selected_hosts by index. So
+            for the first instance the selected host is selected_host[0] and
+            the already allocated placement candidate is
+            selected_alloc_reqs[0].
+        """
        # We only need to filter/weigh the hosts again if we're dealing with
        # more than one instance and are going to be picking alternates.
        if index > 0 and num_alts > 0:
@ -546,11 +604,10 @@ class SchedulerManager(manager.Manager):
        # representing the selected host along with alternates from the same
        # cell.
        selections_to_return = []
-        for selected_host in selected_hosts:
+        for i, selected_host in enumerate(selected_hosts):
            # This is the list of hosts for one particular instance.
            if alloc_reqs_by_rp_uuid:
-                selected_alloc_req = alloc_reqs_by_rp_uuid.get(
-                        selected_host.uuid)[0]
+                selected_alloc_req = selected_alloc_reqs[i]
            else:
                selected_alloc_req = None

@ -571,15 +628,17 @@ class SchedulerManager(manager.Manager):
                if len(selected_plus_alts) >= num_alts + 1:
                    break

+                # TODO(gibi): In theory we could generate alternatives on the
+                # same host if that host has different possible allocation
+                # candidates for the request. But we don't do that today
                if host.cell_uuid == cell_uuid and host not in selected_hosts:
                    if alloc_reqs_by_rp_uuid is not None:
-                        alt_uuid = host.uuid
-                        if alt_uuid not in alloc_reqs_by_rp_uuid:
+                        if not host.allocation_candidates:
                            msg = ("A host state with uuid = '%s' that did "
-                                   "not have a matching allocation_request "
+                                   "not have any remaining allocation_request "
                                   "was encountered while scheduling. This "
                                   "host was skipped.")
-                            LOG.debug(msg, alt_uuid)
+                            LOG.debug(msg, host.uuid)
                            continue

                        # TODO(jaypipes): Loop through all allocation_requests
@ -588,7 +647,13 @@ class SchedulerManager(manager.Manager):
                        # the future based on information in the provider
                        # summaries, we'll just try to claim resources using
                        # the first allocation_request
-                        alloc_req = alloc_reqs_by_rp_uuid[alt_uuid][0]
+                        # NOTE(gibi): we are using, and re-using, allocation
+                        # candidates for alternatives here. This is OK as
+                        # these candidates are not yet allocated in placement
+                        # and we don't know if an alternate will ever be used.
+                        # To increase our success we could try to use different
+                        # candidate for different alternative though.
+                        alloc_req = host.allocation_candidates[0]
                        alt_selection = objects.Selection.from_host_state(
                            host, alloc_req, allocation_request_version)
                    else:
--- a/nova/tests/unit/scheduler/test_manager.py
+++ b/nova/tests/unit/scheduler/test_manager.py