diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch new file mode 100644 index 000000000..56c1fae8d --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch @@ -0,0 +1,200 @@ +From 63e8d996af35f797806db199728554ff13bc3fd9 Mon Sep 17 00:00:00 2001 +From: Saba Touheed Mujawar +Date: Wed, 8 Jan 2025 07:28:12 -0500 +Subject: [PATCH] kubeadm: create platform pods with zero CPU resources + +This specifies zero CPU resources when creating the manifests +for the static platform pods, as a workaround for the lack of +separate resource tracking for platform resources. + +This specifies zero CPU and Memory resources for the coredns +deployment. manifests.go is the main source file for this, +not sure if the coredns.yaml are used but they are updated to +be consistent. + +This specifies CPU limit of 1 for kube-apiserver pod so that it is +treated as a burstable QoS. This gives a boost of cgroup CPUShares +since the burstable cgroup parent has significantly more CPUShares +than best-effort on typical systems. This improves kube-apiserver +API responsiveness. + +This increases kube-apiserver Readiness probe periodSeconds to 10 +based on WRS/SS joint recommendation for minimum probe settings. +This reduces likelihood of kube-apiserver probe failure and +subsequent pod-restart under servere load. This also reduces CPU +demand. + +Signed-off-by: Daniel Safta +Signed-off-by: Saba Touheed Mujawar +Signed-off-by: Boovan Rajendran +Signed-off-by: Jim Gauld +Signed-off-by: Saba Touheed Mujawar +--- + cluster/addons/dns/coredns/coredns.yaml.base | 4 ++-- + cluster/addons/dns/coredns/coredns.yaml.in | 4 ++-- + cluster/addons/dns/coredns/coredns.yaml.sed | 4 ++-- + cmd/kubeadm/app/phases/addons/dns/dns_test.go | 8 ++++---- + cmd/kubeadm/app/phases/addons/dns/manifests.go | 4 ++-- + .../app/phases/controlplane/manifests.go | 10 ++++++---- + cmd/kubeadm/app/util/staticpod/utils.go | 17 ++++++++++++++++- + 7 files changed, 34 insertions(+), 17 deletions(-) + +diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base +index 3d438dce445..41088f063eb 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.base ++++ b/cluster/addons/dns/coredns/coredns.yaml.base +@@ -139,8 +139,8 @@ spec: + limits: + memory: __DNS__MEMORY__LIMIT__ + requests: +- cpu: 100m +- memory: 70Mi ++ cpu: 0 ++ memory: 0 + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: + - name: config-volume +diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in +index 419acb0e966..906d6d28890 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.in ++++ b/cluster/addons/dns/coredns/coredns.yaml.in +@@ -139,8 +139,8 @@ spec: + limits: + memory: 'dns_memory_limit' + requests: +- cpu: 100m +- memory: 70Mi ++ cpu: 0 ++ memory: 0 + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: + - name: config-volume +diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed +index a35df71454f..af0fae57dbd 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.sed ++++ b/cluster/addons/dns/coredns/coredns.yaml.sed +@@ -139,8 +139,8 @@ spec: + limits: + memory: $DNS_MEMORY_LIMIT + requests: +- cpu: 100m +- memory: 70Mi ++ cpu: 0 ++ memory: 0 + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: + - name: config-volume +diff --git a/cmd/kubeadm/app/phases/addons/dns/dns_test.go b/cmd/kubeadm/app/phases/addons/dns/dns_test.go +index 0b6c12eb41b..4f2022d7105 100644 +--- a/cmd/kubeadm/app/phases/addons/dns/dns_test.go ++++ b/cmd/kubeadm/app/phases/addons/dns/dns_test.go +@@ -705,8 +705,8 @@ spec: + limits: + memory: 170Mi + requests: +- cpu: 100m +- memory: 70Mi ++ cpu: 0 ++ memory: 0 + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: + - name: config-volume +@@ -970,8 +970,8 @@ spec: + limits: + memory: 170Mi + requests: +- cpu: 100m +- memory: 70Mi ++ cpu: 0 ++ memory: 0 + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: + - name: config-volume +diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go +index 905a2e050e6..2a2212d5d37 100644 +--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go ++++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go +@@ -104,8 +104,8 @@ spec: + limits: + memory: 170Mi + requests: +- cpu: 100m +- memory: 70Mi ++ cpu: 0 ++ memory: 0 + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: + - name: config-volume +diff --git a/cmd/kubeadm/app/phases/controlplane/manifests.go b/cmd/kubeadm/app/phases/controlplane/manifests.go +index 11b93e083db..3458efe3a03 100644 +--- a/cmd/kubeadm/app/phases/controlplane/manifests.go ++++ b/cmd/kubeadm/app/phases/controlplane/manifests.go +@@ -67,8 +67,10 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap + LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", endpoint.BindPort, v1.URISchemeHTTPS), + ReadinessProbe: staticpodutil.ReadinessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/readyz", endpoint.BindPort, v1.URISchemeHTTPS), + StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", endpoint.BindPort, v1.URISchemeHTTPS, componentHealthCheckTimeout), +- Resources: staticpodutil.ComponentResources("250m"), +- Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.APIServer.ExtraEnvs), ++ // WRS: Increase kube-apiserver cgroup CPUShares to improve API responsiveness; ++ // achieved by setting CPU Limits to make it burstable QoS. ++ Resources: staticpodutil.ComponentLimitResources("0", "1"), ++ Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.APIServer.ExtraEnvs), + }, mounts.GetVolumes(kubeadmconstants.KubeAPIServer), + map[string]string{kubeadmconstants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey: endpoint.String()}), + kubeadmconstants.KubeControllerManager: staticpodutil.ComponentPod(v1.Container{ +@@ -79,7 +81,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap + VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeControllerManager)), + LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS), + StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS, componentHealthCheckTimeout), +- Resources: staticpodutil.ComponentResources("200m"), ++ Resources: staticpodutil.ComponentResources("0"), + Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.ControllerManager.ExtraEnvs), + }, mounts.GetVolumes(kubeadmconstants.KubeControllerManager), nil), + kubeadmconstants.KubeScheduler: staticpodutil.ComponentPod(v1.Container{ +@@ -90,7 +92,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap + VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeScheduler)), + LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS), + StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS, componentHealthCheckTimeout), +- Resources: staticpodutil.ComponentResources("100m"), ++ Resources: staticpodutil.ComponentResources("0"), + Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.Scheduler.ExtraEnvs), + }, mounts.GetVolumes(kubeadmconstants.KubeScheduler), nil), + } +diff --git a/cmd/kubeadm/app/util/staticpod/utils.go b/cmd/kubeadm/app/util/staticpod/utils.go +index fcd28eff4d0..c87d6c954eb 100644 +--- a/cmd/kubeadm/app/util/staticpod/utils.go ++++ b/cmd/kubeadm/app/util/staticpod/utils.go +@@ -99,6 +99,18 @@ func ComponentResources(cpu string) v1.ResourceRequirements { + } + } + ++// ComponentLimitResources returns the v1.ResourceRequirements object needed for allocating a specified amount of the CPU with Limits ++func ComponentLimitResources(cpu string, lcpu string) v1.ResourceRequirements { ++ return v1.ResourceRequirements{ ++ Requests: v1.ResourceList{ ++ v1.ResourceCPU: resource.MustParse(cpu), ++ }, ++ Limits: v1.ResourceList{ ++ v1.ResourceCPU: resource.MustParse(lcpu), ++ }, ++ } ++} ++ + // NewVolume creates a v1.Volume with a hostPath mount to the specified location + func NewVolume(name, path string, pathType *v1.HostPathType) v1.Volume { + return v1.Volume{ +@@ -255,7 +267,10 @@ func LivenessProbe(host, path string, port int32, scheme v1.URIScheme) *v1.Probe + func ReadinessProbe(host, path string, port int32, scheme v1.URIScheme) *v1.Probe { + // sets initialDelaySeconds as '0' because we don't want to delay user infrastructure checks + // looking for "ready" status on kubeadm static Pods +- return createHTTPProbe(host, path, port, scheme, 0, 15, 3, 1) ++ // WRS/SS joint recommendation: All pods probes should have following minimum probe ++ // settings unless required by the service (initialDelaySecond 0, periodSeconds 10, ++ // timeoutSeconds 5, successThreshold 1, failureThreshold 3) ++ return createHTTPProbe(host, path, port, scheme, 0, 15, 3, 10) + } + + // StartupProbe creates a Probe object with a HTTPGet handler +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubeadm-readiness-probe-timeout-core-dns.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubeadm-readiness-probe-timeout-core-dns.patch new file mode 100644 index 000000000..3560f622b --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubeadm-readiness-probe-timeout-core-dns.patch @@ -0,0 +1,88 @@ +From 6725ab07375fc9fc8035b8919b6f2b7f601168c5 Mon Sep 17 00:00:00 2001 +From: Ferdinando Terada +Date: Mon, 23 Dec 2024 17:53:09 -0300 +Subject: [PATCH] Adjust timeout for coredns readinessProbe + +The timeout value for the readinessProbe of CoreDNS was increased. +This adjustment was necessary to avoid issues during stress testing, +ensuring that the component can properly handle high-load situations +and prevent premature failure in readiness checks. +--- + cluster/addons/dns/coredns/coredns.yaml.base | 1 + + cluster/addons/dns/coredns/coredns.yaml.in | 1 + + cluster/addons/dns/coredns/coredns.yaml.sed | 1 + + cmd/kubeadm/app/phases/addons/dns/dns_test.go | 2 ++ + cmd/kubeadm/app/phases/addons/dns/manifests.go | 1 + + 5 files changed, 6 insertions(+) + +diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base +index 41088f063eb..4bcb2b2e4fe 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.base ++++ b/cluster/addons/dns/coredns/coredns.yaml.base +@@ -170,6 +170,7 @@ spec: + path: /ready + port: 8181 + scheme: HTTP ++ timeoutSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + capabilities: +diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in +index 906d6d28890..744f8cb730a 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.in ++++ b/cluster/addons/dns/coredns/coredns.yaml.in +@@ -170,6 +170,7 @@ spec: + path: /ready + port: 8181 + scheme: HTTP ++ timeoutSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + capabilities: +diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed +index af0fae57dbd..844f21f2abf 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.sed ++++ b/cluster/addons/dns/coredns/coredns.yaml.sed +@@ -170,6 +170,7 @@ spec: + path: /ready + port: 8181 + scheme: HTTP ++ timeoutSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + capabilities: +diff --git a/cmd/kubeadm/app/phases/addons/dns/dns_test.go b/cmd/kubeadm/app/phases/addons/dns/dns_test.go +index 4f2022d7105..1f571f48bb8 100644 +--- a/cmd/kubeadm/app/phases/addons/dns/dns_test.go ++++ b/cmd/kubeadm/app/phases/addons/dns/dns_test.go +@@ -736,6 +736,7 @@ spec: + path: /ready + port: 8181 + scheme: HTTP ++ timeoutSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + capabilities: +@@ -1001,6 +1002,7 @@ spec: + path: /ready + port: 8181 + scheme: HTTP ++ timeoutSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + capabilities: +diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go +index 2a2212d5d37..c0be57357e4 100644 +--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go ++++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go +@@ -135,6 +135,7 @@ spec: + path: /ready + port: 8181 + scheme: HTTP ++ timeoutSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + capabilities: +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubeadm-reduce-UpgradeManifestTimeout.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubeadm-reduce-UpgradeManifestTimeout.patch new file mode 100644 index 000000000..1d5d30d97 --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubeadm-reduce-UpgradeManifestTimeout.patch @@ -0,0 +1,33 @@ +From 08425e565a3dff5d8b5b7cb2e9991280398cf21f Mon Sep 17 00:00:00 2001 +From: Saba Touheed Mujawar +Date: Fri, 17 Jan 2025 05:09:55 -0500 +Subject: [PATCH] kubeadm: reduce UpgradeManifestTimeout + +This modifies kubeadm UpgradeManifestTimeout from 5 minutes default +to 3 minutes to reduce the unnecessary delay in retries during +kubeadm-upgrade-apply failures. + +The typical control-plane upgrade of static pods is 75 to 85 seconds, +so 3 minutes gives adequate buffer to complete the operation. + +Signed-off-by: Saba Touheed Mujawar +--- + cmd/kubeadm/app/constants/constants.go | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/cmd/kubeadm/app/constants/constants.go b/cmd/kubeadm/app/constants/constants.go +index 0a3e0a15fce..02357d557da 100644 +--- a/cmd/kubeadm/app/constants/constants.go ++++ b/cmd/kubeadm/app/constants/constants.go +@@ -235,7 +235,7 @@ const ( + KubeletHealthCheckTimeout = 4 * time.Minute + + // UpgradeManifestsTimeout specifies the default timeout for upgradring static Pod manifests +- UpgradeManifestsTimeout = 5 * time.Minute ++ UpgradeManifestsTimeout = 3 * time.Minute + + // PullImageRetry specifies how many times ContainerRuntime retries when pulling image failed + PullImageRetry = 5 +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch new file mode 100644 index 000000000..eb84551cd --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch @@ -0,0 +1,30 @@ +From 45806d8b57d2bfdc24165264ea2f6aabadcd2e1a Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Mon, 4 Sep 2023 08:05:29 -0400 +Subject: [PATCH] kubelet CFS quota throttling for non integer cpulimit + +Signed-off-by: Boovan Rajendran +--- + pkg/kubelet/cm/internal_container_lifecycle_linux.go | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +index 376efa9e0ef..a7e56ac6911 100644 +--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go ++++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +@@ -40,7 +40,11 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain + // Disable cgroup CFS throttle at the container level. + // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_quota_us + // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_period_us +- if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed { ++ // We can only set CpuQuota to -1 if we're allocating the entire CPU. ++ // For fractional CPUs the CpuQuota is needed to enforce the limit. ++ cpuQuantity := container.Resources.Requests[v1.ResourceCPU] ++ fractionalCpuQuantity := cpuQuantity.MilliValue()%1000 ++ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && fractionalCpuQuantity == 0 { + containerConfig.Linux.Resources.CpuPeriod = int64(100000) + containerConfig.Linux.Resources.CpuQuota = int64(-1) + } +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch new file mode 100644 index 000000000..a84762476 --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch @@ -0,0 +1,256 @@ +From 521099d83512e74d11829b464d899c05b595d482 Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Mon, 26 Feb 2024 06:27:48 -0500 +Subject: [PATCH] kubelet cpumanager disable CFS quota throttling + +This disables CFS CPU quota to avoid performance degradation due to +Linux kernel CFS quota implementation. Note that 4.18 kernel attempts +to solve the CFS throttling problem, but there are reports that it is +not completely effective. + +This disables CFS quota throttling for Guaranteed pods for both +parent and container cgroups by writing -1 to cgroup cpu.cfs_quota_us. +Disabling has a dramatic latency improvement for HTTP response times. + +This patch is refactored in 1.22.5 due to new internal_container_lifecycle +framework. We leverage the same mechanism to set Linux resources as: +cpu manager: specify the container CPU set during the creation + +Co-authored-by: Jim Gauld +Signed-off-by: Sachin Gopala Krishna +Signed-off-by: Boovan Rajendran +--- + pkg/kubelet/cm/cpumanager/cpu_manager.go | 7 +++ + pkg/kubelet/cm/cpumanager/fake_cpu_manager.go | 10 ++++- + pkg/kubelet/cm/helpers_linux.go | 10 +++++ + pkg/kubelet/cm/helpers_linux_test.go | 43 ++++++++++--------- + .../cm/internal_container_lifecycle_linux.go | 9 ++++ + 5 files changed, 57 insertions(+), 22 deletions(-) + +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index 8b5049d7d74..2d1b6eefebe 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -73,6 +73,9 @@ type Manager interface { + // State returns a read-only interface to the internal CPU manager state. + State() state.Reader + ++ // GetCPUPolicy returns the assigned CPU manager policy ++ GetCPUPolicy() string ++ + // GetTopologyHints implements the topologymanager.HintProvider Interface + // and is consulted to achieve NUMA aware resource alignment among this + // and other resource controllers. +@@ -243,6 +246,10 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe + return nil + } + ++func (m *manager) GetCPUPolicy() string { ++ return m.policy.Name() ++} ++ + func (m *manager) Allocate(p *v1.Pod, c *v1.Container) error { + // The pod is during the admission phase. We need to save the pod to avoid it + // being cleaned before the admission ended +diff --git a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go +index 4a03f3dd23f..b36fb0da3b4 100644 +--- a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go +@@ -28,7 +28,8 @@ import ( + ) + + type fakeManager struct { +- state state.State ++ policy Policy ++ state state.State + } + + func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error { +@@ -70,6 +71,10 @@ func (m *fakeManager) State() state.Reader { + return m.state + } + ++func (m *fakeManager) GetCPUPolicy() string { ++ return m.policy.Name() ++} ++ + func (m *fakeManager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet { + klog.InfoS("GetExclusiveCPUs", "podUID", podUID, "containerName", containerName) + return cpuset.CPUSet{} +@@ -88,6 +93,7 @@ func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet + // NewFakeManager creates empty/fake cpu manager + func NewFakeManager() Manager { + return &fakeManager{ +- state: state.NewMemoryState(), ++ policy: &nonePolicy{}, ++ state: state.NewMemoryState(), + } + } +diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go +index 8a144e7a73c..008b955ee98 100644 +--- a/pkg/kubelet/cm/helpers_linux.go ++++ b/pkg/kubelet/cm/helpers_linux.go +@@ -170,6 +170,16 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, + // build the result + result := &ResourceConfig{} + if qosClass == v1.PodQOSGuaranteed { ++ // Disable CFS CPU quota to avoid performance degradation due to ++ // Linux kernel CFS throttle implementation. ++ // NOTE: 4.18 kernel attempts to solve CFS throttling problem, ++ // but there are reports that it is not completely effective. ++ // This will configure cgroup CFS parameters at pod level: ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods//cpu.cfs_quota_us ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods//cpu.cfs_period_us ++ cpuQuota = int64(-1) ++ cpuPeriod = uint64(100000) ++ + result.CPUShares = &cpuShares + result.CPUQuota = &cpuQuota + result.CPUPeriod = &cpuPeriod +diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go +index fba41fd49be..60609394659 100644 +--- a/pkg/kubelet/cm/helpers_linux_test.go ++++ b/pkg/kubelet/cm/helpers_linux_test.go +@@ -64,8 +64,9 @@ func TestResourceConfigForPod(t *testing.T) { + burstablePartialShares := MilliCPUToShares(200) + burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) + guaranteedShares := MilliCPUToShares(100) +- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) +- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) ++ guaranteedQuotaPeriod := uint64(100000) ++ guaranteedQuota := int64(-1) ++ guaranteedTunedQuota := int64(-1) + memoryQuantity = resource.MustParse("100Mi") + cpuNoLimit := int64(-1) + guaranteedMemory := memoryQuantity.Value() +@@ -204,8 +205,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement": { + pod: &v1.Pod{ +@@ -218,8 +219,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-with-tuned-quota": { + pod: &v1.Pod{ +@@ -232,8 +233,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement-with-tuned-quota": { + pod: &v1.Pod{ +@@ -246,8 +247,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "burstable-partial-limits-with-init-containers": { + pod: &v1.Pod{ +@@ -309,8 +310,10 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + burstablePartialShares := MilliCPUToShares(200) + burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) + guaranteedShares := MilliCPUToShares(100) +- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) +- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) ++ guaranteedQuotaPeriod := uint64(100000) ++ guaranteedQuota := int64(-1) ++ guaranteedTunedQuota := int64(-1) ++ + memoryQuantity = resource.MustParse("100Mi") + cpuNoLimit := int64(-1) + guaranteedMemory := memoryQuantity.Value() +@@ -449,8 +452,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement": { + pod: &v1.Pod{ +@@ -463,8 +466,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-with-tuned-quota": { + pod: &v1.Pod{ +@@ -477,8 +480,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement-with-tuned-quota": { + pod: &v1.Pod{ +@@ -491,8 +494,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + } + +diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +index 0c3bb2e4999..376efa9e0ef 100644 +--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go ++++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +@@ -26,6 +26,7 @@ import ( + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" ++ v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" + ) + + func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error { +@@ -36,6 +37,14 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain + } + } + ++ // Disable cgroup CFS throttle at the container level. ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_quota_us ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_period_us ++ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed { ++ containerConfig.Linux.Resources.CpuPeriod = int64(100000) ++ containerConfig.Linux.Resources.CpuQuota = int64(-1) ++ } ++ + if i.memoryManager != nil { + numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container) + if numaNodes.Len() > 0 { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch new file mode 100644 index 000000000..3883829dd --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch @@ -0,0 +1,789 @@ +From af61145f1ccd068da8937e6b20107f25023cdcb0 Mon Sep 17 00:00:00 2001 +From: Saba Touheed Mujawar +Date: Tue, 11 Feb 2025 05:27:58 -0500 +Subject: [PATCH] kubelet cpumanager introduce concept of isolated CPUs + +This introduces the concept of "isolated CPUs", which are CPUs that +have been isolated at the kernel level via the "isolcpus" kernel boot +parameter. + +When starting the kubelet process, two separate sets of reserved CPUs +may be specified. With this change CPUs reserved via +'--system-reserved=cpu' will be used for infrastructure pods while the +isolated CPUs should be reserved via '--kube-reserved=cpu' to cause +kubelet to skip over them for "normal" CPU resource tracking. The +kubelet code will double-check that the specified isolated CPUs match +what the kernel exposes in "/sys/devices/system/cpu/isolated". + +A plugin (outside the scope of this commit) will expose the isolated +CPUs to kubelet via the device plugin API. + +If a pod specifies some number of "isolcpus" resources, the device +manager will allocate them. In this code we check whether such +resources have been allocated, and if so we set the container cpuset to +the isolated CPUs. This does mean that it really only makes sense to +specify "isolcpus" resources for best-effort or burstable pods, not for +guaranteed ones since that would throw off the accounting code. In +order to ensure the accounting still works as designed, if "isolcpus" +are specified for guaranteed pods, the affinity will be set to the +non-isolated CPUs. + +This patch was refactored in 1.21.3 due to upstream API change +node: podresources: make GetDevices() consistent +(commit ad68f9588c72d6477b5a290c548a9031063ac659). + +The routine podIsolCPUs() was refactored in 1.21.3 since the API +p.deviceManager.GetDevices() is returning multiple devices with +a device per cpu. The resultant cpuset needs to be the aggregate. + +The routine NewStaticPolicy was refactored in 1.22.5, adding a new argument +in its signature: cpuPolicyOptions map[string]string. This change is implies +shifting the new arguments(deviceManager, excludeReserved) with one position +to the right. + +Co-authored-by: Jim Gauld +Co-authored-by: Chris Friesen +Signed-off-by: Gleb Aronsky +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Sachin Gopala Krishna +Signed-off-by: Boovan Rajendran +Signed-off-by: Saba Touheed Mujawar +Signed-off-by: Boovan Rajendran +Signed-off-by: Saba Touheed Mujawar +--- + pkg/kubelet/cm/container_manager_linux.go | 1 + + pkg/kubelet/cm/cpumanager/cpu_manager.go | 35 +++++- + pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 31 +++-- + pkg/kubelet/cm/cpumanager/policy_static.go | 90 +++++++++++++- + .../cm/cpumanager/policy_static_test.go | 57 +++++++-- + pkg/kubelet/cm/devicemanager/manager_stub.go | 110 ++++++++++++++++++ + 6 files changed, 298 insertions(+), 26 deletions(-) + create mode 100644 pkg/kubelet/cm/devicemanager/manager_stub.go + +diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go +index 4ed1aa7b591..7eb76b4c20f 100644 +--- a/pkg/kubelet/cm/container_manager_linux.go ++++ b/pkg/kubelet/cm/container_manager_linux.go +@@ -324,6 +324,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I + cm.GetNodeAllocatableReservation(), + nodeConfig.KubeletRootDir, + cm.topologyManager, ++ cm.deviceManager, + ) + if err != nil { + klog.ErrorS(err, "Failed to initialize cpu manager") +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index e0a359932b7..acd51fdef8d 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -20,6 +20,8 @@ import ( + "context" + "fmt" + "math" ++ "os" ++ "strings" + "sync" + "time" + +@@ -32,6 +34,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/cm/containermap" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/config" + kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" +@@ -51,6 +54,25 @@ type policyName string + // cpuManagerStateFileName is the file name where cpu manager stores its state + const cpuManagerStateFileName = "cpu_manager_state" + ++// get the system-level isolated CPUs ++func getIsolcpus() cpuset.CPUSet { ++ dat, err := os.ReadFile("/sys/devices/system/cpu/isolated") ++ if err != nil { ++ klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir") ++ return cpuset.New() ++ } ++ ++ // The isolated cpus string ends in a newline ++ cpustring := strings.TrimSuffix(string(dat), "\n") ++ cset, err := cpuset.Parse(cpustring) ++ if err != nil { ++ klog.Errorf("[cpumanager] unable to parse sysfs isolcpus string to cpuset") ++ return cpuset.New() ++ } ++ ++ return cset ++} ++ + // Manager interface provides methods for Kubelet to manage pod cpus. + type Manager interface { + // Start is called during Kubelet initialization. +@@ -154,7 +176,8 @@ func (s *sourcesReadyStub) AddSource(source string) {} + func (s *sourcesReadyStub) AllReady() bool { return true } + + // NewManager creates new cpu manager based on provided policy +-func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) { ++func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store, deviceManager devicemanager.Manager) (Manager, error) { ++ + var topo *topology.CPUTopology + var policy Policy + var err error +@@ -195,7 +218,15 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc + // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset. + // This variable is primarily to make testing easier. + excludeReserved := true +- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved) ++ ++ // isolCPUs is the set of kernel-isolated CPUs. They should be a subset of specificCPUs or ++ // of the CPUs that NewStaticPolicy() will pick if numReservedCPUs is set. It's only in the ++ // argument list here for ease of testing, it's really internal to the policy. ++ isolCPUs := getIsolcpus() ++ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, isolCPUs, affinity, cpuPolicyOptions, deviceManager, excludeReserved) ++ if err != nil { ++ return nil, fmt.Errorf("new static policy error: %v", err) ++ } + + if err != nil { + return nil, fmt.Errorf("new static policy error: %w", err) +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +index e5f5d07a2ad..8abf173ed5c 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +@@ -36,6 +36,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/cm/containermap" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/utils/cpuset" + ) +@@ -263,6 +264,7 @@ func makeMultiContainerPodWithOptions(initCPUs, appCPUs []*containerOptions) *v1 + } + + func TestCPUManagerAdd(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() + testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ +@@ -278,9 +280,11 @@ func TestCPUManagerAdd(t *testing.T) { + }, + 0, + cpuset.New(), ++ cpuset.New(), + topologymanager.NewFakeManager(), + nil, +- testExcl) ++ testDM, ++ testExcl) + testCases := []struct { + description string + updateErr error +@@ -530,8 +534,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) { + } + + testExcl := false ++ testDM, _ := devicemanager.NewManagerStub() + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testExcl) + + mockState := &mockState{ + assignments: testCase.stAssignments, +@@ -686,7 +691,9 @@ func TestCPUManagerGenerate(t *testing.T) { + } + defer os.RemoveAll(sDir) + +- mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.New(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager()) ++ testDM, err := devicemanager.NewManagerStub() ++ mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.New(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM) ++ + if testCase.expectedError != nil { + if !strings.Contains(err.Error(), testCase.expectedError.Error()) { + t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), testCase.expectedError.Error()) +@@ -757,6 +764,7 @@ func TestCPUManagerRemove(t *testing.T) { + + func TestReconcileState(t *testing.T) { + testExcl := false ++ testDM, _ := devicemanager.NewManagerStub() + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 8, +@@ -775,9 +783,11 @@ func TestReconcileState(t *testing.T) { + }, + 0, + cpuset.New(), ++ cpuset.New(), + topologymanager.NewFakeManager(), + nil, +- testExcl) ++ testDM, ++ testExcl) + + testCases := []struct { + description string +@@ -1282,6 +1292,7 @@ func TestReconcileState(t *testing.T) { + // the following tests are with --reserved-cpus configured + func TestCPUManagerAddWithResvList(t *testing.T) { + testExcl := false ++ testDM, _ := devicemanager.NewManagerStub() + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 4, +@@ -1296,9 +1307,11 @@ func TestCPUManagerAddWithResvList(t *testing.T) { + }, + 1, + cpuset.New(0), ++ cpuset.New(), + topologymanager.NewFakeManager(), + nil, +- testExcl) ++ testDM, ++ testExcl) + testCases := []struct { + description string + updateErr error +@@ -1410,7 +1423,8 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) { + } + defer os.RemoveAll(sDir) + +- _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.New(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager()) ++ testDM, err := devicemanager.NewManagerStub() ++ _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.New(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM) + if err == nil { + t.Errorf("Expected error, but NewManager succeeded") + } +@@ -1424,6 +1438,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) { + + func TestCPUManagerGetAllocatableCPUs(t *testing.T) { + testExcl := false ++ testDm, _ := devicemanager.NewManagerStub() + nonePolicy, _ := NewNonePolicy(nil) + staticPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ +@@ -1439,9 +1454,11 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) { + }, + 1, + cpuset.New(0), ++ cpuset.New(), + topologymanager.NewFakeManager(), + nil, +- testExcl) ++ testDm, ++ testExcl) + + testCases := []struct { + description string +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index fff571491f0..22f2962de83 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -19,6 +19,7 @@ package cpumanager + import ( + "context" + "fmt" ++ "strconv" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +@@ -33,6 +34,7 @@ import ( + "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" + "k8s.io/kubernetes/pkg/kubelet/metrics" +@@ -128,6 +130,10 @@ type staticPolicy struct { + topology *topology.CPUTopology + // set of CPUs that is not available for exclusive assignment + reservedCPUs cpuset.CPUSet ++ // subset of reserved CPUs with isolcpus attribute ++ isolcpus cpuset.CPUSet ++ // parent containerManager, used to get device list ++ deviceManager devicemanager.Manager + // If true, default CPUSet should exclude reserved CPUs + excludeReserved bool + // Superset of reservedCPUs. It includes not just the reservedCPUs themselves, +@@ -150,7 +156,8 @@ var _ Policy = &staticPolicy{} + // NewStaticPolicy returns a CPU manager policy that does not change CPU + // assignments for exclusively pinned guaranteed containers after the main + // container process starts. +-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) { ++func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, isolCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, deviceManager devicemanager.Manager, excludeReserved bool) (Policy, error) { ++ + opts, err := NewStaticPolicyOptions(cpuPolicyOptions) + if err != nil { + return nil, err +@@ -165,6 +172,8 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + policy := &staticPolicy{ + topology: topology, + affinity: affinity, ++ isolcpus: isolCPUs, ++ deviceManager: deviceManager, + excludeReserved: excludeReserved, + cpusToReuse: make(map[string]cpuset.CPUSet), + options: opts, +@@ -201,6 +210,12 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + policy.reservedCPUs = reserved + policy.reservedPhysicalCPUs = reservedPhysicalCPUs + ++ if !isolCPUs.IsSubsetOf(reserved) { ++ klog.Errorf("[cpumanager] isolCPUs %v is not a subset of reserved %v", isolCPUs, reserved) ++ reserved = reserved.Union(isolCPUs) ++ klog.Warningf("[cpumanager] mismatch isolCPUs %v, force reserved %v", isolCPUs, reserved) ++ } ++ + return policy, nil + } + +@@ -234,8 +249,9 @@ func (p *staticPolicy) validateState(s state.State) error { + } else { + s.SetDefaultCPUSet(allCPUs) + } +- klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n", +- allCPUs, p.reservedCPUs, s.GetDefaultCPUSet()) ++ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, isolcpus:%v, default:%v\n", ++ allCPUs, p.reservedCPUs, p.isolcpus, s.GetDefaultCPUSet()) ++ + return nil + } + +@@ -340,16 +356,46 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai + return nil + } + +- cpuset := p.reservedCPUs ++ cpuset := p.reservedCPUs.Clone().Difference(p.isolcpus) + if cpuset.IsEmpty() { + // If this happens then someone messed up. +- return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reservedCPUs) ++ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v, isolcpus:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reservedCPUs, p.isolcpus) ++ + } + s.SetCPUSet(string(pod.UID), container.Name, cpuset) + klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset) + return nil + } + ++ // This corrects kubelet cpumanager static cpuset tracking for isolcpus, ++ // since version 1.26.1 . This ensures that pods specified with ++ // isolcpus + guaranteed QoS + integer cpu requests, are affined to ++ // exclusive cpuset and tracked as non-isolated cpus. ++ cpuQuantity := container.Resources.Requests[v1.ResourceCPU] ++ fractionalCpuQuantity := cpuQuantity.MilliValue() % 1000 ++ if isolcpus := p.podIsolCPUs(pod, container); isolcpus.Size() > 0 && ++ v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed && fractionalCpuQuantity == 0 { ++ // container has requested isolated CPUs ++ if set, ok := s.GetCPUSet(string(pod.UID), container.Name); ok { ++ if set.Equals(isolcpus) { ++ klog.Infof("[cpumanager] isolcpus container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)", ++ pod.Namespace, string(pod.UID), pod.Name, container.Name) ++ return nil ++ } else { ++ klog.Infof("[cpumanager] isolcpus container state has cpus %v, should be %v (namespace: %s, pod UID: %s, pod: %s, container: %s)", ++ isolcpus, set, pod.Namespace, string(pod.UID), pod.Name, container.Name) ++ } ++ } ++ // Note that we do not do anything about init containers here. ++ // It looks like devices are allocated per-pod based on effective requests/limits ++ // and extra devices from initContainers are not freed up when the regular containers start. ++ // TODO: confirm this is still true for 1.20 ++ s.SetCPUSet(string(pod.UID), container.Name, isolcpus) ++ klog.Infof("[cpumanager] isolcpus: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", ++ pod.Namespace, string(pod.UID), pod.Name, container.Name, isolcpus) ++ return nil ++ } ++ + numCPUs := p.guaranteedCPUs(pod, container) + if numCPUs == 0 { + // container belongs in the shared pool (nothing to do; use default cpuset) +@@ -415,7 +461,9 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai + } + s.SetCPUSet(string(pod.UID), container.Name, cpuset) + p.updateCPUsToReuse(pod, container, cpuset) +- ++ klog.Infof("[cpumanager] guaranteed: AddContainer "+ ++ "(namespace: %s, pod UID: %s, pod: %s, container: %s); numCPUS=%d, cpuset=%v", ++ pod.Namespace, string(pod.UID), pod.Name, container.Name, numCPUs, cpuset) + return nil + } + +@@ -794,6 +842,36 @@ func isKubeInfra(pod *v1.Pod) bool { + + } + ++// get the isolated CPUs (if any) from the devices associated with a specific container ++func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet { ++ // NOTE: This is required for TestStaticPolicyAdd() since makePod() does ++ // not create UID. We also need a way to properly stub devicemanager. ++ if len(string(pod.UID)) == 0 { ++ return cpuset.New() ++ } ++ resContDevices := p.deviceManager.GetDevices(string(pod.UID), container.Name) ++ cpuSet := cpuset.New() ++ for resourceName, resourceDevs := range resContDevices { ++ // this resource name needs to match the isolcpus device plugin ++ if resourceName == "windriver.com/isolcpus" { ++ for devID, _ := range resourceDevs { ++ cpuStrList := []string{devID} ++ if len(cpuStrList) > 0 { ++ // loop over the list of strings, convert each one to int, add to cpuset ++ for _, cpuStr := range cpuStrList { ++ cpu, err := strconv.Atoi(cpuStr) ++ if err != nil { ++ panic(err) ++ } ++ cpuSet = cpuSet.Union(cpuset.New(cpu)) ++ } ++ } ++ } ++ } ++ } ++ return cpuSet ++} ++ + // isHintSocketAligned function return true if numa nodes in hint are socket aligned. + func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool { + numaNodesBitMask := hint.NUMANodeAffinity.GetBits() +diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go +index 83614619550..ee70a833d33 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go +@@ -30,6 +30,7 @@ import ( + pkgfeatures "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" + "k8s.io/utils/cpuset" +@@ -73,8 +74,9 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest { + } + + func TestStaticPolicyName(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() + testExcl := false +- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl) ++ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testExcl) + + policyName := policy.Name() + if policyName != "static" { +@@ -84,6 +86,7 @@ func TestStaticPolicyName(t *testing.T) { + } + + func TestStaticPolicyStart(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []staticPolicyTest{ + { + description: "non-corrupted state", +@@ -159,7 +162,7 @@ func TestStaticPolicyStart(t *testing.T) { + } + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testCase.excludeReserved) + + policy := p.(*staticPolicy) + st := &mockState{ +@@ -207,7 +210,6 @@ func TestStaticPolicyAdd(t *testing.T) { + largeTopoCPUSet := cpuset.New(largeTopoCPUids...) + largeTopoSock0CPUSet := cpuset.New(largeTopoSock0CPUids...) + largeTopoSock1CPUSet := cpuset.New(largeTopoSock1CPUids...) +- + // these are the cases which must behave the same regardless the policy options. + // So we will permutate the options to ensure this holds true. + +@@ -630,7 +632,8 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) { + cpus = testCase.reservedCPUs.Clone() + } + testExcl := false +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, tm, testCase.options, testExcl) ++ testDM, _ := devicemanager.NewManagerStub() ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, cpus, tm, testCase.options, testDM, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -677,6 +680,8 @@ func runStaticPolicyTestCaseWithFeatureGate(t *testing.T, testCase staticPolicyT + } + + func TestStaticPolicyReuseCPUs(t *testing.T) { ++ excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []struct { + staticPolicyTest + expCSetAfterAlloc cpuset.CPUSet +@@ -701,7 +706,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -733,6 +738,8 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + } + + func TestStaticPolicyDoNotReuseCPUs(t *testing.T) { ++ excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []struct { + staticPolicyTest + expCSetAfterAlloc cpuset.CPUSet +@@ -754,7 +761,7 @@ func TestStaticPolicyDoNotReuseCPUs(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -779,6 +786,7 @@ func TestStaticPolicyDoNotReuseCPUs(t *testing.T) { + + func TestStaticPolicyRemove(t *testing.T) { + excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []staticPolicyTest{ + { + description: "SingleSocketHT, DeAllocOneContainer", +@@ -837,7 +845,7 @@ func TestStaticPolicyRemove(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -860,6 +868,7 @@ func TestStaticPolicyRemove(t *testing.T) { + + func TestTopologyAwareAllocateCPUs(t *testing.T) { + excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []struct { + description string + topo *topology.CPUTopology +@@ -928,7 +937,8 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) { + }, + } + for _, tc := range testCases { +- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved) ++ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) ++ + policy := p.(*staticPolicy) + st := &mockState{ + assignments: tc.stAssignments, +@@ -961,6 +971,7 @@ type staticPolicyTestWithResvList struct { + topo *topology.CPUTopology + numReservedCPUs int + reserved cpuset.CPUSet ++ isolcpus cpuset.CPUSet + stAssignments state.ContainerCPUAssignments + stDefaultCPUSet cpuset.CPUSet + pod *v1.Pod +@@ -972,6 +983,8 @@ type staticPolicyTestWithResvList struct { + } + + func TestStaticPolicyStartWithResvList(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() ++ testExcl := false + testCases := []staticPolicyTestWithResvList{ + { + description: "empty cpuset", +@@ -1001,10 +1014,9 @@ func TestStaticPolicyStartWithResvList(t *testing.T) { + expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"), + }, + } +- testExcl := false + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) ++ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testExcl) + + if !reflect.DeepEqual(err, testCase.expNewErr) { + t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v", +@@ -1053,6 +1065,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 1, + reserved: cpuset.New(0), ++ isolcpus: cpuset.New(), + stAssignments: state.ContainerCPUAssignments{}, + stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"), +@@ -1066,6 +1079,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), ++ isolcpus: cpuset.New(), + stAssignments: state.ContainerCPUAssignments{}, + stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"), +@@ -1079,6 +1093,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), ++ isolcpus: cpuset.New(), + stAssignments: state.ContainerCPUAssignments{ + "fakePod": map[string]cpuset.CPUSet{ + "fakeContainer100": cpuset.New(2, 3, 6, 7), +@@ -1096,6 +1111,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), ++ isolcpus: cpuset.New(), + stAssignments: state.ContainerCPUAssignments{ + "fakePod": map[string]cpuset.CPUSet{ + "fakeContainer100": cpuset.New(2, 3, 6, 7), +@@ -1108,11 +1124,30 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + expCPUAlloc: true, + expCSet: cpuset.New(0, 1), + }, ++ { ++ description: "InfraPod, SingleSocketHT, Isolcpus, ExpectAllocReserved", ++ topo: topoSingleSocketHT, ++ numReservedCPUs: 2, ++ reserved: cpuset.New(0, 1), ++ isolcpus: cpuset.New(1), ++ stAssignments: state.ContainerCPUAssignments{ ++ "fakePod": map[string]cpuset.CPUSet{ ++ "fakeContainer100": cpuset.New(2, 3, 6, 7), ++ }, ++ }, ++ stDefaultCPUSet: cpuset.New(4, 5), ++ pod: infraPod, ++ isKubeInfraPodfunc: fakeIsKubeInfraTrue, ++ expErr: nil, ++ expCPUAlloc: true, ++ expCSet: cpuset.New(0), ++ }, + } + + testExcl := true ++ testDM, _ := devicemanager.NewManagerStub() + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, testCase.isolcpus, topologymanager.NewFakeManager(), nil, testDM, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +diff --git a/pkg/kubelet/cm/devicemanager/manager_stub.go b/pkg/kubelet/cm/devicemanager/manager_stub.go +new file mode 100644 +index 00000000000..f0e725c6c9f +--- /dev/null ++++ b/pkg/kubelet/cm/devicemanager/manager_stub.go +@@ -0,0 +1,110 @@ ++/* ++Copyright 2017 The Kubernetes Authors. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++*/ ++ ++package devicemanager ++ ++import ( ++ v1 "k8s.io/api/core/v1" ++ "k8s.io/apimachinery/pkg/util/sets" ++ "k8s.io/kubernetes/pkg/kubelet/cm/containermap" ++ "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" ++ "k8s.io/kubernetes/pkg/kubelet/config" ++ "k8s.io/kubernetes/pkg/kubelet/lifecycle" ++ "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" ++ schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ++) ++ ++// ManagerStub provides a simple stub implementation for the Device Manager. ++type ManagerStub struct { ++ // containerMap provides a mapping from (pod, container) -> containerID ++ // for all containers in a pod. Used to detect pods running across a restart ++ containerMap containermap.ContainerMap ++ ++ // containerRunningSet identifies which container among those present in `containerMap` ++ // was reported running by the container runtime when `containerMap` was computed. ++ // Used to detect pods running across a restart ++ containerRunningSet sets.Set[string] ++} ++ ++// NewManagerStub creates a ManagerStub. ++func NewManagerStub() (*ManagerStub, error) { ++ return &ManagerStub{}, nil ++} ++ ++// Start simply returns nil. ++func (h *ManagerStub) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, initialContainers containermap.ContainerMap, initialContainerRunningSet sets.Set[string]) error { ++ return nil ++} ++ ++// Stop simply returns nil. ++func (h *ManagerStub) Stop() error { ++ return nil ++} ++ ++// Allocate simply returns nil. ++func (h *ManagerStub) Allocate(pod *v1.Pod, container *v1.Container) error { ++ return nil ++} ++ ++// UpdatePluginResources simply returns nil. ++func (h *ManagerStub) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error { ++ return nil ++} ++ ++// GetDeviceRunContainerOptions simply returns nil. ++func (h *ManagerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) { ++ return nil, nil ++} ++ ++// GetCapacity simply returns nil capacity and empty removed resource list. ++func (h *ManagerStub) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) { ++ return nil, nil, []string{} ++} ++ ++// GetWatcherHandler returns plugin watcher interface ++func (h *ManagerStub) GetWatcherHandler() cache.PluginHandler { ++ return nil ++} ++ ++// GetTopologyHints returns an empty TopologyHint map ++func (h *ManagerStub) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint { ++ return map[string][]topologymanager.TopologyHint{} ++} ++ ++// GetPodTopologyHints returns an empty TopologyHint map ++func (h *ManagerStub) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint { ++ return map[string][]topologymanager.TopologyHint{} ++} ++ ++// GetDevices returns nil ++func (h *ManagerStub) GetDevices(_, _ string) ResourceDeviceInstances { ++ return nil ++} ++ ++// GetAllocatableDevices returns nothing ++func (h *ManagerStub) GetAllocatableDevices() ResourceDeviceInstances { ++ return nil ++} ++ ++// ShouldResetExtendedResourceCapacity returns false ++func (h *ManagerStub) ShouldResetExtendedResourceCapacity() bool { ++ return false ++} ++ ++// UpdateAllocatedDevices returns nothing ++func (h *ManagerStub) UpdateAllocatedDevices() { ++ return ++} +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch new file mode 100644 index 000000000..33bf20b04 --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch @@ -0,0 +1,366 @@ +From bd2514a9b62c61f2e98f199de98dca76348a8891 Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Wed, 6 Mar 2024 03:50:23 -0500 +Subject: [PATCH] kubelet cpumanager keep normal containers off reserved CPUs + +When starting the kubelet process, two separate sets of reserved CPUs +may be specified. With this change CPUs reserved via +'--system-reserved=cpu' +or '--kube-reserved=cpu' will be ignored by kubernetes itself. This +explicitly excludes the reserved CPUS from the DefaultCPUset so +that pods cannot run there. + +Co-authored-by: Jim Gauld +Signed-off-by: Sachin Gopala Krishna +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Boovan Rajendran +--- + pkg/kubelet/cm/cpumanager/cpu_manager.go | 6 ++- + pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 19 ++++++--- + pkg/kubelet/cm/cpumanager/policy_static.go | 30 +++++++++++--- + .../cm/cpumanager/policy_static_test.go | 40 ++++++++++++++----- + 4 files changed, 72 insertions(+), 23 deletions(-) + +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index 2d1b6eefebe..e0a359932b7 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -192,7 +192,11 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc + // exclusively allocated. + reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000 + numReservedCPUs := int(math.Ceil(reservedCPUsFloat)) +- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions) ++ // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset. ++ // This variable is primarily to make testing easier. ++ excludeReserved := true ++ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved) ++ + if err != nil { + return nil, fmt.Errorf("new static policy error: %w", err) + } +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +index f6563bbca23..e5f5d07a2ad 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +@@ -263,6 +263,7 @@ func makeMultiContainerPodWithOptions(initCPUs, appCPUs []*containerOptions) *v1 + } + + func TestCPUManagerAdd(t *testing.T) { ++ testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 4, +@@ -278,7 +279,8 @@ func TestCPUManagerAdd(t *testing.T) { + 0, + cpuset.New(), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + testCases := []struct { + description string + updateErr error +@@ -527,8 +529,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) { + }, + } + ++ testExcl := false + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl) + + mockState := &mockState{ + assignments: testCase.stAssignments, +@@ -753,6 +756,7 @@ func TestCPUManagerRemove(t *testing.T) { + } + + func TestReconcileState(t *testing.T) { ++ testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 8, +@@ -772,7 +776,8 @@ func TestReconcileState(t *testing.T) { + 0, + cpuset.New(), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + + testCases := []struct { + description string +@@ -1276,6 +1281,7 @@ func TestReconcileState(t *testing.T) { + // above test cases are without kubelet --reserved-cpus cmd option + // the following tests are with --reserved-cpus configured + func TestCPUManagerAddWithResvList(t *testing.T) { ++ testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 4, +@@ -1291,7 +1297,8 @@ func TestCPUManagerAddWithResvList(t *testing.T) { + 1, + cpuset.New(0), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + testCases := []struct { + description string + updateErr error +@@ -1416,6 +1423,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) { + } + + func TestCPUManagerGetAllocatableCPUs(t *testing.T) { ++ testExcl := false + nonePolicy, _ := NewNonePolicy(nil) + staticPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ +@@ -1432,7 +1440,8 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) { + 1, + cpuset.New(0), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + + testCases := []struct { + description string +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index d22a6a64d5e..9690eedab58 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -106,6 +106,8 @@ type staticPolicy struct { + topology *topology.CPUTopology + // set of CPUs that is not available for exclusive assignment + reservedCPUs cpuset.CPUSet ++ // If true, default CPUSet should exclude reserved CPUs ++ excludeReserved bool + // Superset of reservedCPUs. It includes not just the reservedCPUs themselves, + // but also any siblings of those reservedCPUs on the same physical die. + // NOTE: If the reserved set includes full physical CPUs from the beginning +@@ -126,7 +128,7 @@ var _ Policy = &staticPolicy{} + // NewStaticPolicy returns a CPU manager policy that does not change CPU + // assignments for exclusively pinned guaranteed containers after the main + // container process starts. +-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string) (Policy, error) { ++func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) { + opts, err := NewStaticPolicyOptions(cpuPolicyOptions) + if err != nil { + return nil, err +@@ -141,6 +143,7 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + policy := &staticPolicy{ + topology: topology, + affinity: affinity, ++ excludeReserved: excludeReserved, + cpusToReuse: make(map[string]cpuset.CPUSet), + options: opts, + } +@@ -202,7 +205,15 @@ func (p *staticPolicy) validateState(s state.State) error { + } + // state is empty initialize + allCPUs := p.topology.CPUDetails.CPUs() +- s.SetDefaultCPUSet(allCPUs) ++ if p.excludeReserved { ++ // Exclude reserved CPUs from the default CPUSet to keep containers off them ++ // unless explicitly affined. ++ s.SetDefaultCPUSet(allCPUs.Difference(p.reservedCPUs)) ++ } else { ++ s.SetDefaultCPUSet(allCPUs) ++ } ++ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n", ++ allCPUs, p.reservedCPUs, s.GetDefaultCPUSet()) + return nil + } + +@@ -210,11 +221,12 @@ func (p *staticPolicy) validateState(s state.State) error { + // 1. Check if the reserved cpuset is not part of default cpuset because: + // - kube/system reserved have changed (increased) - may lead to some containers not being able to start + // - user tampered with file +- if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) { +- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", +- p.reservedCPUs.String(), tmpDefaultCPUset.String()) ++ if !p.excludeReserved { ++ if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) { ++ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", ++ p.reservedCPUs.String(), tmpDefaultCPUset.String()) ++ } + } +- + // 2. Check if state for static policy is consistent + for pod := range tmpAssignments { + for container, cset := range tmpAssignments[pod] { +@@ -241,6 +253,9 @@ func (p *staticPolicy) validateState(s state.State) error { + } + } + totalKnownCPUs = totalKnownCPUs.Union(tmpCPUSets...) ++ if p.excludeReserved { ++ totalKnownCPUs = totalKnownCPUs.Union(p.reservedCPUs) ++ } + if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) { + return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"", + p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String()) +@@ -381,6 +396,9 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa + cpusInUse := getAssignedCPUsOfSiblings(s, podUID, containerName) + if toRelease, ok := s.GetCPUSet(podUID, containerName); ok { + s.Delete(podUID, containerName) ++ if p.excludeReserved { ++ toRelease = toRelease.Difference(p.reservedCPUs) ++ } + // Mutate the shared pool, adding released cpus. + toRelease = toRelease.Difference(cpusInUse) + s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease)) +diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go +index 8cc3fc2be74..b060201e156 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go +@@ -36,6 +36,7 @@ type staticPolicyTest struct { + description string + topo *topology.CPUTopology + numReservedCPUs int ++ excludeReserved bool + reservedCPUs *cpuset.CPUSet + podUID string + options map[string]string +@@ -69,7 +70,8 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest { + } + + func TestStaticPolicyName(t *testing.T) { +- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ testExcl := false ++ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl) + + policyName := policy.Name() + if policyName != "static" { +@@ -99,6 +101,15 @@ func TestStaticPolicyStart(t *testing.T) { + stDefaultCPUSet: cpuset.New(), + expCSet: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), + }, ++ { ++ description: "empty cpuset exclude reserved", ++ topo: topoDualSocketHT, ++ numReservedCPUs: 2, ++ excludeReserved: true, ++ stAssignments: state.ContainerCPUAssignments{}, ++ stDefaultCPUSet: cpuset.New(), ++ expCSet: cpuset.New(1, 2, 3, 4, 5, 7, 8, 9, 10, 11), ++ }, + { + description: "reserved cores 0 & 6 are not present in available cpuset", + topo: topoDualSocketHT, +@@ -145,7 +156,8 @@ func TestStaticPolicyStart(t *testing.T) { + } + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ + policy := p.(*staticPolicy) + st := &mockState{ + assignments: testCase.stAssignments, +@@ -614,7 +626,8 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) { + if testCase.reservedCPUs != nil { + cpus = testCase.reservedCPUs.Clone() + } +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, tm, testCase.options) ++ testExcl := false ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, tm, testCase.options, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -685,7 +698,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -738,7 +751,7 @@ func TestStaticPolicyDoNotReuseCPUs(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -762,6 +775,7 @@ func TestStaticPolicyDoNotReuseCPUs(t *testing.T) { + } + + func TestStaticPolicyRemove(t *testing.T) { ++ excludeReserved := false + testCases := []staticPolicyTest{ + { + description: "SingleSocketHT, DeAllocOneContainer", +@@ -820,7 +834,7 @@ func TestStaticPolicyRemove(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -842,6 +856,7 @@ func TestStaticPolicyRemove(t *testing.T) { + } + + func TestTopologyAwareAllocateCPUs(t *testing.T) { ++ excludeReserved := false + testCases := []struct { + description string + topo *topology.CPUTopology +@@ -910,7 +925,7 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) { + }, + } + for _, tc := range testCases { +- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved) + policy := p.(*staticPolicy) + st := &mockState{ + assignments: tc.stAssignments, +@@ -982,9 +997,11 @@ func TestStaticPolicyStartWithResvList(t *testing.T) { + expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"), + }, + } ++ testExcl := false + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil) ++ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) ++ + if !reflect.DeepEqual(err, testCase.expNewErr) { + t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v", + testCase.description, testCase.expNewErr, err) +@@ -1024,7 +1041,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + numReservedCPUs: 1, + reserved: cpuset.New(0), + stAssignments: state.ContainerCPUAssignments{}, +- stDefaultCPUSet: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7), ++ stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"), + expErr: fmt.Errorf("not enough cpus available to satisfy request: requested=8, available=7"), + expCPUAlloc: false, +@@ -1036,7 +1053,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), + stAssignments: state.ContainerCPUAssignments{}, +- stDefaultCPUSet: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7), ++ stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"), + expErr: nil, + expCPUAlloc: true, +@@ -1060,8 +1077,9 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + }, + } + ++ testExcl := true + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-platform-pods-on-reserved-cpus.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-platform-pods-on-reserved-cpus.patch new file mode 100644 index 000000000..cb4979cfe --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-cpumanager-platform-pods-on-reserved-cpus.patch @@ -0,0 +1,556 @@ +From 4a28be032c61624f9b16d69ae3a7f699b6a62f51 Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Tue, 5 Sep 2023 06:27:39 -0400 +Subject: [PATCH] Namespace-Label-Based Pod Identification and CPU Management + for Infra Pods + +This change assigns system infrastructure pods to the "reserved" +cpuset to isolate them from the shared pool of CPUs. Kubernetes +infrastructure pods are identified based on namespace 'kube-system'. +Platform pods are identified based on namespace 'kube-system', +or label with 'app.starlingx.io/component=platform'. + +The platform and infrastructure pods are given an isolated CPU +affinity cpuset when the CPU manager is configured "with the 'static' +policy." + +This implementation assumes that the "reserved" cpuset is large +enough to handle all infrastructure and platform pod's CPU +allocations, and it prevents the platform pods from running on +application/isolated CPUs regardless of what QoS class they're in. + +Co-authored-by: Jim Gauld +Signed-off-by: Gleb Aronsky +Signed-off-by: Thiago Miranda +Signed-off-by: Kaustubh Dhokte +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Sachin Gopala Krishna +Signed-off-by: Boovan Rajendran +Signed-off-by: Saba Touheed Mujawar +--- + pkg/kubelet/cm/cpumanager/policy_static.go | 121 ++++++++- + .../cm/cpumanager/policy_static_test.go | 233 +++++++++++++++--- + .../cm/cpumanager/topology_hints_test.go | 4 + + 3 files changed, 316 insertions(+), 42 deletions(-) + +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index 9690eedab58..fff571491f0 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -17,11 +17,17 @@ limitations under the License. + package cpumanager + + import ( ++ "context" + "fmt" + + v1 "k8s.io/api/core/v1" ++ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" ++ k8sclient "k8s.io/client-go/kubernetes" ++ restclient "k8s.io/client-go/rest" ++ "k8s.io/client-go/tools/clientcmd" + "k8s.io/klog/v2" ++ "k8s.io/kubernetes/cmd/kubeadm/app/constants" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" + v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" + "k8s.io/kubernetes/pkg/features" +@@ -44,6 +50,22 @@ const ( + ErrorSMTAlignment = "SMTAlignmentError" + ) + ++// Declared as variables so that they can easily more ++// overridden during testing ++type getPodNamespace func(string) (*v1.Namespace, error) ++type buildFromConfigFlag func(masterUrl string, kubeconfigPath string) (*restclient.Config, error) ++type isKubeInfraFunc func(pod *v1.Pod) bool ++ ++var varGetNamespaceObject getPodNamespace ++var varBuildConfigFromFlags buildFromConfigFlag ++var varIsKubeInfra isKubeInfraFunc ++ ++func init() { ++ varIsKubeInfra = isKubeInfra ++ varGetNamespaceObject = getPodNamespaceObject ++ varBuildConfigFromFlags = clientcmd.BuildConfigFromFlags ++} ++ + // SMTAlignmentError represents an error due to SMT alignment + type SMTAlignmentError struct { + RequestedCPUs int +@@ -141,11 +163,11 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + klog.InfoS("Static policy created with configuration", "options", opts) + + policy := &staticPolicy{ +- topology: topology, +- affinity: affinity, ++ topology: topology, ++ affinity: affinity, + excludeReserved: excludeReserved, +- cpusToReuse: make(map[string]cpuset.CPUSet), +- options: opts, ++ cpusToReuse: make(map[string]cpuset.CPUSet), ++ options: opts, + } + + allCPUs := topology.CPUDetails.CPUs() +@@ -223,8 +245,8 @@ func (p *staticPolicy) validateState(s state.State) error { + // - user tampered with file + if !p.excludeReserved { + if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) { +- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", +- p.reservedCPUs.String(), tmpDefaultCPUset.String()) ++ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", ++ p.reservedCPUs.String(), tmpDefaultCPUset.String()) + } + } + // 2. Check if state for static policy is consistent +@@ -309,6 +331,25 @@ func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, c + } + + func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) { ++ // Process infra pods before guaranteed pods ++ if varIsKubeInfra(pod) { ++ // Container belongs in reserved pool. ++ // We don't want to fall through to the p.guaranteedCPUs() clause below so return either nil or error. ++ if _, ok := s.GetCPUSet(string(pod.UID), container.Name); ok { ++ klog.Infof("[cpumanager] static policy: reserved container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)", pod.Namespace, string(pod.UID), pod.Name, container.Name) ++ return nil ++ } ++ ++ cpuset := p.reservedCPUs ++ if cpuset.IsEmpty() { ++ // If this happens then someone messed up. ++ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reservedCPUs) ++ } ++ s.SetCPUSet(string(pod.UID), container.Name, cpuset) ++ klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset) ++ return nil ++ } ++ + numCPUs := p.guaranteedCPUs(pod, container) + if numCPUs == 0 { + // container belongs in the shared pool (nothing to do; use default cpuset) +@@ -460,6 +501,10 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int + if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() { + return 0 + } ++ // Infrastructure pods use reserved CPUs even if they're in the Guaranteed QoS class ++ if varIsKubeInfra(pod) { ++ return 0 ++ } + // Safe downcast to do for all systems with < 2.1 billion CPUs. + // Per the language spec, `int` is guaranteed to be at least 32 bits wide. + // https://golang.org/ref/spec#Numeric_types +@@ -685,6 +730,70 @@ func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, reu + return hints + } + ++func getPodNamespaceObject(podNamespaceName string) (*v1.Namespace, error) { ++ ++ kubeConfigPath := constants.GetKubeletKubeConfigPath() ++ cfg, err := varBuildConfigFromFlags("", kubeConfigPath) ++ if err != nil { ++ klog.Error("Failed to build client config from ", kubeConfigPath, err.Error()) ++ return nil, err ++ } ++ ++ clientset, err := k8sclient.NewForConfig(cfg) ++ if err != nil { ++ klog.Error("Failed to get clientset for KUBECONFIG ", kubeConfigPath, err.Error()) ++ return nil, err ++ } ++ ++ namespaceObj, err := clientset.CoreV1().Namespaces().Get(context.TODO(), podNamespaceName, metav1.GetOptions{}) ++ if err != nil { ++ klog.Error("Error getting namespace object:", err.Error()) ++ return nil, err ++ } ++ ++ return namespaceObj, nil ++ ++} ++ ++// check if a given pod is labelled as platform pod or ++// is in a namespace labelled as a platform namespace ++func isKubeInfra(pod *v1.Pod) bool { ++ ++ podName := pod.GetName() ++ podNamespaceName := pod.GetNamespace() ++ ++ if podNamespaceName == "kube-system" { ++ klog.Infof("Pod %s has %s namespace. Treating as platform pod.", podName, podNamespaceName) ++ return true ++ } ++ ++ klog.InfoS("Checking pod ", podName, " for label 'app.starlingx.io/component=platform'.") ++ podLabels := pod.GetLabels() ++ val, ok := podLabels["app.starlingx.io/component"] ++ if ok && val == "platform" { ++ klog.InfoS("Pod ", podName, " has 'app.starlingx.io/component=platform' label. Treating as platform pod.") ++ return true ++ } ++ ++ klog.V(4).InfoS("Pod ", pod.GetName(), " does not have 'app.starlingx.io/component=platform' label. Checking its namespace information...") ++ ++ namespaceObj, err := varGetNamespaceObject(podNamespaceName) ++ if err != nil { ++ return false ++ } ++ ++ namespaceLabels := namespaceObj.GetLabels() ++ val, ok = namespaceLabels["app.starlingx.io/component"] ++ if ok && val == "platform" { ++ klog.InfoS("For pod: ", podName, ", its Namespace ", podNamespaceName, " has 'app.starlingx.io/component=platform' label. Treating as platform pod.") ++ return true ++ } ++ ++ klog.InfoS("Neither pod ", podName, " nor its namespace ", podNamespaceName, " has 'app.starlingx.io/component=platform' label. Not treating as platform pod.") ++ return false ++ ++} ++ + // isHintSocketAligned function return true if numa nodes in hint are socket aligned. + func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool { + numaNodesBitMask := hint.NUMANodeAffinity.GetBits() +diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go +index b060201e156..83614619550 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go +@@ -17,12 +17,15 @@ limitations under the License. + package cpumanager + + import ( ++ "errors" + "fmt" + "reflect" + "testing" + + v1 "k8s.io/api/core/v1" ++ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" ++ restclient "k8s.io/client-go/rest" + featuregatetesting "k8s.io/component-base/featuregate/testing" + pkgfeatures "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" +@@ -109,7 +112,7 @@ func TestStaticPolicyStart(t *testing.T) { + stAssignments: state.ContainerCPUAssignments{}, + stDefaultCPUSet: cpuset.New(), + expCSet: cpuset.New(1, 2, 3, 4, 5, 7, 8, 9, 10, 11), +- }, ++ }, + { + description: "reserved cores 0 & 6 are not present in available cpuset", + topo: topoDualSocketHT, +@@ -954,17 +957,18 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) { + // above test cases are without kubelet --reserved-cpus cmd option + // the following tests are with --reserved-cpus configured + type staticPolicyTestWithResvList struct { +- description string +- topo *topology.CPUTopology +- numReservedCPUs int +- reserved cpuset.CPUSet +- stAssignments state.ContainerCPUAssignments +- stDefaultCPUSet cpuset.CPUSet +- pod *v1.Pod +- expErr error +- expNewErr error +- expCPUAlloc bool +- expCSet cpuset.CPUSet ++ description string ++ topo *topology.CPUTopology ++ numReservedCPUs int ++ reserved cpuset.CPUSet ++ stAssignments state.ContainerCPUAssignments ++ stDefaultCPUSet cpuset.CPUSet ++ pod *v1.Pod ++ isKubeInfraPodfunc isKubeInfraFunc ++ expErr error ++ expNewErr error ++ expCPUAlloc bool ++ expCSet cpuset.CPUSet + } + + func TestStaticPolicyStartWithResvList(t *testing.T) { +@@ -1032,35 +1036,63 @@ func TestStaticPolicyStartWithResvList(t *testing.T) { + } + } + +-func TestStaticPolicyAddWithResvList(t *testing.T) { ++func fakeIsKubeInfraTrue(pod *v1.Pod) bool { ++ return true ++} + ++func fakeIsKubeInfraFalse(pod *v1.Pod) bool { ++ return false ++} ++ ++func TestStaticPolicyAddWithResvList(t *testing.T) { ++ infraPod := makePod("fakePod", "fakeContainer2", "200m", "200m") ++ infraPod.Namespace = "kube-system" + testCases := []staticPolicyTestWithResvList{ + { +- description: "GuPodSingleCore, SingleSocketHT, ExpectError", +- topo: topoSingleSocketHT, +- numReservedCPUs: 1, +- reserved: cpuset.New(0), +- stAssignments: state.ContainerCPUAssignments{}, +- stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7), +- pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"), +- expErr: fmt.Errorf("not enough cpus available to satisfy request: requested=8, available=7"), +- expCPUAlloc: false, +- expCSet: cpuset.New(), ++ description: "GuPodSingleCore, SingleSocketHT, ExpectError", ++ topo: topoSingleSocketHT, ++ numReservedCPUs: 1, ++ reserved: cpuset.New(0), ++ stAssignments: state.ContainerCPUAssignments{}, ++ stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7), ++ pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"), ++ isKubeInfraPodfunc: fakeIsKubeInfraFalse, ++ expErr: fmt.Errorf("not enough cpus available to satisfy request: requested=8, available=7"), ++ expCPUAlloc: false, ++ expCSet: cpuset.New(), + }, + { +- description: "GuPodSingleCore, SingleSocketHT, ExpectAllocOneCPU", ++ description: "GuPodSingleCore, SingleSocketHT, ExpectAllocOneCPU", ++ topo: topoSingleSocketHT, ++ numReservedCPUs: 2, ++ reserved: cpuset.New(0, 1), ++ stAssignments: state.ContainerCPUAssignments{}, ++ stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7), ++ pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"), ++ isKubeInfraPodfunc: fakeIsKubeInfraFalse, ++ expErr: nil, ++ expCPUAlloc: true, ++ expCSet: cpuset.New(4), // expect sibling of partial core ++ }, ++ { ++ description: "GuPodMultipleCores, SingleSocketHT, ExpectAllocOneCore", + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), +- stAssignments: state.ContainerCPUAssignments{}, +- stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7), +- pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"), +- expErr: nil, +- expCPUAlloc: true, +- expCSet: cpuset.New(4), // expect sibling of partial core ++ stAssignments: state.ContainerCPUAssignments{ ++ "fakePod": map[string]cpuset.CPUSet{ ++ "fakeContainer100": cpuset.New(2, 3, 6, 7), ++ }, ++ }, ++ stDefaultCPUSet: cpuset.New(0, 1, 4, 5), ++ pod: makePod("fakePod", "fakeContainer3", "2000m", "2000m"), ++ isKubeInfraPodfunc: fakeIsKubeInfraFalse, ++ expErr: nil, ++ expCPUAlloc: true, ++ expCSet: cpuset.New(4, 5), + }, + { +- description: "GuPodMultipleCores, SingleSocketHT, ExpectAllocOneCore", ++ description: "InfraPod, SingleSocketHT, ExpectAllocReserved", + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), +@@ -1069,11 +1101,12 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + "fakeContainer100": cpuset.New(2, 3, 6, 7), + }, + }, +- stDefaultCPUSet: cpuset.New(0, 1, 4, 5), +- pod: makePod("fakePod", "fakeContainer3", "2000m", "2000m"), +- expErr: nil, +- expCPUAlloc: true, +- expCSet: cpuset.New(4, 5), ++ stDefaultCPUSet: cpuset.New(4, 5), ++ pod: infraPod, ++ isKubeInfraPodfunc: fakeIsKubeInfraTrue, ++ expErr: nil, ++ expCPUAlloc: true, ++ expCSet: cpuset.New(0, 1), + }, + } + +@@ -1086,6 +1119,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + defaultCPUSet: testCase.stDefaultCPUSet, + } + ++ varIsKubeInfra = testCase.isKubeInfraPodfunc + container := &testCase.pod.Spec.Containers[0] + err := policy.Allocate(st, testCase.pod, container) + if !reflect.DeepEqual(err, testCase.expErr) { +@@ -1206,6 +1240,133 @@ func TestStaticPolicyOptions(t *testing.T) { + } + } + ++func makePodWithLabels(podLabels map[string]string) *v1.Pod { ++ return &v1.Pod{ ++ ObjectMeta: metav1.ObjectMeta{ ++ Name: "test-pod", ++ Namespace: "test-namespace", ++ Labels: podLabels, ++ }, ++ } ++} ++ ++func fakeBuildConfigFromFlags(masterUrl string, kubeconfigPath string) (*restclient.Config, error) { ++ ++ return &restclient.Config{}, nil ++} ++ ++func fakeBuildConfigFromFlagsError(masterUrl string, kubeconfigPath string) (*restclient.Config, error) { ++ ++ errString := fmt.Sprintf("%s file not found", kubeconfigPath) ++ return nil, errors.New(errString) ++ ++} ++ ++func getFakeInfraPodNamespace(_ string) (*v1.Namespace, error) { ++ ++ return &v1.Namespace{ ++ ObjectMeta: metav1.ObjectMeta{ ++ Name: "test-namespace", ++ Labels: map[string]string{ ++ "app.starlingx.io/component": "platform", ++ }, ++ }}, nil ++} ++ ++func getFakeNonInfraPodNamespace(_ string) (*v1.Namespace, error) { ++ ++ return &v1.Namespace{ ++ ObjectMeta: metav1.ObjectMeta{ ++ Name: "test-namespace", ++ Labels: map[string]string{ ++ "fake": "label", ++ }}}, nil ++ ++} ++ ++type kubeInfraPodTestCase struct { ++ description string ++ pod *v1.Pod ++ namespaceFunc getPodNamespace ++ expectedValue bool ++} ++ ++func TestKubeInfraPod(t *testing.T) { ++ testCases := []kubeInfraPodTestCase{ ++ { ++ description: "Pod with platform label and namespace with platform label", ++ pod: makePodWithLabels(map[string]string{ ++ "app.starlingx.io/component": "platform", ++ }), ++ namespaceFunc: getFakeInfraPodNamespace, ++ expectedValue: true, ++ }, ++ { ++ description: "Pod with platform label and namespace without platform label", ++ pod: makePodWithLabels(map[string]string{ ++ "app.starlingx.io/component": "platform", ++ }), ++ namespaceFunc: getFakeNonInfraPodNamespace, ++ expectedValue: true, ++ }, ++ { ++ description: "Pod without platform label and namespace with platform label", ++ pod: makePodWithLabels(map[string]string{ ++ "test": "label", ++ }), ++ namespaceFunc: getFakeInfraPodNamespace, ++ expectedValue: true, ++ }, ++ { ++ description: "Pod without platform label and namespace without platform label", ++ pod: makePodWithLabels(map[string]string{ ++ "test": "namespace", ++ }), ++ namespaceFunc: getFakeNonInfraPodNamespace, ++ expectedValue: false, ++ }, ++ } ++ ++ for _, testCase := range testCases { ++ t.Run(testCase.description, func(t *testing.T) { ++ ++ varGetNamespaceObject = testCase.namespaceFunc ++ varBuildConfigFromFlags = fakeBuildConfigFromFlags ++ gotValue := isKubeInfra(testCase.pod) ++ ++ if gotValue != testCase.expectedValue { ++ t.Errorf("StaticPolicy isKubeInfraPod() error %v. expected value %v actual value %v", ++ testCase.description, testCase.expectedValue, gotValue) ++ } else { ++ fmt.Printf("StaticPolicy isKubeInfraPod() test successful. : %v ", testCase.description) ++ } ++ ++ }) ++ } ++ ++ test := kubeInfraPodTestCase{ ++ description: "Failure reading kubeconfig file", ++ pod: makePodWithLabels(map[string]string{ ++ "test": "namespace", ++ }), ++ namespaceFunc: getFakeNonInfraPodNamespace, ++ expectedValue: false, ++ } ++ ++ varGetNamespaceObject = getPodNamespaceObject ++ varBuildConfigFromFlags = fakeBuildConfigFromFlagsError ++ ++ gotValue := isKubeInfra(test.pod) ++ ++ if gotValue != test.expectedValue { ++ t.Errorf("StaticPolicy isKubeInfraPod() error %v. expected value %v actual value %v", ++ test.description, test.expectedValue, gotValue) ++ } else { ++ fmt.Printf("StaticPolicy isKubeInfraPod() test successful. : %v ", test.description) ++ } ++ ++} ++ + func newCPUSetPtr(cpus ...int) *cpuset.CPUSet { + ret := cpuset.New(cpus...) + return &ret +diff --git a/pkg/kubelet/cm/cpumanager/topology_hints_test.go b/pkg/kubelet/cm/cpumanager/topology_hints_test.go +index 53738b613c2..ad9c0f17602 100644 +--- a/pkg/kubelet/cm/cpumanager/topology_hints_test.go ++++ b/pkg/kubelet/cm/cpumanager/topology_hints_test.go +@@ -197,6 +197,7 @@ func TestPodGuaranteedCPUs(t *testing.T) { + expectedCPU: 210, + }, + } ++ varIsKubeInfra = fakeIsKubeInfraFalse + for _, tc := range tcases { + t.Run(tc.name, func(t *testing.T) { + requestedCPU := p.podGuaranteedCPUs(tc.pod) +@@ -241,6 +242,7 @@ func TestGetTopologyHints(t *testing.T) { + sourcesReady: &sourcesReadyStub{}, + } + ++ varIsKubeInfra = fakeIsKubeInfraFalse + hints := m.GetTopologyHints(&tc.pod, &tc.container)[string(v1.ResourceCPU)] + if len(tc.expectedHints) == 0 && len(hints) == 0 { + continue +@@ -294,6 +296,7 @@ func TestGetPodTopologyHints(t *testing.T) { + sourcesReady: &sourcesReadyStub{}, + } + ++ varIsKubeInfra = fakeIsKubeInfraFalse + podHints := m.GetPodTopologyHints(&tc.pod)[string(v1.ResourceCPU)] + if len(tc.expectedHints) == 0 && len(podHints) == 0 { + continue +@@ -477,6 +480,7 @@ func TestGetPodTopologyHintsWithPolicyOptions(t *testing.T) { + sourcesReady: &sourcesReadyStub{}, + } + ++ varIsKubeInfra = fakeIsKubeInfraFalse + podHints := m.GetPodTopologyHints(&testCase.pod)[string(v1.ResourceCPU)] + sort.SliceStable(podHints, func(i, j int) bool { + return podHints[i].LessThan(podHints[j]) +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch new file mode 100644 index 000000000..8bd860867 --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch @@ -0,0 +1,50 @@ +From 75eff86c66aa8fac96b7d102d339c1a50b0ad205 Mon Sep 17 00:00:00 2001 +From: Jim Gauld +Date: Fri, 11 Feb 2022 11:06:35 -0500 +Subject: [PATCH] kubelet: sort isolcpus allocation when SMT enabled + +The existing device manager code returns CPUs as devices in unsorted +order. This numerically sorts isolcpus allocations when SMT/HT is +enabled on the host. This logs SMT pairs, singletons, and algorithm +order details to make the algorithm understandable. + +Signed-off-by: Jim Gauld +--- + pkg/kubelet/cm/devicemanager/manager.go | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index 8a488a9292f..6a6e1195bcf 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -568,7 +568,16 @@ func order_devices_by_sibling(devices sets.Set[string], needed int) ([]string, e + return cpu_lst[0] + } + } ++ //Make post-analysis of selection algorithm obvious by numerical sorting ++ //the available isolated cpu_id. ++ cpu_ids := make([]int, 0, int(devices.Len())) + for cpu_id := range devices { ++ cpu_id_, _ := strconv.Atoi(cpu_id) ++ cpu_ids = append(cpu_ids, cpu_id_) ++ } ++ sort.Ints(cpu_ids) ++ for _, _cpu_id := range cpu_ids { ++ cpu_id := strconv.Itoa(_cpu_id) + // If we've already found cpu_id as a sibling, skip it. + if _, ok := _iterated_cpu[cpu_id]; ok { + continue +@@ -610,7 +619,9 @@ func order_devices_by_sibling(devices sets.Set[string], needed int) ([]string, e + } + } + } +- //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ //This algorithm will get some attention. Show minimal details. ++ klog.Infof("order_devices_by_sibling: needed=%d, smtpairs=%v, singletons=%v, order=%v", ++ needed, sibling_lst, single_lst, dev_lst) + return dev_lst, nil + } + func smt_enabled() bool { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch new file mode 100644 index 000000000..28ef5386d --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch @@ -0,0 +1,150 @@ +From 67797d74a1f57e0983ca9457ac8954406bf8b183 Mon Sep 17 00:00:00 2001 +From: Saba Touheed Mujawar +Date: Wed, 8 Jan 2025 08:27:30 -0500 +Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware + +Enhance isolcpus support in Kubernetes to allocate isolated SMT +siblings to the same container when SMT/HT is enabled on the host. + +As it stands, the device manager code in Kubernetes is not SMT-aware +(since normally it doesn't deal with CPUs). However, StarlingX +exposes isolated CPUs as devices and if possible we want to allocate +all SMT siblings from a CPU core to the same container in order to +minimize cross- container interference due to resource contention +within the CPU core. + +The solution is basically to take the list of isolated CPUs and +re-order it so that the SMT siblings are next to each other. That +way the existing resource selection code will allocate the siblings +together. As an optimization, if it is known that an odd number +of isolated CPUs are desired, a singleton SMT sibling will be +inserted into the list to avoid breaking up sibling pairs. + +Signed-off-by: Tao Wang +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Boovan Rajendran +Signed-off-by: Saba Touheed Mujawar +Signed-off-by: Boovan Rajendran +Signed-off-by: Saba Touheed Mujawar +--- + pkg/kubelet/cm/devicemanager/manager.go | 83 ++++++++++++++++++++++++- + 1 file changed, 82 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index f1d04e97179..8a488a9292f 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -23,6 +23,8 @@ import ( + "path/filepath" + "runtime" + "sort" ++ "strconv" ++ "strings" + "sync" + "time" + +@@ -46,6 +48,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" + "k8s.io/kubernetes/pkg/kubelet/types" + schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ++ "k8s.io/utils/cpuset" + ) + + const nodeWithoutTopology = -1 +@@ -550,6 +553,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() { + m.allocatedDevices = m.podDevices.devices() + } + ++// Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed', ++// return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list ++// contain as many hyperthread sibling pairs as possible. ++func order_devices_by_sibling(devices sets.Set[string], needed int) ([]string, error) { ++ var dev_lst []string ++ var single_lst []string ++ sibling_lst := make([]string, 0, int(devices.Len())) ++ _iterated_cpu := make(map[string]string) ++ get_sibling := func(cpu string, cpu_lst []string) string { ++ if cpu_lst[0] == cpu { ++ return cpu_lst[1] ++ } else { ++ return cpu_lst[0] ++ } ++ } ++ for cpu_id := range devices { ++ // If we've already found cpu_id as a sibling, skip it. ++ if _, ok := _iterated_cpu[cpu_id]; ok { ++ continue ++ } ++ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id) ++ dat, err := os.ReadFile(devPath) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id) ++ } ++ cpustring := strings.TrimSuffix(string(dat), "\n") ++ cpu_pair_set, err := cpuset.Parse(cpustring) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring) ++ } ++ var cpu_pair_lst []string ++ for _, v := range cpu_pair_set.List() { ++ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v)) ++ } ++ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst) ++ if _, ok := devices[sibling_cpu_id]; ok { ++ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id) ++ _iterated_cpu[sibling_cpu_id] = "" ++ } else { ++ single_lst = append(single_lst, cpu_id) ++ } ++ _iterated_cpu[cpu_id] = "" ++ } ++ if needed%2 == 0 { ++ dev_lst = append(sibling_lst, single_lst...) ++ } else { ++ if len(single_lst) > 1 { ++ _tmp_list := append(sibling_lst, single_lst[1:]...) ++ dev_lst = append(single_lst[0:1], _tmp_list...) ++ } else { ++ if len(single_lst) == 0 { ++ dev_lst = sibling_lst ++ } else { ++ dev_lst = append(single_lst, sibling_lst...) ++ } ++ } ++ } ++ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ return dev_lst, nil ++} ++func smt_enabled() bool { ++ dat, _ := os.ReadFile("/sys/devices/system/cpu/smt/active") ++ state := strings.TrimSuffix(string(dat), "\n") ++ if state == "0" { ++ return false ++ } ++ return true ++} ++ + // Returns list of device Ids we need to allocate with Allocate rpc call. + // Returns empty list in case we don't need to issue the Allocate rpc call. + func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.Set[string]) (sets.Set[string], error) { +@@ -630,7 +702,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi + if m.allocatedDevices[resource] == nil { + m.allocatedDevices[resource] = sets.New[string]() + } +- for device := range devices.Difference(allocated) { ++ availableDevices := sets.List[string](devices.Difference(allocated)) ++ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together. ++ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() { ++ var err error ++ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed) ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ } ++ for _, device := range availableDevices { + m.allocatedDevices[resource].Insert(device) + allocated.Insert(device) + needed-- +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/series b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/series new file mode 100644 index 000000000..ea0ceb8e8 --- /dev/null +++ b/kubernetes/kubernetes-1.30.6/debian/deb_folder/patches/series @@ -0,0 +1,10 @@ +kubeadm-create-platform-pods-with-zero-CPU-resources.patch +kubernetes-make-isolcpus-allocation-SMT-aware.patch +kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch +kubelet-cpumanager-disable-CFS-quota-throttling.patch +kubelet-cpumanager-keep-normal-containers-off-reserv.patch +kubelet-cpumanager-platform-pods-on-reserved-cpus.patch +kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch +kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch +kubeadm-reduce-UpgradeManifestTimeout.patch +kubeadm-readiness-probe-timeout-core-dns.patch