
This commit cherry-picks commits from the mainline kernel to improve Sapphire Rapids CPU support in the following components of the StarlingX kernel: intel_idle, perf/x86/RAPL and powercap, and perf/x86/cstate. (RAPL stands for "Running Average Power Limit", which is a CPU feature for measuring and limiting power consumption.) These improvements are required to support a new power metrics application in StarlingX, which is intended to work with Sapphire Rapids CPUs: https://opendev.org/starlingx/app-power-metrics The following commits are cherry-picked as part of this effort, in chronological order, organized by component: => intel_idle * commit 9edf3c0ffef0 ("intel_idle: add SPR support") (v5.18-rc1~203^2~3^3~5) * commit da0e58c038e6 ("intel_idle: add 'preferred_cstates' module argument") (v5.18-rc1~203^2~3^3~4) * commit 3a9cf77b60dc ("intel_idle: add core C6 optimization for SPR") (v5.18-rc1~203^2~3^3~3) * commit 39c184a6a9a7 ("intel_idle: Fix the 'preferred_cstates' module parameter") (v5.18-rc5~22^2^2~1) * commit 7eac3bd38d18 ("intel_idle: Fix SPR C6 optimization") (v5.18-rc5~22^2^2) * commit 1548fac47a11 ("intel_idle: make SPR C1 and C1E be independent") (v6.0-rc1~184^2~2^2^2) => perf/x86/rapl + powercap * commit ffb20c2e52e8 ("perf/x86/rapl: Add msr mask support") (v5.12-rc1~146^2~3) * commit b6f78d3fba7f ("perf/x86/rapl: Only check lower 32bits for RAPL energy counters") (v5.12-rc1~146^2~2) * commit 838342a6d6b7 ("perf/x86/rapl: Fix psys-energy event on Intel SPR platform") (v5.12-rc1~146^2~1) * commit 931da6a0de5d ("powercap: intel_rapl: support new layout of Psys PowerLimit Register on SPR") (v5.17-rc1~167^2^4^2~1) * commit 80275ca9e525 ("perf/x86/rapl: Use standard Energy Unit for SPR Dram RAPL domain") (v6.1-rc4~3^2~3) => perf/x86/cstate * commit 87bf399f86ec ("perf/x86/cstate: Add ICELAKE_X and ICELAKE_D support") (v5.14-rc1~7^2~1) * commit 528c9f1daf20 ("perf/x86/cstate: Add SAPPHIRERAPIDS_X CPU support") (v5.18-rc4~3^2) The set of commits listed above is a reduced version of a slightly larger superset of commits we had originally considered for cherry-picking. We opted for the commits listed above to limit potential impact on the StarlingX kernel by focusing on Sapphire Rapids support and direct dependencies only. We should note that we encountered a number of merge conflicts while cherry-picking these commits; however, none of the merge conflict resolutions required significantly altering the modifications made by the original commits. The individual patch files denote the nature of the merge conflicts. Verification: * The kernel recipes and all kernel modules were built from scratch with this commit, using the following command in a StarlingX build environment: $ build-pkgs -c -p linux,linux-rt,bnxt-en,i40e,i40e-cvl-2.54,\ i40e-cvl-4.10,iavf,iavf-cvl-2.54,iavf-cvl-4.10,ice,ice-cvl-2.54,\ ice-cvl-4.10,igb-uio,iqvlinux,kmod-opae-fpga-driver,mlnx-ofed-kernel,\ octeon-ep,qat1.7.l These packages were further packaged into a StarlingX (ostree) patch for easier deployment. * An Ansible-bootstrapped low-latency All-in-One simplex StarlingX set-up was prepared on a server with a Sapphire Rapids CPU. * The ostree patch was installed onto the server to start testing our changes. The kernel was confirmed to boot up as expected. * We enabled RAPL Psys domain reporting the server's BIOS (originally disabled), and we also disabled the BIOS-enforced limit on the CPU *package* C-states (originally set to C0/C1). * We forcibly removed the "intel_idle.max_cstate=0" kernel command line argument by modifying the sysinv daemon's Python source code on the server (with a systemd service that bind-mounts a replacement *.py file, to avoid another ostree patch). This was required to prevent the intel_idle driver from disabling itself, so that we could confirm the sanity of the cherry-picked commits. * The following tests were carried out, first with the patched preempt-rt kernel, and next with the original unpatched preempt-rt kernel: * Confirm that the intel_idle CPU idling driver is active: $ cat /sys/devices/system/cpu/cpuidle/current_driver * Confirm the CPU idling state names and parameters: $ grep -s '^' \ /sys/devices/system/cpu/cpu0/cpuidle/state[0-9]*/\ {name,desc,time,latency,residency} * Confirm that the RAPL/powercap and C-state related performance monitor unit (PMU) counters are usable by the kernel and with perf: $ sudo perf list * Confirm that the CPU and package C-state residency counters are working: $ perf stat -a \ -e cstate_core/c1-residency/ -e cstate_core/c6-residency/ \ -e cstate_pkg/c2-residency/ -e cstate_pkg/c6-residency/ \ -- sleep 5 * Confirm that RAPL/powercap-related performance counters are working: $ perf stat -a \ -e power/energy-pkg/ -e power/energy-ram/ -e power/energy-psys/ \ -- sleep 5 With the unpatched kernel, we observed that the intel_idle driver used CPU idling information exposed by the ACPI tables, with the following idle state names: POLL, C1_ACPI, C2_ACPI. With the patched kernel the C-state tables embedded in the intel_idle driver were used as expected, with the following idle state names: POLL, C1, C1E, C6. With the unpatched kernel, we observed that the CPU/package C-state residency counters were not detected, whereas they were detected with the patched kernel, as expected. With both the unpatched and the patched kernels, the RAPL/powercap related performance counters were detected. We observed that the units for the DRAM domain were incorrect for the unpatched kernel, which was expected due to the lack of commit 80275ca9e525 ("perf/x86/rapl: Use standard Energy Unit for SPR Dram RAPL domain"). * To confirm the sanity of our results acquired with the patched kernel in the previous step, we also carried out the following experiment with the v6.4.3-rt6 kernel available in the linux-yocto repository as commit 917d160a84f6 ("Merge branch 'v6.4/standard/base' into v6.4/standard/preempt-rt/base") in the "v6.4/standard/preempt-rt/base" branch. The "notification of death" StarlingX kernel patch was forward-ported to the v6.4.3-rt6 kernel and the "kernel.sched_nr_migrate" sysctl was reintroduced to make this kernel work with the aforementioned Ansible-bootstrapped StarlingX system. Furthermore, to ensure that the RAPL/powercap features are aligned to the most recent mainline kernel version, we cherry-picked the following commits from v6.5-rc1 onto the v6.4.3-rt6 kernel: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/log/?qt=range&q=44c026a73be8..49776c712eb6 Afterwards, this v6.4.3-rt6-based test kernel was built and installed onto the test server, and test procedures discussed in the previous step were repeated. Compared to the patched StarlingX v5.10 kernel, we observed that the RAPL/powercap measurements were similar, and the CPU and package C-state residency counters were not extremely different with the v6.4.3-rt6-based test kernel. * We should note that we have repeated tests with the patched StarlingX v5.10 kernel as well, but we did not reinstall the system to acquire a standard/non-low-latency set-up. Instead, we opted for running the following command, rebooting the system into the standard kernel, followed by repeating the test procedures, which had similar results. sudo grub-editenv /boot/1/kernel.env set kernel=vmlinuz-5.10.0-6-amd64 Acknowledgements: * Thanks to Alyson Deives Pereira for his extensive help in pruning the commits that we had originally thought of cherry-picking with this commit. * Thanks to Mark Asselstine for his advice on the second phase of the commit pruning activity. Story: 2010773 Task: 48449 Change-Id: Ibe6bff65e8a415ac027a5d493a0e65fe58c9e344 Signed-off-by: M. Vefa Bicakci <vefa.bicakci@windriver.com>
153 lines
5.3 KiB
Diff
153 lines
5.3 KiB
Diff
From ae6725d25cfe2fa7a45be90a06a953f9ebbad8e9 Mon Sep 17 00:00:00 2001
|
|
From: Zhang Rui <rui.zhang@intel.com>
|
|
Date: Tue, 7 Dec 2021 21:17:34 +0800
|
|
Subject: [PATCH] powercap: intel_rapl: support new layout of Psys PowerLimit
|
|
Register on SPR
|
|
|
|
On Sapphire Rapids, the layout of the Psys domain Power Limit Register
|
|
is different from from what it was before.
|
|
|
|
Enhance the code to support the new Psys PL register layout.
|
|
|
|
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
|
|
Reported-and-tested-by: Alkattan Dana <dana.alkattan@intel.com>
|
|
[ rjw: Subject and changelog edits ]
|
|
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
(cherry picked from commit 931da6a0de5d620425af4425344259e6ff46b654)
|
|
Signed-off-by: M. Vefa Bicakci <vefa.bicakci@windriver.com>
|
|
---
|
|
drivers/powercap/intel_rapl_common.c | 61 +++++++++++++++++++++++++++-
|
|
include/linux/intel_rapl.h | 6 +++
|
|
2 files changed, 65 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
|
|
index 285420c1eb7c..da90c7e52122 100644
|
|
--- a/drivers/powercap/intel_rapl_common.c
|
|
+++ b/drivers/powercap/intel_rapl_common.c
|
|
@@ -61,6 +61,20 @@
|
|
#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
|
|
#define PP_POLICY_MASK 0x1F
|
|
|
|
+/*
|
|
+ * SPR has different layout for Psys Domain PowerLimit registers.
|
|
+ * There are 17 bits of PL1 and PL2 instead of 15 bits.
|
|
+ * The Enable bits and TimeWindow bits are also shifted as a result.
|
|
+ */
|
|
+#define PSYS_POWER_LIMIT1_MASK 0x1FFFF
|
|
+#define PSYS_POWER_LIMIT1_ENABLE BIT(17)
|
|
+
|
|
+#define PSYS_POWER_LIMIT2_MASK (0x1FFFFULL<<32)
|
|
+#define PSYS_POWER_LIMIT2_ENABLE BIT_ULL(49)
|
|
+
|
|
+#define PSYS_TIME_WINDOW1_MASK (0x7FULL<<19)
|
|
+#define PSYS_TIME_WINDOW2_MASK (0x7FULL<<51)
|
|
+
|
|
/* Non HW constants */
|
|
#define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */
|
|
#define RAPL_PRIMITIVE_DUMMY BIT(2)
|
|
@@ -97,6 +111,7 @@ struct rapl_defaults {
|
|
bool to_raw);
|
|
unsigned int dram_domain_energy_unit;
|
|
unsigned int psys_domain_energy_unit;
|
|
+ bool spr_psys_bits;
|
|
};
|
|
static struct rapl_defaults *rapl_defaults;
|
|
|
|
@@ -669,12 +684,51 @@ static struct rapl_primitive_info rpi[] = {
|
|
RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
|
|
PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
|
|
RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
|
|
+ PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0,
|
|
+ RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
|
|
+ PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32,
|
|
+ RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
|
|
+ PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17,
|
|
+ RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
|
|
+ PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49,
|
|
+ RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
|
|
+ PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19,
|
|
+ RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
|
|
+ PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51,
|
|
+ RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
|
|
/* non-hardware */
|
|
PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
|
|
RAPL_PRIMITIVE_DERIVED),
|
|
{NULL, 0, 0, 0},
|
|
};
|
|
|
|
+static enum rapl_primitives
|
|
+prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
|
|
+{
|
|
+ if (!rapl_defaults->spr_psys_bits)
|
|
+ return prim;
|
|
+
|
|
+ if (rd->id != RAPL_DOMAIN_PLATFORM)
|
|
+ return prim;
|
|
+
|
|
+ switch (prim) {
|
|
+ case POWER_LIMIT1:
|
|
+ return PSYS_POWER_LIMIT1;
|
|
+ case POWER_LIMIT2:
|
|
+ return PSYS_POWER_LIMIT2;
|
|
+ case PL1_ENABLE:
|
|
+ return PSYS_PL1_ENABLE;
|
|
+ case PL2_ENABLE:
|
|
+ return PSYS_PL2_ENABLE;
|
|
+ case TIME_WINDOW1:
|
|
+ return PSYS_TIME_WINDOW1;
|
|
+ case TIME_WINDOW2:
|
|
+ return PSYS_TIME_WINDOW2;
|
|
+ default:
|
|
+ return prim;
|
|
+ }
|
|
+}
|
|
+
|
|
/* Read primitive data based on its related struct rapl_primitive_info.
|
|
* if xlate flag is set, return translated data based on data units, i.e.
|
|
* time, energy, and power.
|
|
@@ -692,7 +746,8 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
|
|
enum rapl_primitives prim, bool xlate, u64 *data)
|
|
{
|
|
u64 value;
|
|
- struct rapl_primitive_info *rp = &rpi[prim];
|
|
+ enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
|
|
+ struct rapl_primitive_info *rp = &rpi[prim_fixed];
|
|
struct reg_action ra;
|
|
int cpu;
|
|
|
|
@@ -738,7 +793,8 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
|
|
enum rapl_primitives prim,
|
|
unsigned long long value)
|
|
{
|
|
- struct rapl_primitive_info *rp = &rpi[prim];
|
|
+ enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
|
|
+ struct rapl_primitive_info *rp = &rpi[prim_fixed];
|
|
int cpu;
|
|
u64 bits;
|
|
struct reg_action ra;
|
|
@@ -983,6 +1039,7 @@ static const struct rapl_defaults rapl_defaults_spr_server = {
|
|
.set_floor_freq = set_floor_freq_default,
|
|
.compute_time_window = rapl_compute_time_window_core,
|
|
.psys_domain_energy_unit = 1000000000,
|
|
+ .spr_psys_bits = true,
|
|
};
|
|
|
|
static const struct rapl_defaults rapl_defaults_byt = {
|
|
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
|
|
index 50b8398ffd21..acf72c018142 100644
|
|
--- a/include/linux/intel_rapl.h
|
|
+++ b/include/linux/intel_rapl.h
|
|
@@ -58,6 +58,12 @@ enum rapl_primitives {
|
|
THROTTLED_TIME,
|
|
PRIORITY_LEVEL,
|
|
|
|
+ PSYS_POWER_LIMIT1,
|
|
+ PSYS_POWER_LIMIT2,
|
|
+ PSYS_PL1_ENABLE,
|
|
+ PSYS_PL2_ENABLE,
|
|
+ PSYS_TIME_WINDOW1,
|
|
+ PSYS_TIME_WINDOW2,
|
|
/* below are not raw primitive data */
|
|
AVERAGE_POWER,
|
|
NR_RAPL_PRIMITIVES,
|