
When the system is stressed running pods on isolated cores (using stress-ng for instance [1]) and the Power Metrics App [2] is also being executed, the system hangs. [1] https://github.com/ColinIanKing/stress-ng [2] https://opendev.org/starlingx/app-power-metrics Dmesg shows the following output: WARNING: CPU: 16 PID: 207561 at kernel/events/core.c:868 perf_cgroup_switch+0x222/0x230 RIP: 0010:perf_cgroup_switch+0x222/0x230 Call Trace: ? __warn+0x79/0xc0 ? perf_cgroup_switch+0x222/0x230 ? report_bug+0x9e/0xc0 ? handle_bug+0x41/0x90 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x12/0x20 ? perf_cgroup_switch+0x222/0x230 ? perf_cgroup_switch+0xff/0x230 __perf_event_task_sched_in+0x169/0x330 ? __perf_event_task_sched_out+0x27c/0x6d0 ? newidle_balance+0x3fd/0x480 finish_task_switch.isra.0+0x118/0x4b0 __schedule+0x2ae/0x930 ? hrtimer_start_range_ns+0x2fc/0x420 schedule+0xa7/0x110 do_nanosleep+0x7c/0x1a0 hrtimer_nanosleep+0x9b/0x140 ? __hrtimer_init+0xe0/0xe0 __x64_sys_nanosleep+0xad/0xe0 do_syscall_64+0x30/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 There is an upstream patch set that fix a race condition on perf_cgroup_switch. Applying these patches into stx kernel solved the issue. * commit a0827713e298 ("perf/core: Don't pass task around when ctx sched in") (v5.18-rc2~8^2~3) * commit 6875186aea5c ("perf/core: perf/core: Use perf_cgroup_info->active to check if cgroup is active") (v5.18-rc2~8^2~2) * commit 96492a6c558a ("perf/core: Fix perf_cgroup_switch()") (v5.18-rc2~8^2~1) * commit e19cd0b6fa59 ("perf/core: Always set cpuctx cgrp when enable cgroup event") (v5.18-rc2~8^2) Note: It was verified that are no "fixes" commits from mainline kernel to the commits mentioned above Test plan: PASS: Build iso success for rt and std. PASS: Install success onto a AIO-SX lab with both rt and std kernel. PASS: Apply power-metrics app, launch stress pods and confirm the system is stable. Closes-Bug: 2035124 Change-Id: I30fcb63e4564a23cdb26794f4dfefa748eaa0cee Signed-off-by: Alyson Deives Pereira <alyson.deivespereira@windriver.com>
217 lines
7.4 KiB
Diff
217 lines
7.4 KiB
Diff
From 840bf045f2488bf182c67a8791ddc1a176766fdb Mon Sep 17 00:00:00 2001
|
|
From: Chengming Zhou <zhouchengming@bytedance.com>
|
|
Date: Tue, 29 Mar 2022 23:45:20 +0800
|
|
Subject: [PATCH 71/74] perf/core: Don't pass task around when ctx sched in
|
|
|
|
The current code pass task around for ctx_sched_in(), only
|
|
to get perf_cgroup of the task, then update the timestamp
|
|
of it and its ancestors and set them to active.
|
|
|
|
But we can use cpuctx->cgrp to get active perf_cgroup and
|
|
its ancestors since cpuctx->cgrp has been set before
|
|
ctx_sched_in().
|
|
|
|
This patch remove the task argument in ctx_sched_in()
|
|
and cleanup related code.
|
|
|
|
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Link: https://lore.kernel.org/r/20220329154523.86438-2-zhouchengming@bytedance.com
|
|
(cherry picked from commit a0827713e298d021d3c79ae7423aea408f3f7c3a)
|
|
Signed-off-by: Alyson Deives Pereira <alyson.deivespereira@windriver.com>
|
|
---
|
|
kernel/events/core.c | 58 ++++++++++++++++++++------------------------
|
|
1 file changed, 26 insertions(+), 32 deletions(-)
|
|
|
|
diff --git a/kernel/events/core.c b/kernel/events/core.c
|
|
index 7e66e6eae545..a8b758ec7be0 100644
|
|
--- a/kernel/events/core.c
|
|
+++ b/kernel/events/core.c
|
|
@@ -568,8 +568,7 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
|
|
enum event_type_t event_type);
|
|
|
|
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
|
|
- enum event_type_t event_type,
|
|
- struct task_struct *task);
|
|
+ enum event_type_t event_type);
|
|
|
|
static void update_context_time(struct perf_event_context *ctx);
|
|
static u64 perf_event_time(struct perf_event *event);
|
|
@@ -800,10 +799,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
|
|
}
|
|
|
|
static inline void
|
|
-perf_cgroup_set_timestamp(struct task_struct *task,
|
|
- struct perf_event_context *ctx)
|
|
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
|
|
{
|
|
- struct perf_cgroup *cgrp;
|
|
+ struct perf_event_context *ctx = &cpuctx->ctx;
|
|
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
|
|
struct perf_cgroup_info *info;
|
|
struct cgroup_subsys_state *css;
|
|
|
|
@@ -812,10 +811,10 @@ perf_cgroup_set_timestamp(struct task_struct *task,
|
|
* ensure we do not access cgroup data
|
|
* unless we have the cgroup pinned (css_get)
|
|
*/
|
|
- if (!task || !ctx->nr_cgroups)
|
|
+ if (!cgrp)
|
|
return;
|
|
|
|
- cgrp = perf_cgroup_from_task(task, ctx);
|
|
+ WARN_ON_ONCE(!ctx->nr_cgroups);
|
|
|
|
for (css = &cgrp->css; css; css = css->parent) {
|
|
cgrp = container_of(css, struct perf_cgroup, css);
|
|
@@ -868,14 +867,14 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
|
|
WARN_ON_ONCE(cpuctx->cgrp);
|
|
/*
|
|
* set cgrp before ctxsw in to allow
|
|
- * event_filter_match() to not have to pass
|
|
- * task around
|
|
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
|
|
+ * to not have to pass task around
|
|
* we pass the cpuctx->ctx to perf_cgroup_from_task()
|
|
* because cgorup events are only per-cpu
|
|
*/
|
|
cpuctx->cgrp = perf_cgroup_from_task(task,
|
|
&cpuctx->ctx);
|
|
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
|
|
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL);
|
|
}
|
|
perf_pmu_enable(cpuctx->ctx.pmu);
|
|
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
|
|
@@ -1117,8 +1116,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
|
|
}
|
|
|
|
static inline void
|
|
-perf_cgroup_set_timestamp(struct task_struct *task,
|
|
- struct perf_event_context *ctx)
|
|
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
|
|
{
|
|
}
|
|
|
|
@@ -2698,8 +2696,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
|
|
static void
|
|
ctx_sched_in(struct perf_event_context *ctx,
|
|
struct perf_cpu_context *cpuctx,
|
|
- enum event_type_t event_type,
|
|
- struct task_struct *task);
|
|
+ enum event_type_t event_type);
|
|
|
|
static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
|
|
struct perf_event_context *ctx,
|
|
@@ -2715,15 +2712,14 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
|
|
}
|
|
|
|
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
|
|
- struct perf_event_context *ctx,
|
|
- struct task_struct *task)
|
|
+ struct perf_event_context *ctx)
|
|
{
|
|
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
|
|
+ cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
|
|
if (ctx)
|
|
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
|
|
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
|
|
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
|
|
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
|
|
if (ctx)
|
|
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
|
|
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
|
|
}
|
|
|
|
/*
|
|
@@ -2773,7 +2769,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
|
|
else if (ctx_event_type & EVENT_PINNED)
|
|
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
|
|
|
|
- perf_event_sched_in(cpuctx, task_ctx, current);
|
|
+ perf_event_sched_in(cpuctx, task_ctx);
|
|
perf_pmu_enable(cpuctx->ctx.pmu);
|
|
}
|
|
|
|
@@ -2993,7 +2989,7 @@ static void __perf_event_enable(struct perf_event *event,
|
|
return;
|
|
|
|
if (!event_filter_match(event)) {
|
|
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
|
|
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME);
|
|
return;
|
|
}
|
|
|
|
@@ -3002,7 +2998,7 @@ static void __perf_event_enable(struct perf_event *event,
|
|
* then don't put it on unless the group is on.
|
|
*/
|
|
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
|
|
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
|
|
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME);
|
|
return;
|
|
}
|
|
|
|
@@ -3811,8 +3807,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
|
|
static void
|
|
ctx_sched_in(struct perf_event_context *ctx,
|
|
struct perf_cpu_context *cpuctx,
|
|
- enum event_type_t event_type,
|
|
- struct task_struct *task)
|
|
+ enum event_type_t event_type)
|
|
{
|
|
int is_active = ctx->is_active;
|
|
|
|
@@ -3824,7 +3819,7 @@ ctx_sched_in(struct perf_event_context *ctx,
|
|
if (!(is_active & EVENT_TIME)) {
|
|
/* start ctx time */
|
|
__update_context_time(ctx, false);
|
|
- perf_cgroup_set_timestamp(task, ctx);
|
|
+ perf_cgroup_set_timestamp(cpuctx);
|
|
/*
|
|
* CPU-release for the below ->is_active store,
|
|
* see __load_acquire() in perf_event_time_now()
|
|
@@ -3855,12 +3850,11 @@ ctx_sched_in(struct perf_event_context *ctx,
|
|
}
|
|
|
|
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
|
|
- enum event_type_t event_type,
|
|
- struct task_struct *task)
|
|
+ enum event_type_t event_type)
|
|
{
|
|
struct perf_event_context *ctx = &cpuctx->ctx;
|
|
|
|
- ctx_sched_in(ctx, cpuctx, event_type, task);
|
|
+ ctx_sched_in(ctx, cpuctx, event_type);
|
|
}
|
|
|
|
static void perf_event_context_sched_in(struct perf_event_context *ctx,
|
|
@@ -3895,7 +3889,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
|
|
*/
|
|
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
|
|
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
|
|
- perf_event_sched_in(cpuctx, ctx, task);
|
|
+ perf_event_sched_in(cpuctx, ctx);
|
|
|
|
if (cpuctx->sched_cb_usage && pmu->sched_task)
|
|
pmu->sched_task(cpuctx->task_ctx, true);
|
|
@@ -4206,7 +4200,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
|
|
if (cpu_event)
|
|
rotate_ctx(&cpuctx->ctx, cpu_event);
|
|
|
|
- perf_event_sched_in(cpuctx, task_ctx, current);
|
|
+ perf_event_sched_in(cpuctx, task_ctx);
|
|
|
|
perf_pmu_enable(cpuctx->ctx.pmu);
|
|
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
|
|
@@ -4278,7 +4272,7 @@ static void perf_event_enable_on_exec(int ctxn)
|
|
clone_ctx = unclone_ctx(ctx);
|
|
ctx_resched(cpuctx, ctx, event_type);
|
|
} else {
|
|
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
|
|
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME);
|
|
}
|
|
perf_ctx_unlock(cpuctx, ctx);
|
|
|
|
--
|
|
2.25.1
|
|
|