Jim Gauld 0232b8b9dc Update collectd cpu plugin and monitor-tools to diagnose cpu spikes
The collectd cpu plugin and monitor-tools are updated to
support diagnosing high cpu usage on shorter time scale.
This includes tools that assist SystemEngineering determine
the source where CPU time is coming from.

This collectd cpu plugin is updated to support Kubernetes services
under system.slice or k8splatform.slice.

This changes the frequency of read function sampling to 1 second.
We now see logs with instantaneous cpu spikes at the cgroup level.
This dispatch of results still occurs at the original plugin
interval of 30 seconds.  The logging of the 1 second sampling is
configurable via /etc/collectd.d/starlingx/python_plugins.conf
field 'hires = <true|false>. The hiresolution samples are always
collected and used for a histogram, but it is not always desired
to log this due to the volume of output.

This adds new logs for occupancy wait. This is similar to cpu
occupancy, but instead of realtime used, it measures the aggregate
percent of time a given cgroup is waiting to schedule. This is a
measure of CPU contention.

This adds new logs for occupancy histograms for all cgroups and
aggregated groupings based on the 1 second occupancy samples.
The histograms are displayed in hirunner order. This displays
the histogram, the mean, 95th-percentile, and max value.
The histograms are logged at 5 minute intervals.

This reduces collectd cgroup to 256 CPUShare from (1024).
This smoothes out behaviour of poorly behaved audits.

The 'schedtop' tool is updated to display 'cgroup' field. This
is the systemd cgroup name, or abbrieviated pod-name. This also
handles Kernel sched output format changes for 6.6.

New tool 'portscanner' is added to monitor-tools to diagnose
local host processes that are using specific ports. This has been
instrumental in discovering gunicorn/keystone API users.

New tool 'k8smetrics' is added to monitor-tools to display
the delay histogram and percentiles for kube-apiserver and
etdcserver. This gives a way to quantify performance as
a result of system load.

Partial-Bug: 2084714

TEST PLAN:
AIO-SX, AIO-DX, Standard, Storage, DC:
PASS: Fresh install ISO
PASS: Verify /var/log/collectd.logs for 1 second cpu/wait logs,
      and contains: etcd, kubelet, and containerd services.
PASS: Verify we are dispatching at 30 second granularity.
PASS: Verify we are displaying histograms every 5 minutes.
PASS: Verify we can enable/disable the display of hiresolution
      logs with /etc/collectd.d/starlingx/python_plugins.conf
      field 'hires = <true|false>'.
PASS: Verify schedtop contains 'cgroup' output.
PASS: Verify output from 'k8smetrics'.
      Cross check against Prometheus GUI for apiserver percentile.
PASS: Verify output from portscanner with port 5000.
      Verify 1-to-1 mapping against /var/log/keystone/keystone-all.log.

Change-Id: I82d4f414afdf1cecbcc99680b360cbad702ba140
Signed-off-by: Jim Gauld <James.Gauld@windriver.com>
2024-11-15 02:11:55 -05:00

1728 lines
56 KiB
Perl
Executable File

#!/usr/bin/perl
########################################################################
#
# Copyright (c) 2015-2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
########################################################################
#
# Description:
# This displays occupancy and scheduling information per sample period.
# Output includes total occupancy, per-core occupancy, loadavg, per-task cpu,
# per-task scheduling, per-task io-wait.
#
# Usage: schedtop OPTIONS
# [--delay=<seconds>] [--repeat=<num>] [--period=<seconds>]
# [--reset-hwm] [--idle] [--sort=<cpu|io>]
# [--watch-cmd=tid1,cmd1,cmd2,...] [--watch-cgroup=cgroup1,...]
# [--watch-only] [--watch-quiet]
# [--trig-delay=time]
# [--help]
use strict;
use warnings;
use Data::Dumper;
use POSIX qw(uname strftime);
use Time::HiRes qw(clock_gettime usleep CLOCK_MONOTONIC CLOCK_REALTIME);
use Benchmark ':hireswallclock';
use Carp qw(croak carp);
use Math::BigInt;
use File::Find ();
# Define toolname
our $TOOLNAME = "schedtop";
our $VERSION = "0.1";
# Constants
use constant SI_k => 1.0E3;
use constant SI_M => 1.0E6;
use constant SI_G => 1.0E9;
use constant Ki => 1024.0;
use constant Mi => 1024.0*1024.0;
use constant Gi => 1024.0*1024.0*1024.0;
# Globals
our %opt_V = ();
our %opt_P = ();
our %percpu_0 = ();
our %percpu_1 = ();
our %task_0 = ();
our %task_1 = ();
our %tids_0 = ();
our %tids_1 = ();
our %tids_w = ();
our %D_task = ();
our %D_percpu = ();
our %loadavg = ();
our $tm_0 = ();
our $tm_1 = ();
our $tr_0 = ();
our $tr_1 = ();
our $tm_elapsed = ();
our $tr_elapsed = ();
our $tm_final = ();
our $uptime = ();
our $num_cpus = 1;
our $affinity_mask = Math::BigInt->new('0');
our $w_aff = 10;
our $num_tasks = 0;
our $num_blk = 0;
our $num_state_D = 0;
our $is_schedstat = 1;
our $USER_HZ = 100; # no easy way to get this
our $CLOCK_NS = SI_G / $USER_HZ;
our $print_host = 1;
our @cgroup_procs_paths = ();
our @cgroup_procs_match = ();
our @cgroup_tids = ();
# Print options
our ($P_none, $P_lite, $P_brief, $P_full) = (0, 1, 2, 3);
our ($P_ps, $P_cpu, $P_del, $P_io, $P_id, $P_cmd) = (0, 1, 2, 3, 4, 5);
our @P_list = ($::P_ps, $::P_cpu, $::P_del, $::P_io, $::P_id, $::P_cmd);
# Argument list parameters
our ($arg_debug,
$arg_delay,
$arg_repeat,
$arg_period,
$arg_reset_hwm,
$arg_idle,
$arg_sort,
$arg_print,
@arg_watch_cmd,
@arg_watch_cgroup,
$arg_watch_only,
$arg_watch_quiet,
$arg_trig_delay,
) = ();
#-------------------------------------------------------------------------------
# MAIN Program
#-------------------------------------------------------------------------------
my $ONE_BILLION = 1.0E9;
my $MIN_DELAY = 0.001;
my $MAX_DELAY = 0.001;
# benchmark variables
my ($bd, $b0, $b1);
my @policies = ('OT', 'FF', 'RR', 'BA', 'ID', 'UN', 'UN');
my @delta_list = (
'nr_switches',
'nr_migrations',
'exec_runtime',
'wait_sum',
'wait_count',
'iowait_sum',
'iowait_count',
'syscr',
'syscw',
'read_bytes',
'write_bytes',
'cancelled_write_bytes',
);
my @state_list = (
'exec_max', 'wait_max', 'block_max',
'pid', 'ppid', 'state', 'cgroup', 'comm', 'cmdline', 'wchan', 'affinity',
'VmSize', 'VmRSS', 'start_time',
'nice', 'policy', 'priority', 'rt_priority', 'task_cpu'
);
# Autoflush output
select(STDERR);
$| = 1;
select(STDOUT); # default
$| = 1;
# Parse input arguments and print tool usage if necessary
&parse_schedtop_args(
\$::arg_debug,
\$::arg_delay,
\$::arg_repeat,
\$::arg_period,
\$::arg_reset_hwm,
\$::arg_idle,
\$::arg_sort,
\$::arg_print,
\@::arg_watch_cmd,
\@::arg_watch_cgroup,
\$::arg_watch_only,
\$::arg_watch_quiet,
\$::arg_trig_delay,
);
# Set default print options
if ($::arg_print eq 'full') {
for my $P (@::P_list) { $::opt_P{$P} = $::P_full; }
} elsif ($::arg_print eq 'brief') {
for my $P (@::P_list) { $::opt_P{$P} = $::P_brief; }
} else {
for my $P (@::P_list) { $::opt_P{$P} = $::P_none; }
}
# Disable some options if data not present
$::opt_V{'sched'} = &is_sched();
$::opt_V{'io'} = &is_io();
if ($::opt_V{'sched'} == 0) {
$::opt_P{$::P_cpu} = $::P_none;
$::opt_P{$::P_del} = $::P_none;
$::opt_P{$::P_io} = $::P_none;
undef $::arg_reset_hwm;
}
if ($::opt_V{'io'} == 0) {
if ($::opt_V{'sched'} == 0) {
$::opt_P{$::P_io} = $::P_none;
$::arg_sort = 'cpu';
} else {
if ($::opt_P{$::P_io} != $::P_none) {
$::opt_P{$::P_io} = $::P_lite;
}
}
}
# Check for root user
if ($>) {
warn "$::TOOLNAME: requires root/sudo.\n";
exit 1;
}
# Print out some debugging information
if (defined $::arg_debug) {
$Data::Dumper::Indent = 1;
}
# Check for schedstat support; fallback to stats
$is_schedstat = -e '/proc/schedstat' ? 1 : 0;
# Print out selected options
printf "selected options: ".
"delay = %.3fs, repeat = %d, idle=%s, hwm=%s, sort=%s, print=%s\n",
$::arg_delay, $::arg_repeat,
(defined $::arg_idle ? 'idle_tasks' : 'no_idle_tasks'),
(defined $::arg_reset_hwm ? 'reset-hwm' : 'unchanged'),
$::arg_sort, $::arg_print;
if (@::arg_watch_cmd) {
printf "selected watch/trigger options: ".
"watch-cmd=%s, watch-cgroup=%s, only=%s, quiet=%s, delay=%d ms\n",
join(',', @::arg_watch_cmd),
join(',', @::arg_watch_cgroup),
(defined $::arg_watch_only ? 'true' : 'false'),
(defined $::arg_watch_quiet ? 'true' : 'false'),
$::arg_trig_delay;
}
# Capture timestamp
$b0 = new Benchmark;
# Get number of logical cpus
&get_num_logical_cpus(\$::num_cpus);
$::affinity_mask = Math::BigInt->new('0');
for (my $i=0; $i < $::num_cpus; $i++) {
my $y = Math::BigInt->new('1');
$y->blsft($i);
$::affinity_mask->bior($y);
}
$w_aff = &max(length 'AFF', length $::affinity_mask->as_hex());
# Find cgroup.proc paths matching specified cgroup patterns
&find_matching_cgroup_procs(\@::cgroup_procs_match, \@::arg_watch_cgroup);
for my $file (@::cgroup_procs_match) {
print "matched cgroup:", $file, "\n";
}
# Reset scheduling hi-water marks
if (defined $::arg_reset_hwm) {
&get_tids(\%::tids_1);
&reset_sched_hwm(\%::tids_1);
sleep(0.001);
}
# Get current hires epoc timestamp
$::tm_1 = clock_gettime(CLOCK_MONOTONIC);
$::tr_1 = clock_gettime(CLOCK_REALTIME);
$::tm_final = $::tm_1 + $::arg_delay*$::arg_repeat;
# Set initial delay
$::tm_elapsed = $::arg_delay;
$MAX_DELAY = $::arg_delay + $MIN_DELAY;
# Get overall per-cpu stats
if ($is_schedstat) {
&read_schedstat(\%::percpu_1);
} else {
&read_stat(\%::percpu_1);
}
# Get list of pids and tids
&get_tids(\%::tids_1);
# Get current scheduling and io info for all tids
&read_sched(\%::tids_1, \%::task_1);
# Track watched tids for monitoring
&track_watched_tids(\%::tids_1, \%::tids_w, \%::task_1, \@::arg_watch_cmd, \@::arg_watch_cgroup);
# determine column sort order
my $s_keyw = 'watched';
my ($s_key1, $s_key2, $s_key3) = ();
if ($::arg_sort eq 'cpu') {
($s_key1, $s_key2, $s_key3) = ('exec_runtime', 'nr_switches', 'pid');
} elsif ($::arg_sort eq 'io') {
($s_key1, $s_key2, $s_key3) = ('io', 'ios', 'exec_runtime');
} else {
($s_key1, $s_key2, $s_key3) = ('exec_runtime', 'nr_switches', , 'pid');
}
# Main loop
REPEAT_LOOP: for (my $repeat=1; $repeat <= $::arg_repeat; $repeat++) {
# copy all state variables
$::tm_0 = (); $::tr_0 = (); %::percpu_0 = (); %::tids_0 = (); %::task_0 = ();
$::tm_0 = $::tm_1; $::tr_0 = $::tr_1;
foreach my $cpu (keys %::percpu_1) { $::percpu_0{$cpu} = $::percpu_1{$cpu}; }
foreach my $tid (keys %::tids_1) { $::tids_0{$tid} = $::tids_1{$tid}; }
foreach my $tid (keys %::task_1) {
foreach my $var (keys %{$::task_1{$tid}}) {
$::task_0{$tid}{$var} = $::task_1{$tid}{$var};
}
}
# estimate sleep delay to achieve desired interarrival by subtracting out
# the measured cpu runtime of the tool.
my $delay = $::arg_delay;
if (defined $::D_task{$$}{'exec_runtime'}) {
$delay -= ($::D_task{$$}{'exec_runtime'}/SI_k);
}
$delay = $MIN_DELAY if ($delay < $MIN_DELAY);
$delay = $MAX_DELAY if ($delay > $MAX_DELAY);
usleep( SI_M*$delay );
# Collect current state
$::tm_1 = (); $::tr_1 = (); %::percpu_1 = (); %::tids_1 = (); %::task_1 = ();
# Get current hires epoc timestamp
$::tm_1 = clock_gettime(CLOCK_MONOTONIC);
$::tr_1 = clock_gettime(CLOCK_REALTIME);
# Get overall per-cpu stats
if ($is_schedstat) {
&read_schedstat(\%::percpu_1);
} else {
&read_stat(\%::percpu_1);
}
if (defined $::arg_watch_only) {
# This determines a subset of pids and tids
# based on previous watched tids and matching cgroups.
# This should reduce cpu impact dramatically.
# Get list of pids and tids
&get_tids(\%::tids_1);
# Get array of tids corresponding to matching cgroups
&read_cgroup_procs(\@::cgroup_tids, \@::cgroup_procs_match);
my %cgroup_tids_h = map { $_ => 1 } @::cgroup_tids;
# Keep previous watched tids and find new matches from cgroup.procs
my @del_tids = ();
foreach my $tid (keys %::tids_1) {
my $pid = $::tids_1{$tid};
next if (exists $::tids_w{$tid});
if (exists $cgroup_tids_h{$tid}) {
$::tids_w{$tid} = $pid;
printf "ADD watching: tid=%7d\n", $tid;
next;
}
push(@del_tids, $tid);
}
# Prune tids not actually being watched
foreach my $tid (@del_tids) {
delete $::tids_1{$tid};
}
# Prune watched tids that not longer exist
my @del_tids_w = ();
foreach my $tid (keys %::tids_w) {
next if (exists $::tids_1{$tid});
push(@del_tids_w, $tid);
}
foreach my $tid (@del_tids_w) {
printf "REM watching: tid=%7d\n", $tid;
delete $::tids_w{$tid};
}
} else {
# Get list of pids and tids
&get_tids(\%::tids_1);
}
# Get current scheduling and io info for all tids
&read_sched(\%::tids_1, \%::task_1);
# Get current uptime
&get_uptime(\$::uptime);
# Get current loadavg
&get_loadavg(\%::loadavg, \$::runq, \$::num_tasks);
# Get current processes blocked
&get_blocked(\$::num_blk);
# Get current processes in uninterruptible disk sleep.
$num_state_D = 0;
foreach my $tid (keys %::task_1) {
$::num_state_D++ if ($::task_1{$tid}{'state'} eq 'D');
}
# Delta calculation
%::D_task = (); %::D_percpu = ();
$::tm_elapsed = $::tm_1 - $::tm_0;
$::tr_elapsed = $::tr_1 - $::tr_0;
foreach my $tid (keys %::task_1) {
next if ( !(exists $::task_0{$tid}) );
# simple delta
foreach my $var (@delta_list) {
$::D_task{$tid}{$var} = ($::task_1{$tid}{$var} - $::task_0{$tid}{$var});
}
# state information
foreach my $state (@state_list) {
$::D_task{$tid}{$state} = $::task_1{$tid}{$state};
}
if (exists $::tids_w{$tid}) {
$::D_task{$tid}{'watched'} = 1;
} else {
$::D_task{$tid}{'watched'} = 0;
}
# derived calculations
my $exec_runtime = $::D_task{$tid}{'exec_runtime'};
my $nr_switches = $::D_task{$tid}{'nr_switches'};
my $iowait_sum = $::D_task{$tid}{'iowait_sum'};
if ($nr_switches > 0.0) {
$::D_task{$tid}{'tlen'} = $exec_runtime / $nr_switches;
} else {
$::D_task{$tid}{'tlen'} = 0.0;
}
if ($::tm_elapsed > 0.0) {
$::D_task{$tid}{'occ'} = 100.0*$exec_runtime/1.0E3/$::tm_elapsed;
$::D_task{$tid}{'iowait'} = 100.0*$iowait_sum/1.0E3/$::tm_elapsed;
} else {
$::D_task{$tid}{'occ'} = 0.0;
$::D_task{$tid}{'iowait'} = 0.0;
}
$::D_task{$tid}{'io'} = $::D_task{$tid}{'read_bytes'}
+ $::D_task{$tid}{'write_bytes'}
+ $::D_task{$tid}{'cancelled_write_bytes'};
$::D_task{$tid}{'ios'} = $::D_task{$tid}{'syscw'}
+ $::D_task{$tid}{'iowait_count'};
}
foreach my $cpu (keys %::percpu_1) {
$::D_percpu{$cpu}{'runtime'} = ($::percpu_1{$cpu} - $::percpu_0{$cpu})/1.0E6;
if ($::tm_elapsed > 0.0) {
$::D_percpu{$cpu}{'occ'} = 100.0*$D_percpu{$cpu}{'runtime'}/1.0E3/$::tm_elapsed;
} else {
$::D_percpu{$cpu}{'occ'} = 0.0;
}
}
my $occ_total = 0.0;
for (my $cpu=0; $cpu < $::num_cpus; $cpu++) {
$occ_total += $::D_percpu{$cpu}{'occ'};
}
# Trigger sysrq and coredump if we exceed watch trigger threshold
my $trigger = 0;
if ($::arg_trig_delay > 0) {
foreach my $tid (keys %::tids_w) {
if ($::D_task{$tid}{'wait_max'} > $::arg_trig_delay) {
$trigger = 1;
if (!defined $::arg_watch_quiet) {
printf "TRIGGER: delay: %.3f > %.3f milliseconds, tid: %d, comm: %s\n",
$::D_task{$tid}{'wait_max'}, $::arg_trig_delay, $tid, $::task_1{$tid}{'comm'};
}
}
if ($::D_task{$tid}{'block_max'} > $::arg_trig_delay) {
$trigger = 1;
if (!defined $::arg_watch_quiet) {
printf "TRIGGER: block: %.3f > %.3f milliseconds, tid: %d, comm: %s\n",
$::D_task{$tid}{'block_max'}, $::arg_trig_delay, $tid, $::task_1{$tid}{'comm'};
}
}
}
}
if ($trigger) {
if (!defined $::arg_watch_quiet) {
printf "TRIGGER sysrq.";
}
&sysrq_trigger_crash();
}
# Suppress all output
next if (defined $::arg_watch_quiet);
# Print summary
&schedtop_header(
\$::tr_1,
\$::tm_elapsed,
\$::tr_elapsed,
\$::uptime,
\$::loadavg,
\$::runq,
\$::num_blk,
\$::num_state_D,
\$::num_tasks,
\$::print_host
);
printf "%-5s %7s ", 'core:', 'total';
for (my $cpu=0; $cpu < $::num_cpus; $cpu++) {
printf "%5s ", $cpu;
}
print "\n";
printf "%-5s %7.1f ", 'occ:', $occ_total;
for (my $cpu=0; $cpu < $::num_cpus; $cpu++) {
printf "%5.1f ", $::D_percpu{$cpu}{'occ'};
}
print "\n";
print "\n";
# Build up output line by specific area
my $L = ();
$L = '';
$L .= sprintf "%7s %7s %7s ", "TID", "PID", "PPID";
if ($::opt_P{$::P_ps} != $::P_none) {
$L .= sprintf "%1s %2s %*s %2s %3s %4s ",
"S", "P", $w_aff, "AFF", "PO", "NI", "PR";
}
if ($::opt_P{$::P_cpu} == $::P_brief) {
$L .= sprintf "%6s %7s ", "ctxt", "occ";
} elsif ($::opt_P{$::P_cpu} == $::P_full) {
$L .= sprintf "%6s %6s %7s ", "ctxt", "migr", "occ";
}
if ($::opt_P{$::P_del} != $::P_none) {
$L .= sprintf "%7s %7s %7s %7s %7s ", "tlen", "tmax", "delay", "dmax", "bmax";
}
if ($::opt_P{$::P_io} == $::P_lite) {
$L .= sprintf "%7s %6s ", "iowt", "iocnt";
} elsif ($::opt_P{$::P_io} == $::P_brief) {
$L .= sprintf "%7s %8s %8s ", "iowt", "read", "write";
} elsif ($::opt_P{$::P_io} == $::P_full) {
$L .= sprintf "%7s %8s %8s %8s %8s %8s ",
"iowt", "read", "write", "wcncl", "rsysc", "wsysc";
}
if ($::opt_P{$::P_id} != $::P_none) {
$L .= sprintf "%-22s ", "wchan";
}
if ($::opt_P{$::P_cmd} == $::P_brief) {
$L .= sprintf "%s", "cmdline";
} elsif ($::opt_P{$::P_cmd} == $::P_full) {
$L .= sprintf "%-16s %-15s %s", "cgroup", "comm", "cmdline";
}
print $L, "\n";
foreach my $tid (sort {($D_task{$b}{$s_keyw} <=> $D_task{$a}{$s_keyw}) or
($D_task{$b}{$s_key1} <=> $D_task{$a}{$s_key1}) or
($D_task{$b}{$s_key2} <=> $D_task{$a}{$s_key2}) or
($D_task{$b}{$s_key3} <=> $D_task{$a}{$s_key3})} keys %D_task) {
my $exec_runtime = $::D_task{$tid}{'exec_runtime'};
my $nr_switches = $::D_task{$tid}{'nr_switches'};
my $aff = $::D_task{$tid}{'affinity'}->as_hex();
# skip printing if there is no actual delta
if ( !(defined $::arg_idle) ) {
next if (($exec_runtime == 0.0) && ($nr_switches == 0));
}
# Build up output line by specific area
$L = '';
$L .= sprintf "%7d %7d %7d ",
$tid, $::D_task{$tid}{'pid'}, $::D_task{$tid}{'ppid'};
if ($::opt_P{$::P_ps} != $::P_none) {
$L .= sprintf "%1s %2d %*s %2s %3d %4d ",
$::D_task{$tid}{'state'}, $::D_task{$tid}{'task_cpu'}, $w_aff, $aff,
$policies[$::D_task{$tid}{'policy'}], $::D_task{$tid}{'nice'},
$::D_task{$tid}{'priority'};
}
if ($::opt_P{$::P_cpu} == $::P_brief) {
$L .= sprintf "%6d %7.2f ",
$::D_task{$tid}{'nr_switches'}, $::D_task{$tid}{'occ'};
} elsif ($::opt_P{$::P_cpu} == $::P_full) {
$L .= sprintf "%6d %6d %7.2f ",
$::D_task{$tid}{'nr_switches'}, $::D_task{$tid}{'nr_migrations'},
$::D_task{$tid}{'occ'},
}
if ($::opt_P{$::P_del} != $::P_none) {
$L .= sprintf "%7.3f %7.1f %7.3f %7.1f %7.1f ",
$::D_task{$tid}{'tlen'}, $::D_task{$tid}{'exec_max'},
$::D_task{$tid}{'wait_sum'}, $::D_task{$tid}{'wait_max'},
$::D_task{$tid}{'block_max'};
}
if ($::opt_P{$::P_io} == $::P_lite) {
$L .= sprintf "%7.2f %6d ",
$::D_task{$tid}{'iowait'}, $::D_task{$tid}{'iowait_count'};
} elsif ($::opt_P{$::P_io} == $::P_brief) {
$L .= sprintf "%7.2f %8s %8s ",
$::D_task{$tid}{'iowait'},
&format_SI($::D_task{$tid}{'read_bytes'}),
&format_SI($::D_task{$tid}{'write_bytes'});
} elsif ($::opt_P{$::P_io} == $::P_full) {
$L .= sprintf "%7.2f %8s %8s %8s %8s %8s ",
$::D_task{$tid}{'iowait'},
&format_SI($::D_task{$tid}{'read_bytes'}),
&format_SI($::D_task{$tid}{'write_bytes'}),
&format_SI($::D_task{$tid}{'cancelled_write_bytes'}),
&format_SI($::D_task{$tid}{'syscr'}),
&format_SI($::D_task{$tid}{'syscw'});
}
if ($::opt_P{$::P_id} != $::P_none) {
$L .= sprintf "%-22s ", substr($::D_task{$tid}{'wchan'}, 0, 22);
}
if ($::opt_P{$::P_cmd} == $::P_brief) {
$L .= sprintf "%s", $::D_task{$tid}{'cmdline'};
} elsif ($::opt_P{$::P_cmd} == $::P_full) {
$L .= sprintf "%-16s %-15s %s",
substr($::D_task{$tid}{'cgroup'}, 0, 16),
substr($::D_task{$tid}{'comm'}, 0, 15),
$::D_task{$tid}{'cmdline'};
}
print $L, "\n";
}
print "\n";
# exit repeat loop if we have exceeded overall time
last if ($::tm_1 > $::tm_final);
} # REPEAT LOOP
# Print that tool has finished
print "done\n";
# Capture timestamp and report delta
$b1 = new Benchmark; $bd = Benchmark::timediff($b1, $b0);
printf "processing time: %s\n", timestr($bd);
exit 0;
#-------------------------------------------------------------------------------
# Convert a number to SI unit xxx.yyyG
sub format_SI
{
(my $value) = @_;
if ($value >= SI_G) {
return sprintf("%.3fG", $value/SI_G);
} elsif ($value >= SI_M) {
return sprintf("%.3fM", $value/SI_M);
} elsif ($value >= SI_k) {
return sprintf("%.3fk", $value/SI_k);
} else {
return sprintf("%.0f", $value);
}
}
# Convert to IEC binary unit xxx.yyyGi
# Since underlying memory units are in pages, don't need decimals for Ki
sub format_IEC
{
(my $value) = @_;
if ($value >= Gi) {
return sprintf("%.3fGi", $value/Gi);
} elsif ($value >= Mi) {
return sprintf("%.3fMi", $value/Mi);
} elsif ($value >= Ki) {
return sprintf("%.0fKi", $value/Ki);
} else {
return sprintf("%.0f", $value);
}
}
# Determine whether scheduler stats are available
sub is_sched
{
return (-e '/proc/1/task/1/sched') ? 1 : 0;
}
# Determine whether IO stats are available
sub is_io
{
return (-e '/proc/1/task/1/io') ? 1 : 0;
}
# Determine max of array
sub max {
my ($max, @vars) = @_;
for (@vars) {
$max = $_ if $_ > $max;
}
return $max;
}
# Determine tids and pid mapping by walking /proc/<pid>/task/<tid>
sub get_tids
{
(local *::tids) = @_;
my (@pids_, @tids_) = ();
my ($dh, $pid, $tid);
# get pid list
my $dir = '/proc';
opendir($dh, $dir) || croak "Cannot open directory: $dir ($!)";
@pids_ = grep { /^\d+$/ && -d "$dir/$_" } readdir($dh);
closedir $dh;
# get tid list
foreach $pid (@pids_) {
$dir = '/proc/' . $pid . '/task';
opendir(my $dh, $dir) || next;
@tids_ = grep { /^\d+$/ && -d "$dir/$_" } readdir($dh);
closedir $dh;
foreach $tid (@tids_) { $::tids{$tid} = $pid; }
}
}
# Reset scheduling hi-water-marks
# NOTE: Reset by write 0 to sched is finicky; use brute force
sub reset_sched_hwm
{
(local *::tids) = @_;
# reset scheduling hi-water-marks by writing '0' to each task
my (%pids_) = ();
foreach my $tid (keys %::tids) {
my $pid = $::tids{$tid};
$pids_{$pid} = 1;
}
foreach my $pid (keys %pids_) {
my $file = '/proc/' . $pid . '/sched';
open(my $fh, "> $file") || next;
print $fh "0\n";
close($fh);
}
foreach my $tid (keys %::tids) {
my $file = '/proc/' . $tid . '/sched';
open(my $fh, "> $file") || next;
print $fh "0\n";
close($fh);
}
foreach my $tid (keys %::tids) {
my $pid = $::tids{$tid};
my $file = '/proc/' . $pid . '/task/' . $tid . '/sched';
open(my $fh, "> $file") || next;
print $fh "0\n";
close($fh);
}
}
# Trigger a crash dump via sysrq, result in /var/crash .
# The following requires root privilege:
# echo 1 > /proc/sys/kernel/sysrq
# echo c > /proc/sysrq-trigger
sub sysrq_trigger_crash
{
my $file1 = '/proc/sys/kernel/sysrq';
open(my $fh1, "> $file1") || croak "Cannot open file: $file1 ($!)";
print $fh1 "1\n";
close($fh1);
my $file2 = '/proc/sysrq-trigger';
open(my $fh2, "> $file2") || croak "Cannot open file: $file2 ($!)";
print $fh2 "c\n";
close($fh2);
}
# Track watched tids for monitoring
sub track_watched_tids
{
(local *::tids, local *::tids_w, local *::task, local *::arg_watch_cmd, local *::arg_watch_cgroup) = @_;
foreach my $tid (keys %::tids) {
my $pid = $::tids{$tid};
my $comm = $::task{$tid}{'comm'};
my $cgroup = $::task{$tid}{'cgroup'};
my $cmdline = $::task{$tid}{'cmdline'};
my $watched = 0;
next if (exists $::tids_w{$tid});
foreach my $cmd (@::arg_watch_cmd) {
if (($cmd =~ /^\d+$/) && (($tid == $cmd) || ($pid == $cmd))) {
$::tids_w{$tid} = $pid;
printf "watching: tid=%7d, cgroup=%s, comm=%s, cmdline=%.40s\n", $tid, $cgroup, $comm, $cmdline;
}
if ((defined $comm) && ($comm =~ /^\Q$cmd\E/)) {
$::tids_w{$tid} = $pid;
printf "watching: tid=%7d, cgroup=%s, comm=%s, cmdline=%.40s\n", $tid, $cgroup, $comm, $cmdline;
}
}
foreach my $cg (@::arg_watch_cgroup) {
if ((defined $cgroup) && ($cgroup =~ /^\Q$cg\E/)) {
$::tids_w{$tid} = $pid;
printf "watching: tid=%7d, cgroup=%s, comm=%s, cmdline=%.40s\n", $tid, $cgroup, $comm, $cmdline;
}
}
}
}
# Find module difficult, storing result in global variable
sub wanted_cgroup_procs {
my $F = $File::Find::name;
if ($_ eq 'cgroup.procs') {
push @::cgroup_procs_paths, $F;
}
}
# Find cgroup.proc paths matching specified cgroup patterns
sub find_matching_cgroup_procs
{
(local *::cgroup_procs_match, local *::arg_watch_cgroup) = @_;
# Find all cgroup.procs paths for the pids cgroup controller
File::Find::find(\&wanted_cgroup_procs, '/sys/fs/cgroup/pids');
foreach my $file (@::cgroup_procs_paths) {
foreach my $cg (@::arg_watch_cgroup) {
if ($file =~ /\Q$cg\E(\.service|\.scope)/) {
push(@::cgroup_procs_match, $file);
} elsif ($file =~ /kubepods\/\w+\/\Q$cg\E/) {
push(@::cgroup_procs_match, $file);
}
}
}
}
# Get array of tids corresponding to matching cgroups
sub read_cgroup_procs
{
(local *::tids, local *::cgroup_procs_match) = @_;
my $tid = ();
# reset scheduling hi-water-marks by writing '0' to each task
foreach my $cgroup_procs (@::cgroup_procs_match) {
open(my $fh, $cgroup_procs) || goto SKIP_PROCS;
while (<$fh>) {
if (/^(\d+)$/) {
$tid = $1;
push @::tids, $tid;
}
}
close($fh);
SKIP_PROCS:;
}
}
# Parse cpu and scheduling info for each tid
# - ignore the specific tid if there is incomplete data,
# (i.e., cannot obtain info because task has died,
# eg. missing ./stat, ./status, ./cmdline, ./wchan)
#
sub read_sched
{
(local *::tids, local *::task) = @_;
%::task = ();
foreach my $tid (keys %::tids) {
my ($fh, $file, $pid, $comm, $cmdline, $wchan, $id) = ();
my ($tpid, $tcomm, $state, $ppid, $pgrp, $sid,
$tty_nr, $tty_pgrp, $flags,
$min_flt, $cmin_flt, $maj_flt, $cmaj_flt,
$utime, $stime, $cutime, $cstime,
$priority, $nice, $num_threads,
$it_real_value, $start_time,
$vsize, $rss, $rsslim,
$start_code, $end_code, $start_stack, $esp, $eip,
$pending, $blocked, $sigign, $sigcatch, $wchan_addr,
$dum1, $dum2, $exit_signal, $task_cpu,
$rt_priority, $policy, $blkio_ticks,
$gtime, $cgtime,
$start_data, $end_data, $start_brk, $arg_start, $arg_end,
$env_start, $env_end, $exit_code) = ();
my ($cgroup) = ();
my ($nr_switches, $nr_migrations) = (0,0);
my ($exec_runtime, $exec_max) = (0.0, 0.0);
my ($wait_max, $wait_sum, $wait_count) = (0.0, 0.0, 0);
my ($block_max) = (0.0);
my ($iowait_sum, $iowait_count) = (0.0, 0);
my ($VmSize, $VmRSS) = ();
my $Cpus_allowed = Math::BigInt->new('0');
my $affinity = Math::BigInt->new('0');
my ($rchar, $wchar, $syscr, $syscw, $read_bytes, $write_bytes,
$cancelled_write_bytes) = (0,0,0,0,0,0,0);
my ($sched_valid, $io_valid, $status_valid, $cmdline_valid,
$wchan_valid, $stat_valid, $cgroup_valid) = ();
$pid = $::tids{$tid};
# NOTE: Format change over time: OLD: se.statistics.X, NEW: se.statistics->X
#cat /proc/1/sched
#systemd (1, #threads: 1)
#-------------------------------------------------------------------
#se.exec_start : 33792676.285222
#se.vruntime : 28019997.693224
#se.sum_exec_runtime : 21918.207287
#se.nr_migrations : 5413
#se.statistics->sum_sleep_runtime : 1166561.198533
#se.statistics->wait_start : 0.000000
#se.statistics->sleep_start : 33792676.285222
#se.statistics->block_start : 0.000000
#se.statistics->sleep_max : 18951.679990
#se.statistics->block_max : 0.000000
#se.statistics->exec_max : 0.909747
#se.statistics->slice_max : 1.790123
#se.statistics->wait_max : 4.026544
#se.statistics->wait_sum : 507.245963
#se.statistics->wait_count : 2540
#se.statistics->iowait_sum : 0.000000
#se.statistics->iowait_count : 0
#se.statistics->nr_migrations_cold : 0
#se.statistics->nr_failed_migrations_affine : 67
#se.statistics->nr_failed_migrations_running : 1
#se.statistics->nr_failed_migrations_hot : 1
#se.statistics->nr_forced_migrations : 0
#se.statistics->nr_wakeups : 2472
#se.statistics->nr_wakeups_sync : 34
#se.statistics->nr_wakeups_migrate : 176
#se.statistics->nr_wakeups_local : 1442
#se.statistics->nr_wakeups_remote : 1030
#se.statistics->nr_wakeups_affine : 155
#se.statistics->nr_wakeups_affine_attempts : 969
#se.statistics->nr_wakeups_passive : 0
#se.statistics->nr_wakeups_idle : 0
#avg_atom : 0.286970
#avg_per_cpu : 4.049179
#nr_switches : 76378
#nr_voluntary_switches : 72308
#nr_involuntary_switches : 4070
#se.load.weight : 1024
#policy : 0
#prio : 120
#clock-delta : 28
# Changes for 6.6.0 kernel
#cat /proc/1/sched
#systemd (1, #threads: 1)
#-------------------------------------------------------------------
#se.exec_start : 251536392.418317
#se.vruntime : 542073.435409
#se.sum_exec_runtime : 1097697.572750
#se.nr_migrations : 35039
#sum_sleep_runtime : 249925608.224346
#sum_block_runtime : 234992.983051
#wait_start : 0.000000
#sleep_start : 251536392.418317
#block_start : 0.000000
#sleep_max : 11967.794377
#block_max : 1230.041276
#exec_max : 147.808142
#slice_max : 78.070544
#wait_max : 180.271599
#wait_sum : 440802.706697
#wait_count : 1022180
#iowait_sum : 81.179285
#iowait_count : 63
#nr_migrations_cold : 0
#nr_failed_migrations_affine : 145872
#nr_failed_migrations_running : 67209
#nr_failed_migrations_hot : 82715
#nr_forced_migrations : 12
#nr_wakeups : 264124
#nr_wakeups_sync : 41
#nr_wakeups_migrate : 205
#nr_wakeups_local : 146458
#nr_wakeups_remote : 117666
#nr_wakeups_affine : 204
#nr_wakeups_affine_attempts : 409
#nr_wakeups_passive : 0
#nr_wakeups_idle : 0
#avg_atom : 1.072258
#avg_per_cpu : 31.327879
#nr_switches : 1023725
#nr_voluntary_switches : 264916
#nr_involuntary_switches : 758809
#se.load.weight : 1048576
#se.avg.load_sum : 1490
#se.avg.runnable_sum : 1526937
#se.avg.util_sum : 365568
#se.avg.load_avg : 32
#se.avg.runnable_avg : 32
#se.avg.util_avg : 7
#se.avg.last_update_time : 251536392418304
#se.avg.util_est.ewma : 163
#se.avg.util_est.enqueued : 7
#policy : 0
#prio : 120
#clock-delta : 112
#mm->numa_scan_seq : 0
#numa_pages_migrated : 0
#numa_preferred_nid : -1
#total_numa_faults : 0
#current_node=0, numa_group_id=0
#numa_faults node=0 task_private=0 task_shared=0 group_private=0 group_shared=0
# parse /proc/<pid>/task/<tid>/sched
$file = '/proc/' . $pid . '/task/' . $tid . '/sched';
open($fh, $file) || goto SKIP_SCHED;
$_ = <$fh>;
if (/^(.*)\s+\((\d+),\s+#threads:/) {
$comm = $1; $id = $2;
}
my ($k, $v, $c0);
LOOP_SCHED: while (<$fh>) {
if (/^wait_max\s+:\s+(\S+)/ || /^se\.statistics.{1,2}wait_max\s+:\s+(\S+)/) {
$wait_max = $1;
} elsif (/^block_max\s+:\s+(\S+)/ || /^se\.statistics.{1,2}block_max\s+:\s+(\S+)/) {
$block_max = $1;
} elsif (/^wait_sum\s+:\s+(\S+)/ || /^se\.statistics.{1,2}wait_sum\s+:\s+(\S+)/) {
$wait_sum = $1;
} elsif (/^wait_count\s+:\s+(\S+)/ || /^se\.statistics.{1,2}wait_count\s+:\s+(\S+)/) {
$wait_count = $1;
} elsif (/^exec_max\s+:\s+(\S+)/ || /^se\.statistics.{1,2}exec_max\s+:\s+(\S+)/) {
$exec_max = $1;
} elsif (/^iowait_sum\s+:\s+(\S+)/ || /^se\.statistics.{1,2}iowait_sum\s+:\s+(\S+)/) {
$iowait_sum = $1;
} elsif (/^iowait_count\s+:\s+(\S+)/ || /^se\.statistics.{1,2}iowait_count\s+:\s+(\S+)/) {
$iowait_count = $1;
} elsif (/^se\.sum_exec_runtime\s+:\s+(\S+)/) {
$exec_runtime = $1;
} elsif (/^se\.nr_migrations\s+:\s+(\S+)/) {
$nr_migrations = $1;
} elsif (/^nr_switches\s+:\s+(\S+)/) {
$nr_switches = $1;
$sched_valid = 1;
last LOOP_SCHED;
}
}
close($fh);
SKIP_SCHED:;
#cat /proc/1/io
#rchar: 3432590242
#wchar: 438665986
#syscr: 316595
#syscw: 104722
#read_bytes: 1586438144
#write_bytes: 246829056
#cancelled_write_bytes: 7798784
# parse /proc/<pid>/task/<tid>/io
$file = '/proc/' . $pid . '/task/' . $tid . '/io';
open($fh, $file) || goto SKIP_IO;
LOOP_IO: while (<$fh>) {
if (/^rchar:\s+(\S+)/) {
$rchar = $1;
} elsif (/^wchar:\s+(\S+)/) {
$wchar = $1;
} elsif (/^syscr:\s+(\S+)/) {
$syscr = $1;
} elsif (/^syscw:\s+(\S+)/) {
$syscw = $1;
} elsif (/^read_bytes:\s+(\S+)/) {
$read_bytes = $1;
} elsif (/^write_bytes:\s+(\S+)/) {
$write_bytes = $1;
} elsif (/^cancelled_write_bytes:\s+(\S+)/) {
$cancelled_write_bytes = $1;
$io_valid = 1;
last LOOP_IO;
}
}
close($fh);
SKIP_IO:;
# parse /proc/<pid>/task/<tid>/status
$file = '/proc/' . $pid . '/task/' . $tid . '/status';
open($fh, $file) || next;
LOOP_STATUS: while (<$fh>) {
if (/^Name:\s+(.*)/) {
$comm = $1;
} elsif (/^State:\s+(\S+)/) {
$state = $1;
} elsif (/^PPid:\s+(\S+)/) {
$ppid = $1;
} elsif (/^VmSize:\s+(\S+)/) {
$VmSize = $1;
} elsif (/^VmRSS:\s+(\S+)/) {
$VmRSS = $1;
} elsif (/^Cpus_allowed:\s+([0]+,)*(\S+)/) {
my $h = $2; $h =~ tr/,/_/;
$Cpus_allowed = Math::BigInt->from_hex($h);
$affinity = $Cpus_allowed->band($::affinity_mask);
$status_valid = 1;
last LOOP_STATUS;
}
}
close($fh);
# parse /proc/<pid>/task/<tid>/cmdline
$file = '/proc/' . $pid . '/task/' . $tid . '/cmdline';
open($fh, $file) || next;
LOOP_CMDLINE: while (<$fh>) {
if (/^(.*)$/) {
$cmdline = $1;
$cmdline =~ s/\000/ /g;
$cmdline_valid = 1;
last LOOP_CMDLINE;
}
}
if (!$cmdline_valid) {
$cmdline_valid = 1;
$cmdline = $comm;
}
close($fh);
# parse /proc/<pid>/task/<tid>/wchan
$file = '/proc/' . $pid . '/task/' . $tid . '/wchan';
open($fh, $file) || next;
LOOP_WCHAN: while (<$fh>) {
if (/^(.*)$/) {
$wchan = $1;
$wchan_valid = 1;
last LOOP_WCHAN;
}
}
close($fh);
#Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
#..............................................................................
# Field Content
# tpid process id (or tid, if /proc/<pid>/task/<tid>/stat)
# tcomm filename of the executable
# state state (R is running, S is sleeping, D is sleeping in an
# uninterruptible wait, Z is zombie, T is traced or stopped)
# ppid process id of the parent process
# pgrp pgrp of the process
# sid session id
# tty_nr tty the process uses
# tty_pgrp pgrp of the tty
# flags task flags
# min_flt number of minor faults
# cmin_flt number of minor faults with child's
# maj_flt number of major faults
# cmaj_flt number of major faults with child's
# utime user mode jiffies
# stime kernel mode jiffies
# cutime user mode jiffies with child's
# cstime kernel mode jiffies with child's
# priority priority level
# nice nice level
# num_threads number of threads
# it_real_value (obsolete, always 0)
# start_time time the process started after system boot
# vsize virtual memory size
# rss resident set memory size
# rsslim current limit in bytes on the rss
# start_code address above which program text can run
# end_code address below which program text can run
# start_stack address of the start of the main process stack
# esp current value of ESP
# eip current value of EIP
# pending bitmap of pending signals
# blocked bitmap of blocked signals
# sigign bitmap of ignored signals
# sigcatch bitmap of catched signals
# wchan address where process went to sleep
# 0 (place holder)
# 0 (place holder)
# exit_signal signal to send to parent thread on exit
# task_cpu which CPU the task is scheduled on
# rt_priority realtime priority
# policy scheduling policy (man sched_setscheduler)
# blkio_ticks time spent waiting for block IO
# gtime guest time of the task in jiffies
# cgtime guest time of the task children in jiffies
# start_data address above which program data+bss is placed
# end_data address below which program data+bss is placed
# start_brk address above which program heap can be expanded with brk()
# arg_start address above which program command line is placed
# arg_end address below which program command line is placed
# env_start address above which program environment is placed
# env_end address below which program environment is placed
# exit_code the thread's exit_code in the form reported by the waitpid system call
# parse /proc/<pid>/task/<tid>/stat
$file = '/proc/' . $pid . '/task/' . $tid . '/stat';
my $dummy;
open($fh, $file) || next;
$_ = <$fh>;
($tpid, $tcomm, $dummy) = /^(\d+)\s+\((.*)\)\s+(.*)/;
($state, $ppid, $pgrp, $sid,
$tty_nr, $tty_pgrp, $flags,
$min_flt, $cmin_flt, $maj_flt, $cmaj_flt,
$utime, $stime, $cutime, $cstime,
$priority, $nice, $num_threads,
$it_real_value, $start_time,
$vsize, $rss, $rsslim,
$start_code, $end_code, $start_stack, $esp, $eip,
$pending, $blocked, $sigign, $sigcatch, $wchan_addr,
$dum1, $dum2, $exit_signal, $task_cpu,
$rt_priority, $policy, $blkio_ticks, $gtime, $cgtime,
$start_data, $end_data, $start_brk, $arg_start, $arg_end,
$env_start, $env_end, $exit_code) = split(/\s+/, $dummy);
$stat_valid = 1;
close($fh);
#cat /proc/1/task/1/cgroup
#12:cpu,cpuacct:/init.scope
#11:pids:/init.scope
#10:hugetlb:/
#9:memory:/init.scope
#8:rdma:/
#7:cpuset:/
#6:net_cls,net_prio:/
#5:devices:/init.scope
#4:blkio:/init.scope
#3:freezer:/
#2:perf_event:/
#1:name=systemd:/init.scope
#0::/init.scope
# Extract the pod id:
# /k8s-infra/kubepods/burstable/pode84531c2-0bb1-45f8-b27f-e779b858552d/fdeaea0e577a525a3d9e41655ee05dd9b4edf17ce4b1bf95803cae1518f43ca2
# Extract *.service or *.scope name:
# /system.slice/acpid.service
# /system.slice/system-ceph.slice/ceph-mds.scope
# parse /proc/<pid>/task/<tid>/cgroup
$file = '/proc/' . $pid . '/task/' . $tid . '/cgroup';
open($fh, $file) || next;
LOOP_CGROUP: while (<$fh>) {
if (/^\d+:(pids|cpu,cpuacct):(.*)/) {
$_ = $2;
if (/kubepods\/\w+\/(pod[a-z0-9-]+)\/\w+$/) {
$cgroup = $1;
} elsif (/\/([a-zA-Z0-9_-@:]+)\.\w+$/) {
$cgroup = $1;
} else {
$cgroup = '-'; # '-' looks prettier than '/'
}
$cgroup_valid = 1;
last LOOP_CGROUP;
}
}
close($fh);
# sched
if (defined $sched_valid) {
$::task{$tid}{'exec_runtime'} = $exec_runtime;
$::task{$tid}{'exec_max'} = $exec_max;
$::task{$tid}{'block_max'} = $block_max;
$::task{$tid}{'wait_max'} = $wait_max;
$::task{$tid}{'wait_sum'} = $wait_sum;
$::task{$tid}{'wait_count'} = $wait_count;
$::task{$tid}{'iowait_sum'} = $iowait_sum;
$::task{$tid}{'iowait_count'} = $iowait_count;
$::task{$tid}{'nr_migrations'} = $nr_migrations;
$::task{$tid}{'nr_switches'} = $nr_switches;
} else {
$::task{$tid}{'exec_runtime'} = 0;
$::task{$tid}{'exec_max'} = 0;
$::task{$tid}{'block_max'} = 0;
$::task{$tid}{'wait_max'} = 0;
$::task{$tid}{'wait_sum'} = 0;
$::task{$tid}{'wait_count'} = 0;
$::task{$tid}{'iowait_sum'} = 0;
$::task{$tid}{'iowait_count'} = 0;
$::task{$tid}{'nr_migrations'} = 0;
$::task{$tid}{'nr_switches'} = 0;
}
# io
if (defined $io_valid) {
$::task{$tid}{'rchar'} = $rchar;
$::task{$tid}{'wchar'} = $wchar;
$::task{$tid}{'syscr'} = $syscr;
$::task{$tid}{'syscw'} = $syscw;
$::task{$tid}{'read_bytes'} = $read_bytes;
$::task{$tid}{'write_bytes'} = $write_bytes;
$::task{$tid}{'cancelled_write_bytes'} = $cancelled_write_bytes;
} else {
$::task{$tid}{'rchar'} = 0;
$::task{$tid}{'wchar'} = 0;
$::task{$tid}{'syscr'} = 0;
$::task{$tid}{'syscw'} = 0;
$::task{$tid}{'read_bytes'} = 0;
$::task{$tid}{'write_bytes'} = 0;
$::task{$tid}{'cancelled_write_bytes'} = 0;
}
# status
if (defined $status_valid) {
$::task{$tid}{'pid'} = $pid;
$::task{$tid}{'comm'} = $comm;
$::task{$tid}{'state'} = $state;
$::task{$tid}{'ppid'} = $ppid;
$::task{$tid}{'VmSize'} = $VmSize;
$::task{$tid}{'VmRSS'} = $VmRSS;
$::task{$tid}{'affinity'} = $affinity;
} else {
$::task{$tid}{'pid'} = 0;
$::task{$tid}{'comm'} = '-';
$::task{$tid}{'state'} = '-';
$::task{$tid}{'ppid'} = 0;
$::task{$tid}{'VmSize'} = 0;
$::task{$tid}{'VmRSS'} = 0;
$::task{$tid}{'affinity'} = Math::BigInt->new('0');
}
# cmdline
if (defined $cmdline_valid) {
$::task{$tid}{'cmdline'} = $cmdline;
} else {
$::task{$tid}{'cmdline'} = $comm;
}
# wchan
if (defined $cmdline_valid) {
$::task{$tid}{'wchan'} = $wchan;
} else {
$::task{$tid}{'wchan'} = '-';
}
# stat
if (defined $stat_valid) {
$::task{$tid}{'nice'} = $nice;
$::task{$tid}{'policy'} = $policy;
$::task{$tid}{'priority'} = $priority;
$::task{$tid}{'rt_priority'} = $rt_priority;
$::task{$tid}{'start_time'} = $start_time;
$::task{$tid}{'task_cpu'} = $task_cpu;
} else {
$::task{$tid}{'nice'} = 0;
$::task{$tid}{'policy'} = '-';
$::task{$tid}{'priority'} = 0;
$::task{$tid}{'rt_priority'} = 0;
$::task{$tid}{'start_time'} = '';
$::task{$tid}{'task_cpu'} = 0;
}
# cgroup
if (defined $cgroup_valid) {
$::task{$tid}{'cgroup'} = $cgroup;
} else {
$::task{$tid}{'cgroup'} = '-';
}
}
}
# Parse per-cpu hi-resolution scheduling stats
sub read_schedstat
{
(local *::percpu) = @_;
my ($version, $timestamp);
my ($cpu, $cputime);
my ($fh, $file);
%::percpu = ();
# parse /proc/schedstat
$file = '/proc/schedstat';
open($fh, $file) || croak "Cannot open file: $file ($!)";
$_ = <$fh>; ($version) = /^version\s+(\d+)/;
$_ = <$fh>; ($timestamp) = /^timestamp\s+(\d+)/;
if ($version == 15) {
LOOP_SCHEDSTAT: while (<$fh>) {
# version 15: cputime is 7th field
if (/^cpu(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+/) {
$cpu = $1; $cputime = $2;
$::percpu{$cpu} = $cputime;
}
}
} else {
croak "schedstat version: $version method not implemented.";
}
close($fh);
SKIP_SCHED:;
}
# Parse per-cpu jiffie stats; cputime excludes iowait.
sub read_stat
{
(local *::percpu) = @_;
my ($cpu, $cputime);
my ($user, $sys, $nice, $idle, $iowt, $hirq, $sirq);
my ($fh, $file);
%::percpu = ();
# parse /proc/stat
$file = '/proc/stat';
open($fh, $file) || croak "Cannot open file: $file ($!)";
LOOP_STAT: while (<$fh>) {
if (/^cpu(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+/) {
$cpu =$1; $user = $2; $sys = $3; $nice = $4; $idle = $5; $iowt = $6; $hirq = $7; $sirq = $8;
$cputime = $CLOCK_NS * ($user + $sys + $nice + $iowt + $hirq + $sirq);
$::percpu{$cpu} = $cputime;
}
}
close($fh);
}
# Parse load-average from /proc/loadavg
sub get_loadavg
{
(local *::loadavg, local *::runq, local *::num_tasks) = @_;
$::loadavg{'1'} = 0.0;
$::loadavg{'5'} = 0.0;
$::loadavg{'15'} = 0.0;
$::runq = 0;
$::num_tasks = 0;
my $file = '/proc/loadavg';
open(my $fh, $file) || croak "Cannot open file: $file ($!)";
$_ = <$fh>;
if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\/(\d+)\s+\d+/) {
$::loadavg{'1'} = $1;
$::loadavg{'5'} = $2;
$::loadavg{'15'} = $3;
$::runq = $4;
$::num_tasks = $5;
}
close($fh);
}
# Parse blocked from /proc/stat
sub get_blocked
{
(local *::num_blk) = @_;
$::num_blk = 0;
my $file = '/proc/stat';
open(my $fh, $file) || croak "Cannot open file: $file ($!)";
while ($_ = <$fh>) {
if (/^procs_blocked\s+(\d+)/) {
$::num_blk = $1;
}
}
close($fh);
}
# Parse uptime from /proc/uptime
sub get_uptime
{
(local *::uptime) = @_;
$::uptime = 0.0;
my $file = '/proc/uptime';
open(my $fh, $file) || croak "Cannot open file: $file ($!)";
$_ = <$fh>;
if (/^(\S+)\s+\S+/) {
$::uptime = $1;
}
close($fh);
}
# Get number of online logical cpus
sub get_num_logical_cpus {
(local *::num_cpus) = @_;
$::num_cpus = 0;
my $file = "/proc/cpuinfo";
open(my $fh, $file) || croak "Cannot open file: $file ($!)";
LOOP_CPUINFO: while (<$fh>) {
if (/^[Pp]rocessor\s+:\s\d+/) {
$::num_cpus++;
}
}
close($fh);
}
# Print header
sub schedtop_header {
(local *::tr_1,
local *::tm_elapsed,
local *::tr_elapsed,
local *::uptime,
local *::loadavg,
local *::runq,
local *::num_blk,
local *::num_state_D,
local *::num_tasks,
local *::print_host,
) = @_;
# process epoch to get current timestamp
my $mm_in_s = 60;
my $hh_in_s = 60*60;
my $dd_in_s = 24*60*60;
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst);
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($::tr_1);
my $msec = 1000.0*($::tr_1 - int($::tr_1));
# convert uptime to elapsed <d>:<hh>:<mm>:<ss>
my ($up, $up_dd, $up_hh, $up_mm, $up_ss);
$up = int($::uptime);
$up_dd = int($up/$dd_in_s);
$up -= $dd_in_s*$up_dd;
$up_hh = int($up/$hh_in_s);
$up -= $hh_in_s*$up_hh;
$up_mm = int($up/$mm_in_s);
$up -= $mm_in_s*$up_mm;
$up_ss = $up;
# Calculate skew of CLOCK_REALTIME vs CLOCK_MONOTONIC,
# and display skew if > 5% relative difference.
my $skew_ms = ($::tr_elapsed - $::tm_elapsed)*1000.0;
my $skew = "";
if (abs($skew_ms)/$::tm_elapsed > 50.0) {
$skew = sprintf " skew:%.3f ms", $skew_ms;
}
#schedtop -- 2014/03/03 02:00:21.357 dt:2050.003 ms ldavg:0.07, 0.09, 0.08 runq:1 blk:0 D:0 nproc:440 up:6:13:00:56 skew:0.001 ms
printf "%s %s -- ".
"%4d-%02d-%02d %02d:%02d:%02d.%03d ".
"dt:%.3f ms ".
"ldavg:%.2f, %.2f, %.2f runq:%d blk:%d D:%d nproc:%d ".
"up:%d:%02d:%02d:%02d %s\n",
$::TOOLNAME, $::VERSION,
1900+$year, 1+$mon, $mday, $hour, $min, $sec, $msec,
$::tm_elapsed*1000.0,
$::loadavg{'1'}, $::loadavg{'5'}, $::loadavg{'15'},
$::runq, $::num_blk, $::num_state_D, $::num_tasks,
$up_dd, $up_hh, $up_mm, $up_ss,
$skew;
return if (!($::print_host));
# After first print, disable print host information
$::print_host = 0;
# Get host specific information
my ($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE);
($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE) = POSIX::uname();
my ($NODETYPE, $SUBFUNCTION, $BUILDINFO) = ('-', '-', '-');
my ($SW_VERSION, $BUILD_ID) = ('-', '-');
# Get platform nodetype and subfunction
PLATFORM: {
my $file = "/etc/platform/platform.conf";
open(FILE, $file) || next;
while($_ = <FILE>) {
s/[\0\e\f\r\a]//g; chomp; # strip control characters if any
if (/^nodetype=(\S+)/) {
$NODETYPE = $1;
}
if (/^subfunction=(\S+)/) {
$SUBFUNCTION = $1;
}
}
close(FILE);
}
# Get loadbuild info
BUILD: {
my $file = "/etc/build.info";
open(FILE, $file) || next;
while($_ = <FILE>) {
s/[\0\e\f\r\a]//g; chomp; # strip control characters if any
if (/^SW_VERSION=\"([^"]+)\"/) {
$SW_VERSION = $1;
}
if (/^BUILD_ID=\"([^"]+)\"/) {
$BUILD_ID = $1;
}
}
close(FILE);
}
$BUILDINFO = join(' ', $SW_VERSION, $BUILD_ID);
# Parse /proc/cpuinfo to get specific processor info
my ($n_cpu, $model_name, $cpu_MHz) = (0, '-', 0);
CPUINFO: {
my $file = "/proc/cpuinfo";
open(FILE, $file) || croak "Cannot open file: $file ($!)";
while($_ = <FILE>) {
s/[\0\e\f\r\a]//g; chomp; # strip control characters if any
if (/^[Pp]rocessor\s+:\s+\d+/) {
$n_cpu++;
} elsif (/^model name\s+:\s+(.*)$/) {
$_ = $1; s/\s+/ /g;
$model_name = $_;
} elsif (/^cpu MHz\s+:\s+(\S+)/) {
$cpu_MHz = $1;
} elsif (/^bogomips\s+:\s+(\S+)/) {
$cpu_MHz = $1 if ($cpu_MHz == 0);
}
}
close(FILE);
}
printf " host:%s nodetype:%s subfunction:%s\n",
$NODENAME, $NODETYPE, $SUBFUNCTION;
printf " arch:%s processor:%s speed:%.0f #CPUs:%d\n",
$MACHINE, $model_name, $cpu_MHz, $n_cpu;
printf " %s %s build:%s\n", $OSTYPE, $OSRELEASE, $BUILDINFO;
}
# Parse and validate command line arguments
sub parse_schedtop_args {
(local *::arg_debug,
local *::arg_delay,
local *::arg_repeat,
local *::arg_period,
local *::arg_reset_hwm,
local *::arg_idle,
local *::arg_sort,
local *::arg_print,
local *::arg_watch_cmd,
local *::arg_watch_cgroup,
local *::arg_watch_only,
local *::arg_watch_quiet,
local *::arg_trig_delay,
) = @_;
# Local variables
my ($fail, $arg_help);
# Use the Argument processing module
use Getopt::Long;
# Print usage if no arguments
if (!@::ARGV) {
&Usage();
exit 0;
}
# Process input arguments
$fail = 0;
GetOptions(
"debug:i", \$::arg_debug,
"delay=f", \$::arg_delay,
"period=i", \$::arg_period,
"repeat=i", \$::arg_repeat,
"reset-hwm", \$::arg_reset_hwm,
"idle", \$::arg_idle,
"sort=s", \$::arg_sort,
"print=s", \$::arg_print,
"watch-cmd=s@", \@::arg_watch_cmd,
"watch-cgroup=s@", \@::arg_watch_cgroup,
"watch-only", \$::arg_watch_only,
"watch-quiet", \$::arg_watch_quiet,
"trig-delay=i", \$::arg_trig_delay,
"help|h", \$arg_help
) || GetOptionsMessage();
# Print help documentation if user has selected --help
&ListHelp() if (defined $arg_help);
# Validate options
if ((defined $::arg_repeat) && (defined $::arg_period)) {
$fail = 1;
warn "$::TOOLNAME: Input error: cannot specify both --repeat and --period options.\n";
}
if ((defined $::arg_delay) && ($::arg_delay < 0.01)) {
$fail = 1;
warn "$::TOOLNAME: Input error: --delay %f is less than 0.01.\n",
$::arg_delay;
}
if ((defined $::arg_sort) && !(($::arg_sort eq 'cpu') || ($::arg_sort eq 'io'))) {
$fail = 1;
warn "$::TOOLNAME: Input error: --sort=$::arg_sort invalid; valid options are: cpu, io.\n";
}
if ((defined $::arg_print) && !(($::arg_print eq 'brief') || ($::arg_print eq 'full'))) {
$fail = 1;
warn "$::TOOLNAME: Input error: --print=$::arg_print invalid; valid options are: brief, full\n";
}
if ((defined $::arg_watch_only) && !(@::arg_watch_cmd || @::arg_watch_cgroup)) {
$fail = 1;
warn "$::TOOLNAME: Input error: --watch-only requires --watch-cmd or --watch-cgroup option.\n";
}
if ((defined $::arg_watch_quiet) && !(@::arg_watch_cmd || @::arg_watch_cgroup)) {
$fail = 1;
warn "$::TOOLNAME: Input error: --watch-quiet requires --watch-cmd or --watch-cgroup option.\n";
}
if ((defined $::arg_trig_delay) && !(@::arg_watch_cmd || @::arg_watch_cgroup)) {
$fail = 1;
warn "$::TOOLNAME: Input error: --trig-delay requires --watch-cmd or --watch-cgroup option.\n";
}
if ((defined $::arg_trig_delay) && ($::arg_trig_delay < 1)) {
$fail = 1;
warn "$::TOOLNAME: Input error: --trig-delay %d is less than 1.\n",
$::arg_trig_delay;
}
if (@::arg_watch_cmd) {
my @cmds = @::arg_watch_cmd;
@::arg_watch_cmd = ();
for my $cmd (@cmds) {
push(@::arg_watch_cmd, split(',', $cmd));
}
}
if (@::arg_watch_cgroup) {
my @cgroups = @::arg_watch_cgroup;
@::arg_watch_cgroup = ();
for my $cgroup (@cgroups) {
push(@::arg_watch_cgroup, split(',', $cgroup));
}
}
if (@::ARGV) {
$fail = 1;
warn "$::TOOLNAME: Input error: not expecting these options: '@::ARGV'.\n";
}
# Set reasonable defaults
$::arg_delay ||= 1.0;
$::arg_repeat ||= 1;
if ($::arg_period) {
$::arg_repeat = $::arg_period / $::arg_delay;
} else {
$::arg_period = $::arg_delay * $::arg_repeat;
}
$::arg_sort ||= 'cpu';
$::arg_print ||= 'full';
$::arg_trig_delay ||= 0;
# Upon missing or invalid options, print usage
if ($fail == 1) {
&Usage();
exit 1;
}
}
# Print out a warning message and usage
sub GetOptionsMessage {
warn "$::TOOLNAME: Error processing input arguments.\n";
&Usage();
exit 1;
}
# Print out program usage
sub Usage {
printf "Usage: $::TOOLNAME OPTIONS\n";
printf " [--delay=<seconds>] [--repeat=<num>] [--period=<seconds>]\n";
printf " [--reset-hwm] [--idle] [--sort=<cpu|io>] [--print=<brief|full>]\n";
printf " [--watch-cmd=tid1,cmd1,cmd2,...] [--watch-cgroup=cgroup1,...]\n";
printf " [--watch-only] [--watch-quiet]\n";
printf " [--trig-delay=time]\n";
printf " [--help]\n";
printf "\n";
}
# Print tool help
sub ListHelp {
printf "$::TOOLNAME -- display per-task scheduling occupancy\n";
&Usage();
printf "Options: miscellaneous\n";
printf " --delay=<seconds> : output interval (seconds): default: 1.0\n";
printf " --repeat=<num> : number of repeat samples: default: 1\n";
printf " --period=<seconds> : overall tool duration (seconds): default: --\n";
printf " --reset-hwm : reset scheduling delay hi-water marks\n";
printf " --idle : specify printing of idle tasks\n";
printf " --sort=<cpu|io> : sort order, select from 'cpu' or 'io'\n";
printf " --print=<brief|full> : select 'brief' or 'full' fields to display\n";
printf("Watch specific tasks or commands:\n");
printf(" --watch-cmd=tid1,cmd1,... : watch specific tids or 'comm' names\n");
printf(" (matches from beginning of comm with partial name, eg, --watch-cmd=sirq)\n");
printf(" --watch-cgroup=cgroup1,... : watch specific cgroup names\n");
printf(" (matches from beginning of cgroup with partial name, eg, --watch-cgroup=sm)\n");
printf(" --watch-only : display only watched tasks (reduces impact of tool)\n");
printf(" --watch-quiet : suppress output after watch starts\n");
printf("Trigger crash dump via sysrq:\n");
printf " --trig-delay=time : trigger delay threshold (ms)\n";
printf " --help : this help\n";
exit 0;
}
1;