mirror of
https://github.com/systemd/systemd.git
synced 2024-11-13 23:21:08 +03:00
c91d0fd2f4
On multi-core systems file /proc/schedstat may be larger than 4096 bytes and pread() will only read part of it. Fix issue https://github.com/systemd/systemd/issues/404
553 lines
19 KiB
C
553 lines
19 KiB
C
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
|
|
|
|
/***
|
|
This file is part of systemd.
|
|
|
|
Copyright (C) 2009-2013 Intel Corporation
|
|
|
|
Authors:
|
|
Auke Kok <auke-jan.h.kok@intel.com>
|
|
|
|
systemd is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU Lesser General Public License as published by
|
|
the Free Software Foundation; either version 2.1 of the License, or
|
|
(at your option) any later version.
|
|
|
|
systemd is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
|
along with systemd; If not, see <http://www.gnu.org/licenses/>.
|
|
***/
|
|
|
|
#include <unistd.h>
|
|
#include <stdlib.h>
|
|
#include <limits.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <dirent.h>
|
|
#include <fcntl.h>
|
|
#include <time.h>
|
|
|
|
#include "util.h"
|
|
#include "time-util.h"
|
|
#include "strxcpyx.h"
|
|
#include "store.h"
|
|
#include "bootchart.h"
|
|
#include "cgroup-util.h"
|
|
#include "fileio.h"
|
|
|
|
/*
|
|
* Alloc a static 4k buffer for stdio - primarily used to increase
|
|
* PSS buffering from the default 1k stdin buffer to reduce
|
|
* read() overhead.
|
|
*/
|
|
static char smaps_buf[4096];
|
|
static int skip = 0;
|
|
|
|
double gettime_ns(void) {
|
|
struct timespec n;
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &n);
|
|
|
|
return (n.tv_sec + (n.tv_nsec / (double) NSEC_PER_SEC));
|
|
}
|
|
|
|
static char *bufgetline(char *buf) {
|
|
char *c;
|
|
|
|
if (!buf)
|
|
return NULL;
|
|
|
|
c = strchr(buf, '\n');
|
|
if (c)
|
|
c++;
|
|
|
|
return c;
|
|
}
|
|
|
|
static int pid_cmdline_strscpy(int procfd, char *buffer, size_t buf_len, int pid) {
|
|
char filename[PATH_MAX];
|
|
_cleanup_close_ int fd = -1;
|
|
ssize_t n;
|
|
|
|
sprintf(filename, "%d/cmdline", pid);
|
|
fd = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
|
|
if (fd < 0)
|
|
return -errno;
|
|
|
|
n = read(fd, buffer, buf_len-1);
|
|
if (n > 0) {
|
|
int i;
|
|
for (i = 0; i < n; i++)
|
|
if (buffer[i] == '\0')
|
|
buffer[i] = ' ';
|
|
buffer[n] = '\0';
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int log_sample(DIR *proc,
|
|
int sample,
|
|
struct ps_struct *ps_first,
|
|
struct list_sample_data **ptr,
|
|
int *pscount,
|
|
int *cpus) {
|
|
|
|
static int vmstat = -1;
|
|
_cleanup_free_ char *buf_schedstat = NULL;
|
|
char buf[4096];
|
|
char key[256];
|
|
char val[256];
|
|
char rt[256];
|
|
char wt[256];
|
|
char *m;
|
|
int r;
|
|
int c;
|
|
int p;
|
|
int mod;
|
|
static int e_fd = -1;
|
|
ssize_t s;
|
|
ssize_t n;
|
|
struct dirent *ent;
|
|
int fd;
|
|
struct list_sample_data *sampledata;
|
|
struct ps_sched_struct *ps_prev = NULL;
|
|
int procfd;
|
|
int taskfd = -1;
|
|
|
|
sampledata = *ptr;
|
|
|
|
procfd = dirfd(proc);
|
|
if (procfd < 0)
|
|
return -errno;
|
|
|
|
if (vmstat < 0) {
|
|
/* block stuff */
|
|
vmstat = openat(procfd, "vmstat", O_RDONLY|O_CLOEXEC);
|
|
if (vmstat < 0)
|
|
return log_error_errno(errno, "Failed to open /proc/vmstat: %m");
|
|
}
|
|
|
|
n = pread(vmstat, buf, sizeof(buf) - 1, 0);
|
|
if (n <= 0) {
|
|
vmstat = safe_close(vmstat);
|
|
if (n < 0)
|
|
return -errno;
|
|
return -ENODATA;
|
|
}
|
|
|
|
buf[n] = '\0';
|
|
|
|
m = buf;
|
|
while (m) {
|
|
if (sscanf(m, "%s %s", key, val) < 2)
|
|
goto vmstat_next;
|
|
if (streq(key, "pgpgin"))
|
|
sampledata->blockstat.bi = atoi(val);
|
|
if (streq(key, "pgpgout")) {
|
|
sampledata->blockstat.bo = atoi(val);
|
|
break;
|
|
}
|
|
vmstat_next:
|
|
m = bufgetline(m);
|
|
if (!m)
|
|
break;
|
|
}
|
|
|
|
/* Parse "/proc/schedstat" for overall CPU utilization */
|
|
r = read_full_file("/proc/schedstat", &buf_schedstat, NULL);
|
|
if (r < 0)
|
|
return log_error_errno(r, "Unable to read schedstat: %m");
|
|
|
|
m = buf_schedstat;
|
|
while (m) {
|
|
if (sscanf(m, "%s %*s %*s %*s %*s %*s %*s %s %s", key, rt, wt) < 3)
|
|
goto schedstat_next;
|
|
|
|
if (strstr(key, "cpu")) {
|
|
r = safe_atoi((const char*)(key+3), &c);
|
|
if (r < 0 || c > MAXCPUS -1)
|
|
/* Oops, we only have room for MAXCPUS data */
|
|
break;
|
|
sampledata->runtime[c] = atoll(rt);
|
|
sampledata->waittime[c] = atoll(wt);
|
|
|
|
if (c == *cpus)
|
|
*cpus = c + 1;
|
|
}
|
|
schedstat_next:
|
|
m = bufgetline(m);
|
|
if (!m)
|
|
break;
|
|
}
|
|
|
|
if (arg_entropy) {
|
|
if (e_fd < 0) {
|
|
e_fd = openat(procfd, "sys/kernel/random/entropy_avail", O_RDONLY|O_CLOEXEC);
|
|
if (e_fd < 0)
|
|
return log_error_errno(errno, "Failed to open /proc/sys/kernel/random/entropy_avail: %m");
|
|
}
|
|
|
|
n = pread(e_fd, buf, sizeof(buf) - 1, 0);
|
|
if (n <= 0) {
|
|
e_fd = safe_close(e_fd);
|
|
} else {
|
|
buf[n] = '\0';
|
|
sampledata->entropy_avail = atoi(buf);
|
|
}
|
|
}
|
|
|
|
while ((ent = readdir(proc)) != NULL) {
|
|
char filename[PATH_MAX];
|
|
int pid;
|
|
struct ps_struct *ps;
|
|
|
|
if ((ent->d_name[0] < '0') || (ent->d_name[0] > '9'))
|
|
continue;
|
|
|
|
pid = atoi(ent->d_name);
|
|
|
|
if (pid >= MAXPIDS)
|
|
continue;
|
|
|
|
ps = ps_first;
|
|
while (ps->next_ps) {
|
|
ps = ps->next_ps;
|
|
if (ps->pid == pid)
|
|
break;
|
|
}
|
|
|
|
/* end of our LL? then append a new record */
|
|
if (ps->pid != pid) {
|
|
_cleanup_fclose_ FILE *st = NULL;
|
|
char t[32];
|
|
struct ps_struct *parent;
|
|
|
|
ps->next_ps = new0(struct ps_struct, 1);
|
|
if (!ps->next_ps)
|
|
return log_oom();
|
|
|
|
ps = ps->next_ps;
|
|
ps->pid = pid;
|
|
ps->sched = -1;
|
|
ps->schedstat = -1;
|
|
|
|
ps->sample = new0(struct ps_sched_struct, 1);
|
|
if (!ps->sample)
|
|
return log_oom();
|
|
|
|
ps->sample->sampledata = sampledata;
|
|
|
|
(*pscount)++;
|
|
|
|
/* mark our first sample */
|
|
ps->first = ps->last = ps->sample;
|
|
ps->sample->runtime = atoll(rt);
|
|
ps->sample->waittime = atoll(wt);
|
|
|
|
/* get name, start time */
|
|
if (ps->sched < 0) {
|
|
sprintf(filename, "%d/sched", pid);
|
|
ps->sched = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
|
|
if (ps->sched < 0)
|
|
continue;
|
|
}
|
|
|
|
s = pread(ps->sched, buf, sizeof(buf) - 1, 0);
|
|
if (s <= 0) {
|
|
ps->sched = safe_close(ps->sched);
|
|
continue;
|
|
}
|
|
buf[s] = '\0';
|
|
|
|
if (!sscanf(buf, "%s %*s %*s", key))
|
|
continue;
|
|
|
|
strscpy(ps->name, sizeof(ps->name), key);
|
|
|
|
/* cmdline */
|
|
if (arg_show_cmdline)
|
|
pid_cmdline_strscpy(procfd, ps->name, sizeof(ps->name), pid);
|
|
|
|
/* discard line 2 */
|
|
m = bufgetline(buf);
|
|
if (!m)
|
|
continue;
|
|
|
|
m = bufgetline(m);
|
|
if (!m)
|
|
continue;
|
|
|
|
if (!sscanf(m, "%*s %*s %s", t))
|
|
continue;
|
|
|
|
r = safe_atod(t, &ps->starttime);
|
|
if (r < 0)
|
|
continue;
|
|
|
|
ps->starttime /= 1000.0;
|
|
|
|
if (arg_show_cgroup)
|
|
/* if this fails, that's OK */
|
|
cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER,
|
|
ps->pid, &ps->cgroup);
|
|
|
|
/* ppid */
|
|
sprintf(filename, "%d/stat", pid);
|
|
fd = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
|
|
if (fd < 0)
|
|
continue;
|
|
|
|
st = fdopen(fd, "re");
|
|
if (!st) {
|
|
close(fd);
|
|
continue;
|
|
}
|
|
|
|
if (!fscanf(st, "%*s %*s %*s %i", &p))
|
|
continue;
|
|
|
|
ps->ppid = p;
|
|
|
|
/*
|
|
* setup child pointers
|
|
*
|
|
* these are used to paint the tree coherently later
|
|
* each parent has a LL of children, and a LL of siblings
|
|
*/
|
|
if (pid == 1)
|
|
continue; /* nothing to do for init atm */
|
|
|
|
/* kthreadd has ppid=0, which breaks our tree ordering */
|
|
if (ps->ppid == 0)
|
|
ps->ppid = 1;
|
|
|
|
parent = ps_first;
|
|
while ((parent->next_ps && parent->pid != ps->ppid))
|
|
parent = parent->next_ps;
|
|
|
|
if (parent->pid != ps->ppid) {
|
|
/* orphan */
|
|
ps->ppid = 1;
|
|
parent = ps_first->next_ps;
|
|
}
|
|
|
|
ps->parent = parent;
|
|
|
|
if (!parent->children) {
|
|
/* it's the first child */
|
|
parent->children = ps;
|
|
} else {
|
|
/* walk all children and append */
|
|
struct ps_struct *children;
|
|
children = parent->children;
|
|
while (children->next)
|
|
children = children->next;
|
|
|
|
children->next = ps;
|
|
}
|
|
}
|
|
|
|
/* else -> found pid, append data in ps */
|
|
|
|
/* below here is all continuous logging parts - we get here on every
|
|
* iteration */
|
|
|
|
/* rt, wt */
|
|
if (ps->schedstat < 0) {
|
|
sprintf(filename, "%d/schedstat", pid);
|
|
ps->schedstat = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
|
|
if (ps->schedstat < 0)
|
|
continue;
|
|
}
|
|
|
|
s = pread(ps->schedstat, buf, sizeof(buf) - 1, 0);
|
|
if (s <= 0) {
|
|
/* clean up our file descriptors - assume that the process exited */
|
|
close(ps->schedstat);
|
|
ps->schedstat = -1;
|
|
ps->sched = safe_close(ps->sched);
|
|
continue;
|
|
}
|
|
|
|
buf[s] = '\0';
|
|
|
|
if (!sscanf(buf, "%s %s %*s", rt, wt))
|
|
continue;
|
|
|
|
ps->sample->next = new0(struct ps_sched_struct, 1);
|
|
if (!ps->sample->next)
|
|
return log_oom();
|
|
|
|
ps->sample->next->prev = ps->sample;
|
|
ps->sample = ps->sample->next;
|
|
ps->last = ps->sample;
|
|
ps->sample->runtime = atoll(rt);
|
|
ps->sample->waittime = atoll(wt);
|
|
ps->sample->sampledata = sampledata;
|
|
ps->sample->ps_new = ps;
|
|
if (ps_prev)
|
|
ps_prev->cross = ps->sample;
|
|
|
|
ps_prev = ps->sample;
|
|
ps->total = (ps->last->runtime - ps->first->runtime)
|
|
/ 1000000000.0;
|
|
|
|
/* Take into account CPU runtime/waittime spent in non-main threads of the process
|
|
* by parsing "/proc/[pid]/task/[tid]/schedstat" for all [tid] != [pid]
|
|
* See https://github.com/systemd/systemd/issues/139
|
|
*/
|
|
|
|
/* Browse directory "/proc/[pid]/task" to know the thread ids of process [pid] */
|
|
snprintf(filename, sizeof(filename), PID_FMT "/task", pid);
|
|
taskfd = openat(procfd, filename, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
|
|
if (taskfd >= 0) {
|
|
_cleanup_closedir_ DIR *taskdir = NULL;
|
|
|
|
taskdir = fdopendir(taskfd);
|
|
if (!taskdir) {
|
|
safe_close(taskfd);
|
|
return -errno;
|
|
}
|
|
FOREACH_DIRENT(ent, taskdir, break) {
|
|
int tid = -1;
|
|
_cleanup_close_ int tid_schedstat = -1;
|
|
long long delta_rt;
|
|
long long delta_wt;
|
|
|
|
if ((ent->d_name[0] < '0') || (ent->d_name[0] > '9'))
|
|
continue;
|
|
|
|
/* Skip main thread as it was already accounted */
|
|
r = safe_atoi(ent->d_name, &tid);
|
|
if (r < 0 || tid == pid)
|
|
continue;
|
|
|
|
/* Parse "/proc/[pid]/task/[tid]/schedstat" */
|
|
snprintf(filename, sizeof(filename), PID_FMT "/schedstat", tid);
|
|
tid_schedstat = openat(taskfd, filename, O_RDONLY|O_CLOEXEC);
|
|
|
|
if (tid_schedstat == -1)
|
|
continue;
|
|
|
|
s = pread(tid_schedstat, buf, sizeof(buf) - 1, 0);
|
|
if (s <= 0)
|
|
continue;
|
|
buf[s] = '\0';
|
|
|
|
if (!sscanf(buf, "%s %s %*s", rt, wt))
|
|
continue;
|
|
|
|
r = safe_atolli(rt, &delta_rt);
|
|
if (r < 0)
|
|
continue;
|
|
r = safe_atolli(rt, &delta_wt);
|
|
if (r < 0)
|
|
continue;
|
|
ps->sample->runtime += delta_rt;
|
|
ps->sample->waittime += delta_wt;
|
|
}
|
|
}
|
|
|
|
if (!arg_pss)
|
|
goto catch_rename;
|
|
|
|
/* Pss */
|
|
if (!ps->smaps) {
|
|
sprintf(filename, "%d/smaps", pid);
|
|
fd = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
|
|
if (fd < 0)
|
|
continue;
|
|
ps->smaps = fdopen(fd, "re");
|
|
if (!ps->smaps) {
|
|
close(fd);
|
|
continue;
|
|
}
|
|
setvbuf(ps->smaps, smaps_buf, _IOFBF, sizeof(smaps_buf));
|
|
} else {
|
|
rewind(ps->smaps);
|
|
}
|
|
|
|
/* test to see if we need to skip another field */
|
|
if (skip == 0) {
|
|
if (fgets(buf, sizeof(buf), ps->smaps) == NULL) {
|
|
continue;
|
|
}
|
|
if (fread(buf, 1, 28 * 15, ps->smaps) != (28 * 15)) {
|
|
continue;
|
|
}
|
|
if (buf[392] == 'V') {
|
|
skip = 2;
|
|
}
|
|
else {
|
|
skip = 1;
|
|
}
|
|
rewind(ps->smaps);
|
|
}
|
|
|
|
while (1) {
|
|
int pss_kb;
|
|
|
|
/* skip one line, this contains the object mapped. */
|
|
if (fgets(buf, sizeof(buf), ps->smaps) == NULL) {
|
|
break;
|
|
}
|
|
/* then there's a 28 char 14 line block */
|
|
if (fread(buf, 1, 28 * 14, ps->smaps) != 28 * 14) {
|
|
break;
|
|
}
|
|
pss_kb = atoi(&buf[61]);
|
|
ps->sample->pss += pss_kb;
|
|
|
|
/* skip one more line if this is a newer kernel */
|
|
if (skip == 2) {
|
|
if (fgets(buf, sizeof(buf), ps->smaps) == NULL)
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (ps->sample->pss > ps->pss_max)
|
|
ps->pss_max = ps->sample->pss;
|
|
|
|
catch_rename:
|
|
/* catch process rename, try to randomize time */
|
|
mod = (arg_hz < 4.0) ? 4.0 : (arg_hz / 4.0);
|
|
if (((sample - ps->pid) + pid) % (int)(mod) == 0) {
|
|
|
|
/* re-fetch name */
|
|
/* get name, start time */
|
|
if (ps->sched < 0) {
|
|
sprintf(filename, "%d/sched", pid);
|
|
ps->sched = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
|
|
if (ps->sched < 0)
|
|
continue;
|
|
}
|
|
|
|
s = pread(ps->sched, buf, sizeof(buf) - 1, 0);
|
|
if (s <= 0) {
|
|
/* clean up file descriptors */
|
|
ps->sched = safe_close(ps->sched);
|
|
ps->schedstat = safe_close(ps->schedstat);
|
|
continue;
|
|
}
|
|
|
|
buf[s] = '\0';
|
|
|
|
if (!sscanf(buf, "%s %*s %*s", key))
|
|
continue;
|
|
|
|
strscpy(ps->name, sizeof(ps->name), key);
|
|
|
|
/* cmdline */
|
|
if (arg_show_cmdline)
|
|
pid_cmdline_strscpy(procfd, ps->name, sizeof(ps->name), pid);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|