1
0
mirror of https://github.com/systemd/systemd.git synced 2024-11-13 23:21:08 +03:00
systemd/src/bootchart/store.c
Gianpaolo Macario c91d0fd2f4 bootchart: Ensure that /proc/schedstat is read entirely
On multi-core systems file /proc/schedstat may be
larger than 4096 bytes and pread() will only read part of it.

Fix issue https://github.com/systemd/systemd/issues/404
2015-06-30 15:09:02 +00:00

553 lines
19 KiB
C

/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
/***
This file is part of systemd.
Copyright (C) 2009-2013 Intel Corporation
Authors:
Auke Kok <auke-jan.h.kok@intel.com>
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <unistd.h>
#include <stdlib.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <dirent.h>
#include <fcntl.h>
#include <time.h>
#include "util.h"
#include "time-util.h"
#include "strxcpyx.h"
#include "store.h"
#include "bootchart.h"
#include "cgroup-util.h"
#include "fileio.h"
/*
* Alloc a static 4k buffer for stdio - primarily used to increase
* PSS buffering from the default 1k stdin buffer to reduce
* read() overhead.
*/
static char smaps_buf[4096];
static int skip = 0;
double gettime_ns(void) {
struct timespec n;
clock_gettime(CLOCK_MONOTONIC, &n);
return (n.tv_sec + (n.tv_nsec / (double) NSEC_PER_SEC));
}
static char *bufgetline(char *buf) {
char *c;
if (!buf)
return NULL;
c = strchr(buf, '\n');
if (c)
c++;
return c;
}
static int pid_cmdline_strscpy(int procfd, char *buffer, size_t buf_len, int pid) {
char filename[PATH_MAX];
_cleanup_close_ int fd = -1;
ssize_t n;
sprintf(filename, "%d/cmdline", pid);
fd = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
if (fd < 0)
return -errno;
n = read(fd, buffer, buf_len-1);
if (n > 0) {
int i;
for (i = 0; i < n; i++)
if (buffer[i] == '\0')
buffer[i] = ' ';
buffer[n] = '\0';
}
return 0;
}
int log_sample(DIR *proc,
int sample,
struct ps_struct *ps_first,
struct list_sample_data **ptr,
int *pscount,
int *cpus) {
static int vmstat = -1;
_cleanup_free_ char *buf_schedstat = NULL;
char buf[4096];
char key[256];
char val[256];
char rt[256];
char wt[256];
char *m;
int r;
int c;
int p;
int mod;
static int e_fd = -1;
ssize_t s;
ssize_t n;
struct dirent *ent;
int fd;
struct list_sample_data *sampledata;
struct ps_sched_struct *ps_prev = NULL;
int procfd;
int taskfd = -1;
sampledata = *ptr;
procfd = dirfd(proc);
if (procfd < 0)
return -errno;
if (vmstat < 0) {
/* block stuff */
vmstat = openat(procfd, "vmstat", O_RDONLY|O_CLOEXEC);
if (vmstat < 0)
return log_error_errno(errno, "Failed to open /proc/vmstat: %m");
}
n = pread(vmstat, buf, sizeof(buf) - 1, 0);
if (n <= 0) {
vmstat = safe_close(vmstat);
if (n < 0)
return -errno;
return -ENODATA;
}
buf[n] = '\0';
m = buf;
while (m) {
if (sscanf(m, "%s %s", key, val) < 2)
goto vmstat_next;
if (streq(key, "pgpgin"))
sampledata->blockstat.bi = atoi(val);
if (streq(key, "pgpgout")) {
sampledata->blockstat.bo = atoi(val);
break;
}
vmstat_next:
m = bufgetline(m);
if (!m)
break;
}
/* Parse "/proc/schedstat" for overall CPU utilization */
r = read_full_file("/proc/schedstat", &buf_schedstat, NULL);
if (r < 0)
return log_error_errno(r, "Unable to read schedstat: %m");
m = buf_schedstat;
while (m) {
if (sscanf(m, "%s %*s %*s %*s %*s %*s %*s %s %s", key, rt, wt) < 3)
goto schedstat_next;
if (strstr(key, "cpu")) {
r = safe_atoi((const char*)(key+3), &c);
if (r < 0 || c > MAXCPUS -1)
/* Oops, we only have room for MAXCPUS data */
break;
sampledata->runtime[c] = atoll(rt);
sampledata->waittime[c] = atoll(wt);
if (c == *cpus)
*cpus = c + 1;
}
schedstat_next:
m = bufgetline(m);
if (!m)
break;
}
if (arg_entropy) {
if (e_fd < 0) {
e_fd = openat(procfd, "sys/kernel/random/entropy_avail", O_RDONLY|O_CLOEXEC);
if (e_fd < 0)
return log_error_errno(errno, "Failed to open /proc/sys/kernel/random/entropy_avail: %m");
}
n = pread(e_fd, buf, sizeof(buf) - 1, 0);
if (n <= 0) {
e_fd = safe_close(e_fd);
} else {
buf[n] = '\0';
sampledata->entropy_avail = atoi(buf);
}
}
while ((ent = readdir(proc)) != NULL) {
char filename[PATH_MAX];
int pid;
struct ps_struct *ps;
if ((ent->d_name[0] < '0') || (ent->d_name[0] > '9'))
continue;
pid = atoi(ent->d_name);
if (pid >= MAXPIDS)
continue;
ps = ps_first;
while (ps->next_ps) {
ps = ps->next_ps;
if (ps->pid == pid)
break;
}
/* end of our LL? then append a new record */
if (ps->pid != pid) {
_cleanup_fclose_ FILE *st = NULL;
char t[32];
struct ps_struct *parent;
ps->next_ps = new0(struct ps_struct, 1);
if (!ps->next_ps)
return log_oom();
ps = ps->next_ps;
ps->pid = pid;
ps->sched = -1;
ps->schedstat = -1;
ps->sample = new0(struct ps_sched_struct, 1);
if (!ps->sample)
return log_oom();
ps->sample->sampledata = sampledata;
(*pscount)++;
/* mark our first sample */
ps->first = ps->last = ps->sample;
ps->sample->runtime = atoll(rt);
ps->sample->waittime = atoll(wt);
/* get name, start time */
if (ps->sched < 0) {
sprintf(filename, "%d/sched", pid);
ps->sched = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
if (ps->sched < 0)
continue;
}
s = pread(ps->sched, buf, sizeof(buf) - 1, 0);
if (s <= 0) {
ps->sched = safe_close(ps->sched);
continue;
}
buf[s] = '\0';
if (!sscanf(buf, "%s %*s %*s", key))
continue;
strscpy(ps->name, sizeof(ps->name), key);
/* cmdline */
if (arg_show_cmdline)
pid_cmdline_strscpy(procfd, ps->name, sizeof(ps->name), pid);
/* discard line 2 */
m = bufgetline(buf);
if (!m)
continue;
m = bufgetline(m);
if (!m)
continue;
if (!sscanf(m, "%*s %*s %s", t))
continue;
r = safe_atod(t, &ps->starttime);
if (r < 0)
continue;
ps->starttime /= 1000.0;
if (arg_show_cgroup)
/* if this fails, that's OK */
cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER,
ps->pid, &ps->cgroup);
/* ppid */
sprintf(filename, "%d/stat", pid);
fd = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
if (fd < 0)
continue;
st = fdopen(fd, "re");
if (!st) {
close(fd);
continue;
}
if (!fscanf(st, "%*s %*s %*s %i", &p))
continue;
ps->ppid = p;
/*
* setup child pointers
*
* these are used to paint the tree coherently later
* each parent has a LL of children, and a LL of siblings
*/
if (pid == 1)
continue; /* nothing to do for init atm */
/* kthreadd has ppid=0, which breaks our tree ordering */
if (ps->ppid == 0)
ps->ppid = 1;
parent = ps_first;
while ((parent->next_ps && parent->pid != ps->ppid))
parent = parent->next_ps;
if (parent->pid != ps->ppid) {
/* orphan */
ps->ppid = 1;
parent = ps_first->next_ps;
}
ps->parent = parent;
if (!parent->children) {
/* it's the first child */
parent->children = ps;
} else {
/* walk all children and append */
struct ps_struct *children;
children = parent->children;
while (children->next)
children = children->next;
children->next = ps;
}
}
/* else -> found pid, append data in ps */
/* below here is all continuous logging parts - we get here on every
* iteration */
/* rt, wt */
if (ps->schedstat < 0) {
sprintf(filename, "%d/schedstat", pid);
ps->schedstat = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
if (ps->schedstat < 0)
continue;
}
s = pread(ps->schedstat, buf, sizeof(buf) - 1, 0);
if (s <= 0) {
/* clean up our file descriptors - assume that the process exited */
close(ps->schedstat);
ps->schedstat = -1;
ps->sched = safe_close(ps->sched);
continue;
}
buf[s] = '\0';
if (!sscanf(buf, "%s %s %*s", rt, wt))
continue;
ps->sample->next = new0(struct ps_sched_struct, 1);
if (!ps->sample->next)
return log_oom();
ps->sample->next->prev = ps->sample;
ps->sample = ps->sample->next;
ps->last = ps->sample;
ps->sample->runtime = atoll(rt);
ps->sample->waittime = atoll(wt);
ps->sample->sampledata = sampledata;
ps->sample->ps_new = ps;
if (ps_prev)
ps_prev->cross = ps->sample;
ps_prev = ps->sample;
ps->total = (ps->last->runtime - ps->first->runtime)
/ 1000000000.0;
/* Take into account CPU runtime/waittime spent in non-main threads of the process
* by parsing "/proc/[pid]/task/[tid]/schedstat" for all [tid] != [pid]
* See https://github.com/systemd/systemd/issues/139
*/
/* Browse directory "/proc/[pid]/task" to know the thread ids of process [pid] */
snprintf(filename, sizeof(filename), PID_FMT "/task", pid);
taskfd = openat(procfd, filename, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
if (taskfd >= 0) {
_cleanup_closedir_ DIR *taskdir = NULL;
taskdir = fdopendir(taskfd);
if (!taskdir) {
safe_close(taskfd);
return -errno;
}
FOREACH_DIRENT(ent, taskdir, break) {
int tid = -1;
_cleanup_close_ int tid_schedstat = -1;
long long delta_rt;
long long delta_wt;
if ((ent->d_name[0] < '0') || (ent->d_name[0] > '9'))
continue;
/* Skip main thread as it was already accounted */
r = safe_atoi(ent->d_name, &tid);
if (r < 0 || tid == pid)
continue;
/* Parse "/proc/[pid]/task/[tid]/schedstat" */
snprintf(filename, sizeof(filename), PID_FMT "/schedstat", tid);
tid_schedstat = openat(taskfd, filename, O_RDONLY|O_CLOEXEC);
if (tid_schedstat == -1)
continue;
s = pread(tid_schedstat, buf, sizeof(buf) - 1, 0);
if (s <= 0)
continue;
buf[s] = '\0';
if (!sscanf(buf, "%s %s %*s", rt, wt))
continue;
r = safe_atolli(rt, &delta_rt);
if (r < 0)
continue;
r = safe_atolli(rt, &delta_wt);
if (r < 0)
continue;
ps->sample->runtime += delta_rt;
ps->sample->waittime += delta_wt;
}
}
if (!arg_pss)
goto catch_rename;
/* Pss */
if (!ps->smaps) {
sprintf(filename, "%d/smaps", pid);
fd = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
if (fd < 0)
continue;
ps->smaps = fdopen(fd, "re");
if (!ps->smaps) {
close(fd);
continue;
}
setvbuf(ps->smaps, smaps_buf, _IOFBF, sizeof(smaps_buf));
} else {
rewind(ps->smaps);
}
/* test to see if we need to skip another field */
if (skip == 0) {
if (fgets(buf, sizeof(buf), ps->smaps) == NULL) {
continue;
}
if (fread(buf, 1, 28 * 15, ps->smaps) != (28 * 15)) {
continue;
}
if (buf[392] == 'V') {
skip = 2;
}
else {
skip = 1;
}
rewind(ps->smaps);
}
while (1) {
int pss_kb;
/* skip one line, this contains the object mapped. */
if (fgets(buf, sizeof(buf), ps->smaps) == NULL) {
break;
}
/* then there's a 28 char 14 line block */
if (fread(buf, 1, 28 * 14, ps->smaps) != 28 * 14) {
break;
}
pss_kb = atoi(&buf[61]);
ps->sample->pss += pss_kb;
/* skip one more line if this is a newer kernel */
if (skip == 2) {
if (fgets(buf, sizeof(buf), ps->smaps) == NULL)
break;
}
}
if (ps->sample->pss > ps->pss_max)
ps->pss_max = ps->sample->pss;
catch_rename:
/* catch process rename, try to randomize time */
mod = (arg_hz < 4.0) ? 4.0 : (arg_hz / 4.0);
if (((sample - ps->pid) + pid) % (int)(mod) == 0) {
/* re-fetch name */
/* get name, start time */
if (ps->sched < 0) {
sprintf(filename, "%d/sched", pid);
ps->sched = openat(procfd, filename, O_RDONLY|O_CLOEXEC);
if (ps->sched < 0)
continue;
}
s = pread(ps->sched, buf, sizeof(buf) - 1, 0);
if (s <= 0) {
/* clean up file descriptors */
ps->sched = safe_close(ps->sched);
ps->schedstat = safe_close(ps->schedstat);
continue;
}
buf[s] = '\0';
if (!sscanf(buf, "%s %*s %*s", key))
continue;
strscpy(ps->name, sizeof(ps->name), key);
/* cmdline */
if (arg_show_cmdline)
pid_cmdline_strscpy(procfd, ps->name, sizeof(ps->name), pid);
}
}
return 0;
}