mirror of
https://github.com/systemd/systemd-stable.git
synced 2024-12-22 13:33:56 +03:00
Merge pull request #7915 from poettering/pids-max-tweak
This commit is contained in:
commit
f26f5b60d0
9
TODO
9
TODO
@ -43,6 +43,15 @@ Features:
|
||||
sd_id128_get_machine_app_specific(). After all on long-running systems both
|
||||
IDs have similar properties.
|
||||
|
||||
* emulate properties of the root cgroup on controllers that don't support such
|
||||
properties natively on cpu/io/memory, the way we already do it for
|
||||
"pids". Also, add the same logic to cgtop.
|
||||
|
||||
* set TasksAccounting=1 on the root slice if we are running on the root cgroup,
|
||||
and similar for the others, as soon as we emulate them properly. After all,
|
||||
Linux keeps these system-wide stats anyway, and it costs nothing to expose
|
||||
them.
|
||||
|
||||
* sd-bus: add vtable flag, that may be used to request client creds implicitly
|
||||
and asynchronously before dispatching the operation
|
||||
|
||||
|
@ -150,6 +150,8 @@ basic_sources = files('''
|
||||
proc-cmdline.h
|
||||
process-util.c
|
||||
process-util.h
|
||||
procfs-util.c
|
||||
procfs-util.h
|
||||
random-util.c
|
||||
random-util.h
|
||||
ratelimit.c
|
||||
|
@ -184,3 +184,22 @@ static inline int safe_fork(const char *name, ForkFlags flags, pid_t *ret_pid) {
|
||||
}
|
||||
|
||||
int fork_agent(const char *name, const int except[], unsigned n_except, pid_t *pid, const char *path, ...);
|
||||
|
||||
#if SIZEOF_PID_T == 4
|
||||
/* The highest possibly (theoretic) pid_t value on this architecture. */
|
||||
#define PID_T_MAX ((pid_t) INT32_MAX)
|
||||
/* The maximum number of concurrent processes Linux allows on this architecture, as well as the highest valid PID value
|
||||
* the kernel will potentially assign. This reflects a value compiled into the kernel (PID_MAX_LIMIT), and sets the
|
||||
* upper boundary on what may be written to the /proc/sys/kernel/pid_max sysctl (but do note that the sysctl is off by
|
||||
* 1, since PID 0 can never exist and there can hence only be one process less than the limit would suggest). Since
|
||||
* these values are documented in proc(5) we feel quite confident that they are stable enough for the near future at
|
||||
* least to define them here too. */
|
||||
#define TASKS_MAX 4194303U
|
||||
#elif SIZEOF_PID_T == 2
|
||||
#define PID_T_MAX ((pid_t) INT16_MAX)
|
||||
#define TASKS_MAX 32767U
|
||||
#else
|
||||
#error "Unknown pid_t size"
|
||||
#endif
|
||||
|
||||
assert_cc(TASKS_MAX <= (unsigned long) PID_T_MAX)
|
||||
|
138
src/basic/procfs-util.c
Normal file
138
src/basic/procfs-util.c
Normal file
@ -0,0 +1,138 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1+ */
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#include "alloc-util.h"
|
||||
#include "fileio.h"
|
||||
#include "parse-util.h"
|
||||
#include "process-util.h"
|
||||
#include "procfs-util.h"
|
||||
#include "stdio-util.h"
|
||||
#include "string-util.h"
|
||||
|
||||
int procfs_tasks_get_limit(uint64_t *ret) {
|
||||
_cleanup_free_ char *value = NULL;
|
||||
uint64_t pid_max, threads_max;
|
||||
int r;
|
||||
|
||||
assert(ret);
|
||||
|
||||
/* So there are two sysctl files that control the system limit of processes:
|
||||
*
|
||||
* 1. kernel.threads-max: this is probably the sysctl that makes more sense, as it directly puts a limit on
|
||||
* concurrent tasks.
|
||||
*
|
||||
* 2. kernel.pid_max: this limits the numeric range PIDs can take, and thus indirectly also limits the number
|
||||
* of concurrent threads. AFAICS it's primarily a compatibility concept: some crappy old code used a signed
|
||||
* 16bit type for PIDs, hence the kernel provides a way to ensure the PIDs never go beyond INT16_MAX by
|
||||
* default.
|
||||
*
|
||||
* By default #2 is set to much lower values than #1, hence the limit people come into contact with first, as
|
||||
* it's the lowest boundary they need to bump when they want higher number of processes.
|
||||
*
|
||||
* Also note the weird definition of #2: PIDs assigned will be kept below this value, which means the number of
|
||||
* tasks that can be created is one lower, as PID 0 is not a valid process ID. */
|
||||
|
||||
r = read_one_line_file("/proc/sys/kernel/pid_max", &value);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = safe_atou64(value, &pid_max);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
value = mfree(value);
|
||||
r = read_one_line_file("/proc/sys/kernel/threads-max", &value);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = safe_atou64(value, &threads_max);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* Subtract one from pid_max, since PID 0 is not a valid PID */
|
||||
*ret = MIN(pid_max-1, threads_max);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int procfs_tasks_set_limit(uint64_t limit) {
|
||||
char buffer[DECIMAL_STR_MAX(uint64_t)+1];
|
||||
_cleanup_free_ char *value = NULL;
|
||||
uint64_t pid_max;
|
||||
int r;
|
||||
|
||||
if (limit == 0) /* This makes no sense, we are userspace and hence count as tasks too, and we want to live,
|
||||
* hence the limit conceptually has to be above 0. Also, most likely if anyone asks for a zero
|
||||
* limit he/she probably means "no limit", hence let's better refuse this to avoid
|
||||
* confusion. */
|
||||
return -EINVAL;
|
||||
|
||||
/* The Linux kernel doesn't allow this value to go below 20, hence don't allow this either, higher values than
|
||||
* TASKS_MAX are not accepted by the pid_max sysctl. We'll treat anything this high as "unbounded" and hence
|
||||
* set it to the maximum. */
|
||||
limit = CLAMP(limit, 20U, TASKS_MAX);
|
||||
|
||||
r = read_one_line_file("/proc/sys/kernel/pid_max", &value);
|
||||
if (r < 0)
|
||||
return r;
|
||||
r = safe_atou64(value, &pid_max);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* As pid_max is about the numeric pid_t range we'll bump it if necessary, but only ever increase it, never
|
||||
* decrease it, as threads-max is the much more relevant sysctl. */
|
||||
if (limit > pid_max-1) {
|
||||
sprintf(buffer, "%" PRIu64, limit+1); /* Add one, since PID 0 is not a valid PID */
|
||||
r = write_string_file("/proc/sys/kernel/pid_max", buffer, WRITE_STRING_FILE_DISABLE_BUFFER);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
sprintf(buffer, "%" PRIu64, limit);
|
||||
r = write_string_file("/proc/sys/kernel/threads-max", buffer, WRITE_STRING_FILE_DISABLE_BUFFER);
|
||||
if (r < 0) {
|
||||
uint64_t threads_max;
|
||||
|
||||
/* Hmm, we couldn't write this? If so, maybe it was already set properly? In that case let's not
|
||||
* generate an error */
|
||||
|
||||
value = mfree(value);
|
||||
if (read_one_line_file("/proc/sys/kernel/threads-max", &value) < 0)
|
||||
return r; /* return original error */
|
||||
|
||||
if (safe_atou64(value, &threads_max) < 0)
|
||||
return r; /* return original error */
|
||||
|
||||
if (MIN(pid_max-1, threads_max) != limit)
|
||||
return r; /* return original error */
|
||||
|
||||
/* Yay! Value set already matches what we were trying to set, hence consider this a success. */
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int procfs_tasks_get_current(uint64_t *ret) {
|
||||
_cleanup_free_ char *value = NULL;
|
||||
const char *p, *nr;
|
||||
size_t n;
|
||||
int r;
|
||||
|
||||
assert(ret);
|
||||
|
||||
r = read_one_line_file("/proc/loadavg", &value);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* Look for the second part of the fourth field, which is separated by a slash from the first part. None of the
|
||||
* earlier fields use a slash, hence let's use this to find the right spot. */
|
||||
p = strchr(value, '/');
|
||||
if (!p)
|
||||
return -EINVAL;
|
||||
|
||||
p++;
|
||||
n = strspn(p, DIGITS);
|
||||
nr = strndupa(p, n);
|
||||
|
||||
return safe_atou64(nr, ret);
|
||||
}
|
8
src/basic/procfs-util.h
Normal file
8
src/basic/procfs-util.h
Normal file
@ -0,0 +1,8 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1+ */
|
||||
#pragma once
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
int procfs_tasks_get_limit(uint64_t *ret);
|
||||
int procfs_tasks_set_limit(uint64_t limit);
|
||||
int procfs_tasks_get_current(uint64_t *ret);
|
@ -52,6 +52,7 @@
|
||||
#include "parse-util.h"
|
||||
#include "path-util.h"
|
||||
#include "process-util.h"
|
||||
#include "procfs-util.h"
|
||||
#include "set.h"
|
||||
#include "signal-util.h"
|
||||
#include "stat-util.h"
|
||||
@ -473,31 +474,22 @@ uint64_t physical_memory_scale(uint64_t v, uint64_t max) {
|
||||
|
||||
uint64_t system_tasks_max(void) {
|
||||
|
||||
#if SIZEOF_PID_T == 4
|
||||
#define TASKS_MAX ((uint64_t) (INT32_MAX-1))
|
||||
#elif SIZEOF_PID_T == 2
|
||||
#define TASKS_MAX ((uint64_t) (INT16_MAX-1))
|
||||
#else
|
||||
#error "Unknown pid_t size"
|
||||
#endif
|
||||
|
||||
_cleanup_free_ char *value = NULL, *root = NULL;
|
||||
uint64_t a = TASKS_MAX, b = TASKS_MAX;
|
||||
_cleanup_free_ char *root = NULL;
|
||||
|
||||
/* Determine the maximum number of tasks that may run on this system. We check three sources to determine this
|
||||
* limit:
|
||||
*
|
||||
* a) the maximum value for the pid_t type
|
||||
* a) the maximum tasks value the kernel allows on this architecture
|
||||
* b) the cgroups pids_max attribute for the system
|
||||
* c) the kernel's configure maximum PID value
|
||||
* c) the kernel's configured maximum PID value
|
||||
*
|
||||
* And then pick the smallest of the three */
|
||||
|
||||
if (read_one_line_file("/proc/sys/kernel/pid_max", &value) >= 0)
|
||||
(void) safe_atou64(value, &a);
|
||||
(void) procfs_tasks_get_limit(&a);
|
||||
|
||||
if (cg_get_root_path(&root) >= 0) {
|
||||
value = mfree(value);
|
||||
_cleanup_free_ char *value = NULL;
|
||||
|
||||
if (cg_get_attribute("pids", root, "pids.max", &value) >= 0)
|
||||
(void) safe_atou64(value, &b);
|
||||
|
@ -40,7 +40,9 @@
|
||||
#include "parse-util.h"
|
||||
#include "path-util.h"
|
||||
#include "process-util.h"
|
||||
#include "procfs-util.h"
|
||||
#include "stdio-util.h"
|
||||
#include "strv.h"
|
||||
#include "terminal-util.h"
|
||||
#include "unit-name.h"
|
||||
#include "util.h"
|
||||
@ -193,26 +195,33 @@ static int process(
|
||||
g->n_tasks_valid = true;
|
||||
|
||||
} else if (streq(controller, "pids") && arg_count == COUNT_PIDS) {
|
||||
_cleanup_free_ char *p = NULL, *v = NULL;
|
||||
|
||||
r = cg_get_path(controller, path, "pids.current", &p);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (isempty(path) || path_equal(path, "/")) {
|
||||
r = procfs_tasks_get_current(&g->n_tasks);
|
||||
if (r < 0)
|
||||
return r;
|
||||
} else {
|
||||
_cleanup_free_ char *p = NULL, *v = NULL;
|
||||
|
||||
r = read_one_line_file(p, &v);
|
||||
if (r == -ENOENT)
|
||||
return 0;
|
||||
if (r < 0)
|
||||
return r;
|
||||
r = cg_get_path(controller, path, "pids.current", &p);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = safe_atou64(v, &g->n_tasks);
|
||||
if (r < 0)
|
||||
return r;
|
||||
r = read_one_line_file(p, &v);
|
||||
if (r == -ENOENT)
|
||||
return 0;
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = safe_atou64(v, &g->n_tasks);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
if (g->n_tasks > 0)
|
||||
g->n_tasks_valid = true;
|
||||
|
||||
} else if (streq(controller, "cpu") || streq(controller, "cpuacct")) {
|
||||
} else if (STR_IN_SET(controller, "cpu", "cpuacct")) {
|
||||
_cleanup_free_ char *p = NULL, *v = NULL;
|
||||
uint64_t new_usage;
|
||||
nsec_t timestamp;
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include "parse-util.h"
|
||||
#include "path-util.h"
|
||||
#include "process-util.h"
|
||||
#include "procfs-util.h"
|
||||
#include "special.h"
|
||||
#include "stdio-util.h"
|
||||
#include "string-table.h"
|
||||
@ -39,6 +40,18 @@
|
||||
|
||||
#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
|
||||
|
||||
bool unit_has_root_cgroup(Unit *u) {
|
||||
assert(u);
|
||||
|
||||
/* Returns whether this unit manages the root cgroup. Note that this is different from being named "-.slice",
|
||||
* as inside of containers the root slice won't be identical to the root cgroup. */
|
||||
|
||||
if (!u->cgroup_path)
|
||||
return false;
|
||||
|
||||
return isempty(u->cgroup_path) || path_equal(u->cgroup_path, "/");
|
||||
}
|
||||
|
||||
static void cgroup_compat_warn(void) {
|
||||
static bool cgroup_compat_warned = false;
|
||||
|
||||
@ -708,21 +721,17 @@ static void cgroup_context_apply(
|
||||
|
||||
assert(u);
|
||||
|
||||
c = unit_get_cgroup_context(u);
|
||||
path = u->cgroup_path;
|
||||
|
||||
assert(c);
|
||||
assert(path);
|
||||
|
||||
/* Nothing to do? Exit early! */
|
||||
if (apply_mask == 0 && !apply_bpf)
|
||||
return;
|
||||
|
||||
/* Some cgroup attributes are not supported on the root cgroup,
|
||||
* hence silently ignore */
|
||||
is_root = isempty(path) || path_equal(path, "/");
|
||||
if (is_root)
|
||||
/* Make sure we don't try to display messages with an empty path. */
|
||||
/* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
|
||||
is_root = unit_has_root_cgroup(u);
|
||||
|
||||
assert_se(c = unit_get_cgroup_context(u));
|
||||
assert_se(path = u->cgroup_path);
|
||||
|
||||
if (is_root) /* Make sure we don't try to display messages with an empty path. */
|
||||
path = "/";
|
||||
|
||||
/* We generally ignore errors caused by read-only mounted
|
||||
@ -1019,19 +1028,46 @@ static void cgroup_context_apply(
|
||||
}
|
||||
}
|
||||
|
||||
if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
|
||||
if (apply_mask & CGROUP_MASK_PIDS) {
|
||||
|
||||
if (c->tasks_max != CGROUP_LIMIT_MAX) {
|
||||
char buf[DECIMAL_STR_MAX(uint64_t) + 2];
|
||||
if (is_root) {
|
||||
/* So, the "pids" controller does not expose anything on the root cgroup, in order not to
|
||||
* replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
|
||||
* the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
|
||||
* non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
|
||||
* exclusive ownership of the sysctls, but we still want to honour things if the user sets
|
||||
* limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
|
||||
* through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
|
||||
* it also counts. But if the user never set a limit through us (i.e. we are the default of
|
||||
* "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
|
||||
* the first time we set a limit. Note that this boolean is flushed out on manager reload,
|
||||
* which is desirable so that there's an offical way to release control of the sysctl from
|
||||
* systemd: set the limit to unbounded and reload. */
|
||||
|
||||
sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
|
||||
r = cg_set_attribute("pids", path, "pids.max", buf);
|
||||
} else
|
||||
r = cg_set_attribute("pids", path, "pids.max", "max");
|
||||
if (c->tasks_max != CGROUP_LIMIT_MAX) {
|
||||
u->manager->sysctl_pid_max_changed = true;
|
||||
r = procfs_tasks_set_limit(c->tasks_max);
|
||||
} else if (u->manager->sysctl_pid_max_changed)
|
||||
r = procfs_tasks_set_limit(TASKS_MAX);
|
||||
else
|
||||
r = 0;
|
||||
|
||||
if (r < 0)
|
||||
log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
|
||||
"Failed to set pids.max: %m");
|
||||
if (r < 0)
|
||||
log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
|
||||
"Failed to write to tasks limit sysctls: %m");
|
||||
|
||||
} else {
|
||||
if (c->tasks_max != CGROUP_LIMIT_MAX) {
|
||||
char buf[DECIMAL_STR_MAX(uint64_t) + 2];
|
||||
|
||||
sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
|
||||
r = cg_set_attribute("pids", path, "pids.max", buf);
|
||||
} else
|
||||
r = cg_set_attribute("pids", path, "pids.max", "max");
|
||||
if (r < 0)
|
||||
log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
|
||||
"Failed to set pids.max: %m");
|
||||
}
|
||||
}
|
||||
|
||||
if (apply_bpf)
|
||||
@ -1062,7 +1098,7 @@ CGroupMask cgroup_context_get_mask(CGroupContext *c) {
|
||||
mask |= CGROUP_MASK_DEVICES;
|
||||
|
||||
if (c->tasks_accounting ||
|
||||
c->tasks_max != (uint64_t) -1)
|
||||
c->tasks_max != CGROUP_LIMIT_MAX)
|
||||
mask |= CGROUP_MASK_PIDS;
|
||||
|
||||
return mask;
|
||||
@ -2294,6 +2330,10 @@ int unit_get_tasks_current(Unit *u, uint64_t *ret) {
|
||||
if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
|
||||
return -ENODATA;
|
||||
|
||||
/* The root cgroup doesn't expose this information, let's get it from /proc instead */
|
||||
if (unit_has_root_cgroup(u))
|
||||
return procfs_tasks_get_current(ret);
|
||||
|
||||
r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
|
||||
if (r == -ENOENT)
|
||||
return -ENODATA;
|
||||
|
@ -208,6 +208,8 @@ int unit_reset_ip_accounting(Unit *u);
|
||||
cc ? cc->name : false; \
|
||||
})
|
||||
|
||||
bool unit_has_root_cgroup(Unit *u);
|
||||
|
||||
int manager_notify_cgroup_empty(Manager *m, const char *group);
|
||||
|
||||
void unit_invalidate_cgroup(Unit *u, CGroupMask m);
|
||||
|
@ -269,6 +269,9 @@ struct Manager {
|
||||
/* Have we already printed the taint line if necessary? */
|
||||
bool taint_logged:1;
|
||||
|
||||
/* Have we ever changed the "kernel.pid_max" sysctl? */
|
||||
bool sysctl_pid_max_changed:1;
|
||||
|
||||
unsigned test_run_flags:8;
|
||||
|
||||
/* If non-zero, exit with the following value when the systemd
|
||||
|
@ -411,6 +411,10 @@ tests += [
|
||||
[],
|
||||
[]],
|
||||
|
||||
[['src/test/test-procfs-util.c'],
|
||||
[],
|
||||
[]],
|
||||
|
||||
[['src/test/test-unaligned.c'],
|
||||
[],
|
||||
[]],
|
||||
|
38
src/test/test-procfs-util.c
Normal file
38
src/test/test-procfs-util.c
Normal file
@ -0,0 +1,38 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1+ */
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#include "log.h"
|
||||
#include "procfs-util.h"
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
uint64_t v;
|
||||
int r;
|
||||
|
||||
log_parse_environment();
|
||||
log_open();
|
||||
|
||||
assert_se(procfs_tasks_get_current(&v) >= 0);
|
||||
log_info("Current number of tasks: %" PRIu64, v);
|
||||
|
||||
assert_se(procfs_tasks_get_limit(&v) >= 0);
|
||||
log_info("Limit of tasks: %" PRIu64, v);
|
||||
assert_se(v > 0);
|
||||
assert_se(procfs_tasks_set_limit(v) >= 0);
|
||||
|
||||
if (v > 100) {
|
||||
uint64_t w;
|
||||
r = procfs_tasks_set_limit(v-1);
|
||||
assert_se(IN_SET(r, 0, -EPERM, -EACCES, -EROFS));
|
||||
|
||||
assert_se(procfs_tasks_get_limit(&w) >= 0);
|
||||
assert_se((r == 0 && w == v - 1) || (r < 0 && w == v));
|
||||
|
||||
assert_se(procfs_tasks_set_limit(v) >= 0);
|
||||
|
||||
assert_se(procfs_tasks_get_limit(&w) >= 0);
|
||||
assert_se(v == w);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user