6b3e0e2e04
Extend error messages to mention CAP_PERFMON capability as an option to substitute CAP_SYS_ADMIN capability for secure system performance monitoring and observability operations. Make perf_event_paranoid_check() and __cmd_ftrace() to be aware of CAP_PERFMON capability. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_PERFMON capability. Committer testing: Using a libcap with this patch: diff --git a/libcap/include/uapi/linux/capability.h b/libcap/include/uapi/linux/capability.h index 78b2fd4c8a95..89b5b0279b60 100644 --- a/libcap/include/uapi/linux/capability.h +++ b/libcap/include/uapi/linux/capability.h @@ -366,8 +366,9 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +#define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_LAST_CAP CAP_PERFMON #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) Note that using '38' in place of 'cap_perfmon' works to some degree with an old libcap, its only when cap_get_flag() is called that libcap performs an error check based on the maximum value known for capabilities that it will fail. This makes determining the default of perf_event_attr.exclude_kernel to fail, as it can't determine if CAP_PERFMON is in place. Using 'perf top -e cycles' avoids the default check and sets perf_event_attr.exclude_kernel to 1. As root, with a libcap supporting CAP_PERFMON: # groupadd perf_users # adduser perf -g perf_users # mkdir ~perf/bin # cp ~acme/bin/perf ~perf/bin/ # chgrp perf_users ~perf/bin/perf # setcap "cap_perfmon,cap_sys_ptrace,cap_syslog=ep" ~perf/bin/perf # getcap ~perf/bin/perf /home/perf/bin/perf = cap_sys_ptrace,cap_syslog,cap_perfmon+ep # ls -la ~perf/bin/perf -rwxr-xr-x. 1 root perf_users 16968552 Apr 9 13:10 /home/perf/bin/perf As the 'perf' user in the 'perf_users' group: $ perf top -a --stdio Error: Failed to mmap with 1 (Operation not permitted) $ Either add the cap_ipc_lock capability to the perf binary or reduce the ring buffer size to some smaller value: $ perf top -m10 -a --stdio rounding mmap pages size to 64K (16 pages) Error: Failed to mmap with 1 (Operation not permitted) $ perf top -m4 -a --stdio Error: Failed to mmap with 1 (Operation not permitted) $ perf top -m2 -a --stdio PerfTop: 762 irqs/sec kernel:49.7% exact: 100.0% lost: 0/0 drop: 0/0 [4000Hz cycles], (all, 4 CPUs) ------------------------------------------------------------------------------------------------------ 9.83% perf [.] __symbols__insert 8.58% perf [.] rb_next 5.91% [kernel] [k] module_get_kallsym 5.66% [kernel] [k] kallsyms_expand_symbol.constprop.0 3.98% libc-2.29.so [.] __GI_____strtoull_l_internal 3.66% perf [.] rb_insert_color 2.34% [kernel] [k] vsnprintf 2.30% [kernel] [k] string_nocheck 2.16% libc-2.29.so [.] _IO_getdelim 2.15% [kernel] [k] number 2.13% [kernel] [k] format_decode 1.58% libc-2.29.so [.] _IO_feof 1.52% libc-2.29.so [.] __strcmp_avx2 1.50% perf [.] rb_set_parent_color 1.47% libc-2.29.so [.] __libc_calloc 1.24% [kernel] [k] do_syscall_64 1.17% [kernel] [k] __x86_indirect_thunk_rax $ perf record -a sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.552 MB perf.data (74 samples) ] $ perf evlist cycles $ perf evlist -v cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 $ perf report | head -20 # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 74 of event 'cycles' # Event count (approx.): 15694834 # # Overhead Command Shared Object Symbol # ........ ............... .......................... ...................................... # 19.62% perf [kernel.vmlinux] [k] strnlen_user 13.88% swapper [kernel.vmlinux] [k] intel_idle 13.83% ksoftirqd/0 [kernel.vmlinux] [k] pfifo_fast_dequeue 13.51% swapper [kernel.vmlinux] [k] kmem_cache_free 6.31% gnome-shell [kernel.vmlinux] [k] kmem_cache_free 5.66% kworker/u8:3+ix [kernel.vmlinux] [k] delay_tsc 4.42% perf [kernel.vmlinux] [k] __set_cpus_allowed_ptr 3.45% kworker/2:1-eve [kernel.vmlinux] [k] shmem_truncate_range 2.29% gnome-shell libgobject-2.0.so.0.6000.7 [.] g_closure_ref $ Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com> Reviewed-by: James Morris <jamorris@linux.microsoft.com> Acked-by: Jiri Olsa <jolsa@redhat.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andi Kleen <ak@linux.intel.com> Cc: Igor Lubashev <ilubashe@akamai.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Serge Hallyn <serge@hallyn.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/a66d5648-2b8e-577e-e1f2-1d56c017ab5e@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
419 lines
7.8 KiB
C
419 lines
7.8 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include "util.h"
|
|
#include "debug.h"
|
|
#include "event.h"
|
|
#include <api/fs/fs.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/utsname.h>
|
|
#include <dirent.h>
|
|
#include <fcntl.h>
|
|
#include <inttypes.h>
|
|
#include <signal.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
#include <limits.h>
|
|
#include <linux/capability.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/log2.h>
|
|
#include <linux/time64.h>
|
|
#include <unistd.h>
|
|
#include "cap.h"
|
|
#include "strlist.h"
|
|
#include "string2.h"
|
|
|
|
/*
|
|
* XXX We need to find a better place for these things...
|
|
*/
|
|
|
|
bool perf_singlethreaded = true;
|
|
|
|
void perf_set_singlethreaded(void)
|
|
{
|
|
perf_singlethreaded = true;
|
|
}
|
|
|
|
void perf_set_multithreaded(void)
|
|
{
|
|
perf_singlethreaded = false;
|
|
}
|
|
|
|
int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
|
|
int sysctl_perf_event_max_contexts_per_stack = PERF_MAX_CONTEXTS_PER_STACK;
|
|
|
|
int sysctl__max_stack(void)
|
|
{
|
|
int value;
|
|
|
|
if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0)
|
|
sysctl_perf_event_max_stack = value;
|
|
|
|
if (sysctl__read_int("kernel/perf_event_max_contexts_per_stack", &value) == 0)
|
|
sysctl_perf_event_max_contexts_per_stack = value;
|
|
|
|
return sysctl_perf_event_max_stack;
|
|
}
|
|
|
|
bool sysctl__nmi_watchdog_enabled(void)
|
|
{
|
|
static bool cached;
|
|
static bool nmi_watchdog;
|
|
int value;
|
|
|
|
if (cached)
|
|
return nmi_watchdog;
|
|
|
|
if (sysctl__read_int("kernel/nmi_watchdog", &value) < 0)
|
|
return false;
|
|
|
|
nmi_watchdog = (value > 0) ? true : false;
|
|
cached = true;
|
|
|
|
return nmi_watchdog;
|
|
}
|
|
|
|
bool test_attr__enabled;
|
|
|
|
bool perf_host = true;
|
|
bool perf_guest = false;
|
|
|
|
void event_attr_init(struct perf_event_attr *attr)
|
|
{
|
|
if (!perf_host)
|
|
attr->exclude_host = 1;
|
|
if (!perf_guest)
|
|
attr->exclude_guest = 1;
|
|
/* to capture ABI version */
|
|
attr->size = sizeof(*attr);
|
|
}
|
|
|
|
int mkdir_p(char *path, mode_t mode)
|
|
{
|
|
struct stat st;
|
|
int err;
|
|
char *d = path;
|
|
|
|
if (*d != '/')
|
|
return -1;
|
|
|
|
if (stat(path, &st) == 0)
|
|
return 0;
|
|
|
|
while (*++d == '/');
|
|
|
|
while ((d = strchr(d, '/'))) {
|
|
*d = '\0';
|
|
err = stat(path, &st) && mkdir(path, mode);
|
|
*d++ = '/';
|
|
if (err)
|
|
return -1;
|
|
while (*d == '/')
|
|
++d;
|
|
}
|
|
return (stat(path, &st) && mkdir(path, mode)) ? -1 : 0;
|
|
}
|
|
|
|
static bool match_pat(char *file, const char **pat)
|
|
{
|
|
int i = 0;
|
|
|
|
if (!pat)
|
|
return true;
|
|
|
|
while (pat[i]) {
|
|
if (strglobmatch(file, pat[i]))
|
|
return true;
|
|
|
|
i++;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* The depth specify how deep the removal will go.
|
|
* 0 - will remove only files under the 'path' directory
|
|
* 1 .. x - will dive in x-level deep under the 'path' directory
|
|
*
|
|
* If specified the pat is array of string patterns ended with NULL,
|
|
* which are checked upon every file/directory found. Only matching
|
|
* ones are removed.
|
|
*
|
|
* The function returns:
|
|
* 0 on success
|
|
* -1 on removal failure with errno set
|
|
* -2 on pattern failure
|
|
*/
|
|
static int rm_rf_depth_pat(const char *path, int depth, const char **pat)
|
|
{
|
|
DIR *dir;
|
|
int ret;
|
|
struct dirent *d;
|
|
char namebuf[PATH_MAX];
|
|
struct stat statbuf;
|
|
|
|
/* Do not fail if there's no file. */
|
|
ret = lstat(path, &statbuf);
|
|
if (ret)
|
|
return 0;
|
|
|
|
/* Try to remove any file we get. */
|
|
if (!(statbuf.st_mode & S_IFDIR))
|
|
return unlink(path);
|
|
|
|
/* We have directory in path. */
|
|
dir = opendir(path);
|
|
if (dir == NULL)
|
|
return -1;
|
|
|
|
while ((d = readdir(dir)) != NULL && !ret) {
|
|
|
|
if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
|
|
continue;
|
|
|
|
if (!match_pat(d->d_name, pat)) {
|
|
ret = -2;
|
|
break;
|
|
}
|
|
|
|
scnprintf(namebuf, sizeof(namebuf), "%s/%s",
|
|
path, d->d_name);
|
|
|
|
/* We have to check symbolic link itself */
|
|
ret = lstat(namebuf, &statbuf);
|
|
if (ret < 0) {
|
|
pr_debug("stat failed: %s\n", namebuf);
|
|
break;
|
|
}
|
|
|
|
if (S_ISDIR(statbuf.st_mode))
|
|
ret = depth ? rm_rf_depth_pat(namebuf, depth - 1, pat) : 0;
|
|
else
|
|
ret = unlink(namebuf);
|
|
}
|
|
closedir(dir);
|
|
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
return rmdir(path);
|
|
}
|
|
|
|
static int rm_rf_kcore_dir(const char *path)
|
|
{
|
|
char kcore_dir_path[PATH_MAX];
|
|
const char *pat[] = {
|
|
"kcore",
|
|
"kallsyms",
|
|
"modules",
|
|
NULL,
|
|
};
|
|
|
|
snprintf(kcore_dir_path, sizeof(kcore_dir_path), "%s/kcore_dir", path);
|
|
|
|
return rm_rf_depth_pat(kcore_dir_path, 0, pat);
|
|
}
|
|
|
|
int rm_rf_perf_data(const char *path)
|
|
{
|
|
const char *pat[] = {
|
|
"data",
|
|
"data.*",
|
|
NULL,
|
|
};
|
|
|
|
rm_rf_kcore_dir(path);
|
|
|
|
return rm_rf_depth_pat(path, 0, pat);
|
|
}
|
|
|
|
int rm_rf(const char *path)
|
|
{
|
|
return rm_rf_depth_pat(path, INT_MAX, NULL);
|
|
}
|
|
|
|
/* A filter which removes dot files */
|
|
bool lsdir_no_dot_filter(const char *name __maybe_unused, struct dirent *d)
|
|
{
|
|
return d->d_name[0] != '.';
|
|
}
|
|
|
|
/* lsdir reads a directory and store it in strlist */
|
|
struct strlist *lsdir(const char *name,
|
|
bool (*filter)(const char *, struct dirent *))
|
|
{
|
|
struct strlist *list = NULL;
|
|
DIR *dir;
|
|
struct dirent *d;
|
|
|
|
dir = opendir(name);
|
|
if (!dir)
|
|
return NULL;
|
|
|
|
list = strlist__new(NULL, NULL);
|
|
if (!list) {
|
|
errno = ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
while ((d = readdir(dir)) != NULL) {
|
|
if (!filter || filter(name, d))
|
|
strlist__add(list, d->d_name);
|
|
}
|
|
|
|
out:
|
|
closedir(dir);
|
|
return list;
|
|
}
|
|
|
|
size_t hex_width(u64 v)
|
|
{
|
|
size_t n = 1;
|
|
|
|
while ((v >>= 4))
|
|
++n;
|
|
|
|
return n;
|
|
}
|
|
|
|
int perf_event_paranoid(void)
|
|
{
|
|
int value;
|
|
|
|
if (sysctl__read_int("kernel/perf_event_paranoid", &value))
|
|
return INT_MAX;
|
|
|
|
return value;
|
|
}
|
|
|
|
bool perf_event_paranoid_check(int max_level)
|
|
{
|
|
return perf_cap__capable(CAP_SYS_ADMIN) ||
|
|
perf_cap__capable(CAP_PERFMON) ||
|
|
perf_event_paranoid() <= max_level;
|
|
}
|
|
|
|
static int
|
|
fetch_ubuntu_kernel_version(unsigned int *puint)
|
|
{
|
|
ssize_t len;
|
|
size_t line_len = 0;
|
|
char *ptr, *line = NULL;
|
|
int version, patchlevel, sublevel, err;
|
|
FILE *vsig;
|
|
|
|
if (!puint)
|
|
return 0;
|
|
|
|
vsig = fopen("/proc/version_signature", "r");
|
|
if (!vsig) {
|
|
pr_debug("Open /proc/version_signature failed: %s\n",
|
|
strerror(errno));
|
|
return -1;
|
|
}
|
|
|
|
len = getline(&line, &line_len, vsig);
|
|
fclose(vsig);
|
|
err = -1;
|
|
if (len <= 0) {
|
|
pr_debug("Reading from /proc/version_signature failed: %s\n",
|
|
strerror(errno));
|
|
goto errout;
|
|
}
|
|
|
|
ptr = strrchr(line, ' ');
|
|
if (!ptr) {
|
|
pr_debug("Parsing /proc/version_signature failed: %s\n", line);
|
|
goto errout;
|
|
}
|
|
|
|
err = sscanf(ptr + 1, "%d.%d.%d",
|
|
&version, &patchlevel, &sublevel);
|
|
if (err != 3) {
|
|
pr_debug("Unable to get kernel version from /proc/version_signature '%s'\n",
|
|
line);
|
|
goto errout;
|
|
}
|
|
|
|
*puint = (version << 16) + (patchlevel << 8) + sublevel;
|
|
err = 0;
|
|
errout:
|
|
free(line);
|
|
return err;
|
|
}
|
|
|
|
int
|
|
fetch_kernel_version(unsigned int *puint, char *str,
|
|
size_t str_size)
|
|
{
|
|
struct utsname utsname;
|
|
int version, patchlevel, sublevel, err;
|
|
bool int_ver_ready = false;
|
|
|
|
if (access("/proc/version_signature", R_OK) == 0)
|
|
if (!fetch_ubuntu_kernel_version(puint))
|
|
int_ver_ready = true;
|
|
|
|
if (uname(&utsname))
|
|
return -1;
|
|
|
|
if (str && str_size) {
|
|
strncpy(str, utsname.release, str_size);
|
|
str[str_size - 1] = '\0';
|
|
}
|
|
|
|
if (!puint || int_ver_ready)
|
|
return 0;
|
|
|
|
err = sscanf(utsname.release, "%d.%d.%d",
|
|
&version, &patchlevel, &sublevel);
|
|
|
|
if (err != 3) {
|
|
pr_debug("Unable to get kernel version from uname '%s'\n",
|
|
utsname.release);
|
|
return -1;
|
|
}
|
|
|
|
*puint = (version << 16) + (patchlevel << 8) + sublevel;
|
|
return 0;
|
|
}
|
|
|
|
const char *perf_tip(const char *dirpath)
|
|
{
|
|
struct strlist *tips;
|
|
struct str_node *node;
|
|
char *tip = NULL;
|
|
struct strlist_config conf = {
|
|
.dirname = dirpath,
|
|
.file_only = true,
|
|
};
|
|
|
|
tips = strlist__new("tips.txt", &conf);
|
|
if (tips == NULL)
|
|
return errno == ENOENT ? NULL :
|
|
"Tip: check path of tips.txt or get more memory! ;-p";
|
|
|
|
if (strlist__nr_entries(tips) == 0)
|
|
goto out;
|
|
|
|
node = strlist__entry(tips, random() % strlist__nr_entries(tips));
|
|
if (asprintf(&tip, "Tip: %s", node->s) < 0)
|
|
tip = (char *)"Tip: get more memory! ;-)";
|
|
|
|
out:
|
|
strlist__delete(tips);
|
|
|
|
return tip;
|
|
}
|
|
|
|
char *perf_exe(char *buf, int len)
|
|
{
|
|
int n = readlink("/proc/self/exe", buf, len);
|
|
if (n > 0) {
|
|
buf[n] = 0;
|
|
return buf;
|
|
}
|
|
return strcpy(buf, "perf");
|
|
}
|