1
0
mirror of git://sourceware.org/git/lvm2.git synced 2025-01-06 17:18:29 +03:00
lvm2/lib/mm/memlock.c
Zdenek Kabelac 964012fdb9 memlock: use value of 0 to disable memory locking
In cases user is sure he is not using his 'rootfs' or 'swap' on LVs
managed with his command - it possible to completely bypass pinning
process to RAM which may eventually slightly speedup command execution,
(however at the risk the process can be eventually delayed by swapping).
Basicaly use this only at your risk...

TODO: add some dmeventd support for this.
2024-09-27 13:42:45 +02:00

732 lines
20 KiB
C

/*
* Copyright (C) 2003-2004 Sistina Software, Inc. All rights reserved.
* Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
*
* This file is part of LVM2.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v.2.1.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "lib/misc/lib.h"
#include "lib/mm/memlock.h"
#include "lib/config/defaults.h"
#include "lib/config/config.h"
#include "lib/commands/toolcontext.h"
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <malloc.h>
#ifndef DEVMAPPER_SUPPORT
void memlock_inc_daemon(struct cmd_context *cmd)
{
return;
}
void memlock_dec_daemon(struct cmd_context *cmd)
{
return;
}
void critical_section_inc(struct cmd_context *cmd, const char *reason)
{
return;
}
void critical_section_dec(struct cmd_context *cmd, const char *reason)
{
return;
}
int critical_section(void)
{
return 0;
}
void memlock_init(struct cmd_context *cmd)
{
return;
}
void memlock_unlock(struct cmd_context *cmd)
{
return;
}
void memlock_reset(void)
{
return;
}
int memlock_count_daemon(void)
{
return 0;
}
#else /* DEVMAPPER_SUPPORT */
static size_t _size_stack;
static size_t _size_malloc_tmp;
static const size_t _size_malloc = 2000000;
static void *_malloc_mem = NULL;
static int _mem_locked = 0;
static int _priority_raised = 0;
static int _critical_section = 0;
static int _prioritized_section = 0;
static int _memlock_count_daemon = 0;
static int _priority;
static int _default_priority;
/* list of maps, that are unconditionally ignored */
static const char _ignore_maps[][16] = {
"[vdso]",
"[vsyscall]",
"[vectors]",
};
/* default blacklist for maps */
static const char * const _blacklist_maps[] = {
"locale/locale-archive",
"/LC_MESSAGES/",
"gconv/gconv-modules.cache",
"/ld-2.", /* not using dlopen,dlsym during mlock */
"/libattr.so.", /* not using during mlock (udev) */
"/libblkid.so.", /* not using blkid during mlock (udev) */
"/libbz2.so.", /* not using during mlock (udev) */
"/libcap.so.", /* not using during mlock (systemd) */
"/libdl-", /* not using dlopen,dlsym during mlock */
"/libdw-", /* not using during mlock (udev) */
"/libedit.so.", /* not using editline during mlock */
"/libelf-", /* not using during mlock (udev) */
"/libgcrypt.so.", /* not using during mlock (systemd) */
"/libgpg-error.so.", /* not using gpg-error during mlock (systemd) */
"/liblz4.so.", /* not using lz4 during mlock (systemd) */
"/liblzma.so.", /* not using lzma during mlock (systemd) */
"/libmount.so.", /* not using mount during mlock (udev) */
"/libncurses.so.", /* not using ncurses during mlock */
"/libpcre.so.", /* not using pcre during mlock (selinux) */
"/libpcre2-", /* not using pcre during mlock (selinux) */
"/libreadline.so.", /* not using readline during mlock */
"/libresolv-", /* not using during mlock (udev) */
"/libselinux.so.", /* not using selinux during mlock */
"/libsepol.so.", /* not using sepol during mlock */
"/libsystemd.so.", /* not using systemd during mlock */
"/libtinfo.so.", /* not using tinfo during mlock */
"/libudev.so.", /* not using udev during mlock */
"/libuuid.so.", /* not using uuid during mlock (blkid) */
"/libz.so.", /* not using during mlock (udev) */
"/libzstd.so.", /* not using zstd during mlock (systemd) */
"/etc/selinux", /* not using selinux during mlock */
/* "/libdevmapper-event.so" */
};
typedef enum { LVM_MLOCK, LVM_MUNLOCK } lvmlock_t;
static unsigned _use_mlockall;
static int _maps_fd;
static size_t _maps_len = 8192; /* Initial buffer size for reading /proc/self/maps */
static char *_maps_buffer;
static char _procselfmaps[PATH_MAX] = "";
#define SELF_MAPS "/self/maps"
static size_t _mstats; /* statistic for maps locking */
static void _touch_memory(void *mem, size_t size)
{
size_t pagesize = lvm_getpagesize();
char *pos = mem;
char *end = pos + size - sizeof(long);
while (pos < end) {
*(long *) pos = 1;
pos += pagesize;
}
}
static void _allocate_memory(void)
{
#if defined(__GLIBC__)
/* Memory allocation is currently only tested with glibc
* for different C libraries, some other mechanisms might be needed
* meanwhile let users use lvm2 code without memory preallocation.
* Compilation for VALGRIND tracing also goes without preallocation.
*/
void *stack_mem;
struct rlimit limit;
int i, area = 0, missing = _size_malloc_tmp, max_areas = 32;
size_t hblks;
char *areas[max_areas];
/* Check if we could preallocate requested stack */
if (getrlimit(RLIMIT_STACK, &limit) == 0) {
limit.rlim_cur /= 2;
if (_size_stack > limit.rlim_cur)
_size_stack = limit.rlim_cur;
if ((stack_mem = alloca(_size_stack)))
_touch_memory(stack_mem, _size_stack);
}
/* FIXME else warn user setting got ignored */
#ifdef HAVE_MALLINFO2
/* Prefer mallinfo2 call when available with newer glibc */
#define MALLINFO mallinfo2
#else
#define MALLINFO mallinfo
#endif
/*
* When a brk() fails due to fragmented address space (which sometimes
* happens when we try to grab 8M or so), glibc will make a new
* arena. In this arena, the rules for using "direct" mmap are relaxed,
* circumventing the MAX_MMAPs and MMAP_THRESHOLD settings. We can,
* however, detect when this happens with mallinfo() and try to co-opt
* malloc into using MMAP as a MORECORE substitute instead of returning
* MMAP'd memory directly. Since MMAP-as-MORECORE does not munmap the
* memory on free(), this is good enough for our purposes.
*/
while (missing > 0) {
struct MALLINFO inf = MALLINFO();
hblks = inf.hblks;
if ((areas[area] = malloc(_size_malloc_tmp)))
_touch_memory(areas[area], _size_malloc_tmp);
inf = MALLINFO();
if (hblks < inf.hblks) {
/* malloc cheated and used mmap, even though we told it
not to; we try with twice as many areas, each half
the size, to circumvent the faulty logic in glibc */
free(areas[area]);
_size_malloc_tmp /= 2;
} else {
++ area;
missing -= _size_malloc_tmp;
}
if (area == max_areas && missing > 0) {
/* Too bad. Warn the user and proceed, as things are
* most likely going to work out anyway. */
log_warn("WARNING: Failed to reserve memory, %d bytes missing.", missing);
break;
}
}
if ((_malloc_mem = malloc(_size_malloc)))
_touch_memory(_malloc_mem, _size_malloc);
/* free up the reserves so subsequent malloc's can use that memory */
for (i = 0; i < area; ++i)
free(areas[i]);
#endif
}
static void _release_memory(void)
{
free(_malloc_mem);
}
/*
* mlock/munlock memory areas from /proc/self/maps
* format described in kernel/Documentation/filesystem/proc.txt
*/
static int _maps_line(const struct dm_config_node *cn, lvmlock_t lock,
const char *line, size_t *mstats)
{
const struct dm_config_value *cv;
unsigned long from, to;
int pos;
unsigned i;
char fr, fw, fx, fp;
size_t sz;
const char *lock_str = (lock == LVM_MLOCK) ? "mlock" : "munlock";
if (sscanf(line, "%lx-%lx %c%c%c%c%n",
&from, &to, &fr, &fw, &fx, &fp, &pos) != 6) {
log_debug_mem("Failed to parse maps line: %s", line);
return 0;
}
/* Select readable maps */
if (fr != 'r') {
log_debug_mem("%s area unreadable %s : Skipping.", lock_str, line);
return 1;
}
/* always ignored areas */
for (i = 0; i < DM_ARRAY_SIZE(_ignore_maps); ++i)
if (strstr(line + pos, _ignore_maps[i])) {
log_debug_mem("%s ignore filter '%s' matches '%s': Skipping.",
lock_str, _ignore_maps[i], line);
return 1;
}
sz = to - from;
if (!cn) {
/* If no blacklist configured, use an internal set */
for (i = 0; i < DM_ARRAY_SIZE(_blacklist_maps); ++i)
if (strstr(line + pos, _blacklist_maps[i])) {
log_debug_mem("%s default filter '%s' matches '%s': Skipping.",
lock_str, _blacklist_maps[i], line);
return 1;
}
} else {
for (cv = cn->v; cv; cv = cv->next) {
if ((cv->type != DM_CFG_STRING) || !cv->v.str[0])
continue;
if (strstr(line + pos, cv->v.str)) {
log_debug_mem("%s_filter '%s' matches '%s': Skipping.",
lock_str, cv->v.str, line);
return 1;
}
}
}
*mstats += sz;
log_debug_mem("%s %10ldKiB %12lx - %12lx %c%c%c%c%s", lock_str,
((long)sz + 1023) / 1024, from, to, fr, fw, fx, fp, line + pos);
if (lock == LVM_MLOCK) {
if (mlock((const void*)from, sz) < 0) {
log_sys_error("mlock", line);
return 0;
}
} else {
if (munlock((const void*)from, sz) < 0) {
log_sys_error("munlock", line);
return 0;
}
}
return 1;
}
static int _memlock_maps(struct cmd_context *cmd, lvmlock_t lock, size_t *mstats)
{
const struct dm_config_node *cn;
char *line, *line_end;
size_t len;
ssize_t n;
int ret = 1;
if (cmd->running_on_valgrind) {
log_debug_mem("Skipping %slocking of memory maps (running in VALGRIND).",
(lock == LVM_MLOCK) ? "" : "un") ;
return 1;
}
if (_use_mlockall) {
#ifdef MCL_CURRENT
if (lock == LVM_MLOCK) {
if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
log_sys_debug("mlockall", "");
return 0;
}
} else {
if (munlockall()) {
log_sys_debug("munlockall", "");
return 0;
}
}
return 1;
#else
return 0;
#endif
}
/* Reset statistic counters */
*mstats = 0;
/* read mapping into a single memory chunk without reallocation
* in the middle of reading maps file */
for (len = 0;;) {
if (!_maps_buffer || len >= _maps_len) {
if (_maps_buffer)
_maps_len *= 2;
if (!(line = realloc(_maps_buffer, _maps_len))) {
log_debug_mem("Allocation of maps buffer failed.");
return 0;
}
_maps_buffer = line;
}
if (lseek(_maps_fd, 0, SEEK_SET))
log_sys_debug("lseek", _procselfmaps);
for (len = 0 ; len < _maps_len; len += n) {
if (!(n = read(_maps_fd, _maps_buffer + len, _maps_len - len)))
break; /* EOF */
if ((n < 0) || (len >= (size_t)(SSIZE_MAX - n))) {
log_sys_debug("read", _procselfmaps);
return 0;
}
}
if (len < _maps_len) { /* fits in buffer */
_maps_buffer[len] = '\0';
break;
}
}
line = _maps_buffer;
cn = find_config_tree_array(cmd, activation_mlock_filter_CFG, NULL);
while ((line_end = strchr(line, '\n'))) {
*line_end = '\0'; /* remove \n */
if (!_maps_line(cn, lock, line, mstats))
ret = 0;
line = line_end + 1;
}
log_debug_mem("%socked %ld bytes",
(lock == LVM_MLOCK) ? "L" : "Unl", (long)*mstats);
return ret;
}
#ifdef DEBUG_MEMLOCK
/*
* LVM is not supposed to use mmap while devices are suspended.
* This code causes a core dump if gets called."
*/
# ifdef __i386__
# define ARCH_X86
# endif /* __i386__ */
# ifdef __x86_64__
# ifndef ARCH_X86
# define ARCH_X86
# endif /* ARCH_X86 */
# endif /* __x86_64__ */
#endif /* DEBUG_MEMLOCK */
#ifdef ARCH_X86
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <dlfcn.h>
static const unsigned char _instruction_hlt = 0x94;
static char _mmap_orig;
static unsigned char *_mmap_addr;
#ifdef __i386__
static char _mmap64_orig;
static unsigned char *_mmap64_addr;
#endif /* __i386__ */
#endif /* ARCH_X86 */
static int _disable_mmap(void)
{
#ifdef ARCH_X86
volatile unsigned char *abs_addr;
if (!_mmap_addr) {
_mmap_addr = (unsigned char *) dlsym(RTLD_NEXT, "mmap");
if (_mmap_addr[0] == 0xff && _mmap_addr[1] == 0x25) { /* plt */
#ifdef __x86_64__
abs_addr = _mmap_addr + 6 + *(int32_t *)(_mmap_addr + 2);
#endif /* __x86_64__ */
#ifdef __i386__
abs_addr = *(void **)(_mmap_addr + 2);
#endif /* __i386__ */
_mmap_addr = *(void **)abs_addr;
} else
log_debug_mem("Can't find PLT jump entry assuming -fPIE linkage.");
if (mprotect((void *)((unsigned long)_mmap_addr & ~4095UL), 4096, PROT_READ|PROT_WRITE|PROT_EXEC)) {
log_sys_error("mprotect", "");
_mmap_addr = NULL;
return 0;
}
_mmap_orig = *_mmap_addr;
}
log_debug_mem("Remapping mmap entry %02x to %02x.", _mmap_orig, _instruction_hlt);
*_mmap_addr = _instruction_hlt;
#ifdef __i386__
if (!_mmap64_addr) {
_mmap64_addr = (unsigned char *) dlsym(RTLD_NEXT, "mmap64");
if (_mmap64_addr[0] == 0xff && _mmap64_addr[1] == 0x25) {
abs_addr = *(void **)(_mmap64_addr + 2);
_mmap64_addr = *(void **)abs_addr;
} /* Can't find PLT jump entry assuming -fPIE linkage */
if (mprotect((void *)((unsigned long)_mmap64_addr & ~4095UL), 4096, PROT_READ|PROT_WRITE|PROT_EXEC)) {
log_sys_error("mprotect", "");
_mmap64_addr = NULL;
return 0;
}
_mmap64_orig = *_mmap64_addr;
}
*_mmap64_addr = INSTRUCTION_HLT;
#endif /* __i386__ */
#endif /* ARCH_X86 */
return 1;
}
static int _restore_mmap(void)
{
#ifdef ARCH_X86
if (_mmap_addr)
*_mmap_addr = _mmap_orig;
#ifdef __i386__
if (_mmap64_addr)
*_mmap64_addr = _mmap64_orig;
#endif /* __i386__ */
log_debug_mem("Restored mmap entry.");
#endif /* ARCH_X86 */
return 1;
}
static void _raise_priority(struct cmd_context *cmd)
{
if (_priority_raised)
return;
_priority_raised = 1;
errno = 0;
if (((_priority = getpriority(PRIO_PROCESS, 0)) == -1) && errno)
log_sys_debug("getpriority", "");
else if (_default_priority < _priority) {
if (setpriority(PRIO_PROCESS, 0, _default_priority) == 0)
log_debug_activation("Raised task priority %d -> %d.",
_priority, _default_priority);
else
log_warn("WARNING: setpriority %d failed: %s.",
_default_priority, strerror(errno));
}
}
static void _restore_priority_if_possible(struct cmd_context *cmd)
{
if (!_priority_raised || _critical_section || _memlock_count_daemon)
return;
if (setpriority(PRIO_PROCESS, 0, _priority) == 0)
log_debug_activation("Restoring original task priority %d.", _priority);
else
log_warn("WARNING: setpriority %u failed: %s.",
_priority, strerror(errno));
_priority_raised = 0;
}
/* Stop memory getting swapped out */
static void _lock_mem(struct cmd_context *cmd)
{
if (!_size_stack || _size_malloc_tmp) {
log_debug_mem("Skipping memory locking (reserved memory: "
FMTsize_t " stack: " FMTsize_t ").",
_size_malloc_tmp, _size_stack);
return;
}
if (!cmd->running_on_valgrind)
_allocate_memory();
(void)strerror(0); /* Force libc.mo load */
(void)dm_udev_get_sync_support(); /* udev is initialized */
log_very_verbose("Locking memory");
/*
* For daemon we need to use mlockall()
* so even future addition of thread which may not even use lvm lib
* will not block memory locked thread
* Note: assuming _memlock_count_daemon is updated before _memlock_count
*/
_use_mlockall = _memlock_count_daemon ? 1 :
find_config_tree_bool(cmd, activation_use_mlockall_CFG, NULL);
if (!_use_mlockall) {
if (!*_procselfmaps &&
dm_snprintf(_procselfmaps, sizeof(_procselfmaps),
"%s" SELF_MAPS, cmd->proc_dir) < 0) {
log_debug_mem("proc_dir too long");
return;
}
if (!(_maps_fd = open(_procselfmaps, O_RDONLY))) {
log_sys_debug("open", _procselfmaps);
return;
}
if (!_disable_mmap())
stack;
}
if (!_memlock_maps(cmd, LVM_MLOCK, &_mstats))
stack;
}
static void _unlock_mem(struct cmd_context *cmd)
{
size_t unlock_mstats = 0;
if (!_size_stack || _size_malloc_tmp) {
log_debug_mem("Skipping memory unlocking (reserved memory: "
FMTsize_t " stack: " FMTsize_t ").",
_size_malloc_tmp, _size_stack);
return;
}
log_very_verbose("Unlocking memory");
if (!_memlock_maps(cmd, LVM_MUNLOCK, &unlock_mstats))
stack;
if (!_use_mlockall) {
_restore_mmap();
if (close(_maps_fd))
log_sys_debug("close", _procselfmaps);
free(_maps_buffer);
_maps_buffer = NULL;
if (_mstats < unlock_mstats) {
if ((_mstats + lvm_getpagesize()) < unlock_mstats)
log_warn(INTERNAL_ERROR
"Reserved memory (%ld) not enough: used %ld. Increase activation/reserved_memory?",
(long)_mstats, (long)unlock_mstats);
else
/* FIXME Believed due to incorrect use of yes_no_prompt while locks held */
log_debug_mem("Suppressed internal error: Maps lock %ld < unlock %ld, a one-page difference.",
(long)_mstats, (long)unlock_mstats);
}
}
_restore_priority_if_possible(cmd);
_release_memory();
}
static void _lock_mem_if_needed(struct cmd_context *cmd)
{
log_debug_mem("Lock: Memlock counters: prioritized:%d locked:%d critical:%d daemon:%d suspended:%d",
_priority_raised, _mem_locked, _critical_section, _memlock_count_daemon, dm_get_suspended_counter());
if (!_mem_locked &&
((_critical_section + _memlock_count_daemon) == 1)) {
_mem_locked = 1;
_lock_mem(cmd);
}
}
static void _unlock_mem_if_possible(struct cmd_context *cmd)
{
log_debug_mem("Unlock: Memlock counters: prioritized:%d locked:%d critical:%d daemon:%d suspended:%d",
_priority_raised, _mem_locked, _critical_section, _memlock_count_daemon, dm_get_suspended_counter());
if (_mem_locked &&
!_critical_section &&
!_memlock_count_daemon) {
_unlock_mem(cmd);
_mem_locked = 0;
}
}
/*
* Critical section is only triggered with suspending reason.
* Other reasons only raise process priority so the table manipulation
* remains fast.
*
* Memory stays locked until 'memlock_unlock()' is called so when possible
* it may stay locked across multiple critical section entrances.
*/
void critical_section_inc(struct cmd_context *cmd, const char *reason)
{
if (!_critical_section &&
(strcmp(reason, "suspending") == 0)) {
/*
* Profiles are loaded on-demand so make sure that before
* entering the critical section all needed profiles are
* loaded to avoid the disk access later.
*/
(void) load_pending_profiles(cmd);
_critical_section = 1;
log_debug_activation("Entering critical section (%s).", reason);
_lock_mem_if_needed(cmd);
} else
log_debug_activation("Entering prioritized section (%s).", reason);
_raise_priority(cmd);
_prioritized_section++;
}
void critical_section_dec(struct cmd_context *cmd, const char *reason)
{
if (_critical_section && !dm_get_suspended_counter()) {
_critical_section = 0;
log_debug_activation("Leaving critical section (%s).", reason);
} else
log_debug_activation("Leaving section (%s).", reason);
if (_prioritized_section > 0)
_prioritized_section--;
}
int critical_section(void)
{
return _critical_section;
}
int prioritized_section(void)
{
return _prioritized_section;
}
/*
* The memlock_*_daemon functions will force the mlockall() call that we need
* to stay in memory, but they will have no effect on device scans (unlike
* normal critical_section_inc/dec). Memory is kept locked as long as either
* of critical_section or memlock_daemon is in effect.
*/
void memlock_inc_daemon(struct cmd_context *cmd)
{
++_memlock_count_daemon;
if (_memlock_count_daemon == 1 && _critical_section > 0)
log_debug_mem(INTERNAL_ERROR "_memlock_inc_daemon used in critical section.");
log_debug_mem("memlock_count_daemon inc to %d", _memlock_count_daemon);
_lock_mem_if_needed(cmd);
_raise_priority(cmd);
}
void memlock_dec_daemon(struct cmd_context *cmd)
{
if (!_memlock_count_daemon)
log_debug_mem(INTERNAL_ERROR "_memlock_count_daemon has dropped below 0.");
--_memlock_count_daemon;
log_debug_mem("memlock_count_daemon dec to %d", _memlock_count_daemon);
_unlock_mem_if_possible(cmd);
}
void memlock_init(struct cmd_context *cmd)
{
/* When threaded, caller already limited stack size so just use the default. */
_size_stack = 1024ULL * (cmd->threaded ? DEFAULT_RESERVED_STACK :
find_config_tree_int(cmd, activation_reserved_stack_CFG, NULL));
_size_malloc_tmp = find_config_tree_int(cmd, activation_reserved_memory_CFG, NULL) * 1024ULL;
_default_priority = find_config_tree_int(cmd, activation_process_priority_CFG, NULL);
}
void memlock_reset(void)
{
log_debug_mem("memlock reset.");
_mem_locked = 0;
_priority_raised = 0;
_critical_section = 0;
_prioritized_section = 0;
_memlock_count_daemon = 0;
}
void memlock_unlock(struct cmd_context *cmd)
{
_unlock_mem_if_possible(cmd);
_restore_priority_if_possible(cmd);
}
int memlock_count_daemon(void)
{
return _memlock_count_daemon;
}
#endif