1
0
mirror of git://sourceware.org/git/lvm2.git synced 2024-12-21 13:34:40 +03:00
lvm2/lib/mm/memlock.c
2024-05-14 17:59:26 +02:00

718 lines
19 KiB
C

/*
* Copyright (C) 2003-2004 Sistina Software, Inc. All rights reserved.
* Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
*
* This file is part of LVM2.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v.2.1.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "lib/misc/lib.h"
#include "lib/mm/memlock.h"
#include "lib/config/defaults.h"
#include "lib/config/config.h"
#include "lib/commands/toolcontext.h"
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <malloc.h>
#ifndef DEVMAPPER_SUPPORT
void memlock_inc_daemon(struct cmd_context *cmd)
{
return;
}
void memlock_dec_daemon(struct cmd_context *cmd)
{
return;
}
void critical_section_inc(struct cmd_context *cmd, const char *reason)
{
return;
}
void critical_section_dec(struct cmd_context *cmd, const char *reason)
{
return;
}
int critical_section(void)
{
return 0;
}
void memlock_init(struct cmd_context *cmd)
{
return;
}
void memlock_unlock(struct cmd_context *cmd)
{
return;
}
void memlock_reset(void)
{
return;
}
int memlock_count_daemon(void)
{
return 0;
}
#else /* DEVMAPPER_SUPPORT */
static size_t _size_stack;
static size_t _size_malloc_tmp;
static const size_t _size_malloc = 2000000;
static void *_malloc_mem = NULL;
static int _mem_locked = 0;
static int _priority_raised = 0;
static int _critical_section = 0;
static int _prioritized_section = 0;
static int _memlock_count_daemon = 0;
static int _priority;
static int _default_priority;
/* list of maps, that are unconditionaly ignored */
static const char _ignore_maps[][16] = {
"[vdso]",
"[vsyscall]",
"[vectors]",
};
/* default blacklist for maps */
static const char * const _blacklist_maps[] = {
"locale/locale-archive",
"/LC_MESSAGES/",
"gconv/gconv-modules.cache",
"/ld-2.", /* not using dlopen,dlsym during mlock */
"/libattr.so.", /* not using during mlock (udev) */
"/libblkid.so.", /* not using blkid during mlock (udev) */
"/libbz2.so.", /* not using during mlock (udev) */
"/libcap.so.", /* not using during mlock (systemd) */
"/libdl-", /* not using dlopen,dlsym during mlock */
"/libdw-", /* not using during mlock (udev) */
"/libedit.so.", /* not using editline during mlock */
"/libelf-", /* not using during mlock (udev) */
"/libgcrypt.so.", /* not using during mlock (systemd) */
"/libgpg-error.so.", /* not using gpg-error during mlock (systemd) */
"/liblz4.so.", /* not using lz4 during mlock (systemd) */
"/liblzma.so.", /* not using lzma during mlock (systemd) */
"/libmount.so.", /* not using mount during mlock (udev) */
"/libncurses.so.", /* not using ncurses during mlock */
"/libpcre.so.", /* not using pcre during mlock (selinux) */
"/libpcre2-", /* not using pcre during mlock (selinux) */
"/libreadline.so.", /* not using readline during mlock */
"/libresolv-", /* not using during mlock (udev) */
"/libselinux.so.", /* not using selinux during mlock */
"/libsepol.so.", /* not using sepol during mlock */
"/libsystemd.so.", /* not using systemd during mlock */
"/libtinfo.so.", /* not using tinfo during mlock */
"/libudev.so.", /* not using udev during mlock */
"/libuuid.so.", /* not using uuid during mlock (blkid) */
"/libz.so.", /* not using during mlock (udev) */
"/libzstd.so.", /* not using zstd during mlock (systemd) */
"/etc/selinux", /* not using selinux during mlock */
/* "/libdevmapper-event.so" */
};
typedef enum { LVM_MLOCK, LVM_MUNLOCK } lvmlock_t;
static unsigned _use_mlockall;
static int _maps_fd;
static size_t _maps_len = 8192; /* Initial buffer size for reading /proc/self/maps */
static char *_maps_buffer;
static char _procselfmaps[PATH_MAX] = "";
#define SELF_MAPS "/self/maps"
static size_t _mstats; /* statistic for maps locking */
static void _touch_memory(void *mem, size_t size)
{
size_t pagesize = lvm_getpagesize();
char *pos = mem;
char *end = pos + size - sizeof(long);
while (pos < end) {
*(long *) pos = 1;
pos += pagesize;
}
}
static void _allocate_memory(void)
{
#if defined(__GLIBC__)
/* Memory allocation is currently only tested with glibc
* for different C libraries, some other mechanisms might be needed
* meanwhile let users use lvm2 code without memory preallocation.
* Compilation for VALGRIND tracing also goes without preallocation.
*/
void *stack_mem;
struct rlimit limit;
int i, area = 0, missing = _size_malloc_tmp, max_areas = 32;
size_t hblks;
char *areas[max_areas];
/* Check if we could preallocate requested stack */
if (getrlimit(RLIMIT_STACK, &limit) == 0) {
limit.rlim_cur /= 2;
if (_size_stack > limit.rlim_cur)
_size_stack = limit.rlim_cur;
if ((stack_mem = alloca(_size_stack)))
_touch_memory(stack_mem, _size_stack);
}
/* FIXME else warn user setting got ignored */
#ifdef HAVE_MALLINFO2
/* Prefer mallinfo2 call when avaialble with newer glibc */
#define MALLINFO mallinfo2
#else
#define MALLINFO mallinfo
#endif
/*
* When a brk() fails due to fragmented address space (which sometimes
* happens when we try to grab 8M or so), glibc will make a new
* arena. In this arena, the rules for using "direct" mmap are relaxed,
* circumventing the MAX_MMAPs and MMAP_THRESHOLD settings. We can,
* however, detect when this happens with mallinfo() and try to co-opt
* malloc into using MMAP as a MORECORE substitute instead of returning
* MMAP'd memory directly. Since MMAP-as-MORECORE does not munmap the
* memory on free(), this is good enough for our purposes.
*/
while (missing > 0) {
struct MALLINFO inf = MALLINFO();
hblks = inf.hblks;
if ((areas[area] = malloc(_size_malloc_tmp)))
_touch_memory(areas[area], _size_malloc_tmp);
inf = MALLINFO();
if (hblks < inf.hblks) {
/* malloc cheated and used mmap, even though we told it
not to; we try with twice as many areas, each half
the size, to circumvent the faulty logic in glibc */
free(areas[area]);
_size_malloc_tmp /= 2;
} else {
++ area;
missing -= _size_malloc_tmp;
}
if (area == max_areas && missing > 0) {
/* Too bad. Warn the user and proceed, as things are
* most likely going to work out anyway. */
log_warn("WARNING: Failed to reserve memory, %d bytes missing.", missing);
break;
}
}
if ((_malloc_mem = malloc(_size_malloc)))
_touch_memory(_malloc_mem, _size_malloc);
/* free up the reserves so subsequent malloc's can use that memory */
for (i = 0; i < area; ++i)
free(areas[i]);
#endif
}
static void _release_memory(void)
{
free(_malloc_mem);
}
/*
* mlock/munlock memory areas from /proc/self/maps
* format described in kernel/Documentation/filesystem/proc.txt
*/
static int _maps_line(const struct dm_config_node *cn, lvmlock_t lock,
const char *line, size_t *mstats)
{
const struct dm_config_value *cv;
unsigned long from, to;
int pos;
unsigned i;
char fr, fw, fx, fp;
size_t sz;
const char *lock_str = (lock == LVM_MLOCK) ? "mlock" : "munlock";
if (sscanf(line, "%lx-%lx %c%c%c%c%n",
&from, &to, &fr, &fw, &fx, &fp, &pos) != 6) {
log_debug_mem("Failed to parse maps line: %s", line);
return 0;
}
/* Select readable maps */
if (fr != 'r') {
log_debug_mem("%s area unreadable %s : Skipping.", lock_str, line);
return 1;
}
/* always ignored areas */
for (i = 0; i < DM_ARRAY_SIZE(_ignore_maps); ++i)
if (strstr(line + pos, _ignore_maps[i])) {
log_debug_mem("%s ignore filter '%s' matches '%s': Skipping.",
lock_str, _ignore_maps[i], line);
return 1;
}
sz = to - from;
if (!cn) {
/* If no blacklist configured, use an internal set */
for (i = 0; i < DM_ARRAY_SIZE(_blacklist_maps); ++i)
if (strstr(line + pos, _blacklist_maps[i])) {
log_debug_mem("%s default filter '%s' matches '%s': Skipping.",
lock_str, _blacklist_maps[i], line);
return 1;
}
} else {
for (cv = cn->v; cv; cv = cv->next) {
if ((cv->type != DM_CFG_STRING) || !cv->v.str[0])
continue;
if (strstr(line + pos, cv->v.str)) {
log_debug_mem("%s_filter '%s' matches '%s': Skipping.",
lock_str, cv->v.str, line);
return 1;
}
}
}
*mstats += sz;
log_debug_mem("%s %10ldKiB %12lx - %12lx %c%c%c%c%s", lock_str,
((long)sz + 1023) / 1024, from, to, fr, fw, fx, fp, line + pos);
if (lock == LVM_MLOCK) {
if (mlock((const void*)from, sz) < 0) {
log_sys_error("mlock", line);
return 0;
}
} else {
if (munlock((const void*)from, sz) < 0) {
log_sys_error("munlock", line);
return 0;
}
}
return 1;
}
static int _memlock_maps(struct cmd_context *cmd, lvmlock_t lock, size_t *mstats)
{
const struct dm_config_node *cn;
char *line, *line_end;
size_t len;
ssize_t n;
int ret = 1;
if (cmd->running_on_valgrind) {
log_debug_mem("Skipping %slocking of memory maps (running in VALGRIND).",
(lock == LVM_MLOCK) ? "" : "un") ;
return 1;
}
if (_use_mlockall) {
#ifdef MCL_CURRENT
if (lock == LVM_MLOCK) {
if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
log_sys_debug("mlockall", "");
return 0;
}
} else {
if (munlockall()) {
log_sys_debug("munlockall", "");
return 0;
}
}
return 1;
#else
return 0;
#endif
}
/* Reset statistic counters */
*mstats = 0;
/* read mapping into a single memory chunk without reallocation
* in the middle of reading maps file */
for (len = 0;;) {
if (!_maps_buffer || len >= _maps_len) {
if (_maps_buffer)
_maps_len *= 2;
if (!(line = realloc(_maps_buffer, _maps_len))) {
log_debug_mem("Allocation of maps buffer failed.");
return 0;
}
_maps_buffer = line;
}
if (lseek(_maps_fd, 0, SEEK_SET))
log_sys_debug("lseek", _procselfmaps);
for (len = 0 ; len < _maps_len; len += n) {
if (!(n = read(_maps_fd, _maps_buffer + len, _maps_len - len)))
break; /* EOF */
if ((n < 0) || (len >= (size_t)(SSIZE_MAX - n))) {
log_sys_debug("read", _procselfmaps);
return 0;
}
}
if (len < _maps_len) { /* fits in buffer */
_maps_buffer[len] = '\0';
break;
}
}
line = _maps_buffer;
cn = find_config_tree_array(cmd, activation_mlock_filter_CFG, NULL);
while ((line_end = strchr(line, '\n'))) {
*line_end = '\0'; /* remove \n */
if (!_maps_line(cn, lock, line, mstats))
ret = 0;
line = line_end + 1;
}
log_debug_mem("%socked %ld bytes",
(lock == LVM_MLOCK) ? "L" : "Unl", (long)*mstats);
return ret;
}
#ifdef DEBUG_MEMLOCK
/*
* LVM is not supposed to use mmap while devices are suspended.
* This code causes a core dump if gets called."
*/
# ifdef __i386__
# define ARCH_X86
# endif /* __i386__ */
# ifdef __x86_64__
# ifndef ARCH_X86
# define ARCH_X86
# endif /* ARCH_X86 */
# endif /* __x86_64__ */
#endif /* DEBUG_MEMLOCK */
#ifdef ARCH_X86
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <dlfcn.h>
static const unsigned char _instruction_hlt = 0x94;
static char _mmap_orig;
static unsigned char *_mmap_addr;
#ifdef __i386__
static char _mmap64_orig;
static unsigned char *_mmap64_addr;
#endif /* __i386__ */
#endif /* ARCH_X86 */
static int _disable_mmap(void)
{
#ifdef ARCH_X86
volatile unsigned char *abs_addr;
if (!_mmap_addr) {
_mmap_addr = (unsigned char *) dlsym(RTLD_NEXT, "mmap");
if (_mmap_addr[0] == 0xff && _mmap_addr[1] == 0x25) { /* plt */
#ifdef __x86_64__
abs_addr = _mmap_addr + 6 + *(int32_t *)(_mmap_addr + 2);
#endif /* __x86_64__ */
#ifdef __i386__
abs_addr = *(void **)(_mmap_addr + 2);
#endif /* __i386__ */
_mmap_addr = *(void **)abs_addr;
} else
log_debug_mem("Can't find PLT jump entry assuming -fPIE linkage.");
if (mprotect((void *)((unsigned long)_mmap_addr & ~4095UL), 4096, PROT_READ|PROT_WRITE|PROT_EXEC)) {
log_sys_error("mprotect", "");
_mmap_addr = NULL;
return 0;
}
_mmap_orig = *_mmap_addr;
}
log_debug_mem("Remapping mmap entry %02x to %02x.", _mmap_orig, _instruction_hlt);
*_mmap_addr = _instruction_hlt;
#ifdef __i386__
if (!_mmap64_addr) {
_mmap64_addr = (unsigned char *) dlsym(RTLD_NEXT, "mmap64");
if (_mmap64_addr[0] == 0xff && _mmap64_addr[1] == 0x25) {
abs_addr = *(void **)(_mmap64_addr + 2);
_mmap64_addr = *(void **)abs_addr;
} /* Can't find PLT jump entry assuming -fPIE linkage */
if (mprotect((void *)((unsigned long)_mmap64_addr & ~4095UL), 4096, PROT_READ|PROT_WRITE|PROT_EXEC)) {
log_sys_error("mprotect", "");
_mmap64_addr = NULL;
return 0;
}
_mmap64_orig = *_mmap64_addr;
}
*_mmap64_addr = INSTRUCTION_HLT;
#endif /* __i386__ */
#endif /* ARCH_X86 */
return 1;
}
static int _restore_mmap(void)
{
#ifdef ARCH_X86
if (_mmap_addr)
*_mmap_addr = _mmap_orig;
#ifdef __i386__
if (_mmap64_addr)
*_mmap64_addr = _mmap64_orig;
#endif /* __i386__ */
log_debug_mem("Restored mmap entry.");
#endif /* ARCH_X86 */
return 1;
}
static void _raise_priority(struct cmd_context *cmd)
{
if (_priority_raised)
return;
_priority_raised = 1;
errno = 0;
if (((_priority = getpriority(PRIO_PROCESS, 0)) == -1) && errno)
log_sys_debug("getpriority", "");
else if (_default_priority < _priority) {
if (setpriority(PRIO_PROCESS, 0, _default_priority) == 0)
log_debug_activation("Raised task priority %d -> %d.",
_priority, _default_priority);
else
log_warn("WARNING: setpriority %d failed: %s.",
_default_priority, strerror(errno));
}
}
static void _restore_priority_if_possible(struct cmd_context *cmd)
{
if (!_priority_raised || _critical_section || _memlock_count_daemon)
return;
if (setpriority(PRIO_PROCESS, 0, _priority) == 0)
log_debug_activation("Restoring original task priority %d.", _priority);
else
log_warn("WARNING: setpriority %u failed: %s.",
_priority, strerror(errno));
_priority_raised = 0;
}
/* Stop memory getting swapped out */
static void _lock_mem(struct cmd_context *cmd)
{
if (!cmd->running_on_valgrind)
_allocate_memory();
(void)strerror(0); /* Force libc.mo load */
(void)dm_udev_get_sync_support(); /* udev is initialized */
log_very_verbose("Locking memory");
/*
* For daemon we need to use mlockall()
* so even future adition of thread which may not even use lvm lib
* will not block memory locked thread
* Note: assuming _memlock_count_daemon is updated before _memlock_count
*/
_use_mlockall = _memlock_count_daemon ? 1 :
find_config_tree_bool(cmd, activation_use_mlockall_CFG, NULL);
if (!_use_mlockall) {
if (!*_procselfmaps &&
dm_snprintf(_procselfmaps, sizeof(_procselfmaps),
"%s" SELF_MAPS, cmd->proc_dir) < 0) {
log_debug_mem("proc_dir too long");
return;
}
if (!(_maps_fd = open(_procselfmaps, O_RDONLY))) {
log_sys_debug("open", _procselfmaps);
return;
}
if (!_disable_mmap())
stack;
}
if (!_memlock_maps(cmd, LVM_MLOCK, &_mstats))
stack;
}
static void _unlock_mem(struct cmd_context *cmd)
{
size_t unlock_mstats = 0;
log_very_verbose("Unlocking memory");
if (!_memlock_maps(cmd, LVM_MUNLOCK, &unlock_mstats))
stack;
if (!_use_mlockall) {
_restore_mmap();
if (close(_maps_fd))
log_sys_debug("close", _procselfmaps);
free(_maps_buffer);
_maps_buffer = NULL;
if (_mstats < unlock_mstats) {
if ((_mstats + lvm_getpagesize()) < unlock_mstats)
log_warn(INTERNAL_ERROR
"Reserved memory (%ld) not enough: used %ld. Increase activation/reserved_memory?",
(long)_mstats, (long)unlock_mstats);
else
/* FIXME Believed due to incorrect use of yes_no_prompt while locks held */
log_debug_mem("Suppressed internal error: Maps lock %ld < unlock %ld, a one-page difference.",
(long)_mstats, (long)unlock_mstats);
}
}
_restore_priority_if_possible(cmd);
_release_memory();
}
static void _lock_mem_if_needed(struct cmd_context *cmd)
{
log_debug_mem("Lock: Memlock counters: prioritized:%d locked:%d critical:%d daemon:%d suspended:%d",
_priority_raised, _mem_locked, _critical_section, _memlock_count_daemon, dm_get_suspended_counter());
if (!_mem_locked &&
((_critical_section + _memlock_count_daemon) == 1)) {
_mem_locked = 1;
_lock_mem(cmd);
}
}
static void _unlock_mem_if_possible(struct cmd_context *cmd)
{
log_debug_mem("Unlock: Memlock counters: prioritized:%d locked:%d critical:%d daemon:%d suspended:%d",
_priority_raised, _mem_locked, _critical_section, _memlock_count_daemon, dm_get_suspended_counter());
if (_mem_locked &&
!_critical_section &&
!_memlock_count_daemon) {
_unlock_mem(cmd);
_mem_locked = 0;
}
}
/*
* Critical section is only triggered with suspending reason.
* Other reasons only raise process priority so the table manipulation
* remains fast.
*
* Memory stays locked until 'memlock_unlock()' is called so when possible
* it may stay locked across multiple crictical section entrances.
*/
void critical_section_inc(struct cmd_context *cmd, const char *reason)
{
if (!_critical_section &&
(strcmp(reason, "suspending") == 0)) {
/*
* Profiles are loaded on-demand so make sure that before
* entering the critical section all needed profiles are
* loaded to avoid the disk access later.
*/
(void) load_pending_profiles(cmd);
_critical_section = 1;
log_debug_activation("Entering critical section (%s).", reason);
_lock_mem_if_needed(cmd);
} else
log_debug_activation("Entering prioritized section (%s).", reason);
_raise_priority(cmd);
_prioritized_section++;
}
void critical_section_dec(struct cmd_context *cmd, const char *reason)
{
if (_critical_section && !dm_get_suspended_counter()) {
_critical_section = 0;
log_debug_activation("Leaving critical section (%s).", reason);
} else
log_debug_activation("Leaving section (%s).", reason);
if (_prioritized_section > 0)
_prioritized_section--;
}
int critical_section(void)
{
return _critical_section;
}
int prioritized_section(void)
{
return _prioritized_section;
}
/*
* The memlock_*_daemon functions will force the mlockall() call that we need
* to stay in memory, but they will have no effect on device scans (unlike
* normal critical_section_inc/dec). Memory is kept locked as long as either
* of critical_section or memlock_daemon is in effect.
*/
void memlock_inc_daemon(struct cmd_context *cmd)
{
++_memlock_count_daemon;
if (_memlock_count_daemon == 1 && _critical_section > 0)
log_debug_mem(INTERNAL_ERROR "_memlock_inc_daemon used in critical section.");
log_debug_mem("memlock_count_daemon inc to %d", _memlock_count_daemon);
_lock_mem_if_needed(cmd);
_raise_priority(cmd);
}
void memlock_dec_daemon(struct cmd_context *cmd)
{
if (!_memlock_count_daemon)
log_debug_mem(INTERNAL_ERROR "_memlock_count_daemon has dropped below 0.");
--_memlock_count_daemon;
log_debug_mem("memlock_count_daemon dec to %d", _memlock_count_daemon);
_unlock_mem_if_possible(cmd);
}
void memlock_init(struct cmd_context *cmd)
{
/* When threaded, caller already limited stack size so just use the default. */
_size_stack = 1024ULL * (cmd->threaded ? DEFAULT_RESERVED_STACK :
find_config_tree_int(cmd, activation_reserved_stack_CFG, NULL));
_size_malloc_tmp = find_config_tree_int(cmd, activation_reserved_memory_CFG, NULL) * 1024ULL;
_default_priority = find_config_tree_int(cmd, activation_process_priority_CFG, NULL);
}
void memlock_reset(void)
{
log_debug_mem("memlock reset.");
_mem_locked = 0;
_priority_raised = 0;
_critical_section = 0;
_prioritized_section = 0;
_memlock_count_daemon = 0;
}
void memlock_unlock(struct cmd_context *cmd)
{
_unlock_mem_if_possible(cmd);
_restore_priority_if_possible(cmd);
}
int memlock_count_daemon(void)
{
return _memlock_count_daemon;
}
#endif