Linus Torvalds 98931dd95f Yang Shi has improved the behaviour of khugepaged collapsing of readonly
file-backed transparent hugepages.
 
 Johannes Weiner has arranged for zswap memory use to be tracked and
 managed on a per-cgroup basis.
 
 Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for runtime
 enablement of the recent huge page vmemmap optimization feature.
 
 Baolin Wang contributes a series to fix some issues around hugetlb
 pagetable invalidation.
 
 Zhenwei Pi has fixed some interactions between hwpoisoned pages and
 virtualization.
 
 Tong Tiangen has enabled the use of the presently x86-only
 page_table_check debugging feature on arm64 and riscv.
 
 David Vernet has done some fixup work on the memcg selftests.
 
 Peter Xu has taught userfaultfd to handle write protection faults against
 shmem- and hugetlbfs-backed files.
 
 More DAMON development from SeongJae Park - adding online tuning of the
 feature and support for monitoring of fixed virtual address ranges.  Also
 easier discovery of which monitoring operations are available.
 
 Nadav Amit has done some optimization of TLB flushing during mprotect().
 
 Neil Brown continues to labor away at improving our swap-over-NFS support.
 
 David Hildenbrand has some fixes to anon page COWing versus
 get_user_pages().
 
 Peng Liu fixed some errors in the core hugetlb code.
 
 Joao Martins has reduced the amount of memory consumed by device-dax's
 compound devmaps.
 
 Some cleanups of the arch-specific pagemap code from Anshuman Khandual.
 
 Muchun Song has found and fixed some errors in the TLB flushing of
 transparent hugepages.
 
 Roman Gushchin has done more work on the memcg selftests.
 
 And, of course, many smaller fixes and cleanups.  Notably, the customary
 million cleanup serieses from Miaohe Lin.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCYo52xQAKCRDdBJ7gKXxA
 jtJFAQD238KoeI9z5SkPMaeBRYSRQmNll85mxs25KapcEgWgGQD9FAb7DJkqsIVk
 PzE+d9hEfirUGdL6cujatwJ6ejYR8Q8=
 =nFe6
 -----END PGP SIGNATURE-----

Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull MM updates from Andrew Morton:
 "Almost all of MM here. A few things are still getting finished off,
  reviewed, etc.

   - Yang Shi has improved the behaviour of khugepaged collapsing of
     readonly file-backed transparent hugepages.

   - Johannes Weiner has arranged for zswap memory use to be tracked and
     managed on a per-cgroup basis.

   - Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for
     runtime enablement of the recent huge page vmemmap optimization
     feature.

   - Baolin Wang contributes a series to fix some issues around hugetlb
     pagetable invalidation.

   - Zhenwei Pi has fixed some interactions between hwpoisoned pages and
     virtualization.

   - Tong Tiangen has enabled the use of the presently x86-only
     page_table_check debugging feature on arm64 and riscv.

   - David Vernet has done some fixup work on the memcg selftests.

   - Peter Xu has taught userfaultfd to handle write protection faults
     against shmem- and hugetlbfs-backed files.

   - More DAMON development from SeongJae Park - adding online tuning of
     the feature and support for monitoring of fixed virtual address
     ranges. Also easier discovery of which monitoring operations are
     available.

   - Nadav Amit has done some optimization of TLB flushing during
     mprotect().

   - Neil Brown continues to labor away at improving our swap-over-NFS
     support.

   - David Hildenbrand has some fixes to anon page COWing versus
     get_user_pages().

   - Peng Liu fixed some errors in the core hugetlb code.

   - Joao Martins has reduced the amount of memory consumed by
     device-dax's compound devmaps.

   - Some cleanups of the arch-specific pagemap code from Anshuman
     Khandual.

   - Muchun Song has found and fixed some errors in the TLB flushing of
     transparent hugepages.

   - Roman Gushchin has done more work on the memcg selftests.

  ... and, of course, many smaller fixes and cleanups. Notably, the
  customary million cleanup serieses from Miaohe Lin"

* tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits)
  mm: kfence: use PAGE_ALIGNED helper
  selftests: vm: add the "settings" file with timeout variable
  selftests: vm: add "test_hmm.sh" to TEST_FILES
  selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests
  selftests: vm: add migration to the .gitignore
  selftests/vm/pkeys: fix typo in comment
  ksm: fix typo in comment
  selftests: vm: add process_mrelease tests
  Revert "mm/vmscan: never demote for memcg reclaim"
  mm/kfence: print disabling or re-enabling message
  include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace"
  include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion"
  mm: fix a potential infinite loop in start_isolate_page_range()
  MAINTAINERS: add Muchun as co-maintainer for HugeTLB
  zram: fix Kconfig dependency warning
  mm/shmem: fix shmem folio swapoff hang
  cgroup: fix an error handling path in alloc_pagecache_max_30M()
  mm: damon: use HPAGE_PMD_SIZE
  tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate
  nodemask.h: fix compilation error with GCC12
  ...
2022-05-26 12:32:41 -07:00

657 lines
12 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <linux/limits.h>
#include <poll.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/inotify.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include "cgroup_util.h"
#include "../clone3/clone3_selftests.h"
/* Returns read len on success, or -errno on failure. */
static ssize_t read_text(const char *path, char *buf, size_t max_len)
{
ssize_t len;
int fd;
fd = open(path, O_RDONLY);
if (fd < 0)
return -errno;
len = read(fd, buf, max_len - 1);
if (len >= 0)
buf[len] = 0;
close(fd);
return len < 0 ? -errno : len;
}
/* Returns written len on success, or -errno on failure. */
static ssize_t write_text(const char *path, char *buf, ssize_t len)
{
int fd;
fd = open(path, O_WRONLY | O_APPEND);
if (fd < 0)
return -errno;
len = write(fd, buf, len);
close(fd);
return len < 0 ? -errno : len;
}
char *cg_name(const char *root, const char *name)
{
size_t len = strlen(root) + strlen(name) + 2;
char *ret = malloc(len);
snprintf(ret, len, "%s/%s", root, name);
return ret;
}
char *cg_name_indexed(const char *root, const char *name, int index)
{
size_t len = strlen(root) + strlen(name) + 10;
char *ret = malloc(len);
snprintf(ret, len, "%s/%s_%d", root, name, index);
return ret;
}
char *cg_control(const char *cgroup, const char *control)
{
size_t len = strlen(cgroup) + strlen(control) + 2;
char *ret = malloc(len);
snprintf(ret, len, "%s/%s", cgroup, control);
return ret;
}
/* Returns 0 on success, or -errno on failure. */
int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
{
char path[PATH_MAX];
ssize_t ret;
snprintf(path, sizeof(path), "%s/%s", cgroup, control);
ret = read_text(path, buf, len);
return ret >= 0 ? 0 : ret;
}
int cg_read_strcmp(const char *cgroup, const char *control,
const char *expected)
{
size_t size;
char *buf;
int ret;
/* Handle the case of comparing against empty string */
if (!expected)
return -1;
else
size = strlen(expected) + 1;
buf = malloc(size);
if (!buf)
return -1;
if (cg_read(cgroup, control, buf, size)) {
free(buf);
return -1;
}
ret = strcmp(expected, buf);
free(buf);
return ret;
}
int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
{
char buf[PAGE_SIZE];
if (cg_read(cgroup, control, buf, sizeof(buf)))
return -1;
return strstr(buf, needle) ? 0 : -1;
}
long cg_read_long(const char *cgroup, const char *control)
{
char buf[128];
if (cg_read(cgroup, control, buf, sizeof(buf)))
return -1;
return atol(buf);
}
long cg_read_key_long(const char *cgroup, const char *control, const char *key)
{
char buf[PAGE_SIZE];
char *ptr;
if (cg_read(cgroup, control, buf, sizeof(buf)))
return -1;
ptr = strstr(buf, key);
if (!ptr)
return -1;
return atol(ptr + strlen(key));
}
long cg_read_lc(const char *cgroup, const char *control)
{
char buf[PAGE_SIZE];
const char delim[] = "\n";
char *line;
long cnt = 0;
if (cg_read(cgroup, control, buf, sizeof(buf)))
return -1;
for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
cnt++;
return cnt;
}
/* Returns 0 on success, or -errno on failure. */
int cg_write(const char *cgroup, const char *control, char *buf)
{
char path[PATH_MAX];
ssize_t len = strlen(buf), ret;
snprintf(path, sizeof(path), "%s/%s", cgroup, control);
ret = write_text(path, buf, len);
return ret == len ? 0 : ret;
}
int cg_write_numeric(const char *cgroup, const char *control, long value)
{
char buf[64];
int ret;
ret = sprintf(buf, "%lu", value);
if (ret < 0)
return ret;
return cg_write(cgroup, control, buf);
}
int cg_find_unified_root(char *root, size_t len)
{
char buf[10 * PAGE_SIZE];
char *fs, *mount, *type;
const char delim[] = "\n\t ";
if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
return -1;
/*
* Example:
* cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0
*/
for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
mount = strtok(NULL, delim);
type = strtok(NULL, delim);
strtok(NULL, delim);
strtok(NULL, delim);
strtok(NULL, delim);
if (strcmp(type, "cgroup2") == 0) {
strncpy(root, mount, len);
return 0;
}
}
return -1;
}
int cg_create(const char *cgroup)
{
return mkdir(cgroup, 0755);
}
int cg_wait_for_proc_count(const char *cgroup, int count)
{
char buf[10 * PAGE_SIZE] = {0};
int attempts;
char *ptr;
for (attempts = 10; attempts >= 0; attempts--) {
int nr = 0;
if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
break;
for (ptr = buf; *ptr; ptr++)
if (*ptr == '\n')
nr++;
if (nr >= count)
return 0;
usleep(100000);
}
return -1;
}
int cg_killall(const char *cgroup)
{
char buf[PAGE_SIZE];
char *ptr = buf;
/* If cgroup.kill exists use it. */
if (!cg_write(cgroup, "cgroup.kill", "1"))
return 0;
if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
return -1;
while (ptr < buf + sizeof(buf)) {
int pid = strtol(ptr, &ptr, 10);
if (pid == 0)
break;
if (*ptr)
ptr++;
else
break;
if (kill(pid, SIGKILL))
return -1;
}
return 0;
}
int cg_destroy(const char *cgroup)
{
int ret;
retry:
ret = rmdir(cgroup);
if (ret && errno == EBUSY) {
cg_killall(cgroup);
usleep(100);
goto retry;
}
if (ret && errno == ENOENT)
ret = 0;
return ret;
}
int cg_enter(const char *cgroup, int pid)
{
char pidbuf[64];
snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
return cg_write(cgroup, "cgroup.procs", pidbuf);
}
int cg_enter_current(const char *cgroup)
{
return cg_write(cgroup, "cgroup.procs", "0");
}
int cg_enter_current_thread(const char *cgroup)
{
return cg_write(cgroup, "cgroup.threads", "0");
}
int cg_run(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg)
{
int pid, retcode;
pid = fork();
if (pid < 0) {
return pid;
} else if (pid == 0) {
char buf[64];
snprintf(buf, sizeof(buf), "%d", getpid());
if (cg_write(cgroup, "cgroup.procs", buf))
exit(EXIT_FAILURE);
exit(fn(cgroup, arg));
} else {
waitpid(pid, &retcode, 0);
if (WIFEXITED(retcode))
return WEXITSTATUS(retcode);
else
return -1;
}
}
pid_t clone_into_cgroup(int cgroup_fd)
{
#ifdef CLONE_ARGS_SIZE_VER2
pid_t pid;
struct __clone_args args = {
.flags = CLONE_INTO_CGROUP,
.exit_signal = SIGCHLD,
.cgroup = cgroup_fd,
};
pid = sys_clone3(&args, sizeof(struct __clone_args));
/*
* Verify that this is a genuine test failure:
* ENOSYS -> clone3() not available
* E2BIG -> CLONE_INTO_CGROUP not available
*/
if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
goto pretend_enosys;
return pid;
pretend_enosys:
#endif
errno = ENOSYS;
return -ENOSYS;
}
int clone_reap(pid_t pid, int options)
{
int ret;
siginfo_t info = {
.si_signo = 0,
};
again:
ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
if (ret < 0) {
if (errno == EINTR)
goto again;
return -1;
}
if (options & WEXITED) {
if (WIFEXITED(info.si_status))
return WEXITSTATUS(info.si_status);
}
if (options & WSTOPPED) {
if (WIFSTOPPED(info.si_status))
return WSTOPSIG(info.si_status);
}
if (options & WCONTINUED) {
if (WIFCONTINUED(info.si_status))
return 0;
}
return -1;
}
int dirfd_open_opath(const char *dir)
{
return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
}
#define close_prot_errno(fd) \
if (fd >= 0) { \
int _e_ = errno; \
close(fd); \
errno = _e_; \
}
static int clone_into_cgroup_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg)
{
int cgroup_fd;
pid_t pid;
cgroup_fd = dirfd_open_opath(cgroup);
if (cgroup_fd < 0)
return -1;
pid = clone_into_cgroup(cgroup_fd);
close_prot_errno(cgroup_fd);
if (pid == 0)
exit(fn(cgroup, arg));
return pid;
}
int cg_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg)
{
int pid;
pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
if (pid > 0)
return pid;
/* Genuine test failure. */
if (pid < 0 && errno != ENOSYS)
return -1;
pid = fork();
if (pid == 0) {
char buf[64];
snprintf(buf, sizeof(buf), "%d", getpid());
if (cg_write(cgroup, "cgroup.procs", buf))
exit(EXIT_FAILURE);
exit(fn(cgroup, arg));
}
return pid;
}
int get_temp_fd(void)
{
return open(".", O_TMPFILE | O_RDWR | O_EXCL);
}
int alloc_pagecache(int fd, size_t size)
{
char buf[PAGE_SIZE];
struct stat st;
int i;
if (fstat(fd, &st))
goto cleanup;
size += st.st_size;
if (ftruncate(fd, size))
goto cleanup;
for (i = 0; i < size; i += sizeof(buf))
read(fd, buf, sizeof(buf));
return 0;
cleanup:
return -1;
}
int alloc_anon(const char *cgroup, void *arg)
{
size_t size = (unsigned long)arg;
char *buf, *ptr;
buf = malloc(size);
for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
*ptr = 0;
free(buf);
return 0;
}
int is_swap_enabled(void)
{
char buf[PAGE_SIZE];
const char delim[] = "\n";
int cnt = 0;
char *line;
if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
return -1;
for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
cnt++;
return cnt > 1;
}
int set_oom_adj_score(int pid, int score)
{
char path[PATH_MAX];
int fd, len;
sprintf(path, "/proc/%d/oom_score_adj", pid);
fd = open(path, O_WRONLY | O_APPEND);
if (fd < 0)
return fd;
len = dprintf(fd, "%d", score);
if (len < 0) {
close(fd);
return len;
}
close(fd);
return 0;
}
int proc_mount_contains(const char *option)
{
char buf[4 * PAGE_SIZE];
ssize_t read;
read = read_text("/proc/mounts", buf, sizeof(buf));
if (read < 0)
return read;
return strstr(buf, option) != NULL;
}
ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
{
char path[PATH_MAX];
if (!pid)
snprintf(path, sizeof(path), "/proc/%s/%s",
thread ? "thread-self" : "self", item);
else
snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
size = read_text(path, buf, size);
return size < 0 ? -1 : size;
}
int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
{
char buf[PAGE_SIZE];
if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
return -1;
return strstr(buf, needle) ? 0 : -1;
}
int clone_into_cgroup_run_wait(const char *cgroup)
{
int cgroup_fd;
pid_t pid;
cgroup_fd = dirfd_open_opath(cgroup);
if (cgroup_fd < 0)
return -1;
pid = clone_into_cgroup(cgroup_fd);
close_prot_errno(cgroup_fd);
if (pid < 0)
return -1;
if (pid == 0)
exit(EXIT_SUCCESS);
/*
* We don't care whether this fails. We only care whether the initial
* clone succeeded.
*/
(void)clone_reap(pid, WEXITED);
return 0;
}
static int __prepare_for_wait(const char *cgroup, const char *filename)
{
int fd, ret = -1;
fd = inotify_init1(0);
if (fd == -1)
return fd;
ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY);
if (ret == -1) {
close(fd);
fd = -1;
}
return fd;
}
int cg_prepare_for_wait(const char *cgroup)
{
return __prepare_for_wait(cgroup, "cgroup.events");
}
int memcg_prepare_for_wait(const char *cgroup)
{
return __prepare_for_wait(cgroup, "memory.events");
}
int cg_wait_for(int fd)
{
int ret = -1;
struct pollfd fds = {
.fd = fd,
.events = POLLIN,
};
while (true) {
ret = poll(&fds, 1, 10000);
if (ret == -1) {
if (errno == EINTR)
continue;
break;
}
if (ret > 0 && fds.revents & POLLIN) {
ret = 0;
break;
}
}
return ret;
}