110860541f
When a secret memory region is active, memfd_secret disables hibernation. One of the goals is to keep the secret data from being written to persistent-storage. It accomplishes this by maintaining a reference count to `secretmem_users`. Once this reference is held your system can not be hibernated due to the check in `hibernation_available()`. However, because `secretmem_users` is of type `atomic_t`, reference counter overflows are possible. As you can see there's an `atomic_inc` for each `memfd` that is opened in the `memfd_secret` syscall. If a local attacker succeeds to open 2^32 memfd's, the counter will wrap around to 0. This implies that you may hibernate again, even though there are still regions of this secret memory, thereby bypassing the security check. In an attempt to fix this I have used `refcount_t` instead of `atomic_t` which prevents reference counter overflows. Link: https://lkml.kernel.org/r/20210820043339.2151352-1-jordy@pwning.systems Signed-off-by: Jordy Zomer <jordy@pwning.systems> Cc: Kees Cook <keescook@chromium.org>, Cc: Jordy Zomer <jordy@jordyzomer.github.io> Cc: James Bottomley <James.Bottomley@HansenPartnership.com> Cc: Mike Rapoport <rppt@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
257 lines
5.5 KiB
C
257 lines
5.5 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright IBM Corporation, 2021
|
|
*
|
|
* Author: Mike Rapoport <rppt@linux.ibm.com>
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/memfd.h>
|
|
#include <linux/bitops.h>
|
|
#include <linux/printk.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/pseudo_fs.h>
|
|
#include <linux/secretmem.h>
|
|
#include <linux/set_memory.h>
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/refcount.h>
|
|
|
|
#include <uapi/linux/magic.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
#include "internal.h"
|
|
|
|
#undef pr_fmt
|
|
#define pr_fmt(fmt) "secretmem: " fmt
|
|
|
|
/*
|
|
* Define mode and flag masks to allow validation of the system call
|
|
* parameters.
|
|
*/
|
|
#define SECRETMEM_MODE_MASK (0x0)
|
|
#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK
|
|
|
|
static bool secretmem_enable __ro_after_init;
|
|
module_param_named(enable, secretmem_enable, bool, 0400);
|
|
MODULE_PARM_DESC(secretmem_enable,
|
|
"Enable secretmem and memfd_secret(2) system call");
|
|
|
|
static refcount_t secretmem_users;
|
|
|
|
bool secretmem_active(void)
|
|
{
|
|
return !!refcount_read(&secretmem_users);
|
|
}
|
|
|
|
static vm_fault_t secretmem_fault(struct vm_fault *vmf)
|
|
{
|
|
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
|
pgoff_t offset = vmf->pgoff;
|
|
gfp_t gfp = vmf->gfp_mask;
|
|
unsigned long addr;
|
|
struct page *page;
|
|
int err;
|
|
|
|
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
|
|
return vmf_error(-EINVAL);
|
|
|
|
retry:
|
|
page = find_lock_page(mapping, offset);
|
|
if (!page) {
|
|
page = alloc_page(gfp | __GFP_ZERO);
|
|
if (!page)
|
|
return VM_FAULT_OOM;
|
|
|
|
err = set_direct_map_invalid_noflush(page);
|
|
if (err) {
|
|
put_page(page);
|
|
return vmf_error(err);
|
|
}
|
|
|
|
__SetPageUptodate(page);
|
|
err = add_to_page_cache_lru(page, mapping, offset, gfp);
|
|
if (unlikely(err)) {
|
|
put_page(page);
|
|
/*
|
|
* If a split of large page was required, it
|
|
* already happened when we marked the page invalid
|
|
* which guarantees that this call won't fail
|
|
*/
|
|
set_direct_map_default_noflush(page);
|
|
if (err == -EEXIST)
|
|
goto retry;
|
|
|
|
return vmf_error(err);
|
|
}
|
|
|
|
addr = (unsigned long)page_address(page);
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
}
|
|
|
|
vmf->page = page;
|
|
return VM_FAULT_LOCKED;
|
|
}
|
|
|
|
static const struct vm_operations_struct secretmem_vm_ops = {
|
|
.fault = secretmem_fault,
|
|
};
|
|
|
|
static int secretmem_release(struct inode *inode, struct file *file)
|
|
{
|
|
refcount_dec(&secretmem_users);
|
|
return 0;
|
|
}
|
|
|
|
static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
|
|
{
|
|
unsigned long len = vma->vm_end - vma->vm_start;
|
|
|
|
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
|
|
return -EINVAL;
|
|
|
|
if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
|
|
return -EAGAIN;
|
|
|
|
vma->vm_flags |= VM_LOCKED | VM_DONTDUMP;
|
|
vma->vm_ops = &secretmem_vm_ops;
|
|
|
|
return 0;
|
|
}
|
|
|
|
bool vma_is_secretmem(struct vm_area_struct *vma)
|
|
{
|
|
return vma->vm_ops == &secretmem_vm_ops;
|
|
}
|
|
|
|
static const struct file_operations secretmem_fops = {
|
|
.release = secretmem_release,
|
|
.mmap = secretmem_mmap,
|
|
};
|
|
|
|
static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static int secretmem_migratepage(struct address_space *mapping,
|
|
struct page *newpage, struct page *page,
|
|
enum migrate_mode mode)
|
|
{
|
|
return -EBUSY;
|
|
}
|
|
|
|
static void secretmem_freepage(struct page *page)
|
|
{
|
|
set_direct_map_default_noflush(page);
|
|
clear_highpage(page);
|
|
}
|
|
|
|
const struct address_space_operations secretmem_aops = {
|
|
.set_page_dirty = __set_page_dirty_no_writeback,
|
|
.freepage = secretmem_freepage,
|
|
.migratepage = secretmem_migratepage,
|
|
.isolate_page = secretmem_isolate_page,
|
|
};
|
|
|
|
static struct vfsmount *secretmem_mnt;
|
|
|
|
static struct file *secretmem_file_create(unsigned long flags)
|
|
{
|
|
struct file *file = ERR_PTR(-ENOMEM);
|
|
struct inode *inode;
|
|
|
|
inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
|
|
if (IS_ERR(inode))
|
|
return ERR_CAST(inode);
|
|
|
|
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
|
|
O_RDWR, &secretmem_fops);
|
|
if (IS_ERR(file))
|
|
goto err_free_inode;
|
|
|
|
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
|
|
mapping_set_unevictable(inode->i_mapping);
|
|
|
|
inode->i_mapping->a_ops = &secretmem_aops;
|
|
|
|
/* pretend we are a normal file with zero size */
|
|
inode->i_mode |= S_IFREG;
|
|
inode->i_size = 0;
|
|
|
|
return file;
|
|
|
|
err_free_inode:
|
|
iput(inode);
|
|
return file;
|
|
}
|
|
|
|
SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
|
|
{
|
|
struct file *file;
|
|
int fd, err;
|
|
|
|
/* make sure local flags do not confict with global fcntl.h */
|
|
BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
|
|
|
|
if (!secretmem_enable)
|
|
return -ENOSYS;
|
|
|
|
if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
|
|
return -EINVAL;
|
|
|
|
fd = get_unused_fd_flags(flags & O_CLOEXEC);
|
|
if (fd < 0)
|
|
return fd;
|
|
|
|
file = secretmem_file_create(flags);
|
|
if (IS_ERR(file)) {
|
|
err = PTR_ERR(file);
|
|
goto err_put_fd;
|
|
}
|
|
|
|
file->f_flags |= O_LARGEFILE;
|
|
|
|
fd_install(fd, file);
|
|
refcount_inc(&secretmem_users);
|
|
return fd;
|
|
|
|
err_put_fd:
|
|
put_unused_fd(fd);
|
|
return err;
|
|
}
|
|
|
|
static int secretmem_init_fs_context(struct fs_context *fc)
|
|
{
|
|
return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
|
|
}
|
|
|
|
static struct file_system_type secretmem_fs = {
|
|
.name = "secretmem",
|
|
.init_fs_context = secretmem_init_fs_context,
|
|
.kill_sb = kill_anon_super,
|
|
};
|
|
|
|
static int secretmem_init(void)
|
|
{
|
|
int ret = 0;
|
|
|
|
if (!secretmem_enable)
|
|
return ret;
|
|
|
|
secretmem_mnt = kern_mount(&secretmem_fs);
|
|
if (IS_ERR(secretmem_mnt))
|
|
ret = PTR_ERR(secretmem_mnt);
|
|
|
|
/* prevent secretmem mappings from ever getting PROT_EXEC */
|
|
secretmem_mnt->mnt_flags |= MNT_NOEXEC;
|
|
|
|
return ret;
|
|
}
|
|
fs_initcall(secretmem_init);
|