36ca7943ac
When the max pages (last_page in the swap header + 1) is smaller than the total pages (inode size) of the swapfile, iomap_swapfile_activate overwrites sis->max with total pages. However, frontswap_map is a swap page state bitmap allocated using the initial sis->max page count read from the swap header. If swapfile activation increases sis->max, it's possible for the frontswap code to walk off the end of the bitmap, thereby corrupting kernel memory. [djwong: modify the description a bit; the original paragraph reads: "However, frontswap_map is allocated using max pages. When test and clear the sis offset, which is larger than max pages, of frontswap_map in __frontswap_invalidate_page(), neighbors of frontswap_map may be overwritten, i.e., slab is polluted." Note also that this bug resulted in a behavioral change: activating a swap file that was formatted and later extended results in all pages being activated, not the number of pages recorded in the swap header.] This fixes the issue by considering the limitation of max pages of swap info in iomap_swapfile_add_extent(). To reproduce the case, compile kernel with slub RED ZONE, then run test: $ sudo stress-ng -a 1 -x softlockup,resources -t 72h --metrics --times \ --verify -v -Y /root/tmpdir/stress-ng/stress-statistic-12.yaml \ --log-file /root/tmpdir/stress-ng/stress-logfile-12.txt \ --temp-path /root/tmpdir/stress-ng/ We'll get the error log as below: [ 1151.015141] ============================================================================= [ 1151.016489] BUG kmalloc-16 (Not tainted): Right Redzone overwritten [ 1151.017486] ----------------------------------------------------------------------------- [ 1151.017486] [ 1151.018997] Disabling lock debugging due to kernel taint [ 1151.019873] INFO: 0x0000000084e43932-0x0000000098d17cae @offset=7392. First byte 0x0 instead of 0xcc [ 1151.021303] INFO: Allocated in __do_sys_swapon+0xcf6/0x1170 age=43417 cpu=9 pid=3816 [ 1151.022538] __slab_alloc+0xe/0x20 [ 1151.023069] __kmalloc_node+0xfd/0x4b0 [ 1151.023704] __do_sys_swapon+0xcf6/0x1170 [ 1151.024346] do_syscall_64+0x33/0x40 [ 1151.024925] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1151.025749] INFO: Freed in put_cred_rcu+0xa1/0xc0 age=43424 cpu=3 pid=2041 [ 1151.026889] kfree+0x276/0x2b0 [ 1151.027405] put_cred_rcu+0xa1/0xc0 [ 1151.027949] rcu_do_batch+0x17d/0x410 [ 1151.028566] rcu_core+0x14e/0x2b0 [ 1151.029084] __do_softirq+0x101/0x29e [ 1151.029645] asm_call_irq_on_stack+0x12/0x20 [ 1151.030381] do_softirq_own_stack+0x37/0x40 [ 1151.031037] do_softirq.part.15+0x2b/0x30 [ 1151.031710] __local_bh_enable_ip+0x4b/0x50 [ 1151.032412] copy_fpstate_to_sigframe+0x111/0x360 [ 1151.033197] __setup_rt_frame+0xce/0x480 [ 1151.033809] arch_do_signal+0x1a3/0x250 [ 1151.034463] exit_to_user_mode_prepare+0xcf/0x110 [ 1151.035242] syscall_exit_to_user_mode+0x27/0x190 [ 1151.035970] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1151.036795] INFO: Slab 0x000000003b9de4dc objects=44 used=9 fp=0x00000000539e349e flags=0xfffffc0010201 [ 1151.038323] INFO: Object 0x000000004855ba01 @offset=7376 fp=0x0000000000000000 [ 1151.038323] [ 1151.039683] Redzone 000000008d0afd3d: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ [ 1151.041180] Object 000000004855ba01: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [ 1151.042714] Redzone 0000000084e43932: 00 00 00 c0 cc cc cc cc ........ [ 1151.044120] Padding 000000000864c042: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ [ 1151.045615] CPU: 5 PID: 3816 Comm: stress-ng Tainted: G B 5.10.50+ #7 [ 1151.046846] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 [ 1151.048633] Call Trace: [ 1151.049072] dump_stack+0x57/0x6a [ 1151.049585] check_bytes_and_report+0xed/0x110 [ 1151.050320] check_object+0x1eb/0x290 [ 1151.050924] ? __x64_sys_swapoff+0x39a/0x540 [ 1151.051646] free_debug_processing+0x151/0x350 [ 1151.052333] __slab_free+0x21a/0x3a0 [ 1151.052938] ? _cond_resched+0x2d/0x40 [ 1151.053529] ? __vunmap+0x1de/0x220 [ 1151.054139] ? __x64_sys_swapoff+0x39a/0x540 [ 1151.054796] ? kfree+0x276/0x2b0 [ 1151.055307] kfree+0x276/0x2b0 [ 1151.055832] __x64_sys_swapoff+0x39a/0x540 [ 1151.056466] do_syscall_64+0x33/0x40 [ 1151.057084] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1151.057866] RIP: 0033:0x150340b0ffb7 [ 1151.058481] Code: Unable to access opcode bytes at RIP 0x150340b0ff8d. [ 1151.059537] RSP: 002b:00007fff7f4ee238 EFLAGS: 00000246 ORIG_RAX: 00000000000000a8 [ 1151.060768] RAX: ffffffffffffffda RBX: 00007fff7f4ee66c RCX: 0000150340b0ffb7 [ 1151.061904] RDX: 000000000000000a RSI: 0000000000018094 RDI: 00007fff7f4ee860 [ 1151.063033] RBP: 00007fff7f4ef980 R08: 0000000000000000 R09: 0000150340a672bd [ 1151.064135] R10: 00007fff7f4edca0 R11: 0000000000000246 R12: 0000000000018094 [ 1151.065253] R13: 0000000000000005 R14: 000000000160d930 R15: 00007fff7f4ee66c [ 1151.066413] FIX kmalloc-16: Restoring 0x0000000084e43932-0x0000000098d17cae=0xcc [ 1151.066413] [ 1151.067890] FIX kmalloc-16: Object at 0x000000004855ba01 not freed Fixes:67482129cd
("iomap: add a swapfile activation function") Fixes:a45c0eccc5
("iomap: move the swapfile code into a separate file") Signed-off-by: Gang Deng <gavin.dg@linux.alibaba.com> Signed-off-by: Xu Yu <xuyu@linux.alibaba.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de>
196 lines
5.6 KiB
C
196 lines
5.6 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2018 Oracle. All Rights Reserved.
|
|
* Author: Darrick J. Wong <darrick.wong@oracle.com>
|
|
*/
|
|
#include <linux/module.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/iomap.h>
|
|
#include <linux/swap.h>
|
|
|
|
/* Swapfile activation */
|
|
|
|
struct iomap_swapfile_info {
|
|
struct iomap iomap; /* accumulated iomap */
|
|
struct swap_info_struct *sis;
|
|
uint64_t lowest_ppage; /* lowest physical addr seen (pages) */
|
|
uint64_t highest_ppage; /* highest physical addr seen (pages) */
|
|
unsigned long nr_pages; /* number of pages collected */
|
|
int nr_extents; /* extent count */
|
|
struct file *file;
|
|
};
|
|
|
|
/*
|
|
* Collect physical extents for this swap file. Physical extents reported to
|
|
* the swap code must be trimmed to align to a page boundary. The logical
|
|
* offset within the file is irrelevant since the swapfile code maps logical
|
|
* page numbers of the swap device to the physical page-aligned extents.
|
|
*/
|
|
static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
|
|
{
|
|
struct iomap *iomap = &isi->iomap;
|
|
unsigned long nr_pages;
|
|
unsigned long max_pages;
|
|
uint64_t first_ppage;
|
|
uint64_t first_ppage_reported;
|
|
uint64_t next_ppage;
|
|
int error;
|
|
|
|
if (unlikely(isi->nr_pages >= isi->sis->max))
|
|
return 0;
|
|
max_pages = isi->sis->max - isi->nr_pages;
|
|
|
|
/*
|
|
* Round the start up and the end down so that the physical
|
|
* extent aligns to a page boundary.
|
|
*/
|
|
first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
|
|
next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
|
|
PAGE_SHIFT;
|
|
|
|
/* Skip too-short physical extents. */
|
|
if (first_ppage >= next_ppage)
|
|
return 0;
|
|
nr_pages = next_ppage - first_ppage;
|
|
nr_pages = min(nr_pages, max_pages);
|
|
|
|
/*
|
|
* Calculate how much swap space we're adding; the first page contains
|
|
* the swap header and doesn't count. The mm still wants that first
|
|
* page fed to add_swap_extent, however.
|
|
*/
|
|
first_ppage_reported = first_ppage;
|
|
if (iomap->offset == 0)
|
|
first_ppage_reported++;
|
|
if (isi->lowest_ppage > first_ppage_reported)
|
|
isi->lowest_ppage = first_ppage_reported;
|
|
if (isi->highest_ppage < (next_ppage - 1))
|
|
isi->highest_ppage = next_ppage - 1;
|
|
|
|
/* Add extent, set up for the next call. */
|
|
error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
|
|
if (error < 0)
|
|
return error;
|
|
isi->nr_extents += error;
|
|
isi->nr_pages += nr_pages;
|
|
return 0;
|
|
}
|
|
|
|
static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
|
|
{
|
|
char *buf, *p = ERR_PTR(-ENOMEM);
|
|
|
|
buf = kmalloc(PATH_MAX, GFP_KERNEL);
|
|
if (buf)
|
|
p = file_path(isi->file, buf, PATH_MAX);
|
|
pr_err("swapon: file %s %s\n", IS_ERR(p) ? "<unknown>" : p, str);
|
|
kfree(buf);
|
|
return -EINVAL;
|
|
}
|
|
|
|
/*
|
|
* Accumulate iomaps for this swap file. We have to accumulate iomaps because
|
|
* swap only cares about contiguous page-aligned physical extents and makes no
|
|
* distinction between written and unwritten extents.
|
|
*/
|
|
static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
|
|
struct iomap *iomap, struct iomap_swapfile_info *isi)
|
|
{
|
|
switch (iomap->type) {
|
|
case IOMAP_MAPPED:
|
|
case IOMAP_UNWRITTEN:
|
|
/* Only real or unwritten extents. */
|
|
break;
|
|
case IOMAP_INLINE:
|
|
/* No inline data. */
|
|
return iomap_swapfile_fail(isi, "is inline");
|
|
default:
|
|
return iomap_swapfile_fail(isi, "has unallocated extents");
|
|
}
|
|
|
|
/* No uncommitted metadata or shared blocks. */
|
|
if (iomap->flags & IOMAP_F_DIRTY)
|
|
return iomap_swapfile_fail(isi, "is not committed");
|
|
if (iomap->flags & IOMAP_F_SHARED)
|
|
return iomap_swapfile_fail(isi, "has shared extents");
|
|
|
|
/* Only one bdev per swap file. */
|
|
if (iomap->bdev != isi->sis->bdev)
|
|
return iomap_swapfile_fail(isi, "outside the main device");
|
|
|
|
if (isi->iomap.length == 0) {
|
|
/* No accumulated extent, so just store it. */
|
|
memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
|
|
} else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
|
|
/* Append this to the accumulated extent. */
|
|
isi->iomap.length += iomap->length;
|
|
} else {
|
|
/* Otherwise, add the retained iomap and store this one. */
|
|
int error = iomap_swapfile_add_extent(isi);
|
|
if (error)
|
|
return error;
|
|
memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
|
|
}
|
|
return iomap_length(iter);
|
|
}
|
|
|
|
/*
|
|
* Iterate a swap file's iomaps to construct physical extents that can be
|
|
* passed to the swapfile subsystem.
|
|
*/
|
|
int iomap_swapfile_activate(struct swap_info_struct *sis,
|
|
struct file *swap_file, sector_t *pagespan,
|
|
const struct iomap_ops *ops)
|
|
{
|
|
struct inode *inode = swap_file->f_mapping->host;
|
|
struct iomap_iter iter = {
|
|
.inode = inode,
|
|
.pos = 0,
|
|
.len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE),
|
|
.flags = IOMAP_REPORT,
|
|
};
|
|
struct iomap_swapfile_info isi = {
|
|
.sis = sis,
|
|
.lowest_ppage = (sector_t)-1ULL,
|
|
.file = swap_file,
|
|
};
|
|
int ret;
|
|
|
|
/*
|
|
* Persist all file mapping metadata so that we won't have any
|
|
* IOMAP_F_DIRTY iomaps.
|
|
*/
|
|
ret = vfs_fsync(swap_file, 1);
|
|
if (ret)
|
|
return ret;
|
|
|
|
while ((ret = iomap_iter(&iter, ops)) > 0)
|
|
iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
if (isi.iomap.length) {
|
|
ret = iomap_swapfile_add_extent(&isi);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* If this swapfile doesn't contain even a single page-aligned
|
|
* contiguous range of blocks, reject this useless swapfile to
|
|
* prevent confusion later on.
|
|
*/
|
|
if (isi.nr_pages == 0) {
|
|
pr_warn("swapon: Cannot find a single usable page in file.\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
|
|
sis->max = isi.nr_pages;
|
|
sis->pages = isi.nr_pages - 1;
|
|
sis->highest_bit = isi.nr_pages - 1;
|
|
return isi.nr_extents;
|
|
}
|
|
EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
|