linux/arch/s390/kernel/machine_kexec.c
Alexander Gordeev 4df29d2b90 s390/smp: rework absolute lowcore access
Temporary unsetting of the prefix page in memcpy_absolute() routine
poses a risk of executing code path with unexpectedly disabled prefix
page. This rework avoids the prefix page uninstalling and disabling
of normal and machine check interrupts when accessing the absolute
zero memory.

Although memcpy_absolute() routine can access the whole memory, it is
only used to update the absolute zero lowcore. This rework therefore
introduces a new mechanism for the absolute zero lowcore access and
scraps memcpy_absolute() routine for good.

Instead, an area is reserved in the virtual memory that is used for
the absolute lowcore access only. That area holds an array of 8KB
virtual mappings - one per CPU. Whenever a CPU is brought online, the
corresponding item is mapped to the real address of the previously
installed prefix page.

The absolute zero lowcore access works like this: a CPU calls the
new primitive get_abs_lowcore() to obtain its 8KB mapping as a
pointer to the struct lowcore. Virtual address references to that
pointer get translated to the real addresses of the prefix page,
which in turn gets swapped with the absolute zero memory addresses
due to prefixing. Once the pointer is not needed it must be released
with put_abs_lowcore() primitive:

	struct lowcore *abs_lc;
	unsigned long flags;

	abs_lc = get_abs_lowcore(&flags);
	abs_lc->... = ...;
	put_abs_lowcore(abs_lc, flags);

To ensure the described mechanism works large segment- and region-
table entries must be avoided for the 8KB mappings. Failure to do
so results in usage of Region-Frame Absolute Address (RFAA) or
Segment-Frame Absolute Address (SFAA) large page fields. In that
case absolute addresses would be used to address the prefix page
instead of the real ones and the prefixing would get bypassed.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
2022-09-14 16:46:00 +02:00

299 lines
7.1 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corp. 2005, 2011
*
* Author(s): Rolf Adelsberger,
* Michael Holzheu <holzheu@linux.vnet.ibm.com>
*/
#include <linux/device.h>
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/delay.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/debug_locks.h>
#include <asm/cio.h>
#include <asm/setup.h>
#include <asm/smp.h>
#include <asm/ipl.h>
#include <asm/diag.h>
#include <asm/elf.h>
#include <asm/asm-offsets.h>
#include <asm/cacheflush.h>
#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
#include <asm/set_memory.h>
#include <asm/stacktrace.h>
#include <asm/switch_to.h>
#include <asm/nmi.h>
#include <asm/sclp.h>
typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long,
unsigned long);
extern const unsigned char relocate_kernel[];
extern const unsigned long long relocate_kernel_len;
#ifdef CONFIG_CRASH_DUMP
/*
* Reset the system, copy boot CPU registers to absolute zero,
* and jump to the kdump image
*/
static void __do_machine_kdump(void *image)
{
int (*start_kdump)(int);
unsigned long prefix;
/* store_status() saved the prefix register to lowcore */
prefix = (unsigned long) S390_lowcore.prefixreg_save_area;
/* Now do the reset */
s390_reset_system();
/*
* Copy dump CPU store status info to absolute zero.
* This need to be done *after* s390_reset_system set the
* prefix register of this CPU to zero
*/
memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA),
(void *)(prefix + __LC_FPREGS_SAVE_AREA), 512);
__load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA);
start_kdump = (void *)((struct kimage *) image)->start;
start_kdump(1);
/* Die if start_kdump returns */
disabled_wait();
}
/*
* Start kdump: create a LGR log entry, store status of all CPUs and
* branch to __do_machine_kdump.
*/
static noinline void __machine_kdump(void *image)
{
struct mcesa *mcesa;
union ctlreg2 cr2_old, cr2_new;
int this_cpu, cpu;
lgr_info_log();
/* Get status of the other CPUs */
this_cpu = smp_find_processor_id(stap());
for_each_online_cpu(cpu) {
if (cpu == this_cpu)
continue;
if (smp_store_status(cpu))
continue;
}
/* Store status of the boot CPU */
mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
if (MACHINE_HAS_VX)
save_vx_regs((__vector128 *) mcesa->vector_save_area);
if (MACHINE_HAS_GS) {
__ctl_store(cr2_old.val, 2, 2);
cr2_new = cr2_old;
cr2_new.gse = 1;
__ctl_load(cr2_new.val, 2, 2);
save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area);
__ctl_load(cr2_old.val, 2, 2);
}
/*
* To create a good backchain for this CPU in the dump store_status
* is passed the address of a function. The address is saved into
* the PSW save area of the boot CPU and the function is invoked as
* a tail call of store_status. The backchain in the dump will look
* like this:
* restart_int_handler -> __machine_kexec -> __do_machine_kdump
* The call to store_status() will not return.
*/
store_status(__do_machine_kdump, image);
}
static unsigned long do_start_kdump(unsigned long addr)
{
struct kimage *image = (struct kimage *) addr;
int (*start_kdump)(int) = (void *)image->start;
int rc;
__arch_local_irq_stnsm(0xfb); /* disable DAT */
rc = start_kdump(0);
__arch_local_irq_stosm(0x04); /* enable DAT */
return rc;
}
#endif /* CONFIG_CRASH_DUMP */
/*
* Check if kdump checksums are valid: We call purgatory with parameter "0"
*/
static bool kdump_csum_valid(struct kimage *image)
{
#ifdef CONFIG_CRASH_DUMP
int rc;
preempt_disable();
rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump,
unsigned long, (unsigned long)image);
preempt_enable();
return rc == 0;
#else
return false;
#endif
}
#ifdef CONFIG_CRASH_DUMP
void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
{
unsigned long addr, size;
for (addr = begin; addr < end; addr += PAGE_SIZE)
free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
size = begin - crashk_res.start;
if (size)
os_info_crashkernel_add(crashk_res.start, size);
else
os_info_crashkernel_add(0, 0);
}
static void crash_protect_pages(int protect)
{
unsigned long size;
if (!crashk_res.end)
return;
size = resource_size(&crashk_res);
if (protect)
set_memory_ro(crashk_res.start, size >> PAGE_SHIFT);
else
set_memory_rw(crashk_res.start, size >> PAGE_SHIFT);
}
void arch_kexec_protect_crashkres(void)
{
crash_protect_pages(1);
}
void arch_kexec_unprotect_crashkres(void)
{
crash_protect_pages(0);
}
#endif
/*
* Give back memory to hypervisor before new kdump is loaded
*/
static int machine_kexec_prepare_kdump(void)
{
#ifdef CONFIG_CRASH_DUMP
if (MACHINE_IS_VM)
diag10_range(PFN_DOWN(crashk_res.start),
PFN_DOWN(crashk_res.end - crashk_res.start + 1));
return 0;
#else
return -EINVAL;
#endif
}
int machine_kexec_prepare(struct kimage *image)
{
void *reboot_code_buffer;
if (image->type == KEXEC_TYPE_CRASH)
return machine_kexec_prepare_kdump();
/* We don't support anything but the default image type for now. */
if (image->type != KEXEC_TYPE_DEFAULT)
return -EINVAL;
/* Get the destination where the assembler code should be copied to.*/
reboot_code_buffer = (void *) page_to_phys(image->control_code_page);
/* Then copy it */
memcpy(reboot_code_buffer, relocate_kernel, relocate_kernel_len);
return 0;
}
void machine_kexec_cleanup(struct kimage *image)
{
}
void arch_crash_save_vmcoreinfo(void)
{
struct lowcore *abs_lc;
unsigned long flags;
VMCOREINFO_SYMBOL(lowcore_ptr);
VMCOREINFO_SYMBOL(high_memory);
VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
vmcoreinfo_append_str("SAMODE31=%lx\n", __samode31);
vmcoreinfo_append_str("EAMODE31=%lx\n", __eamode31);
vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
abs_lc = get_abs_lowcore(&flags);
abs_lc->vmcore_info = paddr_vmcoreinfo_note();
put_abs_lowcore(abs_lc, flags);
}
void machine_shutdown(void)
{
}
void machine_crash_shutdown(struct pt_regs *regs)
{
set_os_info_reipl_block();
}
/*
* Do normal kexec
*/
static void __do_machine_kexec(void *data)
{
unsigned long diag308_subcode;
relocate_kernel_t data_mover;
struct kimage *image = data;
s390_reset_system();
data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page);
__arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */
/* Call the moving routine */
diag308_subcode = DIAG308_CLEAR_RESET;
if (sclp.has_iplcc)
diag308_subcode |= DIAG308_FLAG_EI;
(*data_mover)(&image->head, image->start, diag308_subcode);
/* Die if kexec returns */
disabled_wait();
}
/*
* Reset system and call either kdump or normal kexec
*/
static void __machine_kexec(void *data)
{
pfault_fini();
tracing_off();
debug_locks_off();
#ifdef CONFIG_CRASH_DUMP
if (((struct kimage *) data)->type == KEXEC_TYPE_CRASH)
__machine_kdump(data);
#endif
__do_machine_kexec(data);
}
/*
* Do either kdump or normal kexec. In case of kdump we first ask
* purgatory, if kdump checksums are valid.
*/
void machine_kexec(struct kimage *image)
{
if (image->type == KEXEC_TYPE_CRASH && !kdump_csum_valid(image))
return;
tracer_disable();
smp_send_stop();
smp_call_ipl_cpu(__machine_kexec, image);
}