Merge branch 'x86/asm' into x86/apic
Pick up dependent changes to avoid merge conflicts
This commit is contained in:
commit
c0bb80cfa3
179
Documentation/x86/orc-unwinder.txt
Normal file
179
Documentation/x86/orc-unwinder.txt
Normal file
@ -0,0 +1,179 @@
|
||||
ORC unwinder
|
||||
============
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is
|
||||
similar in concept to a DWARF unwinder. The difference is that the
|
||||
format of the ORC data is much simpler than DWARF, which in turn allows
|
||||
the ORC unwinder to be much simpler and faster.
|
||||
|
||||
The ORC data consists of unwind tables which are generated by objtool.
|
||||
They contain out-of-band data which is used by the in-kernel ORC
|
||||
unwinder. Objtool generates the ORC data by first doing compile-time
|
||||
stack metadata validation (CONFIG_STACK_VALIDATION). After analyzing
|
||||
all the code paths of a .o file, it determines information about the
|
||||
stack state at each instruction address in the file and outputs that
|
||||
information to the .orc_unwind and .orc_unwind_ip sections.
|
||||
|
||||
The per-object ORC sections are combined at link time and are sorted and
|
||||
post-processed at boot time. The unwinder uses the resulting data to
|
||||
correlate instruction addresses with their stack states at run time.
|
||||
|
||||
|
||||
ORC vs frame pointers
|
||||
---------------------
|
||||
|
||||
With frame pointers enabled, GCC adds instrumentation code to every
|
||||
function in the kernel. The kernel's .text size increases by about
|
||||
3.2%, resulting in a broad kernel-wide slowdown. Measurements by Mel
|
||||
Gorman [1] have shown a slowdown of 5-10% for some workloads.
|
||||
|
||||
In contrast, the ORC unwinder has no effect on text size or runtime
|
||||
performance, because the debuginfo is out of band. So if you disable
|
||||
frame pointers and enable the ORC unwinder, you get a nice performance
|
||||
improvement across the board, and still have reliable stack traces.
|
||||
|
||||
Ingo Molnar says:
|
||||
|
||||
"Note that it's not just a performance improvement, but also an
|
||||
instruction cache locality improvement: 3.2% .text savings almost
|
||||
directly transform into a similarly sized reduction in cache
|
||||
footprint. That can transform to even higher speedups for workloads
|
||||
whose cache locality is borderline."
|
||||
|
||||
Another benefit of ORC compared to frame pointers is that it can
|
||||
reliably unwind across interrupts and exceptions. Frame pointer based
|
||||
unwinds can sometimes skip the caller of the interrupted function, if it
|
||||
was a leaf function or if the interrupt hit before the frame pointer was
|
||||
saved.
|
||||
|
||||
The main disadvantage of the ORC unwinder compared to frame pointers is
|
||||
that it needs more memory to store the ORC unwind tables: roughly 2-4MB
|
||||
depending on the kernel config.
|
||||
|
||||
|
||||
ORC vs DWARF
|
||||
------------
|
||||
|
||||
ORC debuginfo's advantage over DWARF itself is that it's much simpler.
|
||||
It gets rid of the complex DWARF CFI state machine and also gets rid of
|
||||
the tracking of unnecessary registers. This allows the unwinder to be
|
||||
much simpler, meaning fewer bugs, which is especially important for
|
||||
mission critical oops code.
|
||||
|
||||
The simpler debuginfo format also enables the unwinder to be much faster
|
||||
than DWARF, which is important for perf and lockdep. In a basic
|
||||
performance test by Jiri Slaby [2], the ORC unwinder was about 20x
|
||||
faster than an out-of-tree DWARF unwinder. (Note: That measurement was
|
||||
taken before some performance tweaks were added, which doubled
|
||||
performance, so the speedup over DWARF may be closer to 40x.)
|
||||
|
||||
The ORC data format does have a few downsides compared to DWARF. ORC
|
||||
unwind tables take up ~50% more RAM (+1.3MB on an x86 defconfig kernel)
|
||||
than DWARF-based eh_frame tables.
|
||||
|
||||
Another potential downside is that, as GCC evolves, it's conceivable
|
||||
that the ORC data may end up being *too* simple to describe the state of
|
||||
the stack for certain optimizations. But IMO this is unlikely because
|
||||
GCC saves the frame pointer for any unusual stack adjustments it does,
|
||||
so I suspect we'll really only ever need to keep track of the stack
|
||||
pointer and the frame pointer between call frames. But even if we do
|
||||
end up having to track all the registers DWARF tracks, at least we will
|
||||
still be able to control the format, e.g. no complex state machines.
|
||||
|
||||
|
||||
ORC unwind table generation
|
||||
---------------------------
|
||||
|
||||
The ORC data is generated by objtool. With the existing compile-time
|
||||
stack metadata validation feature, objtool already follows all code
|
||||
paths, and so it already has all the information it needs to be able to
|
||||
generate ORC data from scratch. So it's an easy step to go from stack
|
||||
validation to ORC data generation.
|
||||
|
||||
It should be possible to instead generate the ORC data with a simple
|
||||
tool which converts DWARF to ORC data. However, such a solution would
|
||||
be incomplete due to the kernel's extensive use of asm, inline asm, and
|
||||
special sections like exception tables.
|
||||
|
||||
That could be rectified by manually annotating those special code paths
|
||||
using GNU assembler .cfi annotations in .S files, and homegrown
|
||||
annotations for inline asm in .c files. But asm annotations were tried
|
||||
in the past and were found to be unmaintainable. They were often
|
||||
incorrect/incomplete and made the code harder to read and keep updated.
|
||||
And based on looking at glibc code, annotating inline asm in .c files
|
||||
might be even worse.
|
||||
|
||||
Objtool still needs a few annotations, but only in code which does
|
||||
unusual things to the stack like entry code. And even then, far fewer
|
||||
annotations are needed than what DWARF would need, so they're much more
|
||||
maintainable than DWARF CFI annotations.
|
||||
|
||||
So the advantages of using objtool to generate ORC data are that it
|
||||
gives more accurate debuginfo, with very few annotations. It also
|
||||
insulates the kernel from toolchain bugs which can be very painful to
|
||||
deal with in the kernel since we often have to workaround issues in
|
||||
older versions of the toolchain for years.
|
||||
|
||||
The downside is that the unwinder now becomes dependent on objtool's
|
||||
ability to reverse engineer GCC code flow. If GCC optimizations become
|
||||
too complicated for objtool to follow, the ORC data generation might
|
||||
stop working or become incomplete. (It's worth noting that livepatch
|
||||
already has such a dependency on objtool's ability to follow GCC code
|
||||
flow.)
|
||||
|
||||
If newer versions of GCC come up with some optimizations which break
|
||||
objtool, we may need to revisit the current implementation. Some
|
||||
possible solutions would be asking GCC to make the optimizations more
|
||||
palatable, or having objtool use DWARF as an additional input, or
|
||||
creating a GCC plugin to assist objtool with its analysis. But for now,
|
||||
objtool follows GCC code quite well.
|
||||
|
||||
|
||||
Unwinder implementation details
|
||||
-------------------------------
|
||||
|
||||
Objtool generates the ORC data by integrating with the compile-time
|
||||
stack metadata validation feature, which is described in detail in
|
||||
tools/objtool/Documentation/stack-validation.txt. After analyzing all
|
||||
the code paths of a .o file, it creates an array of orc_entry structs,
|
||||
and a parallel array of instruction addresses associated with those
|
||||
structs, and writes them to the .orc_unwind and .orc_unwind_ip sections
|
||||
respectively.
|
||||
|
||||
The ORC data is split into the two arrays for performance reasons, to
|
||||
make the searchable part of the data (.orc_unwind_ip) more compact. The
|
||||
arrays are sorted in parallel at boot time.
|
||||
|
||||
Performance is further improved by the use of a fast lookup table which
|
||||
is created at runtime. The fast lookup table associates a given address
|
||||
with a range of indices for the .orc_unwind table, so that only a small
|
||||
subset of the table needs to be searched.
|
||||
|
||||
|
||||
Etymology
|
||||
---------
|
||||
|
||||
Orcs, fearsome creatures of medieval folklore, are the Dwarves' natural
|
||||
enemies. Similarly, the ORC unwinder was created in opposition to the
|
||||
complexity and slowness of DWARF.
|
||||
|
||||
"Although Orcs rarely consider multiple solutions to a problem, they do
|
||||
excel at getting things done because they are creatures of action, not
|
||||
thought." [3] Similarly, unlike the esoteric DWARF unwinder, the
|
||||
veracious ORC unwinder wastes no time or siloconic effort decoding
|
||||
variable-length zero-extended unsigned-integer byte-coded
|
||||
state-machine-based debug information entries.
|
||||
|
||||
Similar to how Orcs frequently unravel the well-intentioned plans of
|
||||
their adversaries, the ORC unwinder frequently unravels stacks with
|
||||
brutal, unyielding efficiency.
|
||||
|
||||
ORC stands for Oops Rewind Capability.
|
||||
|
||||
|
||||
[1] https://lkml.kernel.org/r/20170602104048.jkkzssljsompjdwy@suse.de
|
||||
[2] https://lkml.kernel.org/r/d2ca5435-6386-29b8-db87-7f227c2b713a@suse.cz
|
||||
[3] http://dustin.wikidot.com/half-orcs-and-orcs
|
11
MAINTAINERS
11
MAINTAINERS
@ -7639,17 +7639,6 @@ T: git git://linuxtv.org/mkrufky/tuners.git
|
||||
S: Maintained
|
||||
F: drivers/media/dvb-frontends/lgdt3305.*
|
||||
|
||||
LGUEST
|
||||
M: Rusty Russell <rusty@rustcorp.com.au>
|
||||
L: lguest@lists.ozlabs.org
|
||||
W: http://lguest.ozlabs.org/
|
||||
S: Odd Fixes
|
||||
F: arch/x86/include/asm/lguest*.h
|
||||
F: arch/x86/lguest/
|
||||
F: drivers/lguest/
|
||||
F: include/linux/lguest*.h
|
||||
F: tools/lguest/
|
||||
|
||||
LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
|
||||
M: Viresh Kumar <vireshk@kernel.org>
|
||||
L: linux-ide@vger.kernel.org
|
||||
|
8
arch/um/include/asm/unwind.h
Normal file
8
arch/um/include/asm/unwind.h
Normal file
@ -0,0 +1,8 @@
|
||||
#ifndef _ASM_UML_UNWIND_H
|
||||
#define _ASM_UML_UNWIND_H
|
||||
|
||||
static inline void
|
||||
unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
|
||||
void *orc, size_t orc_size) {}
|
||||
|
||||
#endif /* _ASM_UML_UNWIND_H */
|
@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
|
||||
# Hyper-V paravirtualization support
|
||||
obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
|
||||
|
||||
# lguest paravirtualization support
|
||||
obj-$(CONFIG_LGUEST_GUEST) += lguest/
|
||||
|
||||
obj-y += realmode/
|
||||
obj-y += kernel/
|
||||
obj-y += mm/
|
||||
|
@ -73,7 +73,6 @@ config X86
|
||||
select ARCH_USE_QUEUED_RWLOCKS
|
||||
select ARCH_USE_QUEUED_SPINLOCKS
|
||||
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
select ARCH_WANT_FRAME_POINTERS
|
||||
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
|
||||
select ARCH_WANTS_THP_SWAP if X86_64
|
||||
select BUILDTIME_EXTABLE_SORT
|
||||
@ -158,6 +157,7 @@ config X86
|
||||
select HAVE_MEMBLOCK
|
||||
select HAVE_MEMBLOCK_NODE_MAP
|
||||
select HAVE_MIXED_BREAKPOINTS_REGS
|
||||
select HAVE_MOD_ARCH_SPECIFIC
|
||||
select HAVE_NMI
|
||||
select HAVE_OPROFILE
|
||||
select HAVE_OPTPROBES
|
||||
@ -168,7 +168,7 @@ config X86
|
||||
select HAVE_PERF_REGS
|
||||
select HAVE_PERF_USER_STACK_DUMP
|
||||
select HAVE_REGS_AND_STACK_ACCESS_API
|
||||
select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION
|
||||
select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
|
||||
select HAVE_STACK_VALIDATION if X86_64
|
||||
select HAVE_SYSCALL_TRACEPOINTS
|
||||
select HAVE_UNSTABLE_SCHED_CLOCK
|
||||
@ -778,8 +778,6 @@ config KVM_DEBUG_FS
|
||||
Statistics are displayed in debugfs filesystem. Enabling this option
|
||||
may incur significant overhead.
|
||||
|
||||
source "arch/x86/lguest/Kconfig"
|
||||
|
||||
config PARAVIRT_TIME_ACCOUNTING
|
||||
bool "Paravirtual steal time accounting"
|
||||
depends on PARAVIRT
|
||||
|
@ -305,8 +305,6 @@ config DEBUG_ENTRY
|
||||
Some of these sanity checks may slow down kernel entries and
|
||||
exits or otherwise impact performance.
|
||||
|
||||
This is currently used to help test NMI code.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config DEBUG_NMI_SELFTEST
|
||||
@ -358,4 +356,61 @@ config PUNIT_ATOM_DEBUG
|
||||
The current power state can be read from
|
||||
/sys/kernel/debug/punit_atom/dev_power_state
|
||||
|
||||
choice
|
||||
prompt "Choose kernel unwinder"
|
||||
default FRAME_POINTER_UNWINDER
|
||||
---help---
|
||||
This determines which method will be used for unwinding kernel stack
|
||||
traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
|
||||
livepatch, lockdep, and more.
|
||||
|
||||
config FRAME_POINTER_UNWINDER
|
||||
bool "Frame pointer unwinder"
|
||||
select FRAME_POINTER
|
||||
---help---
|
||||
This option enables the frame pointer unwinder for unwinding kernel
|
||||
stack traces.
|
||||
|
||||
The unwinder itself is fast and it uses less RAM than the ORC
|
||||
unwinder, but the kernel text size will grow by ~3% and the kernel's
|
||||
overall performance will degrade by roughly 5-10%.
|
||||
|
||||
This option is recommended if you want to use the livepatch
|
||||
consistency model, as this is currently the only way to get a
|
||||
reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
|
||||
|
||||
config ORC_UNWINDER
|
||||
bool "ORC unwinder"
|
||||
depends on X86_64
|
||||
select STACK_VALIDATION
|
||||
---help---
|
||||
This option enables the ORC (Oops Rewind Capability) unwinder for
|
||||
unwinding kernel stack traces. It uses a custom data format which is
|
||||
a simplified version of the DWARF Call Frame Information standard.
|
||||
|
||||
This unwinder is more accurate across interrupt entry frames than the
|
||||
frame pointer unwinder. It also enables a 5-10% performance
|
||||
improvement across the entire kernel compared to frame pointers.
|
||||
|
||||
Enabling this option will increase the kernel's runtime memory usage
|
||||
by roughly 2-4MB, depending on your kernel config.
|
||||
|
||||
config GUESS_UNWINDER
|
||||
bool "Guess unwinder"
|
||||
depends on EXPERT
|
||||
---help---
|
||||
This option enables the "guess" unwinder for unwinding kernel stack
|
||||
traces. It scans the stack and reports every kernel text address it
|
||||
finds. Some of the addresses it reports may be incorrect.
|
||||
|
||||
While this option often produces false positives, it can still be
|
||||
useful in many cases. Unlike the other unwinders, it has no runtime
|
||||
overhead.
|
||||
|
||||
endchoice
|
||||
|
||||
config FRAME_POINTER
|
||||
depends on !ORC_UNWINDER && !GUESS_UNWINDER
|
||||
bool
|
||||
|
||||
endmenu
|
||||
|
@ -1,3 +1,5 @@
|
||||
CONFIG_NOHIGHMEM=y
|
||||
# CONFIG_HIGHMEM4G is not set
|
||||
# CONFIG_HIGHMEM64G is not set
|
||||
CONFIG_GUESS_UNWINDER=y
|
||||
# CONFIG_FRAME_POINTER_UNWINDER is not set
|
||||
|
@ -2,7 +2,6 @@
|
||||
# Makefile for the x86 low level entry code
|
||||
#
|
||||
|
||||
OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y
|
||||
OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
|
||||
|
||||
CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <linux/jump_label.h>
|
||||
#include <asm/unwind_hints.h>
|
||||
|
||||
/*
|
||||
|
||||
@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
movq %rdx, 12*8+\offset(%rsp)
|
||||
movq %rsi, 13*8+\offset(%rsp)
|
||||
movq %rdi, 14*8+\offset(%rsp)
|
||||
UNWIND_HINT_REGS offset=\offset extra=0
|
||||
.endm
|
||||
.macro SAVE_C_REGS offset=0
|
||||
SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
|
||||
@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
movq %r12, 3*8+\offset(%rsp)
|
||||
movq %rbp, 4*8+\offset(%rsp)
|
||||
movq %rbx, 5*8+\offset(%rsp)
|
||||
UNWIND_HINT_REGS offset=\offset
|
||||
.endm
|
||||
|
||||
.macro RESTORE_EXTRA_REGS offset=0
|
||||
@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
movq 3*8+\offset(%rsp), %r12
|
||||
movq 4*8+\offset(%rsp), %rbp
|
||||
movq 5*8+\offset(%rsp), %rbx
|
||||
UNWIND_HINT_REGS offset=\offset extra=0
|
||||
.endm
|
||||
|
||||
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
|
||||
@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
.endif
|
||||
movq 13*8(%rsp), %rsi
|
||||
movq 14*8(%rsp), %rdi
|
||||
UNWIND_HINT_IRET_REGS offset=16*8
|
||||
.endm
|
||||
.macro RESTORE_C_REGS
|
||||
RESTORE_C_REGS_HELPER 1,1,1,1,1
|
||||
|
@ -36,6 +36,7 @@
|
||||
#include <asm/smap.h>
|
||||
#include <asm/pgtable_types.h>
|
||||
#include <asm/export.h>
|
||||
#include <asm/frame.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
.code64
|
||||
@ -43,9 +44,10 @@
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
ENTRY(native_usergs_sysret64)
|
||||
UNWIND_HINT_EMPTY
|
||||
swapgs
|
||||
sysretq
|
||||
ENDPROC(native_usergs_sysret64)
|
||||
END(native_usergs_sysret64)
|
||||
#endif /* CONFIG_PARAVIRT */
|
||||
|
||||
.macro TRACE_IRQS_IRETQ
|
||||
@ -134,19 +136,14 @@ ENDPROC(native_usergs_sysret64)
|
||||
*/
|
||||
|
||||
ENTRY(entry_SYSCALL_64)
|
||||
UNWIND_HINT_EMPTY
|
||||
/*
|
||||
* Interrupts are off on entry.
|
||||
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
|
||||
* it is too small to ever cause noticeable irq latency.
|
||||
*/
|
||||
SWAPGS_UNSAFE_STACK
|
||||
/*
|
||||
* A hypervisor implementation might want to use a label
|
||||
* after the swapgs, so that it can do the swapgs
|
||||
* for the guest and jump here on syscall.
|
||||
*/
|
||||
GLOBAL(entry_SYSCALL_64_after_swapgs)
|
||||
|
||||
swapgs
|
||||
movq %rsp, PER_CPU_VAR(rsp_scratch)
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
||||
@ -158,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
|
||||
pushq %r11 /* pt_regs->flags */
|
||||
pushq $__USER_CS /* pt_regs->cs */
|
||||
pushq %rcx /* pt_regs->ip */
|
||||
GLOBAL(entry_SYSCALL_64_after_hwframe)
|
||||
pushq %rax /* pt_regs->orig_ax */
|
||||
pushq %rdi /* pt_regs->di */
|
||||
pushq %rsi /* pt_regs->si */
|
||||
@ -169,6 +167,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
|
||||
pushq %r10 /* pt_regs->r10 */
|
||||
pushq %r11 /* pt_regs->r11 */
|
||||
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
|
||||
UNWIND_HINT_REGS extra=0
|
||||
|
||||
/*
|
||||
* If we need to do entry work or if we guess we'll need to do
|
||||
@ -223,6 +222,7 @@ entry_SYSCALL_64_fastpath:
|
||||
movq EFLAGS(%rsp), %r11
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
movq RSP(%rsp), %rsp
|
||||
UNWIND_HINT_EMPTY
|
||||
USERGS_SYSRET64
|
||||
|
||||
1:
|
||||
@ -316,6 +316,7 @@ syscall_return_via_sysret:
|
||||
/* rcx and r11 are already restored (see code above) */
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
movq RSP(%rsp), %rsp
|
||||
UNWIND_HINT_EMPTY
|
||||
USERGS_SYSRET64
|
||||
|
||||
opportunistic_sysret_failed:
|
||||
@ -343,6 +344,7 @@ ENTRY(stub_ptregs_64)
|
||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||
TRACE_IRQS_OFF
|
||||
popq %rax
|
||||
UNWIND_HINT_REGS extra=0
|
||||
jmp entry_SYSCALL64_slow_path
|
||||
|
||||
1:
|
||||
@ -351,6 +353,7 @@ END(stub_ptregs_64)
|
||||
|
||||
.macro ptregs_stub func
|
||||
ENTRY(ptregs_\func)
|
||||
UNWIND_HINT_FUNC
|
||||
leaq \func(%rip), %rax
|
||||
jmp stub_ptregs_64
|
||||
END(ptregs_\func)
|
||||
@ -367,6 +370,7 @@ END(ptregs_\func)
|
||||
* %rsi: next task
|
||||
*/
|
||||
ENTRY(__switch_to_asm)
|
||||
UNWIND_HINT_FUNC
|
||||
/*
|
||||
* Save callee-saved registers
|
||||
* This must match the order in inactive_task_frame
|
||||
@ -406,6 +410,7 @@ END(__switch_to_asm)
|
||||
* r12: kernel thread arg
|
||||
*/
|
||||
ENTRY(ret_from_fork)
|
||||
UNWIND_HINT_EMPTY
|
||||
movq %rax, %rdi
|
||||
call schedule_tail /* rdi: 'prev' task parameter */
|
||||
|
||||
@ -413,6 +418,7 @@ ENTRY(ret_from_fork)
|
||||
jnz 1f /* kernel threads are uncommon */
|
||||
|
||||
2:
|
||||
UNWIND_HINT_REGS
|
||||
movq %rsp, %rdi
|
||||
call syscall_return_slowpath /* returns with IRQs disabled */
|
||||
TRACE_IRQS_ON /* user mode is traced as IRQS on */
|
||||
@ -440,13 +446,102 @@ END(ret_from_fork)
|
||||
ENTRY(irq_entries_start)
|
||||
vector=FIRST_EXTERNAL_VECTOR
|
||||
.rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
|
||||
UNWIND_HINT_IRET_REGS
|
||||
pushq $(~vector+0x80) /* Note: always in signed byte range */
|
||||
vector=vector+1
|
||||
jmp common_interrupt
|
||||
.align 8
|
||||
vector=vector+1
|
||||
.endr
|
||||
END(irq_entries_start)
|
||||
|
||||
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
|
||||
#ifdef CONFIG_DEBUG_ENTRY
|
||||
pushfq
|
||||
testl $X86_EFLAGS_IF, (%rsp)
|
||||
jz .Lokay_\@
|
||||
ud2
|
||||
.Lokay_\@:
|
||||
addq $8, %rsp
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
|
||||
* flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
|
||||
* Requires kernel GSBASE.
|
||||
*
|
||||
* The invariant is that, if irq_count != -1, then the IRQ stack is in use.
|
||||
*/
|
||||
.macro ENTER_IRQ_STACK regs=1 old_rsp
|
||||
DEBUG_ENTRY_ASSERT_IRQS_OFF
|
||||
movq %rsp, \old_rsp
|
||||
|
||||
.if \regs
|
||||
UNWIND_HINT_REGS base=\old_rsp
|
||||
.endif
|
||||
|
||||
incl PER_CPU_VAR(irq_count)
|
||||
jnz .Lirq_stack_push_old_rsp_\@
|
||||
|
||||
/*
|
||||
* Right now, if we just incremented irq_count to zero, we've
|
||||
* claimed the IRQ stack but we haven't switched to it yet.
|
||||
*
|
||||
* If anything is added that can interrupt us here without using IST,
|
||||
* it must be *extremely* careful to limit its stack usage. This
|
||||
* could include kprobes and a hypothetical future IST-less #DB
|
||||
* handler.
|
||||
*
|
||||
* The OOPS unwinder relies on the word at the top of the IRQ
|
||||
* stack linking back to the previous RSP for the entire time we're
|
||||
* on the IRQ stack. For this to work reliably, we need to write
|
||||
* it before we actually move ourselves to the IRQ stack.
|
||||
*/
|
||||
|
||||
movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
|
||||
movq PER_CPU_VAR(irq_stack_ptr), %rsp
|
||||
|
||||
#ifdef CONFIG_DEBUG_ENTRY
|
||||
/*
|
||||
* If the first movq above becomes wrong due to IRQ stack layout
|
||||
* changes, the only way we'll notice is if we try to unwind right
|
||||
* here. Assert that we set up the stack right to catch this type
|
||||
* of bug quickly.
|
||||
*/
|
||||
cmpq -8(%rsp), \old_rsp
|
||||
je .Lirq_stack_okay\@
|
||||
ud2
|
||||
.Lirq_stack_okay\@:
|
||||
#endif
|
||||
|
||||
.Lirq_stack_push_old_rsp_\@:
|
||||
pushq \old_rsp
|
||||
|
||||
.if \regs
|
||||
UNWIND_HINT_REGS indirect=1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Undoes ENTER_IRQ_STACK.
|
||||
*/
|
||||
.macro LEAVE_IRQ_STACK regs=1
|
||||
DEBUG_ENTRY_ASSERT_IRQS_OFF
|
||||
/* We need to be off the IRQ stack before decrementing irq_count. */
|
||||
popq %rsp
|
||||
|
||||
.if \regs
|
||||
UNWIND_HINT_REGS
|
||||
.endif
|
||||
|
||||
/*
|
||||
* As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
|
||||
* the irq stack but we're not on it.
|
||||
*/
|
||||
|
||||
decl PER_CPU_VAR(irq_count)
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Interrupt entry/exit.
|
||||
*
|
||||
@ -485,17 +580,7 @@ END(irq_entries_start)
|
||||
CALL_enter_from_user_mode
|
||||
|
||||
1:
|
||||
/*
|
||||
* Save previous stack pointer, optionally switch to interrupt stack.
|
||||
* irq_count is used to check if a CPU is already on an interrupt stack
|
||||
* or not. While this is essentially redundant with preempt_count it is
|
||||
* a little cheaper to use a separate counter in the PDA (short of
|
||||
* moving irq_enter into assembly, which would be too much work)
|
||||
*/
|
||||
movq %rsp, %rdi
|
||||
incl PER_CPU_VAR(irq_count)
|
||||
cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
|
||||
pushq %rdi
|
||||
ENTER_IRQ_STACK old_rsp=%rdi
|
||||
/* We entered an interrupt context - irqs are off: */
|
||||
TRACE_IRQS_OFF
|
||||
|
||||
@ -515,10 +600,8 @@ common_interrupt:
|
||||
ret_from_intr:
|
||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||
TRACE_IRQS_OFF
|
||||
decl PER_CPU_VAR(irq_count)
|
||||
|
||||
/* Restore saved previous stack */
|
||||
popq %rsp
|
||||
LEAVE_IRQ_STACK
|
||||
|
||||
testb $3, CS(%rsp)
|
||||
jz retint_kernel
|
||||
@ -561,6 +644,7 @@ restore_c_regs_and_iret:
|
||||
INTERRUPT_RETURN
|
||||
|
||||
ENTRY(native_iret)
|
||||
UNWIND_HINT_IRET_REGS
|
||||
/*
|
||||
* Are we returning to a stack segment from the LDT? Note: in
|
||||
* 64-bit mode SS:RSP on the exception stack is always valid.
|
||||
@ -633,6 +717,7 @@ native_irq_return_ldt:
|
||||
orq PER_CPU_VAR(espfix_stack), %rax
|
||||
SWAPGS
|
||||
movq %rax, %rsp
|
||||
UNWIND_HINT_IRET_REGS offset=8
|
||||
|
||||
/*
|
||||
* At this point, we cannot write to the stack any more, but we can
|
||||
@ -654,6 +739,7 @@ END(common_interrupt)
|
||||
*/
|
||||
.macro apicinterrupt3 num sym do_sym
|
||||
ENTRY(\sym)
|
||||
UNWIND_HINT_IRET_REGS
|
||||
ASM_CLAC
|
||||
pushq $~(\num)
|
||||
.Lcommon_\sym:
|
||||
@ -740,6 +826,8 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
|
||||
|
||||
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
|
||||
ENTRY(\sym)
|
||||
UNWIND_HINT_IRET_REGS offset=8
|
||||
|
||||
/* Sanity check */
|
||||
.if \shift_ist != -1 && \paranoid == 0
|
||||
.error "using shift_ist requires paranoid=1"
|
||||
@ -763,6 +851,7 @@ ENTRY(\sym)
|
||||
.else
|
||||
call error_entry
|
||||
.endif
|
||||
UNWIND_HINT_REGS
|
||||
/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
|
||||
|
||||
.if \paranoid
|
||||
@ -860,6 +949,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
|
||||
* edi: new selector
|
||||
*/
|
||||
ENTRY(native_load_gs_index)
|
||||
FRAME_BEGIN
|
||||
pushfq
|
||||
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
|
||||
SWAPGS
|
||||
@ -868,8 +958,9 @@ ENTRY(native_load_gs_index)
|
||||
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
|
||||
SWAPGS
|
||||
popfq
|
||||
FRAME_END
|
||||
ret
|
||||
END(native_load_gs_index)
|
||||
ENDPROC(native_load_gs_index)
|
||||
EXPORT_SYMBOL(native_load_gs_index)
|
||||
|
||||
_ASM_EXTABLE(.Lgs_change, bad_gs)
|
||||
@ -892,14 +983,12 @@ bad_gs:
|
||||
ENTRY(do_softirq_own_stack)
|
||||
pushq %rbp
|
||||
mov %rsp, %rbp
|
||||
incl PER_CPU_VAR(irq_count)
|
||||
cmove PER_CPU_VAR(irq_stack_ptr), %rsp
|
||||
push %rbp /* frame pointer backlink */
|
||||
ENTER_IRQ_STACK regs=0 old_rsp=%r11
|
||||
call __do_softirq
|
||||
LEAVE_IRQ_STACK regs=0
|
||||
leaveq
|
||||
decl PER_CPU_VAR(irq_count)
|
||||
ret
|
||||
END(do_softirq_own_stack)
|
||||
ENDPROC(do_softirq_own_stack)
|
||||
|
||||
#ifdef CONFIG_XEN
|
||||
idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
|
||||
@ -923,14 +1012,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
|
||||
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
|
||||
* see the correct pointer to the pt_regs
|
||||
*/
|
||||
UNWIND_HINT_FUNC
|
||||
movq %rdi, %rsp /* we don't return, adjust the stack frame */
|
||||
11: incl PER_CPU_VAR(irq_count)
|
||||
movq %rsp, %rbp
|
||||
cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
|
||||
pushq %rbp /* frame pointer backlink */
|
||||
UNWIND_HINT_REGS
|
||||
|
||||
ENTER_IRQ_STACK old_rsp=%r10
|
||||
call xen_evtchn_do_upcall
|
||||
popq %rsp
|
||||
decl PER_CPU_VAR(irq_count)
|
||||
LEAVE_IRQ_STACK
|
||||
|
||||
#ifndef CONFIG_PREEMPT
|
||||
call xen_maybe_preempt_hcall
|
||||
#endif
|
||||
@ -951,6 +1040,7 @@ END(xen_do_hypervisor_callback)
|
||||
* with its current contents: any discrepancy means we in category 1.
|
||||
*/
|
||||
ENTRY(xen_failsafe_callback)
|
||||
UNWIND_HINT_EMPTY
|
||||
movl %ds, %ecx
|
||||
cmpw %cx, 0x10(%rsp)
|
||||
jne 1f
|
||||
@ -970,11 +1060,13 @@ ENTRY(xen_failsafe_callback)
|
||||
pushq $0 /* RIP */
|
||||
pushq %r11
|
||||
pushq %rcx
|
||||
UNWIND_HINT_IRET_REGS offset=8
|
||||
jmp general_protection
|
||||
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
|
||||
movq (%rsp), %rcx
|
||||
movq 8(%rsp), %r11
|
||||
addq $0x30, %rsp
|
||||
UNWIND_HINT_IRET_REGS
|
||||
pushq $-1 /* orig_ax = -1 => not a system call */
|
||||
ALLOC_PT_GPREGS_ON_STACK
|
||||
SAVE_C_REGS
|
||||
@ -1020,6 +1112,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
|
||||
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
|
||||
*/
|
||||
ENTRY(paranoid_entry)
|
||||
UNWIND_HINT_FUNC
|
||||
cld
|
||||
SAVE_C_REGS 8
|
||||
SAVE_EXTRA_REGS 8
|
||||
@ -1047,6 +1140,7 @@ END(paranoid_entry)
|
||||
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
|
||||
*/
|
||||
ENTRY(paranoid_exit)
|
||||
UNWIND_HINT_REGS
|
||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||
TRACE_IRQS_OFF_DEBUG
|
||||
testl %ebx, %ebx /* swapgs needed? */
|
||||
@ -1068,6 +1162,7 @@ END(paranoid_exit)
|
||||
* Return: EBX=0: came from user mode; EBX=1: otherwise
|
||||
*/
|
||||
ENTRY(error_entry)
|
||||
UNWIND_HINT_FUNC
|
||||
cld
|
||||
SAVE_C_REGS 8
|
||||
SAVE_EXTRA_REGS 8
|
||||
@ -1152,6 +1247,7 @@ END(error_entry)
|
||||
* 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
|
||||
*/
|
||||
ENTRY(error_exit)
|
||||
UNWIND_HINT_REGS
|
||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||
TRACE_IRQS_OFF
|
||||
testl %ebx, %ebx
|
||||
@ -1161,6 +1257,7 @@ END(error_exit)
|
||||
|
||||
/* Runs on exception stack */
|
||||
ENTRY(nmi)
|
||||
UNWIND_HINT_IRET_REGS
|
||||
/*
|
||||
* Fix up the exception frame if we're on Xen.
|
||||
* PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
|
||||
@ -1234,11 +1331,13 @@ ENTRY(nmi)
|
||||
cld
|
||||
movq %rsp, %rdx
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
UNWIND_HINT_IRET_REGS base=%rdx offset=8
|
||||
pushq 5*8(%rdx) /* pt_regs->ss */
|
||||
pushq 4*8(%rdx) /* pt_regs->rsp */
|
||||
pushq 3*8(%rdx) /* pt_regs->flags */
|
||||
pushq 2*8(%rdx) /* pt_regs->cs */
|
||||
pushq 1*8(%rdx) /* pt_regs->rip */
|
||||
UNWIND_HINT_IRET_REGS
|
||||
pushq $-1 /* pt_regs->orig_ax */
|
||||
pushq %rdi /* pt_regs->di */
|
||||
pushq %rsi /* pt_regs->si */
|
||||
@ -1255,6 +1354,7 @@ ENTRY(nmi)
|
||||
pushq %r13 /* pt_regs->r13 */
|
||||
pushq %r14 /* pt_regs->r14 */
|
||||
pushq %r15 /* pt_regs->r15 */
|
||||
UNWIND_HINT_REGS
|
||||
ENCODE_FRAME_POINTER
|
||||
|
||||
/*
|
||||
@ -1409,6 +1509,7 @@ first_nmi:
|
||||
.rept 5
|
||||
pushq 11*8(%rsp)
|
||||
.endr
|
||||
UNWIND_HINT_IRET_REGS
|
||||
|
||||
/* Everything up to here is safe from nested NMIs */
|
||||
|
||||
@ -1424,6 +1525,7 @@ first_nmi:
|
||||
pushq $__KERNEL_CS /* CS */
|
||||
pushq $1f /* RIP */
|
||||
INTERRUPT_RETURN /* continues at repeat_nmi below */
|
||||
UNWIND_HINT_IRET_REGS
|
||||
1:
|
||||
#endif
|
||||
|
||||
@ -1473,6 +1575,7 @@ end_repeat_nmi:
|
||||
* exceptions might do.
|
||||
*/
|
||||
call paranoid_entry
|
||||
UNWIND_HINT_REGS
|
||||
|
||||
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
|
||||
movq %rsp, %rdi
|
||||
@ -1510,17 +1613,19 @@ nmi_restore:
|
||||
END(nmi)
|
||||
|
||||
ENTRY(ignore_sysret)
|
||||
UNWIND_HINT_EMPTY
|
||||
mov $-ENOSYS, %eax
|
||||
sysret
|
||||
END(ignore_sysret)
|
||||
|
||||
ENTRY(rewind_stack_do_exit)
|
||||
UNWIND_HINT_FUNC
|
||||
/* Prevent any naive code from trying to unwind to our caller. */
|
||||
xorl %ebp, %ebp
|
||||
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
|
||||
leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
|
||||
leaq -PTREGS_SIZE(%rax), %rsp
|
||||
UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
|
||||
|
||||
call do_exit
|
||||
1: jmp 1b
|
||||
END(rewind_stack_do_exit)
|
||||
|
@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
|
||||
*/
|
||||
ENTRY(entry_SYSCALL_compat)
|
||||
/* Interrupts are off on entry. */
|
||||
SWAPGS_UNSAFE_STACK
|
||||
swapgs
|
||||
|
||||
/* Stash user ESP and switch to the kernel stack. */
|
||||
movl %esp, %r8d
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
||||
/* Zero-extending 32-bit regs, do not remove */
|
||||
movl %eax, %eax
|
||||
|
||||
/* Construct struct pt_regs on stack */
|
||||
pushq $__USER32_DS /* pt_regs->ss */
|
||||
pushq %r8 /* pt_regs->sp */
|
||||
pushq %r11 /* pt_regs->flags */
|
||||
pushq $__USER32_CS /* pt_regs->cs */
|
||||
pushq %rcx /* pt_regs->ip */
|
||||
GLOBAL(entry_SYSCALL_compat_after_hwframe)
|
||||
movl %eax, %eax /* discard orig_ax high bits */
|
||||
pushq %rax /* pt_regs->orig_ax */
|
||||
pushq %rdi /* pt_regs->di */
|
||||
pushq %rsi /* pt_regs->si */
|
||||
|
@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
|
||||
if (ksig->ka.sa.sa_flags & SA_ONSTACK)
|
||||
sp = sigsp(sp, ksig);
|
||||
/* This is the legacy signal stack switching. */
|
||||
else if ((regs->ss & 0xffff) != __USER32_DS &&
|
||||
else if (regs->ss != __USER32_DS &&
|
||||
!(ksig->ka.sa.sa_flags & SA_RESTORER) &&
|
||||
ksig->ka.sa.sa_restorer)
|
||||
sp = (unsigned long) ksig->ka.sa.sa_restorer;
|
||||
|
@ -126,15 +126,15 @@ do { \
|
||||
pr_reg[4] = regs->di; \
|
||||
pr_reg[5] = regs->bp; \
|
||||
pr_reg[6] = regs->ax; \
|
||||
pr_reg[7] = regs->ds & 0xffff; \
|
||||
pr_reg[8] = regs->es & 0xffff; \
|
||||
pr_reg[9] = regs->fs & 0xffff; \
|
||||
pr_reg[7] = regs->ds; \
|
||||
pr_reg[8] = regs->es; \
|
||||
pr_reg[9] = regs->fs; \
|
||||
pr_reg[11] = regs->orig_ax; \
|
||||
pr_reg[12] = regs->ip; \
|
||||
pr_reg[13] = regs->cs & 0xffff; \
|
||||
pr_reg[13] = regs->cs; \
|
||||
pr_reg[14] = regs->flags; \
|
||||
pr_reg[15] = regs->sp; \
|
||||
pr_reg[16] = regs->ss & 0xffff; \
|
||||
pr_reg[16] = regs->ss; \
|
||||
} while (0);
|
||||
|
||||
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
|
||||
@ -204,6 +204,7 @@ void set_personality_ia32(bool);
|
||||
|
||||
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
|
||||
do { \
|
||||
unsigned long base; \
|
||||
unsigned v; \
|
||||
(pr_reg)[0] = (regs)->r15; \
|
||||
(pr_reg)[1] = (regs)->r14; \
|
||||
@ -226,8 +227,8 @@ do { \
|
||||
(pr_reg)[18] = (regs)->flags; \
|
||||
(pr_reg)[19] = (regs)->sp; \
|
||||
(pr_reg)[20] = (regs)->ss; \
|
||||
(pr_reg)[21] = current->thread.fsbase; \
|
||||
(pr_reg)[22] = current->thread.gsbase; \
|
||||
rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
|
||||
rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
|
||||
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
|
||||
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
|
||||
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
|
||||
|
@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", )
|
||||
build_mmio_write(__writew, "w", unsigned short, "r", )
|
||||
build_mmio_write(__writel, "l", unsigned int, "r", )
|
||||
|
||||
#define readb readb
|
||||
#define readw readw
|
||||
#define readl readl
|
||||
#define readb_relaxed(a) __readb(a)
|
||||
#define readw_relaxed(a) __readw(a)
|
||||
#define readl_relaxed(a) __readl(a)
|
||||
@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
|
||||
#define __raw_readw __readw
|
||||
#define __raw_readl __readl
|
||||
|
||||
#define writeb writeb
|
||||
#define writew writew
|
||||
#define writel writel
|
||||
#define writeb_relaxed(v, a) __writeb(v, a)
|
||||
#define writew_relaxed(v, a) __writew(v, a)
|
||||
#define writel_relaxed(v, a) __writel(v, a)
|
||||
@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
|
||||
#ifdef CONFIG_X86_64
|
||||
|
||||
build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
|
||||
build_mmio_read(__readq, "q", unsigned long, "=r", )
|
||||
build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
|
||||
build_mmio_write(__writeq, "q", unsigned long, "r", )
|
||||
|
||||
#define readq_relaxed(a) readq(a)
|
||||
#define writeq_relaxed(v, a) writeq(v, a)
|
||||
#define readq_relaxed(a) __readq(a)
|
||||
#define writeq_relaxed(v, a) __writeq(v, a)
|
||||
|
||||
#define __raw_readq(a) readq(a)
|
||||
#define __raw_writeq(val, addr) writeq(val, addr)
|
||||
#define __raw_readq __readq
|
||||
#define __raw_writeq __writeq
|
||||
|
||||
/* Let people know that we have them */
|
||||
#define readq readq
|
||||
@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address)
|
||||
{
|
||||
return __pa(address);
|
||||
}
|
||||
#define virt_to_phys virt_to_phys
|
||||
|
||||
/**
|
||||
* phys_to_virt - map physical address to virtual
|
||||
@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address)
|
||||
{
|
||||
return __va(address);
|
||||
}
|
||||
#define phys_to_virt phys_to_virt
|
||||
|
||||
/*
|
||||
* Change "struct page" to physical address.
|
||||
@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
|
||||
* else, you probably want one of the following.
|
||||
*/
|
||||
extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
|
||||
#define ioremap_nocache ioremap_nocache
|
||||
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
|
||||
#define ioremap_uc ioremap_uc
|
||||
|
||||
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
|
||||
#define ioremap_cache ioremap_cache
|
||||
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
|
||||
#define ioremap_prot ioremap_prot
|
||||
|
||||
/**
|
||||
* ioremap - map bus memory into CPU space
|
||||
@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
|
||||
{
|
||||
return ioremap_nocache(offset, size);
|
||||
}
|
||||
#define ioremap ioremap
|
||||
|
||||
extern void iounmap(volatile void __iomem *addr);
|
||||
#define iounmap iounmap
|
||||
|
||||
extern void set_iounmap_nonlazy(void);
|
||||
|
||||
@ -202,53 +217,6 @@ extern void set_iounmap_nonlazy(void);
|
||||
|
||||
#include <asm-generic/iomap.h>
|
||||
|
||||
/*
|
||||
* Convert a virtual cached pointer to an uncached pointer
|
||||
*/
|
||||
#define xlate_dev_kmem_ptr(p) p
|
||||
|
||||
/**
|
||||
* memset_io Set a range of I/O memory to a constant value
|
||||
* @addr: The beginning of the I/O-memory range to set
|
||||
* @val: The value to set the memory to
|
||||
* @count: The number of bytes to set
|
||||
*
|
||||
* Set a range of I/O memory to a given value.
|
||||
*/
|
||||
static inline void
|
||||
memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
|
||||
{
|
||||
memset((void __force *)addr, val, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* memcpy_fromio Copy a block of data from I/O memory
|
||||
* @dst: The (RAM) destination for the copy
|
||||
* @src: The (I/O memory) source for the data
|
||||
* @count: The number of bytes to copy
|
||||
*
|
||||
* Copy a block of data from I/O memory.
|
||||
*/
|
||||
static inline void
|
||||
memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
|
||||
{
|
||||
memcpy(dst, (const void __force *)src, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* memcpy_toio Copy a block of data into I/O memory
|
||||
* @dst: The (I/O memory) destination for the copy
|
||||
* @src: The (RAM) source for the data
|
||||
* @count: The number of bytes to copy
|
||||
*
|
||||
* Copy a block of data to I/O memory.
|
||||
*/
|
||||
static inline void
|
||||
memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
|
||||
{
|
||||
memcpy((void __force *)dst, src, count);
|
||||
}
|
||||
|
||||
/*
|
||||
* ISA space is 'always mapped' on a typical x86 system, no need to
|
||||
* explicitly ioremap() it. The fact that the ISA IO space is mapped
|
||||
@ -341,13 +309,38 @@ BUILDIO(b, b, char)
|
||||
BUILDIO(w, w, short)
|
||||
BUILDIO(l, , int)
|
||||
|
||||
#define inb inb
|
||||
#define inw inw
|
||||
#define inl inl
|
||||
#define inb_p inb_p
|
||||
#define inw_p inw_p
|
||||
#define inl_p inl_p
|
||||
#define insb insb
|
||||
#define insw insw
|
||||
#define insl insl
|
||||
|
||||
#define outb outb
|
||||
#define outw outw
|
||||
#define outl outl
|
||||
#define outb_p outb_p
|
||||
#define outw_p outw_p
|
||||
#define outl_p outl_p
|
||||
#define outsb outsb
|
||||
#define outsw outsw
|
||||
#define outsl outsl
|
||||
|
||||
extern void *xlate_dev_mem_ptr(phys_addr_t phys);
|
||||
extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
|
||||
|
||||
#define xlate_dev_mem_ptr xlate_dev_mem_ptr
|
||||
#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
|
||||
|
||||
extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
|
||||
enum page_cache_mode pcm);
|
||||
extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
|
||||
#define ioremap_wc ioremap_wc
|
||||
extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
|
||||
#define ioremap_wt ioremap_wt
|
||||
|
||||
extern bool is_early_ioremap_ptep(pte_t *ptep);
|
||||
|
||||
@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
|
||||
|
||||
#define IO_SPACE_LIMIT 0xffff
|
||||
|
||||
#include <asm-generic/io.h>
|
||||
#undef PCI_IOBASE
|
||||
|
||||
#ifdef CONFIG_MTRR
|
||||
extern int __must_check arch_phys_wc_index(int handle);
|
||||
#define arch_phys_wc_index arch_phys_wc_index
|
||||
|
@ -1,91 +0,0 @@
|
||||
#ifndef _ASM_X86_LGUEST_H
|
||||
#define _ASM_X86_LGUEST_H
|
||||
|
||||
#define GDT_ENTRY_LGUEST_CS 10
|
||||
#define GDT_ENTRY_LGUEST_DS 11
|
||||
#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
|
||||
#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#include <asm/desc.h>
|
||||
|
||||
#define GUEST_PL 1
|
||||
|
||||
/* Page for Switcher text itself, then two pages per cpu */
|
||||
#define SWITCHER_TEXT_PAGES (1)
|
||||
#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
|
||||
#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
|
||||
|
||||
/* Where we map the Switcher, in both Host and Guest. */
|
||||
extern unsigned long switcher_addr;
|
||||
|
||||
/* Found in switcher.S */
|
||||
extern unsigned long default_idt_entries[];
|
||||
|
||||
/* Declarations for definitions in arch/x86/lguest/head_32.S */
|
||||
extern char lguest_noirq_iret[];
|
||||
extern const char lgstart_cli[], lgend_cli[];
|
||||
extern const char lgstart_pushf[], lgend_pushf[];
|
||||
|
||||
extern void lguest_iret(void);
|
||||
extern void lguest_init(void);
|
||||
|
||||
struct lguest_regs {
|
||||
/* Manually saved part. */
|
||||
unsigned long eax, ebx, ecx, edx;
|
||||
unsigned long esi, edi, ebp;
|
||||
unsigned long gs;
|
||||
unsigned long fs, ds, es;
|
||||
unsigned long trapnum, errcode;
|
||||
/* Trap pushed part */
|
||||
unsigned long eip;
|
||||
unsigned long cs;
|
||||
unsigned long eflags;
|
||||
unsigned long esp;
|
||||
unsigned long ss;
|
||||
};
|
||||
|
||||
/* This is a guest-specific page (mapped ro) into the guest. */
|
||||
struct lguest_ro_state {
|
||||
/* Host information we need to restore when we switch back. */
|
||||
u32 host_cr3;
|
||||
struct desc_ptr host_idt_desc;
|
||||
struct desc_ptr host_gdt_desc;
|
||||
u32 host_sp;
|
||||
|
||||
/* Fields which are used when guest is running. */
|
||||
struct desc_ptr guest_idt_desc;
|
||||
struct desc_ptr guest_gdt_desc;
|
||||
struct x86_hw_tss guest_tss;
|
||||
struct desc_struct guest_idt[IDT_ENTRIES];
|
||||
struct desc_struct guest_gdt[GDT_ENTRIES];
|
||||
};
|
||||
|
||||
struct lg_cpu_arch {
|
||||
/* The GDT entries copied into lguest_ro_state when running. */
|
||||
struct desc_struct gdt[GDT_ENTRIES];
|
||||
|
||||
/* The IDT entries: some copied into lguest_ro_state when running. */
|
||||
struct desc_struct idt[IDT_ENTRIES];
|
||||
|
||||
/* The address of the last guest-visible pagefault (ie. cr2). */
|
||||
unsigned long last_pagefault;
|
||||
};
|
||||
|
||||
static inline void lguest_set_ts(void)
|
||||
{
|
||||
u32 cr0;
|
||||
|
||||
cr0 = read_cr0();
|
||||
if (!(cr0 & 8))
|
||||
write_cr0(cr0 | 8);
|
||||
}
|
||||
|
||||
/* Full 4G segment descriptors, suitable for CS and DS. */
|
||||
#define FULL_EXEC_SEGMENT \
|
||||
((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
|
||||
#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _ASM_X86_LGUEST_H */
|
@ -1,74 +0,0 @@
|
||||
/* Architecture specific portion of the lguest hypercalls */
|
||||
#ifndef _ASM_X86_LGUEST_HCALL_H
|
||||
#define _ASM_X86_LGUEST_HCALL_H
|
||||
|
||||
#define LHCALL_FLUSH_ASYNC 0
|
||||
#define LHCALL_LGUEST_INIT 1
|
||||
#define LHCALL_SHUTDOWN 2
|
||||
#define LHCALL_NEW_PGTABLE 4
|
||||
#define LHCALL_FLUSH_TLB 5
|
||||
#define LHCALL_LOAD_IDT_ENTRY 6
|
||||
#define LHCALL_SET_STACK 7
|
||||
#define LHCALL_SET_CLOCKEVENT 9
|
||||
#define LHCALL_HALT 10
|
||||
#define LHCALL_SET_PMD 13
|
||||
#define LHCALL_SET_PTE 14
|
||||
#define LHCALL_SET_PGD 15
|
||||
#define LHCALL_LOAD_TLS 16
|
||||
#define LHCALL_LOAD_GDT_ENTRY 18
|
||||
#define LHCALL_SEND_INTERRUPTS 19
|
||||
|
||||
#define LGUEST_TRAP_ENTRY 0x1F
|
||||
|
||||
/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
|
||||
#define LGUEST_SHUTDOWN_POWEROFF 1
|
||||
#define LGUEST_SHUTDOWN_RESTART 2
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#include <asm/hw_irq.h>
|
||||
|
||||
/*G:030
|
||||
* But first, how does our Guest contact the Host to ask for privileged
|
||||
* operations? There are two ways: the direct way is to make a "hypercall",
|
||||
* to make requests of the Host Itself.
|
||||
*
|
||||
* Our hypercall mechanism uses the highest unused trap code (traps 32 and
|
||||
* above are used by real hardware interrupts). Seventeen hypercalls are
|
||||
* available: the hypercall number is put in the %eax register, and the
|
||||
* arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
|
||||
* If a return value makes sense, it's returned in %eax.
|
||||
*
|
||||
* Grossly invalid calls result in Sudden Death at the hands of the vengeful
|
||||
* Host, rather than returning failure. This reflects Winston Churchill's
|
||||
* definition of a gentleman: "someone who is only rude intentionally".
|
||||
*/
|
||||
static inline unsigned long
|
||||
hcall(unsigned long call,
|
||||
unsigned long arg1, unsigned long arg2, unsigned long arg3,
|
||||
unsigned long arg4)
|
||||
{
|
||||
/* "int" is the Intel instruction to trigger a trap. */
|
||||
asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
|
||||
/* The call in %eax (aka "a") might be overwritten */
|
||||
: "=a"(call)
|
||||
/* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
|
||||
: "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
|
||||
/* "memory" means this might write somewhere in memory.
|
||||
* This isn't true for all calls, but it's safe to tell
|
||||
* gcc that it might happen so it doesn't get clever. */
|
||||
: "memory");
|
||||
return call;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* Can't use our min() macro here: needs to be a constant */
|
||||
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
|
||||
|
||||
#define LHCALL_RING_SIZE 64
|
||||
struct hcall_args {
|
||||
/* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
|
||||
unsigned long arg0, arg1, arg2, arg3, arg4;
|
||||
};
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
#endif /* _ASM_X86_LGUEST_HCALL_H */
|
@ -2,6 +2,15 @@
|
||||
#define _ASM_X86_MODULE_H
|
||||
|
||||
#include <asm-generic/module.h>
|
||||
#include <asm/orc_types.h>
|
||||
|
||||
struct mod_arch_specific {
|
||||
#ifdef CONFIG_ORC_UNWINDER
|
||||
unsigned int num_orcs;
|
||||
int *orc_unwind_ip;
|
||||
struct orc_entry *orc_unwind;
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* X86_64 does not define MODULE_PROC_FAMILY */
|
||||
|
46
arch/x86/include/asm/orc_lookup.h
Normal file
46
arch/x86/include/asm/orc_lookup.h
Normal file
@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#ifndef _ORC_LOOKUP_H
|
||||
#define _ORC_LOOKUP_H
|
||||
|
||||
/*
|
||||
* This is a lookup table for speeding up access to the .orc_unwind table.
|
||||
* Given an input address offset, the corresponding lookup table entry
|
||||
* specifies a subset of the .orc_unwind table to search.
|
||||
*
|
||||
* Each block represents the end of the previous range and the start of the
|
||||
* next range. An extra block is added to give the last range an end.
|
||||
*
|
||||
* The block size should be a power of 2 to avoid a costly 'div' instruction.
|
||||
*
|
||||
* A block size of 256 was chosen because it roughly doubles unwinder
|
||||
* performance while only adding ~5% to the ORC data footprint.
|
||||
*/
|
||||
#define LOOKUP_BLOCK_ORDER 8
|
||||
#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER)
|
||||
|
||||
#ifndef LINKER_SCRIPT
|
||||
|
||||
extern unsigned int orc_lookup[];
|
||||
extern unsigned int orc_lookup_end[];
|
||||
|
||||
#define LOOKUP_START_IP (unsigned long)_stext
|
||||
#define LOOKUP_STOP_IP (unsigned long)_etext
|
||||
|
||||
#endif /* LINKER_SCRIPT */
|
||||
|
||||
#endif /* _ORC_LOOKUP_H */
|
107
arch/x86/include/asm/orc_types.h
Normal file
107
arch/x86/include/asm/orc_types.h
Normal file
@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef _ORC_TYPES_H
|
||||
#define _ORC_TYPES_H
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/compiler.h>
|
||||
|
||||
/*
|
||||
* The ORC_REG_* registers are base registers which are used to find other
|
||||
* registers on the stack.
|
||||
*
|
||||
* ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
|
||||
* address of the previous frame: the caller's SP before it called the current
|
||||
* function.
|
||||
*
|
||||
* ORC_REG_UNDEFINED means the corresponding register's value didn't change in
|
||||
* the current frame.
|
||||
*
|
||||
* The most commonly used base registers are SP and BP -- which the previous SP
|
||||
* is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
|
||||
* usually based on.
|
||||
*
|
||||
* The rest of the base registers are needed for special cases like entry code
|
||||
* and GCC realigned stacks.
|
||||
*/
|
||||
#define ORC_REG_UNDEFINED 0
|
||||
#define ORC_REG_PREV_SP 1
|
||||
#define ORC_REG_DX 2
|
||||
#define ORC_REG_DI 3
|
||||
#define ORC_REG_BP 4
|
||||
#define ORC_REG_SP 5
|
||||
#define ORC_REG_R10 6
|
||||
#define ORC_REG_R13 7
|
||||
#define ORC_REG_BP_INDIRECT 8
|
||||
#define ORC_REG_SP_INDIRECT 9
|
||||
#define ORC_REG_MAX 15
|
||||
|
||||
/*
|
||||
* ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
|
||||
* caller's SP right before it made the call). Used for all callable
|
||||
* functions, i.e. all C code and all callable asm functions.
|
||||
*
|
||||
* ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
|
||||
* to a fully populated pt_regs from a syscall, interrupt, or exception.
|
||||
*
|
||||
* ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
|
||||
* points to the iret return frame.
|
||||
*
|
||||
* The UNWIND_HINT macros are used only for the unwind_hint struct. They
|
||||
* aren't used in struct orc_entry due to size and complexity constraints.
|
||||
* Objtool converts them to real types when it converts the hints to orc
|
||||
* entries.
|
||||
*/
|
||||
#define ORC_TYPE_CALL 0
|
||||
#define ORC_TYPE_REGS 1
|
||||
#define ORC_TYPE_REGS_IRET 2
|
||||
#define UNWIND_HINT_TYPE_SAVE 3
|
||||
#define UNWIND_HINT_TYPE_RESTORE 4
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
/*
|
||||
* This struct is more or less a vastly simplified version of the DWARF Call
|
||||
* Frame Information standard. It contains only the necessary parts of DWARF
|
||||
* CFI, simplified for ease of access by the in-kernel unwinder. It tells the
|
||||
* unwinder how to find the previous SP and BP (and sometimes entry regs) on
|
||||
* the stack for a given code address. Each instance of the struct corresponds
|
||||
* to one or more code locations.
|
||||
*/
|
||||
struct orc_entry {
|
||||
s16 sp_offset;
|
||||
s16 bp_offset;
|
||||
unsigned sp_reg:4;
|
||||
unsigned bp_reg:4;
|
||||
unsigned type:2;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* This struct is used by asm and inline asm code to manually annotate the
|
||||
* location of registers on the stack for the ORC unwinder.
|
||||
*
|
||||
* Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
|
||||
*/
|
||||
struct unwind_hint {
|
||||
u32 ip;
|
||||
s16 sp_offset;
|
||||
u8 sp_reg;
|
||||
u8 type;
|
||||
};
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _ORC_TYPES_H */
|
@ -22,6 +22,7 @@ struct vm86;
|
||||
#include <asm/nops.h>
|
||||
#include <asm/special_insns.h>
|
||||
#include <asm/fpu/types.h>
|
||||
#include <asm/unwind_hints.h>
|
||||
|
||||
#include <linux/personality.h>
|
||||
#include <linux/cache.h>
|
||||
@ -661,7 +662,7 @@ static inline void sync_core(void)
|
||||
* In case NMI unmasking or performance ever becomes a problem,
|
||||
* the next best option appears to be MOV-to-CR2 and an
|
||||
* unconditional jump. That sequence also works on all CPUs,
|
||||
* but it will fault at CPL3 (i.e. Xen PV and lguest).
|
||||
* but it will fault at CPL3 (i.e. Xen PV).
|
||||
*
|
||||
* CPUID is the conventional way, but it's nasty: it doesn't
|
||||
* exist on some 486-like CPUs, and it usually exits to a
|
||||
@ -684,6 +685,7 @@ static inline void sync_core(void)
|
||||
unsigned int tmp;
|
||||
|
||||
asm volatile (
|
||||
UNWIND_HINT_SAVE
|
||||
"mov %%ss, %0\n\t"
|
||||
"pushq %q0\n\t"
|
||||
"pushq %%rsp\n\t"
|
||||
@ -693,6 +695,7 @@ static inline void sync_core(void)
|
||||
"pushq %q0\n\t"
|
||||
"pushq $1f\n\t"
|
||||
"iretq\n\t"
|
||||
UNWIND_HINT_RESTORE
|
||||
"1:"
|
||||
: "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
|
||||
#endif
|
||||
|
@ -9,6 +9,20 @@
|
||||
#ifdef __i386__
|
||||
|
||||
struct pt_regs {
|
||||
/*
|
||||
* NB: 32-bit x86 CPUs are inconsistent as what happens in the
|
||||
* following cases (where %seg represents a segment register):
|
||||
*
|
||||
* - pushl %seg: some do a 16-bit write and leave the high
|
||||
* bits alone
|
||||
* - movl %seg, [mem]: some do a 16-bit write despite the movl
|
||||
* - IDT entry: some (e.g. 486) will leave the high bits of CS
|
||||
* and (if applicable) SS undefined.
|
||||
*
|
||||
* Fortunately, x86-32 doesn't read the high bits on POP or IRET,
|
||||
* so we can just treat all of the segment registers as 16-bit
|
||||
* values.
|
||||
*/
|
||||
unsigned long bx;
|
||||
unsigned long cx;
|
||||
unsigned long dx;
|
||||
@ -16,16 +30,22 @@ struct pt_regs {
|
||||
unsigned long di;
|
||||
unsigned long bp;
|
||||
unsigned long ax;
|
||||
unsigned long ds;
|
||||
unsigned long es;
|
||||
unsigned long fs;
|
||||
unsigned long gs;
|
||||
unsigned short ds;
|
||||
unsigned short __dsh;
|
||||
unsigned short es;
|
||||
unsigned short __esh;
|
||||
unsigned short fs;
|
||||
unsigned short __fsh;
|
||||
unsigned short gs;
|
||||
unsigned short __gsh;
|
||||
unsigned long orig_ax;
|
||||
unsigned long ip;
|
||||
unsigned long cs;
|
||||
unsigned short cs;
|
||||
unsigned short __csh;
|
||||
unsigned long flags;
|
||||
unsigned long sp;
|
||||
unsigned long ss;
|
||||
unsigned short ss;
|
||||
unsigned short __ssh;
|
||||
};
|
||||
|
||||
#else /* __i386__ */
|
||||
@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
|
||||
if (offset == offsetof(struct pt_regs, sp) &&
|
||||
regs->cs == __KERNEL_CS)
|
||||
return kernel_stack_pointer(regs);
|
||||
|
||||
/* The selector fields are 16-bit. */
|
||||
if (offset == offsetof(struct pt_regs, cs) ||
|
||||
offset == offsetof(struct pt_regs, ss) ||
|
||||
offset == offsetof(struct pt_regs, ds) ||
|
||||
offset == offsetof(struct pt_regs, es) ||
|
||||
offset == offsetof(struct pt_regs, fs) ||
|
||||
offset == offsetof(struct pt_regs, gs)) {
|
||||
return *(u16 *)((unsigned long)regs + offset);
|
||||
|
||||
}
|
||||
#endif
|
||||
return *(unsigned long *)((unsigned long)regs + offset);
|
||||
}
|
||||
|
@ -1,45 +1,56 @@
|
||||
#ifndef _ASM_X86_RMWcc
|
||||
#define _ASM_X86_RMWcc
|
||||
|
||||
#define __CLOBBERS_MEM "memory"
|
||||
#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx"
|
||||
|
||||
#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
|
||||
|
||||
/* Use asm goto */
|
||||
|
||||
#define __GEN_RMWcc(fullop, var, cc, ...) \
|
||||
#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
|
||||
do { \
|
||||
asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \
|
||||
: : "m" (var), ## __VA_ARGS__ \
|
||||
: "memory" : cc_label); \
|
||||
: : [counter] "m" (var), ## __VA_ARGS__ \
|
||||
: clobbers : cc_label); \
|
||||
return 0; \
|
||||
cc_label: \
|
||||
return 1; \
|
||||
} while (0)
|
||||
|
||||
#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
|
||||
__GEN_RMWcc(op " " arg0, var, cc)
|
||||
#define __BINARY_RMWcc_ARG " %1, "
|
||||
|
||||
#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
|
||||
__GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
|
||||
|
||||
#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
|
||||
|
||||
/* Use flags output or a set instruction */
|
||||
|
||||
#define __GEN_RMWcc(fullop, var, cc, ...) \
|
||||
#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
|
||||
do { \
|
||||
bool c; \
|
||||
asm volatile (fullop ";" CC_SET(cc) \
|
||||
: "+m" (var), CC_OUT(cc) (c) \
|
||||
: __VA_ARGS__ : "memory"); \
|
||||
: [counter] "+m" (var), CC_OUT(cc) (c) \
|
||||
: __VA_ARGS__ : clobbers); \
|
||||
return c; \
|
||||
} while (0)
|
||||
|
||||
#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
|
||||
__GEN_RMWcc(op " " arg0, var, cc)
|
||||
|
||||
#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
|
||||
__GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
|
||||
#define __BINARY_RMWcc_ARG " %2, "
|
||||
|
||||
#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
|
||||
|
||||
#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
|
||||
__GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM)
|
||||
|
||||
#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \
|
||||
__GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \
|
||||
__CLOBBERS_MEM_CC_CX)
|
||||
|
||||
#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
|
||||
__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \
|
||||
__CLOBBERS_MEM, vcon (val))
|
||||
|
||||
#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \
|
||||
__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \
|
||||
__CLOBBERS_MEM_CC_CX, vcon (val))
|
||||
|
||||
#endif /* _ASM_X86_RMWcc */
|
||||
|
@ -12,11 +12,14 @@ struct unwind_state {
|
||||
struct task_struct *task;
|
||||
int graph_idx;
|
||||
bool error;
|
||||
#ifdef CONFIG_FRAME_POINTER
|
||||
bool got_irq;
|
||||
unsigned long *bp, *orig_sp;
|
||||
#if defined(CONFIG_ORC_UNWINDER)
|
||||
bool signal, full_regs;
|
||||
unsigned long sp, bp, ip;
|
||||
struct pt_regs *regs;
|
||||
#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
|
||||
bool got_irq;
|
||||
unsigned long *bp, *orig_sp, ip;
|
||||
struct pt_regs *regs;
|
||||
unsigned long ip;
|
||||
#else
|
||||
unsigned long *sp;
|
||||
#endif
|
||||
@ -24,16 +27,20 @@ struct unwind_state {
|
||||
|
||||
void __unwind_start(struct unwind_state *state, struct task_struct *task,
|
||||
struct pt_regs *regs, unsigned long *first_frame);
|
||||
|
||||
bool unwind_next_frame(struct unwind_state *state);
|
||||
|
||||
unsigned long unwind_get_return_address(struct unwind_state *state);
|
||||
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
|
||||
|
||||
static inline bool unwind_done(struct unwind_state *state)
|
||||
{
|
||||
return state->stack_info.type == STACK_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
static inline bool unwind_error(struct unwind_state *state)
|
||||
{
|
||||
return state->error;
|
||||
}
|
||||
|
||||
static inline
|
||||
void unwind_start(struct unwind_state *state, struct task_struct *task,
|
||||
struct pt_regs *regs, unsigned long *first_frame)
|
||||
@ -43,22 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
|
||||
__unwind_start(state, task, regs, first_frame);
|
||||
}
|
||||
|
||||
static inline bool unwind_error(struct unwind_state *state)
|
||||
{
|
||||
return state->error;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FRAME_POINTER
|
||||
|
||||
static inline
|
||||
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
|
||||
{
|
||||
if (unwind_done(state))
|
||||
return NULL;
|
||||
|
||||
return state->regs ? &state->regs->ip : state->bp + 1;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
|
||||
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
|
||||
{
|
||||
if (unwind_done(state))
|
||||
@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
|
||||
|
||||
return state->regs;
|
||||
}
|
||||
|
||||
#else /* !CONFIG_FRAME_POINTER */
|
||||
|
||||
static inline
|
||||
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#else
|
||||
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_FRAME_POINTER */
|
||||
#ifdef CONFIG_ORC_UNWINDER
|
||||
void unwind_init(void);
|
||||
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
|
||||
void *orc, size_t orc_size);
|
||||
#else
|
||||
static inline void unwind_init(void) {}
|
||||
static inline
|
||||
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
|
||||
void *orc, size_t orc_size) {}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This disables KASAN checking when reading a value from another task's stack,
|
||||
* since the other task could be running on another CPU and could have poisoned
|
||||
* the stack in the meantime.
|
||||
*/
|
||||
#define READ_ONCE_TASK_STACK(task, x) \
|
||||
({ \
|
||||
unsigned long val; \
|
||||
if (task == current) \
|
||||
val = READ_ONCE(x); \
|
||||
else \
|
||||
val = READ_ONCE_NOCHECK(x); \
|
||||
val; \
|
||||
})
|
||||
|
||||
static inline bool task_on_another_cpu(struct task_struct *task)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
return task != current && task->on_cpu;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* _ASM_X86_UNWIND_H */
|
||||
|
105
arch/x86/include/asm/unwind_hints.h
Normal file
105
arch/x86/include/asm/unwind_hints.h
Normal file
@ -0,0 +1,105 @@
|
||||
#ifndef _ASM_X86_UNWIND_HINTS_H
|
||||
#define _ASM_X86_UNWIND_HINTS_H
|
||||
|
||||
#include "orc_types.h"
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
|
||||
/*
|
||||
* In asm, there are two kinds of code: normal C-type callable functions and
|
||||
* the rest. The normal callable functions can be called by other code, and
|
||||
* don't do anything unusual with the stack. Such normal callable functions
|
||||
* are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this
|
||||
* category. In this case, no special debugging annotations are needed because
|
||||
* objtool can automatically generate the ORC data for the ORC unwinder to read
|
||||
* at runtime.
|
||||
*
|
||||
* Anything which doesn't fall into the above category, such as syscall and
|
||||
* interrupt handlers, tends to not be called directly by other functions, and
|
||||
* often does unusual non-C-function-type things with the stack pointer. Such
|
||||
* code needs to be annotated such that objtool can understand it. The
|
||||
* following CFI hint macros are for this type of code.
|
||||
*
|
||||
* These macros provide hints to objtool about the state of the stack at each
|
||||
* instruction. Objtool starts from the hints and follows the code flow,
|
||||
* making automatic CFI adjustments when it sees pushes and pops, filling out
|
||||
* the debuginfo as necessary. It will also warn if it sees any
|
||||
* inconsistencies.
|
||||
*/
|
||||
.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL
|
||||
#ifdef CONFIG_STACK_VALIDATION
|
||||
.Lunwind_hint_ip_\@:
|
||||
.pushsection .discard.unwind_hints
|
||||
/* struct unwind_hint */
|
||||
.long .Lunwind_hint_ip_\@ - .
|
||||
.short \sp_offset
|
||||
.byte \sp_reg
|
||||
.byte \type
|
||||
.popsection
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro UNWIND_HINT_EMPTY
|
||||
UNWIND_HINT sp_reg=ORC_REG_UNDEFINED
|
||||
.endm
|
||||
|
||||
.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
|
||||
.if \base == %rsp
|
||||
.if \indirect
|
||||
.set sp_reg, ORC_REG_SP_INDIRECT
|
||||
.else
|
||||
.set sp_reg, ORC_REG_SP
|
||||
.endif
|
||||
.elseif \base == %rbp
|
||||
.set sp_reg, ORC_REG_BP
|
||||
.elseif \base == %rdi
|
||||
.set sp_reg, ORC_REG_DI
|
||||
.elseif \base == %rdx
|
||||
.set sp_reg, ORC_REG_DX
|
||||
.elseif \base == %r10
|
||||
.set sp_reg, ORC_REG_R10
|
||||
.else
|
||||
.error "UNWIND_HINT_REGS: bad base register"
|
||||
.endif
|
||||
|
||||
.set sp_offset, \offset
|
||||
|
||||
.if \iret
|
||||
.set type, ORC_TYPE_REGS_IRET
|
||||
.elseif \extra == 0
|
||||
.set type, ORC_TYPE_REGS_IRET
|
||||
.set sp_offset, \offset + (16*8)
|
||||
.else
|
||||
.set type, ORC_TYPE_REGS
|
||||
.endif
|
||||
|
||||
UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
|
||||
.endm
|
||||
|
||||
.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
|
||||
UNWIND_HINT_REGS base=\base offset=\offset iret=1
|
||||
.endm
|
||||
|
||||
.macro UNWIND_HINT_FUNC sp_offset=8
|
||||
UNWIND_HINT sp_offset=\sp_offset
|
||||
.endm
|
||||
|
||||
#else /* !__ASSEMBLY__ */
|
||||
|
||||
#define UNWIND_HINT(sp_reg, sp_offset, type) \
|
||||
"987: \n\t" \
|
||||
".pushsection .discard.unwind_hints\n\t" \
|
||||
/* struct unwind_hint */ \
|
||||
".long 987b - .\n\t" \
|
||||
".short " __stringify(sp_offset) "\n\t" \
|
||||
".byte " __stringify(sp_reg) "\n\t" \
|
||||
".byte " __stringify(type) "\n\t" \
|
||||
".popsection\n\t"
|
||||
|
||||
#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE)
|
||||
|
||||
#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE)
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _ASM_X86_UNWIND_HINTS_H */
|
@ -201,7 +201,7 @@ struct boot_params {
|
||||
*
|
||||
* @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
|
||||
* PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
|
||||
* @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
|
||||
* @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
|
||||
* @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
|
||||
* which start at asm startup_xen() entry point and later jump to the C
|
||||
* xen_start_kernel() entry point. Both domU and dom0 type of guests are
|
||||
|
@ -126,11 +126,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
|
||||
obj-$(CONFIG_TRACING) += tracepoint.o
|
||||
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
|
||||
|
||||
ifdef CONFIG_FRAME_POINTER
|
||||
obj-y += unwind_frame.o
|
||||
else
|
||||
obj-y += unwind_guess.o
|
||||
endif
|
||||
obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o
|
||||
obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o
|
||||
obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o
|
||||
|
||||
###
|
||||
# 64 bit specific files
|
||||
|
@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr;
|
||||
|
||||
int poke_int3_handler(struct pt_regs *regs)
|
||||
{
|
||||
/* bp_patching_in_progress */
|
||||
/*
|
||||
* Having observed our INT3 instruction, we now must observe
|
||||
* bp_patching_in_progress.
|
||||
*
|
||||
* in_progress = TRUE INT3
|
||||
* WMB RMB
|
||||
* write INT3 if (in_progress)
|
||||
*
|
||||
* Idem for bp_int3_handler.
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
if (likely(!bp_patching_in_progress))
|
||||
@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
|
||||
bp_int3_addr = (u8 *)addr + sizeof(int3);
|
||||
bp_patching_in_progress = true;
|
||||
/*
|
||||
* Corresponding read barrier in int3 notifier for
|
||||
* making sure the in_progress flags is correctly ordered wrt.
|
||||
* patching
|
||||
* Corresponding read barrier in int3 notifier for making sure the
|
||||
* in_progress and handler are correctly ordered wrt. patching.
|
||||
*/
|
||||
smp_wmb();
|
||||
|
||||
@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
|
||||
text_poke(addr, opcode, sizeof(int3));
|
||||
|
||||
on_each_cpu(do_sync_core, NULL, 1);
|
||||
|
||||
/*
|
||||
* sync_core() implies an smp_mb() and orders this store against
|
||||
* the writing of the new instruction.
|
||||
*/
|
||||
bp_patching_in_progress = false;
|
||||
smp_wmb();
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
@ -4,9 +4,6 @@
|
||||
|
||||
#include <asm/ucontext.h>
|
||||
|
||||
#include <linux/lguest.h>
|
||||
#include "../../../drivers/lguest/lg.h"
|
||||
|
||||
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
|
||||
static char syscalls[] = {
|
||||
#include <asm/syscalls_32.h>
|
||||
@ -62,23 +59,6 @@ void foo(void)
|
||||
OFFSET(stack_canary_offset, stack_canary, canary);
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
|
||||
BLANK();
|
||||
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
|
||||
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
|
||||
|
||||
BLANK();
|
||||
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
|
||||
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
|
||||
OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
|
||||
OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
|
||||
OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
|
||||
OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
|
||||
OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
|
||||
OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
|
||||
OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
|
||||
OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
|
||||
#endif
|
||||
BLANK();
|
||||
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
|
||||
DEFINE(NR_syscalls, sizeof(syscalls));
|
||||
|
@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
if (stack_name)
|
||||
printk("%s <%s>\n", log_lvl, stack_name);
|
||||
|
||||
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
|
||||
__show_regs(regs, 0);
|
||||
|
||||
/*
|
||||
* Scan the stack, printing any text addresses we find. At the
|
||||
* same time, follow proper stack frames with the unwinder.
|
||||
@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
* Don't print regs->ip again if it was already printed
|
||||
* by __show_regs() below.
|
||||
*/
|
||||
if (regs && stack == ®s->ip) {
|
||||
unwind_next_frame(&state);
|
||||
continue;
|
||||
}
|
||||
if (regs && stack == ®s->ip)
|
||||
goto next;
|
||||
|
||||
if (stack == ret_addr_p)
|
||||
reliable = 1;
|
||||
@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
if (!reliable)
|
||||
continue;
|
||||
|
||||
next:
|
||||
/*
|
||||
* Get the next frame from the unwinder. No need to
|
||||
* check for an error: if anything goes wrong, the rest
|
||||
@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
|
||||
/* if the frame has entry regs, print them */
|
||||
regs = unwind_get_entry_regs(&state);
|
||||
if (regs)
|
||||
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
|
||||
__show_regs(regs, 0);
|
||||
}
|
||||
|
||||
@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
|
||||
#ifdef CONFIG_X86_32
|
||||
if (user_mode(regs)) {
|
||||
sp = regs->sp;
|
||||
ss = regs->ss & 0xffff;
|
||||
ss = regs->ss;
|
||||
} else {
|
||||
sp = kernel_stack_pointer(regs);
|
||||
savesegment(ss, ss);
|
||||
|
@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
|
||||
* This is a software stack, so 'end' can be a valid stack pointer.
|
||||
* It just means the stack is empty.
|
||||
*/
|
||||
if (stack < begin || stack > end)
|
||||
if (stack <= begin || stack > end)
|
||||
return false;
|
||||
|
||||
info->type = STACK_TYPE_IRQ;
|
||||
@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
|
||||
* This is a software stack, so 'end' can be a valid stack pointer.
|
||||
* It just means the stack is empty.
|
||||
*/
|
||||
if (stack < begin || stack > end)
|
||||
if (stack <= begin || stack > end)
|
||||
return false;
|
||||
|
||||
info->type = STACK_TYPE_SOFTIRQ;
|
||||
|
@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
|
||||
begin = end - (exception_stack_sizes[k] / sizeof(long));
|
||||
regs = (struct pt_regs *)end - 1;
|
||||
|
||||
if (stack < begin || stack >= end)
|
||||
if (stack <= begin || stack >= end)
|
||||
continue;
|
||||
|
||||
info->type = STACK_TYPE_EXCEPTION + k;
|
||||
@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
|
||||
* This is a software stack, so 'end' can be a valid stack pointer.
|
||||
* It just means the stack is empty.
|
||||
*/
|
||||
if (stack < begin || stack > end)
|
||||
if (stack <= begin || stack > end)
|
||||
return false;
|
||||
|
||||
info->type = STACK_TYPE_IRQ;
|
||||
|
@ -155,7 +155,6 @@ ENTRY(startup_32)
|
||||
jmp *%eax
|
||||
|
||||
.Lbad_subarch:
|
||||
WEAK(lguest_entry)
|
||||
WEAK(xen_entry)
|
||||
/* Unknown implementation; there's really
|
||||
nothing we can do at this point. */
|
||||
@ -165,7 +164,6 @@ WEAK(xen_entry)
|
||||
|
||||
subarch_entries:
|
||||
.long .Ldefault_entry /* normal x86/PC */
|
||||
.long lguest_entry /* lguest hypervisor */
|
||||
.long xen_entry /* Xen hypervisor */
|
||||
.long .Ldefault_entry /* Moorestown MID */
|
||||
num_subarch_entries = (. - subarch_entries) / 4
|
||||
@ -457,12 +455,9 @@ early_idt_handler_common:
|
||||
/* The vector number is in pt_regs->gs */
|
||||
|
||||
cld
|
||||
pushl %fs /* pt_regs->fs */
|
||||
movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
|
||||
pushl %es /* pt_regs->es */
|
||||
movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
|
||||
pushl %ds /* pt_regs->ds */
|
||||
movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
|
||||
pushl %fs /* pt_regs->fs (__fsh varies by model) */
|
||||
pushl %es /* pt_regs->es (__esh varies by model) */
|
||||
pushl %ds /* pt_regs->ds (__dsh varies by model) */
|
||||
pushl %eax /* pt_regs->ax */
|
||||
pushl %ebp /* pt_regs->bp */
|
||||
pushl %edi /* pt_regs->di */
|
||||
@ -479,9 +474,8 @@ early_idt_handler_common:
|
||||
/* Load the vector number into EDX */
|
||||
movl PT_GS(%esp), %edx
|
||||
|
||||
/* Load GS into pt_regs->gs and clear high bits */
|
||||
/* Load GS into pt_regs->gs (and maybe clobber __gsh) */
|
||||
movw %gs, PT_GS(%esp)
|
||||
movw $0, PT_GS+2(%esp)
|
||||
|
||||
movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */
|
||||
call early_fixup_exception
|
||||
@ -493,10 +487,10 @@ early_idt_handler_common:
|
||||
popl %edi /* pt_regs->di */
|
||||
popl %ebp /* pt_regs->bp */
|
||||
popl %eax /* pt_regs->ax */
|
||||
popl %ds /* pt_regs->ds */
|
||||
popl %es /* pt_regs->es */
|
||||
popl %fs /* pt_regs->fs */
|
||||
popl %gs /* pt_regs->gs */
|
||||
popl %ds /* pt_regs->ds (always ignores __dsh) */
|
||||
popl %es /* pt_regs->es (always ignores __esh) */
|
||||
popl %fs /* pt_regs->fs (always ignores __fsh) */
|
||||
popl %gs /* pt_regs->gs (always ignores __gsh) */
|
||||
decl %ss:early_recursion_flag
|
||||
addl $4, %esp /* pop pt_regs->orig_ax */
|
||||
iret
|
||||
|
@ -21,6 +21,25 @@
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/syscalls.h>
|
||||
|
||||
static void refresh_ldt_segments(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
unsigned short sel;
|
||||
|
||||
/*
|
||||
* Make sure that the cached DS and ES descriptors match the updated
|
||||
* LDT.
|
||||
*/
|
||||
savesegment(ds, sel);
|
||||
if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
|
||||
loadsegment(ds, sel);
|
||||
|
||||
savesegment(es, sel);
|
||||
if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
|
||||
loadsegment(es, sel);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* context.lock is held for us, so we don't need any locking. */
|
||||
static void flush_ldt(void *__mm)
|
||||
{
|
||||
@ -32,6 +51,8 @@ static void flush_ldt(void *__mm)
|
||||
|
||||
pc = &mm->context;
|
||||
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
|
||||
|
||||
refresh_ldt_segments();
|
||||
}
|
||||
|
||||
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/setup.h>
|
||||
#include <asm/unwind.h>
|
||||
|
||||
#if 0
|
||||
#define DEBUGP(fmt, ...) \
|
||||
@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr,
|
||||
struct module *me)
|
||||
{
|
||||
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
|
||||
*para = NULL;
|
||||
*para = NULL, *orc = NULL, *orc_ip = NULL;
|
||||
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
|
||||
|
||||
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
|
||||
@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr,
|
||||
locks = s;
|
||||
if (!strcmp(".parainstructions", secstrings + s->sh_name))
|
||||
para = s;
|
||||
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
|
||||
orc = s;
|
||||
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
|
||||
orc_ip = s;
|
||||
}
|
||||
|
||||
if (alt) {
|
||||
@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr,
|
||||
/* make jump label nops */
|
||||
jump_label_apply_nops(me);
|
||||
|
||||
if (orc && orc_ip)
|
||||
unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
|
||||
(void *)orc->sh_addr, orc->sh_size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
|
||||
x86_platform.legacy.reserve_bios_regions = 1;
|
||||
break;
|
||||
case X86_SUBARCH_XEN:
|
||||
case X86_SUBARCH_LGUEST:
|
||||
x86_platform.legacy.devices.pnpbios = 0;
|
||||
x86_platform.legacy.rtc = 0;
|
||||
break;
|
||||
|
@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all)
|
||||
|
||||
if (user_mode(regs)) {
|
||||
sp = regs->sp;
|
||||
ss = regs->ss & 0xffff;
|
||||
ss = regs->ss;
|
||||
gs = get_user_gs(regs);
|
||||
} else {
|
||||
sp = kernel_stack_pointer(regs);
|
||||
|
@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all)
|
||||
unsigned int fsindex, gsindex;
|
||||
unsigned int ds, cs, es;
|
||||
|
||||
printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
|
||||
(void *)regs->ip);
|
||||
printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
|
||||
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
|
||||
regs->sp, regs->flags);
|
||||
if (regs->orig_ax != -1)
|
||||
@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task)
|
||||
}
|
||||
}
|
||||
|
||||
enum which_selector {
|
||||
FS,
|
||||
GS
|
||||
};
|
||||
|
||||
/*
|
||||
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
|
||||
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
|
||||
* It's forcibly inlined because it'll generate better code and this function
|
||||
* is hot.
|
||||
*/
|
||||
static __always_inline void save_base_legacy(struct task_struct *prev_p,
|
||||
unsigned short selector,
|
||||
enum which_selector which)
|
||||
{
|
||||
if (likely(selector == 0)) {
|
||||
/*
|
||||
* On Intel (without X86_BUG_NULL_SEG), the segment base could
|
||||
* be the pre-existing saved base or it could be zero. On AMD
|
||||
* (with X86_BUG_NULL_SEG), the segment base could be almost
|
||||
* anything.
|
||||
*
|
||||
* This branch is very hot (it's hit twice on almost every
|
||||
* context switch between 64-bit programs), and avoiding
|
||||
* the RDMSR helps a lot, so we just assume that whatever
|
||||
* value is already saved is correct. This matches historical
|
||||
* Linux behavior, so it won't break existing applications.
|
||||
*
|
||||
* To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
|
||||
* report that the base is zero, it needs to actually be zero:
|
||||
* see the corresponding logic in load_seg_legacy.
|
||||
*/
|
||||
} else {
|
||||
/*
|
||||
* If the selector is 1, 2, or 3, then the base is zero on
|
||||
* !X86_BUG_NULL_SEG CPUs and could be anything on
|
||||
* X86_BUG_NULL_SEG CPUs. In the latter case, Linux
|
||||
* has never attempted to preserve the base across context
|
||||
* switches.
|
||||
*
|
||||
* If selector > 3, then it refers to a real segment, and
|
||||
* saving the base isn't necessary.
|
||||
*/
|
||||
if (which == FS)
|
||||
prev_p->thread.fsbase = 0;
|
||||
else
|
||||
prev_p->thread.gsbase = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static __always_inline void save_fsgs(struct task_struct *task)
|
||||
{
|
||||
savesegment(fs, task->thread.fsindex);
|
||||
savesegment(gs, task->thread.gsindex);
|
||||
save_base_legacy(task, task->thread.fsindex, FS);
|
||||
save_base_legacy(task, task->thread.gsindex, GS);
|
||||
}
|
||||
|
||||
static __always_inline void loadseg(enum which_selector which,
|
||||
unsigned short sel)
|
||||
{
|
||||
if (which == FS)
|
||||
loadsegment(fs, sel);
|
||||
else
|
||||
load_gs_index(sel);
|
||||
}
|
||||
|
||||
static __always_inline void load_seg_legacy(unsigned short prev_index,
|
||||
unsigned long prev_base,
|
||||
unsigned short next_index,
|
||||
unsigned long next_base,
|
||||
enum which_selector which)
|
||||
{
|
||||
if (likely(next_index <= 3)) {
|
||||
/*
|
||||
* The next task is using 64-bit TLS, is not using this
|
||||
* segment at all, or is having fun with arcane CPU features.
|
||||
*/
|
||||
if (next_base == 0) {
|
||||
/*
|
||||
* Nasty case: on AMD CPUs, we need to forcibly zero
|
||||
* the base.
|
||||
*/
|
||||
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
|
||||
loadseg(which, __USER_DS);
|
||||
loadseg(which, next_index);
|
||||
} else {
|
||||
/*
|
||||
* We could try to exhaustively detect cases
|
||||
* under which we can skip the segment load,
|
||||
* but there's really only one case that matters
|
||||
* for performance: if both the previous and
|
||||
* next states are fully zeroed, we can skip
|
||||
* the load.
|
||||
*
|
||||
* (This assumes that prev_base == 0 has no
|
||||
* false positives. This is the case on
|
||||
* Intel-style CPUs.)
|
||||
*/
|
||||
if (likely(prev_index | next_index | prev_base))
|
||||
loadseg(which, next_index);
|
||||
}
|
||||
} else {
|
||||
if (prev_index != next_index)
|
||||
loadseg(which, next_index);
|
||||
wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
|
||||
next_base);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* The next task is using a real segment. Loading the selector
|
||||
* is sufficient.
|
||||
*/
|
||||
loadseg(which, next_index);
|
||||
}
|
||||
}
|
||||
|
||||
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
|
||||
unsigned long arg, struct task_struct *p, unsigned long tls)
|
||||
{
|
||||
@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
|
||||
unsigned long new_sp,
|
||||
unsigned int _cs, unsigned int _ss, unsigned int _ds)
|
||||
{
|
||||
WARN_ON_ONCE(regs != current_pt_regs());
|
||||
|
||||
if (static_cpu_has(X86_BUG_NULL_SEG)) {
|
||||
/* Loading zero below won't clear the base. */
|
||||
loadsegment(fs, __USER_DS);
|
||||
load_gs_index(__USER_DS);
|
||||
}
|
||||
|
||||
loadsegment(fs, 0);
|
||||
loadsegment(es, _ds);
|
||||
loadsegment(ds, _ds);
|
||||
load_gs_index(0);
|
||||
|
||||
regs->ip = new_ip;
|
||||
regs->sp = new_sp;
|
||||
regs->cs = _cs;
|
||||
@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||
struct fpu *next_fpu = &next->fpu;
|
||||
int cpu = smp_processor_id();
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
|
||||
unsigned prev_fsindex, prev_gsindex;
|
||||
|
||||
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
|
||||
this_cpu_read(irq_count) != -1);
|
||||
|
||||
switch_fpu_prepare(prev_fpu, cpu);
|
||||
|
||||
@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||
*
|
||||
* (e.g. xen_load_tls())
|
||||
*/
|
||||
savesegment(fs, prev_fsindex);
|
||||
savesegment(gs, prev_gsindex);
|
||||
save_fsgs(prev_p);
|
||||
|
||||
/*
|
||||
* Load TLS before restoring any segments so that segment loads
|
||||
@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||
if (unlikely(next->ds | prev->ds))
|
||||
loadsegment(ds, next->ds);
|
||||
|
||||
/*
|
||||
* Switch FS and GS.
|
||||
*
|
||||
* These are even more complicated than DS and ES: they have
|
||||
* 64-bit bases are that controlled by arch_prctl. The bases
|
||||
* don't necessarily match the selectors, as user code can do
|
||||
* any number of things to cause them to be inconsistent.
|
||||
*
|
||||
* We don't promise to preserve the bases if the selectors are
|
||||
* nonzero. We also don't promise to preserve the base if the
|
||||
* selector is zero and the base doesn't match whatever was
|
||||
* most recently passed to ARCH_SET_FS/GS. (If/when the
|
||||
* FSGSBASE instructions are enabled, we'll need to offer
|
||||
* stronger guarantees.)
|
||||
*
|
||||
* As an invariant,
|
||||
* (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
|
||||
* impossible.
|
||||
*/
|
||||
if (next->fsindex) {
|
||||
/* Loading a nonzero value into FS sets the index and base. */
|
||||
loadsegment(fs, next->fsindex);
|
||||
} else {
|
||||
if (next->fsbase) {
|
||||
/* Next index is zero but next base is nonzero. */
|
||||
if (prev_fsindex)
|
||||
loadsegment(fs, 0);
|
||||
wrmsrl(MSR_FS_BASE, next->fsbase);
|
||||
} else {
|
||||
/* Next base and index are both zero. */
|
||||
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
|
||||
/*
|
||||
* We don't know the previous base and can't
|
||||
* find out without RDMSR. Forcibly clear it.
|
||||
*/
|
||||
loadsegment(fs, __USER_DS);
|
||||
loadsegment(fs, 0);
|
||||
} else {
|
||||
/*
|
||||
* If the previous index is zero and ARCH_SET_FS
|
||||
* didn't change the base, then the base is
|
||||
* also zero and we don't need to do anything.
|
||||
*/
|
||||
if (prev->fsbase || prev_fsindex)
|
||||
loadsegment(fs, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Save the old state and preserve the invariant.
|
||||
* NB: if prev_fsindex == 0, then we can't reliably learn the base
|
||||
* without RDMSR because Intel user code can zero it without telling
|
||||
* us and AMD user code can program any 32-bit value without telling
|
||||
* us.
|
||||
*/
|
||||
if (prev_fsindex)
|
||||
prev->fsbase = 0;
|
||||
prev->fsindex = prev_fsindex;
|
||||
|
||||
if (next->gsindex) {
|
||||
/* Loading a nonzero value into GS sets the index and base. */
|
||||
load_gs_index(next->gsindex);
|
||||
} else {
|
||||
if (next->gsbase) {
|
||||
/* Next index is zero but next base is nonzero. */
|
||||
if (prev_gsindex)
|
||||
load_gs_index(0);
|
||||
wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
|
||||
} else {
|
||||
/* Next base and index are both zero. */
|
||||
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
|
||||
/*
|
||||
* We don't know the previous base and can't
|
||||
* find out without RDMSR. Forcibly clear it.
|
||||
*
|
||||
* This contains a pointless SWAPGS pair.
|
||||
* Fixing it would involve an explicit check
|
||||
* for Xen or a new pvop.
|
||||
*/
|
||||
load_gs_index(__USER_DS);
|
||||
load_gs_index(0);
|
||||
} else {
|
||||
/*
|
||||
* If the previous index is zero and ARCH_SET_GS
|
||||
* didn't change the base, then the base is
|
||||
* also zero and we don't need to do anything.
|
||||
*/
|
||||
if (prev->gsbase || prev_gsindex)
|
||||
load_gs_index(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Save the old state and preserve the invariant.
|
||||
* NB: if prev_gsindex == 0, then we can't reliably learn the base
|
||||
* without RDMSR because Intel user code can zero it without telling
|
||||
* us and AMD user code can program any 32-bit value without telling
|
||||
* us.
|
||||
*/
|
||||
if (prev_gsindex)
|
||||
prev->gsbase = 0;
|
||||
prev->gsindex = prev_gsindex;
|
||||
load_seg_legacy(prev->fsindex, prev->fsbase,
|
||||
next->fsindex, next->fsbase, FS);
|
||||
load_seg_legacy(prev->gsindex, prev->gsbase,
|
||||
next->gsindex, next->gsbase, GS);
|
||||
|
||||
switch_fpu_finish(next_fpu, cpu);
|
||||
|
||||
|
@ -115,6 +115,7 @@
|
||||
#include <asm/microcode.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/kaslr.h>
|
||||
#include <asm/unwind.h>
|
||||
|
||||
/*
|
||||
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
|
||||
@ -1310,6 +1311,8 @@ void __init setup_arch(char **cmdline_p)
|
||||
if (efi_enabled(EFI_BOOT))
|
||||
efi_apply_memmap_quirks();
|
||||
#endif
|
||||
|
||||
unwind_init();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
|
@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
|
||||
sp = current->sas_ss_sp + current->sas_ss_size;
|
||||
} else if (IS_ENABLED(CONFIG_X86_32) &&
|
||||
!onsigstack &&
|
||||
(regs->ss & 0xffff) != __USER_DS &&
|
||||
regs->ss != __USER_DS &&
|
||||
!(ka->sa.sa_flags & SA_RESTORER) &&
|
||||
ka->sa.sa_restorer) {
|
||||
/* This is the legacy signal stack switching. */
|
||||
|
@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
|
||||
unsigned long addr, seg;
|
||||
|
||||
addr = regs->ip;
|
||||
seg = regs->cs & 0xffff;
|
||||
seg = regs->cs;
|
||||
if (v8086_mode(regs)) {
|
||||
addr = (addr & 0xffff) + (seg << 4);
|
||||
return addr;
|
||||
|
@ -10,20 +10,22 @@
|
||||
|
||||
#define FRAME_HEADER_SIZE (sizeof(long) * 2)
|
||||
|
||||
/*
|
||||
* This disables KASAN checking when reading a value from another task's stack,
|
||||
* since the other task could be running on another CPU and could have poisoned
|
||||
* the stack in the meantime.
|
||||
*/
|
||||
#define READ_ONCE_TASK_STACK(task, x) \
|
||||
({ \
|
||||
unsigned long val; \
|
||||
if (task == current) \
|
||||
val = READ_ONCE(x); \
|
||||
else \
|
||||
val = READ_ONCE_NOCHECK(x); \
|
||||
val; \
|
||||
})
|
||||
unsigned long unwind_get_return_address(struct unwind_state *state)
|
||||
{
|
||||
if (unwind_done(state))
|
||||
return 0;
|
||||
|
||||
return __kernel_text_address(state->ip) ? state->ip : 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unwind_get_return_address);
|
||||
|
||||
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
|
||||
{
|
||||
if (unwind_done(state))
|
||||
return NULL;
|
||||
|
||||
return state->regs ? &state->regs->ip : state->bp + 1;
|
||||
}
|
||||
|
||||
static void unwind_dump(struct unwind_state *state)
|
||||
{
|
||||
@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state)
|
||||
}
|
||||
}
|
||||
|
||||
unsigned long unwind_get_return_address(struct unwind_state *state)
|
||||
{
|
||||
if (unwind_done(state))
|
||||
return 0;
|
||||
|
||||
return __kernel_text_address(state->ip) ? state->ip : 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unwind_get_return_address);
|
||||
|
||||
static size_t regs_size(struct pt_regs *regs)
|
||||
{
|
||||
/* x86_32 regs from kernel mode are two words shorter: */
|
||||
|
@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unwind_get_return_address);
|
||||
|
||||
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool unwind_next_frame(struct unwind_state *state)
|
||||
{
|
||||
struct stack_info *info = &state->stack_info;
|
||||
|
582
arch/x86/kernel/unwind_orc.c
Normal file
582
arch/x86/kernel/unwind_orc.c
Normal file
@ -0,0 +1,582 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/sort.h>
|
||||
#include <asm/ptrace.h>
|
||||
#include <asm/stacktrace.h>
|
||||
#include <asm/unwind.h>
|
||||
#include <asm/orc_types.h>
|
||||
#include <asm/orc_lookup.h>
|
||||
#include <asm/sections.h>
|
||||
|
||||
#define orc_warn(fmt, ...) \
|
||||
printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
|
||||
|
||||
extern int __start_orc_unwind_ip[];
|
||||
extern int __stop_orc_unwind_ip[];
|
||||
extern struct orc_entry __start_orc_unwind[];
|
||||
extern struct orc_entry __stop_orc_unwind[];
|
||||
|
||||
static DEFINE_MUTEX(sort_mutex);
|
||||
int *cur_orc_ip_table = __start_orc_unwind_ip;
|
||||
struct orc_entry *cur_orc_table = __start_orc_unwind;
|
||||
|
||||
unsigned int lookup_num_blocks;
|
||||
bool orc_init;
|
||||
|
||||
static inline unsigned long orc_ip(const int *ip)
|
||||
{
|
||||
return (unsigned long)ip + *ip;
|
||||
}
|
||||
|
||||
static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
|
||||
unsigned int num_entries, unsigned long ip)
|
||||
{
|
||||
int *first = ip_table;
|
||||
int *last = ip_table + num_entries - 1;
|
||||
int *mid = first, *found = first;
|
||||
|
||||
if (!num_entries)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Do a binary range search to find the rightmost duplicate of a given
|
||||
* starting address. Some entries are section terminators which are
|
||||
* "weak" entries for ensuring there are no gaps. They should be
|
||||
* ignored when they conflict with a real entry.
|
||||
*/
|
||||
while (first <= last) {
|
||||
mid = first + ((last - first) / 2);
|
||||
|
||||
if (orc_ip(mid) <= ip) {
|
||||
found = mid;
|
||||
first = mid + 1;
|
||||
} else
|
||||
last = mid - 1;
|
||||
}
|
||||
|
||||
return u_table + (found - ip_table);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MODULES
|
||||
static struct orc_entry *orc_module_find(unsigned long ip)
|
||||
{
|
||||
struct module *mod;
|
||||
|
||||
mod = __module_address(ip);
|
||||
if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
|
||||
return NULL;
|
||||
return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
|
||||
mod->arch.num_orcs, ip);
|
||||
}
|
||||
#else
|
||||
static struct orc_entry *orc_module_find(unsigned long ip)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct orc_entry *orc_find(unsigned long ip)
|
||||
{
|
||||
if (!orc_init)
|
||||
return NULL;
|
||||
|
||||
/* For non-init vmlinux addresses, use the fast lookup table: */
|
||||
if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
|
||||
unsigned int idx, start, stop;
|
||||
|
||||
idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
|
||||
|
||||
if (unlikely((idx >= lookup_num_blocks-1))) {
|
||||
orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n",
|
||||
idx, lookup_num_blocks, ip);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
start = orc_lookup[idx];
|
||||
stop = orc_lookup[idx + 1] + 1;
|
||||
|
||||
if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
|
||||
(__start_orc_unwind + stop > __stop_orc_unwind))) {
|
||||
orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n",
|
||||
idx, lookup_num_blocks, start, stop, ip);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return __orc_find(__start_orc_unwind_ip + start,
|
||||
__start_orc_unwind + start, stop - start, ip);
|
||||
}
|
||||
|
||||
/* vmlinux .init slow lookup: */
|
||||
if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext)
|
||||
return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
|
||||
__stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
|
||||
|
||||
/* Module lookup: */
|
||||
return orc_module_find(ip);
|
||||
}
|
||||
|
||||
static void orc_sort_swap(void *_a, void *_b, int size)
|
||||
{
|
||||
struct orc_entry *orc_a, *orc_b;
|
||||
struct orc_entry orc_tmp;
|
||||
int *a = _a, *b = _b, tmp;
|
||||
int delta = _b - _a;
|
||||
|
||||
/* Swap the .orc_unwind_ip entries: */
|
||||
tmp = *a;
|
||||
*a = *b + delta;
|
||||
*b = tmp - delta;
|
||||
|
||||
/* Swap the corresponding .orc_unwind entries: */
|
||||
orc_a = cur_orc_table + (a - cur_orc_ip_table);
|
||||
orc_b = cur_orc_table + (b - cur_orc_ip_table);
|
||||
orc_tmp = *orc_a;
|
||||
*orc_a = *orc_b;
|
||||
*orc_b = orc_tmp;
|
||||
}
|
||||
|
||||
static int orc_sort_cmp(const void *_a, const void *_b)
|
||||
{
|
||||
struct orc_entry *orc_a;
|
||||
const int *a = _a, *b = _b;
|
||||
unsigned long a_val = orc_ip(a);
|
||||
unsigned long b_val = orc_ip(b);
|
||||
|
||||
if (a_val > b_val)
|
||||
return 1;
|
||||
if (a_val < b_val)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* The "weak" section terminator entries need to always be on the left
|
||||
* to ensure the lookup code skips them in favor of real entries.
|
||||
* These terminator entries exist to handle any gaps created by
|
||||
* whitelisted .o files which didn't get objtool generation.
|
||||
*/
|
||||
orc_a = cur_orc_table + (a - cur_orc_ip_table);
|
||||
return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MODULES
|
||||
void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
|
||||
void *_orc, size_t orc_size)
|
||||
{
|
||||
int *orc_ip = _orc_ip;
|
||||
struct orc_entry *orc = _orc;
|
||||
unsigned int num_entries = orc_ip_size / sizeof(int);
|
||||
|
||||
WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
|
||||
orc_size % sizeof(*orc) != 0 ||
|
||||
num_entries != orc_size / sizeof(*orc));
|
||||
|
||||
/*
|
||||
* The 'cur_orc_*' globals allow the orc_sort_swap() callback to
|
||||
* associate an .orc_unwind_ip table entry with its corresponding
|
||||
* .orc_unwind entry so they can both be swapped.
|
||||
*/
|
||||
mutex_lock(&sort_mutex);
|
||||
cur_orc_ip_table = orc_ip;
|
||||
cur_orc_table = orc;
|
||||
sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
|
||||
mutex_unlock(&sort_mutex);
|
||||
|
||||
mod->arch.orc_unwind_ip = orc_ip;
|
||||
mod->arch.orc_unwind = orc;
|
||||
mod->arch.num_orcs = num_entries;
|
||||
}
|
||||
#endif
|
||||
|
||||
void __init unwind_init(void)
|
||||
{
|
||||
size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
|
||||
size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
|
||||
size_t num_entries = orc_ip_size / sizeof(int);
|
||||
struct orc_entry *orc;
|
||||
int i;
|
||||
|
||||
if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
|
||||
orc_size % sizeof(struct orc_entry) != 0 ||
|
||||
num_entries != orc_size / sizeof(struct orc_entry)) {
|
||||
orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Sort the .orc_unwind and .orc_unwind_ip tables: */
|
||||
sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
|
||||
orc_sort_swap);
|
||||
|
||||
/* Initialize the fast lookup table: */
|
||||
lookup_num_blocks = orc_lookup_end - orc_lookup;
|
||||
for (i = 0; i < lookup_num_blocks-1; i++) {
|
||||
orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
|
||||
num_entries,
|
||||
LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
|
||||
if (!orc) {
|
||||
orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
orc_lookup[i] = orc - __start_orc_unwind;
|
||||
}
|
||||
|
||||
/* Initialize the ending block: */
|
||||
orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
|
||||
LOOKUP_STOP_IP);
|
||||
if (!orc) {
|
||||
orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
|
||||
return;
|
||||
}
|
||||
orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
|
||||
|
||||
orc_init = true;
|
||||
}
|
||||
|
||||
unsigned long unwind_get_return_address(struct unwind_state *state)
|
||||
{
|
||||
if (unwind_done(state))
|
||||
return 0;
|
||||
|
||||
return __kernel_text_address(state->ip) ? state->ip : 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unwind_get_return_address);
|
||||
|
||||
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
|
||||
{
|
||||
if (unwind_done(state))
|
||||
return NULL;
|
||||
|
||||
if (state->regs)
|
||||
return &state->regs->ip;
|
||||
|
||||
if (state->sp)
|
||||
return (unsigned long *)state->sp - 1;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
|
||||
size_t len)
|
||||
{
|
||||
struct stack_info *info = &state->stack_info;
|
||||
|
||||
/*
|
||||
* If the address isn't on the current stack, switch to the next one.
|
||||
*
|
||||
* We may have to traverse multiple stacks to deal with the possibility
|
||||
* that info->next_sp could point to an empty stack and the address
|
||||
* could be on a subsequent stack.
|
||||
*/
|
||||
while (!on_stack(info, (void *)addr, len))
|
||||
if (get_stack_info(info->next_sp, state->task, info,
|
||||
&state->stack_mask))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
|
||||
unsigned long *val)
|
||||
{
|
||||
if (!stack_access_ok(state, addr, sizeof(long)))
|
||||
return false;
|
||||
|
||||
*val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr);
|
||||
return true;
|
||||
}
|
||||
|
||||
#define REGS_SIZE (sizeof(struct pt_regs))
|
||||
#define SP_OFFSET (offsetof(struct pt_regs, sp))
|
||||
#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
|
||||
#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
|
||||
|
||||
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
|
||||
unsigned long *ip, unsigned long *sp, bool full)
|
||||
{
|
||||
size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
|
||||
size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
|
||||
struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_64)) {
|
||||
if (!stack_access_ok(state, addr, regs_size))
|
||||
return false;
|
||||
|
||||
*ip = regs->ip;
|
||||
*sp = regs->sp;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!stack_access_ok(state, addr, sp_offset))
|
||||
return false;
|
||||
|
||||
*ip = regs->ip;
|
||||
|
||||
if (user_mode(regs)) {
|
||||
if (!stack_access_ok(state, addr + sp_offset,
|
||||
REGS_SIZE - SP_OFFSET))
|
||||
return false;
|
||||
|
||||
*sp = regs->sp;
|
||||
} else
|
||||
*sp = (unsigned long)®s->sp;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool unwind_next_frame(struct unwind_state *state)
|
||||
{
|
||||
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
|
||||
enum stack_type prev_type = state->stack_info.type;
|
||||
struct orc_entry *orc;
|
||||
struct pt_regs *ptregs;
|
||||
bool indirect = false;
|
||||
|
||||
if (unwind_done(state))
|
||||
return false;
|
||||
|
||||
/* Don't let modules unload while we're reading their ORC data. */
|
||||
preempt_disable();
|
||||
|
||||
/* Have we reached the end? */
|
||||
if (state->regs && user_mode(state->regs))
|
||||
goto done;
|
||||
|
||||
/*
|
||||
* Find the orc_entry associated with the text address.
|
||||
*
|
||||
* Decrement call return addresses by one so they work for sibling
|
||||
* calls and calls to noreturn functions.
|
||||
*/
|
||||
orc = orc_find(state->signal ? state->ip : state->ip - 1);
|
||||
if (!orc || orc->sp_reg == ORC_REG_UNDEFINED)
|
||||
goto done;
|
||||
orig_ip = state->ip;
|
||||
|
||||
/* Find the previous frame's stack: */
|
||||
switch (orc->sp_reg) {
|
||||
case ORC_REG_SP:
|
||||
sp = state->sp + orc->sp_offset;
|
||||
break;
|
||||
|
||||
case ORC_REG_BP:
|
||||
sp = state->bp + orc->sp_offset;
|
||||
break;
|
||||
|
||||
case ORC_REG_SP_INDIRECT:
|
||||
sp = state->sp + orc->sp_offset;
|
||||
indirect = true;
|
||||
break;
|
||||
|
||||
case ORC_REG_BP_INDIRECT:
|
||||
sp = state->bp + orc->sp_offset;
|
||||
indirect = true;
|
||||
break;
|
||||
|
||||
case ORC_REG_R10:
|
||||
if (!state->regs || !state->full_regs) {
|
||||
orc_warn("missing regs for base reg R10 at ip %p\n",
|
||||
(void *)state->ip);
|
||||
goto done;
|
||||
}
|
||||
sp = state->regs->r10;
|
||||
break;
|
||||
|
||||
case ORC_REG_R13:
|
||||
if (!state->regs || !state->full_regs) {
|
||||
orc_warn("missing regs for base reg R13 at ip %p\n",
|
||||
(void *)state->ip);
|
||||
goto done;
|
||||
}
|
||||
sp = state->regs->r13;
|
||||
break;
|
||||
|
||||
case ORC_REG_DI:
|
||||
if (!state->regs || !state->full_regs) {
|
||||
orc_warn("missing regs for base reg DI at ip %p\n",
|
||||
(void *)state->ip);
|
||||
goto done;
|
||||
}
|
||||
sp = state->regs->di;
|
||||
break;
|
||||
|
||||
case ORC_REG_DX:
|
||||
if (!state->regs || !state->full_regs) {
|
||||
orc_warn("missing regs for base reg DX at ip %p\n",
|
||||
(void *)state->ip);
|
||||
goto done;
|
||||
}
|
||||
sp = state->regs->dx;
|
||||
break;
|
||||
|
||||
default:
|
||||
orc_warn("unknown SP base reg %d for ip %p\n",
|
||||
orc->sp_reg, (void *)state->ip);
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (indirect) {
|
||||
if (!deref_stack_reg(state, sp, &sp))
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Find IP, SP and possibly regs: */
|
||||
switch (orc->type) {
|
||||
case ORC_TYPE_CALL:
|
||||
ip_p = sp - sizeof(long);
|
||||
|
||||
if (!deref_stack_reg(state, ip_p, &state->ip))
|
||||
goto done;
|
||||
|
||||
state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
|
||||
state->ip, (void *)ip_p);
|
||||
|
||||
state->sp = sp;
|
||||
state->regs = NULL;
|
||||
state->signal = false;
|
||||
break;
|
||||
|
||||
case ORC_TYPE_REGS:
|
||||
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
|
||||
orc_warn("can't dereference registers at %p for ip %p\n",
|
||||
(void *)sp, (void *)orig_ip);
|
||||
goto done;
|
||||
}
|
||||
|
||||
state->regs = (struct pt_regs *)sp;
|
||||
state->full_regs = true;
|
||||
state->signal = true;
|
||||
break;
|
||||
|
||||
case ORC_TYPE_REGS_IRET:
|
||||
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
|
||||
orc_warn("can't dereference iret registers at %p for ip %p\n",
|
||||
(void *)sp, (void *)orig_ip);
|
||||
goto done;
|
||||
}
|
||||
|
||||
ptregs = container_of((void *)sp, struct pt_regs, ip);
|
||||
if ((unsigned long)ptregs >= prev_sp &&
|
||||
on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
|
||||
state->regs = ptregs;
|
||||
state->full_regs = false;
|
||||
} else
|
||||
state->regs = NULL;
|
||||
|
||||
state->signal = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
orc_warn("unknown .orc_unwind entry type %d\n", orc->type);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Find BP: */
|
||||
switch (orc->bp_reg) {
|
||||
case ORC_REG_UNDEFINED:
|
||||
if (state->regs && state->full_regs)
|
||||
state->bp = state->regs->bp;
|
||||
break;
|
||||
|
||||
case ORC_REG_PREV_SP:
|
||||
if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
|
||||
goto done;
|
||||
break;
|
||||
|
||||
case ORC_REG_BP:
|
||||
if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
|
||||
goto done;
|
||||
break;
|
||||
|
||||
default:
|
||||
orc_warn("unknown BP base reg %d for ip %p\n",
|
||||
orc->bp_reg, (void *)orig_ip);
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Prevent a recursive loop due to bad ORC data: */
|
||||
if (state->stack_info.type == prev_type &&
|
||||
on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
|
||||
state->sp <= prev_sp) {
|
||||
orc_warn("stack going in the wrong direction? ip=%p\n",
|
||||
(void *)orig_ip);
|
||||
goto done;
|
||||
}
|
||||
|
||||
preempt_enable();
|
||||
return true;
|
||||
|
||||
done:
|
||||
preempt_enable();
|
||||
state->stack_info.type = STACK_TYPE_UNKNOWN;
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unwind_next_frame);
|
||||
|
||||
void __unwind_start(struct unwind_state *state, struct task_struct *task,
|
||||
struct pt_regs *regs, unsigned long *first_frame)
|
||||
{
|
||||
memset(state, 0, sizeof(*state));
|
||||
state->task = task;
|
||||
|
||||
/*
|
||||
* Refuse to unwind the stack of a task while it's executing on another
|
||||
* CPU. This check is racy, but that's ok: the unwinder has other
|
||||
* checks to prevent it from going off the rails.
|
||||
*/
|
||||
if (task_on_another_cpu(task))
|
||||
goto done;
|
||||
|
||||
if (regs) {
|
||||
if (user_mode(regs))
|
||||
goto done;
|
||||
|
||||
state->ip = regs->ip;
|
||||
state->sp = kernel_stack_pointer(regs);
|
||||
state->bp = regs->bp;
|
||||
state->regs = regs;
|
||||
state->full_regs = true;
|
||||
state->signal = true;
|
||||
|
||||
} else if (task == current) {
|
||||
asm volatile("lea (%%rip), %0\n\t"
|
||||
"mov %%rsp, %1\n\t"
|
||||
"mov %%rbp, %2\n\t"
|
||||
: "=r" (state->ip), "=r" (state->sp),
|
||||
"=r" (state->bp));
|
||||
|
||||
} else {
|
||||
struct inactive_task_frame *frame = (void *)task->thread.sp;
|
||||
|
||||
state->sp = task->thread.sp;
|
||||
state->bp = READ_ONCE_NOCHECK(frame->bp);
|
||||
state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
|
||||
}
|
||||
|
||||
if (get_stack_info((unsigned long *)state->sp, state->task,
|
||||
&state->stack_info, &state->stack_mask))
|
||||
return;
|
||||
|
||||
/*
|
||||
* The caller can provide the address of the first frame directly
|
||||
* (first_frame) or indirectly (regs->sp) to indicate which stack frame
|
||||
* to start unwinding at. Skip ahead until we reach it.
|
||||
*/
|
||||
|
||||
/* When starting from regs, skip the regs frame: */
|
||||
if (regs) {
|
||||
unwind_next_frame(state);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Otherwise, skip ahead to the user-specified starting frame: */
|
||||
while (!unwind_done(state) &&
|
||||
(!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
|
||||
state->sp <= (unsigned long)first_frame))
|
||||
unwind_next_frame(state);
|
||||
|
||||
return;
|
||||
|
||||
done:
|
||||
state->stack_info.type = STACK_TYPE_UNKNOWN;
|
||||
return;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__unwind_start);
|
@ -24,6 +24,7 @@
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/thread_info.h>
|
||||
#include <asm/page_types.h>
|
||||
#include <asm/orc_lookup.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/boot.h>
|
||||
|
||||
@ -148,6 +149,8 @@ SECTIONS
|
||||
|
||||
BUG_TABLE
|
||||
|
||||
ORC_UNWIND_TABLE
|
||||
|
||||
. = ALIGN(PAGE_SIZE);
|
||||
__vvar_page = .;
|
||||
|
||||
|
@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
|
||||
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
|
||||
# the virtualization menu.
|
||||
source drivers/vhost/Kconfig
|
||||
source drivers/lguest/Kconfig
|
||||
|
||||
endif # VIRTUALIZATION
|
||||
|
@ -1,14 +0,0 @@
|
||||
config LGUEST_GUEST
|
||||
bool "Lguest guest support"
|
||||
depends on X86_32 && PARAVIRT && PCI
|
||||
select TTY
|
||||
select VIRTUALIZATION
|
||||
select VIRTIO
|
||||
select VIRTIO_CONSOLE
|
||||
help
|
||||
Lguest is a tiny in-kernel hypervisor. Selecting this will
|
||||
allow your kernel to boot under lguest. This option will increase
|
||||
your kernel size by about 10k. If in doubt, say N.
|
||||
|
||||
If you say Y here, make sure you say Y (or M) to the virtio block
|
||||
and net drivers which lguest needs.
|
@ -1,2 +0,0 @@
|
||||
obj-y := head_32.o boot.o
|
||||
CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
|
File diff suppressed because it is too large
Load Diff
@ -1,192 +0,0 @@
|
||||
#include <linux/linkage.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <asm/lguest_hcall.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/thread_info.h>
|
||||
#include <asm/processor-flags.h>
|
||||
|
||||
/*G:020
|
||||
|
||||
* Our story starts with the bzImage: booting starts at startup_32 in
|
||||
* arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
|
||||
* kernel in place and then jumps into it: startup_32 in
|
||||
* arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
|
||||
* register, which is created by the bootloader (the Launcher in our case).
|
||||
*
|
||||
* The startup_32 function does very little: it clears the uninitialized global
|
||||
* C variables which we expect to be zero (ie. BSS) and then copies the boot
|
||||
* header and kernel command line somewhere safe, and populates some initial
|
||||
* page tables. Finally it checks the 'hardware_subarch' field. This was
|
||||
* introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
|
||||
* assigned number), then it calls us here.
|
||||
*
|
||||
* WARNING: be very careful here! We're running at addresses equal to physical
|
||||
* addresses (around 0), not above PAGE_OFFSET as most code expects
|
||||
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
|
||||
* data without remembering to subtract __PAGE_OFFSET!
|
||||
*
|
||||
* The .section line puts this code in .init.text so it will be discarded after
|
||||
* boot.
|
||||
*/
|
||||
.section .init.text, "ax", @progbits
|
||||
ENTRY(lguest_entry)
|
||||
/*
|
||||
* We make the "initialization" hypercall now to tell the Host where
|
||||
* our lguest_data struct is.
|
||||
*/
|
||||
movl $LHCALL_LGUEST_INIT, %eax
|
||||
movl $lguest_data - __PAGE_OFFSET, %ebx
|
||||
int $LGUEST_TRAP_ENTRY
|
||||
|
||||
/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
|
||||
movl $LHCALL_NEW_PGTABLE, %eax
|
||||
movl $(initial_page_table - __PAGE_OFFSET), %ebx
|
||||
int $LGUEST_TRAP_ENTRY
|
||||
|
||||
/* Set up the initial stack so we can run C code. */
|
||||
movl $(init_thread_union+THREAD_SIZE),%esp
|
||||
|
||||
/* Jumps are relative: we're running __PAGE_OFFSET too low. */
|
||||
jmp lguest_init+__PAGE_OFFSET
|
||||
|
||||
/*G:055
|
||||
* We create a macro which puts the assembler code between lgstart_ and lgend_
|
||||
* markers. These templates are put in the .text section: they can't be
|
||||
* discarded after boot as we may need to patch modules, too.
|
||||
*/
|
||||
.text
|
||||
#define LGUEST_PATCH(name, insns...) \
|
||||
lgstart_##name: insns; lgend_##name:; \
|
||||
.globl lgstart_##name; .globl lgend_##name
|
||||
|
||||
LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
|
||||
LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
|
||||
|
||||
/*G:033
|
||||
* But using those wrappers is inefficient (we'll see why that doesn't matter
|
||||
* for save_fl and irq_disable later). If we write our routines carefully in
|
||||
* assembler, we can avoid clobbering any registers and avoid jumping through
|
||||
* the wrapper functions.
|
||||
*
|
||||
* I skipped over our first piece of assembler, but this one is worth studying
|
||||
* in a bit more detail so I'll describe in easy stages. First, the routine to
|
||||
* enable interrupts:
|
||||
*/
|
||||
ENTRY(lg_irq_enable)
|
||||
/*
|
||||
* The reverse of irq_disable, this sets lguest_data.irq_enabled to
|
||||
* X86_EFLAGS_IF (ie. "Interrupts enabled").
|
||||
*/
|
||||
movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
|
||||
/*
|
||||
* But now we need to check if the Host wants to know: there might have
|
||||
* been interrupts waiting to be delivered, in which case it will have
|
||||
* set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
|
||||
* jump to send_interrupts, otherwise we're done.
|
||||
*/
|
||||
cmpl $0, lguest_data+LGUEST_DATA_irq_pending
|
||||
jnz send_interrupts
|
||||
/*
|
||||
* One cool thing about x86 is that you can do many things without using
|
||||
* a register. In this case, the normal path hasn't needed to save or
|
||||
* restore any registers at all!
|
||||
*/
|
||||
ret
|
||||
send_interrupts:
|
||||
/*
|
||||
* OK, now we need a register: eax is used for the hypercall number,
|
||||
* which is LHCALL_SEND_INTERRUPTS.
|
||||
*
|
||||
* We used not to bother with this pending detection at all, which was
|
||||
* much simpler. Sooner or later the Host would realize it had to
|
||||
* send us an interrupt. But that turns out to make performance 7
|
||||
* times worse on a simple tcp benchmark. So now we do this the hard
|
||||
* way.
|
||||
*/
|
||||
pushl %eax
|
||||
movl $LHCALL_SEND_INTERRUPTS, %eax
|
||||
/* This is the actual hypercall trap. */
|
||||
int $LGUEST_TRAP_ENTRY
|
||||
/* Put eax back the way we found it. */
|
||||
popl %eax
|
||||
ret
|
||||
|
||||
/*
|
||||
* Finally, the "popf" or "restore flags" routine. The %eax register holds the
|
||||
* flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
|
||||
* enabling interrupts again, if it's 0 we're leaving them off.
|
||||
*/
|
||||
ENTRY(lg_restore_fl)
|
||||
/* This is just "lguest_data.irq_enabled = flags;" */
|
||||
movl %eax, lguest_data+LGUEST_DATA_irq_enabled
|
||||
/*
|
||||
* Now, if the %eax value has enabled interrupts and
|
||||
* lguest_data.irq_pending is set, we want to tell the Host so it can
|
||||
* deliver any outstanding interrupts. Fortunately, both values will
|
||||
* be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
|
||||
* instruction will AND them together for us. If both are set, we
|
||||
* jump to send_interrupts.
|
||||
*/
|
||||
testl lguest_data+LGUEST_DATA_irq_pending, %eax
|
||||
jnz send_interrupts
|
||||
/* Again, the normal path has used no extra registers. Clever, huh? */
|
||||
ret
|
||||
/*:*/
|
||||
|
||||
/* These demark the EIP where host should never deliver interrupts. */
|
||||
.global lguest_noirq_iret
|
||||
|
||||
/*M:004
|
||||
* When the Host reflects a trap or injects an interrupt into the Guest, it
|
||||
* sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
|
||||
* so the Guest iret logic does the right thing when restoring it. However,
|
||||
* when the Host sets the Guest up for direct traps, such as system calls, the
|
||||
* processor is the one to push eflags onto the stack, and the interrupt bit
|
||||
* will be 1 (in reality, interrupts are always enabled in the Guest).
|
||||
*
|
||||
* This turns out to be harmless: the only trap which should happen under Linux
|
||||
* with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
|
||||
* regions), which has to be reflected through the Host anyway. If another
|
||||
* trap *does* go off when interrupts are disabled, the Guest will panic, and
|
||||
* we'll never get to this iret!
|
||||
:*/
|
||||
|
||||
/*G:045
|
||||
* There is one final paravirt_op that the Guest implements, and glancing at it
|
||||
* you can see why I left it to last. It's *cool*! It's in *assembler*!
|
||||
*
|
||||
* The "iret" instruction is used to return from an interrupt or trap. The
|
||||
* stack looks like this:
|
||||
* old address
|
||||
* old code segment & privilege level
|
||||
* old processor flags ("eflags")
|
||||
*
|
||||
* The "iret" instruction pops those values off the stack and restores them all
|
||||
* at once. The only problem is that eflags includes the Interrupt Flag which
|
||||
* the Guest can't change: the CPU will simply ignore it when we do an "iret".
|
||||
* So we have to copy eflags from the stack to lguest_data.irq_enabled before
|
||||
* we do the "iret".
|
||||
*
|
||||
* There are two problems with this: firstly, we can't clobber any registers
|
||||
* and secondly, the whole thing needs to be atomic. The first problem
|
||||
* is solved by using "push memory"/"pop memory" instruction pair for copying.
|
||||
*
|
||||
* The second is harder: copying eflags to lguest_data.irq_enabled will turn
|
||||
* interrupts on before we're finished, so we could be interrupted before we
|
||||
* return to userspace or wherever. Our solution to this is to tell the
|
||||
* Host that it is *never* to interrupt us there, even if interrupts seem to be
|
||||
* enabled. (It's not necessary to protect pop instruction, since
|
||||
* data gets updated only after it completes, so we only need to protect
|
||||
* one instruction, iret).
|
||||
*/
|
||||
ENTRY(lguest_iret)
|
||||
pushl 2*4(%esp)
|
||||
/*
|
||||
* Note the %ss: segment prefix here. Normal data accesses use the
|
||||
* "ds" segment, but that will have already been restored for whatever
|
||||
* we're returning to (such as userspace): we can't trust it. The %ss:
|
||||
* prefix makes sure we use the stack segment, which is still valid.
|
||||
*/
|
||||
popl %ss:lguest_data+LGUEST_DATA_irq_enabled
|
||||
lguest_noirq_iret:
|
||||
iret
|
@ -142,7 +142,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
|
||||
* undefined. I'm not sure which CPUs do this, but at least
|
||||
* the 486 DX works this way.
|
||||
*/
|
||||
if ((regs->cs & 0xFFFF) != __KERNEL_CS)
|
||||
if (regs->cs != __KERNEL_CS)
|
||||
goto fail;
|
||||
|
||||
/*
|
||||
|
@ -981,59 +981,6 @@ void __ref xen_setup_vcpu_info_placement(void)
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
|
||||
unsigned long addr, unsigned len)
|
||||
{
|
||||
char *start, *end, *reloc;
|
||||
unsigned ret;
|
||||
|
||||
start = end = reloc = NULL;
|
||||
|
||||
#define SITE(op, x) \
|
||||
case PARAVIRT_PATCH(op.x): \
|
||||
if (xen_have_vcpu_info_placement) { \
|
||||
start = (char *)xen_##x##_direct; \
|
||||
end = xen_##x##_direct_end; \
|
||||
reloc = xen_##x##_direct_reloc; \
|
||||
} \
|
||||
goto patch_site
|
||||
|
||||
switch (type) {
|
||||
SITE(pv_irq_ops, irq_enable);
|
||||
SITE(pv_irq_ops, irq_disable);
|
||||
SITE(pv_irq_ops, save_fl);
|
||||
SITE(pv_irq_ops, restore_fl);
|
||||
#undef SITE
|
||||
|
||||
patch_site:
|
||||
if (start == NULL || (end-start) > len)
|
||||
goto default_patch;
|
||||
|
||||
ret = paravirt_patch_insns(insnbuf, len, start, end);
|
||||
|
||||
/* Note: because reloc is assigned from something that
|
||||
appears to be an array, gcc assumes it's non-null,
|
||||
but doesn't know its relationship with start and
|
||||
end. */
|
||||
if (reloc > start && reloc < end) {
|
||||
int reloc_off = reloc - start;
|
||||
long *relocp = (long *)(insnbuf + reloc_off);
|
||||
long delta = start - (char *)addr;
|
||||
|
||||
*relocp += delta;
|
||||
}
|
||||
break;
|
||||
|
||||
default_patch:
|
||||
default:
|
||||
ret = paravirt_patch_default(type, clobbers, insnbuf,
|
||||
addr, len);
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct pv_info xen_info __initconst = {
|
||||
.shared_kernel_pmd = 0,
|
||||
|
||||
@ -1043,10 +990,6 @@ static const struct pv_info xen_info __initconst = {
|
||||
.name = "Xen",
|
||||
};
|
||||
|
||||
static const struct pv_init_ops xen_init_ops __initconst = {
|
||||
.patch = xen_patch,
|
||||
};
|
||||
|
||||
static const struct pv_cpu_ops xen_cpu_ops __initconst = {
|
||||
.cpuid = xen_cpuid,
|
||||
|
||||
@ -1244,7 +1187,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
|
||||
|
||||
/* Install Xen paravirt ops */
|
||||
pv_info = xen_info;
|
||||
pv_init_ops = xen_init_ops;
|
||||
pv_init_ops.patch = paravirt_patch_default;
|
||||
pv_cpu_ops = xen_cpu_ops;
|
||||
|
||||
x86_platform.get_nmi_reason = xen_get_nmi_reason;
|
||||
|
@ -1,14 +1,8 @@
|
||||
/*
|
||||
* Asm versions of Xen pv-ops, suitable for either direct use or
|
||||
* inlining. The inline versions are the same as the direct-use
|
||||
* versions, with the pre- and post-amble chopped off.
|
||||
*
|
||||
* This code is encoded for size rather than absolute efficiency, with
|
||||
* a view to being able to inline as much as possible.
|
||||
* Asm versions of Xen pv-ops, suitable for direct use.
|
||||
*
|
||||
* We only bother with direct forms (ie, vcpu in percpu data) of the
|
||||
* operations here; the indirect forms are better handled in C, since
|
||||
* they're generally too large to inline anyway.
|
||||
* operations here; the indirect forms are better handled in C.
|
||||
*/
|
||||
|
||||
#include <asm/asm-offsets.h>
|
||||
@ -16,7 +10,7 @@
|
||||
#include <asm/processor-flags.h>
|
||||
#include <asm/frame.h>
|
||||
|
||||
#include "xen-asm.h"
|
||||
#include <linux/linkage.h>
|
||||
|
||||
/*
|
||||
* Enable events. This clears the event mask and tests the pending
|
||||
@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct)
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
|
||||
jz 1f
|
||||
|
||||
2: call check_events
|
||||
call check_events
|
||||
1:
|
||||
ENDPATCH(xen_irq_enable_direct)
|
||||
FRAME_END
|
||||
ret
|
||||
ENDPROC(xen_irq_enable_direct)
|
||||
RELOC(xen_irq_enable_direct, 2b+1)
|
||||
|
||||
|
||||
/*
|
||||
@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct)
|
||||
*/
|
||||
ENTRY(xen_irq_disable_direct)
|
||||
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
|
||||
ENDPATCH(xen_irq_disable_direct)
|
||||
ret
|
||||
ENDPROC(xen_irq_disable_direct)
|
||||
RELOC(xen_irq_disable_direct, 0)
|
||||
ENDPROC(xen_irq_disable_direct)
|
||||
|
||||
/*
|
||||
* (xen_)save_fl is used to get the current interrupt enable status.
|
||||
@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct)
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
|
||||
setz %ah
|
||||
addb %ah, %ah
|
||||
ENDPATCH(xen_save_fl_direct)
|
||||
ret
|
||||
ENDPROC(xen_save_fl_direct)
|
||||
RELOC(xen_save_fl_direct, 0)
|
||||
|
||||
|
||||
/*
|
||||
@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct)
|
||||
/* check for unmasked and pending */
|
||||
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
|
||||
jnz 1f
|
||||
2: call check_events
|
||||
call check_events
|
||||
1:
|
||||
ENDPATCH(xen_restore_fl_direct)
|
||||
FRAME_END
|
||||
ret
|
||||
ENDPROC(xen_restore_fl_direct)
|
||||
RELOC(xen_restore_fl_direct, 2b+1)
|
||||
|
||||
|
||||
/*
|
||||
|
@ -1,12 +0,0 @@
|
||||
#ifndef _XEN_XEN_ASM_H
|
||||
#define _XEN_XEN_ASM_H
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
|
||||
#define ENDPATCH(x) .globl x##_end; x##_end=.
|
||||
|
||||
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
|
||||
#define XEN_EFLAGS_NMI 0x80000000
|
||||
|
||||
#endif
|
@ -1,14 +1,8 @@
|
||||
/*
|
||||
* Asm versions of Xen pv-ops, suitable for either direct use or
|
||||
* inlining. The inline versions are the same as the direct-use
|
||||
* versions, with the pre- and post-amble chopped off.
|
||||
*
|
||||
* This code is encoded for size rather than absolute efficiency, with
|
||||
* a view to being able to inline as much as possible.
|
||||
* Asm versions of Xen pv-ops, suitable for direct use.
|
||||
*
|
||||
* We only bother with direct forms (ie, vcpu in pda) of the
|
||||
* operations here; the indirect forms are better handled in C, since
|
||||
* they're generally too large to inline anyway.
|
||||
* operations here; the indirect forms are better handled in C.
|
||||
*/
|
||||
|
||||
#include <asm/thread_info.h>
|
||||
@ -18,21 +12,10 @@
|
||||
|
||||
#include <xen/interface/xen.h>
|
||||
|
||||
#include "xen-asm.h"
|
||||
#include <linux/linkage.h>
|
||||
|
||||
/*
|
||||
* Force an event check by making a hypercall, but preserve regs
|
||||
* before making the call.
|
||||
*/
|
||||
check_events:
|
||||
push %eax
|
||||
push %ecx
|
||||
push %edx
|
||||
call xen_force_evtchn_callback
|
||||
pop %edx
|
||||
pop %ecx
|
||||
pop %eax
|
||||
ret
|
||||
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
|
||||
#define XEN_EFLAGS_NMI 0x80000000
|
||||
|
||||
/*
|
||||
* This is run where a normal iret would be run, with the same stack setup:
|
||||
|
@ -1,14 +1,8 @@
|
||||
/*
|
||||
* Asm versions of Xen pv-ops, suitable for either direct use or
|
||||
* inlining. The inline versions are the same as the direct-use
|
||||
* versions, with the pre- and post-amble chopped off.
|
||||
*
|
||||
* This code is encoded for size rather than absolute efficiency, with
|
||||
* a view to being able to inline as much as possible.
|
||||
* Asm versions of Xen pv-ops, suitable for direct use.
|
||||
*
|
||||
* We only bother with direct forms (ie, vcpu in pda) of the
|
||||
* operations here; the indirect forms are better handled in C, since
|
||||
* they're generally too large to inline anyway.
|
||||
* operations here; the indirect forms are better handled in C.
|
||||
*/
|
||||
|
||||
#include <asm/errno.h>
|
||||
@ -20,7 +14,7 @@
|
||||
|
||||
#include <xen/interface/xen.h>
|
||||
|
||||
#include "xen-asm.h"
|
||||
#include <linux/linkage.h>
|
||||
|
||||
ENTRY(xen_adjust_exception_frame)
|
||||
mov 8+0(%rsp), %rcx
|
||||
@ -46,9 +40,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
|
||||
*/
|
||||
ENTRY(xen_iret)
|
||||
pushq $0
|
||||
1: jmp hypercall_iret
|
||||
ENDPATCH(xen_iret)
|
||||
RELOC(xen_iret, 1b+1)
|
||||
jmp hypercall_iret
|
||||
|
||||
ENTRY(xen_sysret64)
|
||||
/*
|
||||
@ -65,9 +57,7 @@ ENTRY(xen_sysret64)
|
||||
pushq %rcx
|
||||
|
||||
pushq $VGCF_in_syscall
|
||||
1: jmp hypercall_iret
|
||||
ENDPATCH(xen_sysret64)
|
||||
RELOC(xen_sysret64, 1b+1)
|
||||
jmp hypercall_iret
|
||||
|
||||
/*
|
||||
* Xen handles syscall callbacks much like ordinary exceptions, which
|
||||
@ -82,34 +72,47 @@ RELOC(xen_sysret64, 1b+1)
|
||||
* rip
|
||||
* r11
|
||||
* rsp->rcx
|
||||
*
|
||||
* In all the entrypoints, we undo all that to make it look like a
|
||||
* CPU-generated syscall/sysenter and jump to the normal entrypoint.
|
||||
*/
|
||||
|
||||
.macro undo_xen_syscall
|
||||
mov 0*8(%rsp), %rcx
|
||||
mov 1*8(%rsp), %r11
|
||||
mov 5*8(%rsp), %rsp
|
||||
.endm
|
||||
|
||||
/* Normal 64-bit system call target */
|
||||
ENTRY(xen_syscall_target)
|
||||
undo_xen_syscall
|
||||
jmp entry_SYSCALL_64_after_swapgs
|
||||
popq %rcx
|
||||
popq %r11
|
||||
|
||||
/*
|
||||
* Neither Xen nor the kernel really knows what the old SS and
|
||||
* CS were. The kernel expects __USER_DS and __USER_CS, so
|
||||
* report those values even though Xen will guess its own values.
|
||||
*/
|
||||
movq $__USER_DS, 4*8(%rsp)
|
||||
movq $__USER_CS, 1*8(%rsp)
|
||||
|
||||
jmp entry_SYSCALL_64_after_hwframe
|
||||
ENDPROC(xen_syscall_target)
|
||||
|
||||
#ifdef CONFIG_IA32_EMULATION
|
||||
|
||||
/* 32-bit compat syscall target */
|
||||
ENTRY(xen_syscall32_target)
|
||||
undo_xen_syscall
|
||||
jmp entry_SYSCALL_compat
|
||||
popq %rcx
|
||||
popq %r11
|
||||
|
||||
/*
|
||||
* Neither Xen nor the kernel really knows what the old SS and
|
||||
* CS were. The kernel expects __USER32_DS and __USER32_CS, so
|
||||
* report those values even though Xen will guess its own values.
|
||||
*/
|
||||
movq $__USER32_DS, 4*8(%rsp)
|
||||
movq $__USER32_CS, 1*8(%rsp)
|
||||
|
||||
jmp entry_SYSCALL_compat_after_hwframe
|
||||
ENDPROC(xen_syscall32_target)
|
||||
|
||||
/* 32-bit compat sysenter target */
|
||||
ENTRY(xen_sysenter_target)
|
||||
undo_xen_syscall
|
||||
mov 0*8(%rsp), %rcx
|
||||
mov 1*8(%rsp), %r11
|
||||
mov 5*8(%rsp), %rsp
|
||||
jmp entry_SYSENTER_compat
|
||||
ENDPROC(xen_sysenter_target)
|
||||
|
||||
|
@ -129,17 +129,10 @@ static inline void __init xen_efi_init(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Declare an asm function, along with symbols needed to make it
|
||||
inlineable */
|
||||
#define DECL_ASM(ret, name, ...) \
|
||||
__visible ret name(__VA_ARGS__); \
|
||||
extern char name##_end[] __visible; \
|
||||
extern char name##_reloc[] __visible
|
||||
|
||||
DECL_ASM(void, xen_irq_enable_direct, void);
|
||||
DECL_ASM(void, xen_irq_disable_direct, void);
|
||||
DECL_ASM(unsigned long, xen_save_fl_direct, void);
|
||||
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
|
||||
__visible void xen_irq_enable_direct(void);
|
||||
__visible void xen_irq_disable_direct(void);
|
||||
__visible unsigned long xen_save_fl_direct(void);
|
||||
__visible void xen_restore_fl_direct(unsigned long);
|
||||
|
||||
/* These are not functions, and cannot be called normally */
|
||||
__visible void xen_iret(void);
|
||||
|
@ -125,7 +125,6 @@ obj-$(CONFIG_ACCESSIBILITY) += accessibility/
|
||||
obj-$(CONFIG_ISDN) += isdn/
|
||||
obj-$(CONFIG_EDAC) += edac/
|
||||
obj-$(CONFIG_EISA) += eisa/
|
||||
obj-y += lguest/
|
||||
obj-$(CONFIG_CPU_FREQ) += cpufreq/
|
||||
obj-$(CONFIG_CPU_IDLE) += cpuidle/
|
||||
obj-y += mmc/
|
||||
|
@ -470,7 +470,7 @@ config VIRTIO_BLK
|
||||
depends on VIRTIO
|
||||
---help---
|
||||
This is the virtual block driver for virtio. It can be used with
|
||||
lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
|
||||
QEMU based VMMs (like KVM or Xen). Say Y or M.
|
||||
|
||||
config VIRTIO_BLK_SCSI
|
||||
bool "SCSI passthrough request for the Virtio block driver"
|
||||
|
@ -161,7 +161,7 @@ config VIRTIO_CONSOLE
|
||||
depends on VIRTIO && TTY
|
||||
select HVC_DRIVER
|
||||
help
|
||||
Virtio console for use with lguest and other hypervisors.
|
||||
Virtio console for use with hypervisors.
|
||||
|
||||
Also serves as a general-purpose serial device for data
|
||||
transfer between the guest and host. Character devices at
|
||||
|
@ -1130,7 +1130,7 @@ static const struct file_operations port_fops = {
|
||||
* We turn the characters into a scatter-gather list, add it to the
|
||||
* output queue and then kick the Host. Then we sit here waiting for
|
||||
* it to finish: inefficient in theory, but in practice
|
||||
* implementations will do it immediately (lguest's Launcher does).
|
||||
* implementations will do it immediately.
|
||||
*/
|
||||
static int put_chars(u32 vtermno, const char *buf, int count)
|
||||
{
|
||||
|
@ -1,13 +0,0 @@
|
||||
config LGUEST
|
||||
tristate "Linux hypervisor example code"
|
||||
depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
|
||||
select HVC_DRIVER
|
||||
---help---
|
||||
This is a very simple module which allows you to run
|
||||
multiple instances of the same Linux kernel, using the
|
||||
"lguest" command found in the tools/lguest directory.
|
||||
|
||||
Note that "lguest" is pronounced to rhyme with "fell quest",
|
||||
not "rustyvisor". See tools/lguest/lguest.txt.
|
||||
|
||||
If unsure, say N. If curious, say M. If masochistic, say Y.
|
@ -1,26 +0,0 @@
|
||||
# Host requires the other files, which can be a module.
|
||||
obj-$(CONFIG_LGUEST) += lg.o
|
||||
lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
|
||||
segments.o lguest_user.o
|
||||
|
||||
lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
|
||||
|
||||
Preparation Preparation!: PREFIX=P
|
||||
Guest: PREFIX=G
|
||||
Drivers: PREFIX=D
|
||||
Launcher: PREFIX=L
|
||||
Host: PREFIX=H
|
||||
Switcher: PREFIX=S
|
||||
Mastery: PREFIX=M
|
||||
Beer:
|
||||
@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
|
||||
Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
|
||||
@sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
|
||||
Puppy:
|
||||
@clear
|
||||
@printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n"
|
||||
@sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear
|
||||
@printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n"
|
||||
@sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear
|
||||
@printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n"
|
||||
@sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear
|
@ -1,47 +0,0 @@
|
||||
Welcome, friend reader, to lguest.
|
||||
|
||||
Lguest is an adventure, with you, the reader, as Hero. I can't think of many
|
||||
5000-line projects which offer both such capability and glimpses of future
|
||||
potential; it is an exciting time to be delving into the source!
|
||||
|
||||
But be warned; this is an arduous journey of several hours or more! And as we
|
||||
know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or
|
||||
equivalent) to anyone I meet who has completed this documentation.
|
||||
|
||||
So get comfortable and keep your wits about you (both quick and humorous).
|
||||
Along your way to the Noble Goal, you will also gain masterly insight into
|
||||
lguest, and hypervisors and x86 virtualization in general.
|
||||
|
||||
Our Quest is in seven parts: (best read with C highlighting turned on)
|
||||
|
||||
I) Preparation
|
||||
- In which our potential hero is flown quickly over the landscape for a
|
||||
taste of its scope. Suitable for the armchair coders and other such
|
||||
persons of faint constitution.
|
||||
|
||||
II) Guest
|
||||
- Where we encounter the first tantalising wisps of code, and come to
|
||||
understand the details of the life of a Guest kernel.
|
||||
|
||||
III) Drivers
|
||||
- Whereby the Guest finds its voice and become useful, and our
|
||||
understanding of the Guest is completed.
|
||||
|
||||
IV) Launcher
|
||||
- Where we trace back to the creation of the Guest, and thus begin our
|
||||
understanding of the Host.
|
||||
|
||||
V) Host
|
||||
- Where we master the Host code, through a long and tortuous journey.
|
||||
Indeed, it is here that our hero is tested in the Bit of Despair.
|
||||
|
||||
VI) Switcher
|
||||
- Where our understanding of the intertwined nature of Guests and Hosts
|
||||
is completed.
|
||||
|
||||
VII) Mastery
|
||||
- Where our fully fledged hero grapples with the Great Question:
|
||||
"What next?"
|
||||
|
||||
make Preparation!
|
||||
Rusty Russell.
|
@ -1,398 +0,0 @@
|
||||
/*P:400
|
||||
* This contains run_guest() which actually calls into the Host<->Guest
|
||||
* Switcher and analyzes the return, such as determining if the Guest wants the
|
||||
* Host to do something. This file also contains useful helper routines.
|
||||
:*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/stringify.h>
|
||||
#include <linux/stddef.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/slab.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/poll.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include "lg.h"
|
||||
|
||||
unsigned long switcher_addr;
|
||||
struct page **lg_switcher_pages;
|
||||
static struct vm_struct *switcher_text_vma;
|
||||
static struct vm_struct *switcher_stacks_vma;
|
||||
|
||||
/* This One Big lock protects all inter-guest data structures. */
|
||||
DEFINE_MUTEX(lguest_lock);
|
||||
|
||||
/*H:010
|
||||
* We need to set up the Switcher at a high virtual address. Remember the
|
||||
* Switcher is a few hundred bytes of assembler code which actually changes the
|
||||
* CPU to run the Guest, and then changes back to the Host when a trap or
|
||||
* interrupt happens.
|
||||
*
|
||||
* The Switcher code must be at the same virtual address in the Guest as the
|
||||
* Host since it will be running as the switchover occurs.
|
||||
*
|
||||
* Trying to map memory at a particular address is an unusual thing to do, so
|
||||
* it's not a simple one-liner.
|
||||
*/
|
||||
static __init int map_switcher(void)
|
||||
{
|
||||
int i, err;
|
||||
|
||||
/*
|
||||
* Map the Switcher in to high memory.
|
||||
*
|
||||
* It turns out that if we choose the address 0xFFC00000 (4MB under the
|
||||
* top virtual address), it makes setting up the page tables really
|
||||
* easy.
|
||||
*/
|
||||
|
||||
/* We assume Switcher text fits into a single page. */
|
||||
if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
|
||||
printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
|
||||
end_switcher_text - start_switcher_text);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* We allocate an array of struct page pointers. map_vm_area() wants
|
||||
* this, rather than just an array of pages.
|
||||
*/
|
||||
lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
|
||||
* TOTAL_SWITCHER_PAGES,
|
||||
GFP_KERNEL);
|
||||
if (!lg_switcher_pages) {
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we actually allocate the pages. The Guest will see these pages,
|
||||
* so we make sure they're zeroed.
|
||||
*/
|
||||
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
|
||||
lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
|
||||
if (!lg_switcher_pages[i]) {
|
||||
err = -ENOMEM;
|
||||
goto free_some_pages;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy in the compiled-in Switcher code (from x86/switcher_32.S).
|
||||
* It goes in the first page, which we map in momentarily.
|
||||
*/
|
||||
memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
|
||||
end_switcher_text - start_switcher_text);
|
||||
kunmap(lg_switcher_pages[0]);
|
||||
|
||||
/*
|
||||
* We place the Switcher underneath the fixmap area, which is the
|
||||
* highest virtual address we can get. This is important, since we
|
||||
* tell the Guest it can't access this memory, so we want its ceiling
|
||||
* as high as possible.
|
||||
*/
|
||||
switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
|
||||
|
||||
/*
|
||||
* Now we reserve the "virtual memory area"s we want. We might
|
||||
* not get them in theory, but in practice it's worked so far.
|
||||
*
|
||||
* We want the switcher text to be read-only and executable, and
|
||||
* the stacks to be read-write and non-executable.
|
||||
*/
|
||||
switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
|
||||
switcher_addr,
|
||||
switcher_addr + PAGE_SIZE);
|
||||
|
||||
if (!switcher_text_vma) {
|
||||
err = -ENOMEM;
|
||||
printk("lguest: could not map switcher pages high\n");
|
||||
goto free_pages;
|
||||
}
|
||||
|
||||
switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
|
||||
VM_ALLOC|VM_NO_GUARD,
|
||||
switcher_addr + PAGE_SIZE,
|
||||
switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
|
||||
if (!switcher_stacks_vma) {
|
||||
err = -ENOMEM;
|
||||
printk("lguest: could not map switcher pages high\n");
|
||||
goto free_text_vma;
|
||||
}
|
||||
|
||||
/*
|
||||
* This code actually sets up the pages we've allocated to appear at
|
||||
* switcher_addr. map_vm_area() takes the vma we allocated above, the
|
||||
* kind of pages we're mapping (kernel text pages and kernel writable
|
||||
* pages respectively), and a pointer to our array of struct pages.
|
||||
*/
|
||||
err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
|
||||
if (err) {
|
||||
printk("lguest: text map_vm_area failed: %i\n", err);
|
||||
goto free_vmas;
|
||||
}
|
||||
|
||||
err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
|
||||
lg_switcher_pages + SWITCHER_TEXT_PAGES);
|
||||
if (err) {
|
||||
printk("lguest: stacks map_vm_area failed: %i\n", err);
|
||||
goto free_vmas;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now the Switcher is mapped at the right address, we can't fail!
|
||||
*/
|
||||
printk(KERN_INFO "lguest: mapped switcher at %p\n",
|
||||
switcher_text_vma->addr);
|
||||
/* And we succeeded... */
|
||||
return 0;
|
||||
|
||||
free_vmas:
|
||||
/* Undoes map_vm_area and __get_vm_area */
|
||||
vunmap(switcher_stacks_vma->addr);
|
||||
free_text_vma:
|
||||
vunmap(switcher_text_vma->addr);
|
||||
free_pages:
|
||||
i = TOTAL_SWITCHER_PAGES;
|
||||
free_some_pages:
|
||||
for (--i; i >= 0; i--)
|
||||
__free_pages(lg_switcher_pages[i], 0);
|
||||
kfree(lg_switcher_pages);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
|
||||
static void unmap_switcher(void)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
|
||||
vunmap(switcher_text_vma->addr);
|
||||
vunmap(switcher_stacks_vma->addr);
|
||||
/* Now we just need to free the pages we copied the switcher into */
|
||||
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
|
||||
__free_pages(lg_switcher_pages[i], 0);
|
||||
kfree(lg_switcher_pages);
|
||||
}
|
||||
|
||||
/*H:032
|
||||
* Dealing With Guest Memory.
|
||||
*
|
||||
* Before we go too much further into the Host, we need to grok the routines
|
||||
* we use to deal with Guest memory.
|
||||
*
|
||||
* When the Guest gives us (what it thinks is) a physical address, we can use
|
||||
* the normal copy_from_user() & copy_to_user() on the corresponding place in
|
||||
* the memory region allocated by the Launcher.
|
||||
*
|
||||
* But we can't trust the Guest: it might be trying to access the Launcher
|
||||
* code. We have to check that the range is below the pfn_limit the Launcher
|
||||
* gave us. We have to make sure that addr + len doesn't give us a false
|
||||
* positive by overflowing, too.
|
||||
*/
|
||||
bool lguest_address_ok(const struct lguest *lg,
|
||||
unsigned long addr, unsigned long len)
|
||||
{
|
||||
return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr);
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine copies memory from the Guest. Here we can see how useful the
|
||||
* kill_lguest() routine we met in the Launcher can be: we return a random
|
||||
* value (all zeroes) instead of needing to return an error.
|
||||
*/
|
||||
void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
|
||||
{
|
||||
if (!lguest_address_ok(cpu->lg, addr, bytes)
|
||||
|| copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
|
||||
/* copy_from_user should do this, but as we rely on it... */
|
||||
memset(b, 0, bytes);
|
||||
kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/* This is the write (copy into Guest) version. */
|
||||
void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
|
||||
unsigned bytes)
|
||||
{
|
||||
if (!lguest_address_ok(cpu->lg, addr, bytes)
|
||||
|| copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
|
||||
kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*H:030
|
||||
* Let's jump straight to the the main loop which runs the Guest.
|
||||
* Remember, this is called by the Launcher reading /dev/lguest, and we keep
|
||||
* going around and around until something interesting happens.
|
||||
*/
|
||||
int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
|
||||
{
|
||||
/* If the launcher asked for a register with LHREQ_GETREG */
|
||||
if (cpu->reg_read) {
|
||||
if (put_user(*cpu->reg_read, user))
|
||||
return -EFAULT;
|
||||
cpu->reg_read = NULL;
|
||||
return sizeof(*cpu->reg_read);
|
||||
}
|
||||
|
||||
/* We stop running once the Guest is dead. */
|
||||
while (!cpu->lg->dead) {
|
||||
unsigned int irq;
|
||||
bool more;
|
||||
|
||||
/* First we run any hypercalls the Guest wants done. */
|
||||
if (cpu->hcall)
|
||||
do_hypercalls(cpu);
|
||||
|
||||
/* Do we have to tell the Launcher about a trap? */
|
||||
if (cpu->pending.trap) {
|
||||
if (copy_to_user(user, &cpu->pending,
|
||||
sizeof(cpu->pending)))
|
||||
return -EFAULT;
|
||||
return sizeof(cpu->pending);
|
||||
}
|
||||
|
||||
/*
|
||||
* All long-lived kernel loops need to check with this horrible
|
||||
* thing called the freezer. If the Host is trying to suspend,
|
||||
* it stops us.
|
||||
*/
|
||||
try_to_freeze();
|
||||
|
||||
/* Check for signals */
|
||||
if (signal_pending(current))
|
||||
return -ERESTARTSYS;
|
||||
|
||||
/*
|
||||
* Check if there are any interrupts which can be delivered now:
|
||||
* if so, this sets up the hander to be executed when we next
|
||||
* run the Guest.
|
||||
*/
|
||||
irq = interrupt_pending(cpu, &more);
|
||||
if (irq < LGUEST_IRQS)
|
||||
try_deliver_interrupt(cpu, irq, more);
|
||||
|
||||
/*
|
||||
* Just make absolutely sure the Guest is still alive. One of
|
||||
* those hypercalls could have been fatal, for example.
|
||||
*/
|
||||
if (cpu->lg->dead)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If the Guest asked to be stopped, we sleep. The Guest's
|
||||
* clock timer will wake us.
|
||||
*/
|
||||
if (cpu->halted) {
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
/*
|
||||
* Just before we sleep, make sure no interrupt snuck in
|
||||
* which we should be doing.
|
||||
*/
|
||||
if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
|
||||
set_current_state(TASK_RUNNING);
|
||||
else
|
||||
schedule();
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* OK, now we're ready to jump into the Guest. First we put up
|
||||
* the "Do Not Disturb" sign:
|
||||
*/
|
||||
local_irq_disable();
|
||||
|
||||
/* Actually run the Guest until something happens. */
|
||||
lguest_arch_run_guest(cpu);
|
||||
|
||||
/* Now we're ready to be interrupted or moved to other CPUs */
|
||||
local_irq_enable();
|
||||
|
||||
/* Now we deal with whatever happened to the Guest. */
|
||||
lguest_arch_handle_trap(cpu);
|
||||
}
|
||||
|
||||
/* Special case: Guest is 'dead' but wants a reboot. */
|
||||
if (cpu->lg->dead == ERR_PTR(-ERESTART))
|
||||
return -ERESTART;
|
||||
|
||||
/* The Guest is dead => "No such file or directory" */
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
/*H:000
|
||||
* Welcome to the Host!
|
||||
*
|
||||
* By this point your brain has been tickled by the Guest code and numbed by
|
||||
* the Launcher code; prepare for it to be stretched by the Host code. This is
|
||||
* the heart. Let's begin at the initialization routine for the Host's lg
|
||||
* module.
|
||||
*/
|
||||
static int __init init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
/* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
|
||||
if (get_kernel_rpl() != 0) {
|
||||
printk("lguest is afraid of being a guest\n");
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
/* First we put the Switcher up in very high virtual memory. */
|
||||
err = map_switcher();
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/* We might need to reserve an interrupt vector. */
|
||||
err = init_interrupts();
|
||||
if (err)
|
||||
goto unmap;
|
||||
|
||||
/* /dev/lguest needs to be registered. */
|
||||
err = lguest_device_init();
|
||||
if (err)
|
||||
goto free_interrupts;
|
||||
|
||||
/* Finally we do some architecture-specific setup. */
|
||||
lguest_arch_host_init();
|
||||
|
||||
/* All good! */
|
||||
return 0;
|
||||
|
||||
free_interrupts:
|
||||
free_interrupts();
|
||||
unmap:
|
||||
unmap_switcher();
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Cleaning up is just the same code, backwards. With a little French. */
|
||||
static void __exit fini(void)
|
||||
{
|
||||
lguest_device_remove();
|
||||
free_interrupts();
|
||||
unmap_switcher();
|
||||
|
||||
lguest_arch_host_fini();
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*
|
||||
* The Host side of lguest can be a module. This is a nice way for people to
|
||||
* play with it.
|
||||
*/
|
||||
module_init(init);
|
||||
module_exit(fini);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
|
@ -1,304 +0,0 @@
|
||||
/*P:500
|
||||
* Just as userspace programs request kernel operations through a system
|
||||
* call, the Guest requests Host operations through a "hypercall". You might
|
||||
* notice this nomenclature doesn't really follow any logic, but the name has
|
||||
* been around for long enough that we're stuck with it. As you'd expect, this
|
||||
* code is basically a one big switch statement.
|
||||
:*/
|
||||
|
||||
/* Copyright (C) 2006 Rusty Russell IBM Corporation
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include "lg.h"
|
||||
|
||||
/*H:120
|
||||
* This is the core hypercall routine: where the Guest gets what it wants.
|
||||
* Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both.
|
||||
*/
|
||||
static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
|
||||
{
|
||||
switch (args->arg0) {
|
||||
case LHCALL_FLUSH_ASYNC:
|
||||
/*
|
||||
* This call does nothing, except by breaking out of the Guest
|
||||
* it makes us process all the asynchronous hypercalls.
|
||||
*/
|
||||
break;
|
||||
case LHCALL_SEND_INTERRUPTS:
|
||||
/*
|
||||
* This call does nothing too, but by breaking out of the Guest
|
||||
* it makes us process any pending interrupts.
|
||||
*/
|
||||
break;
|
||||
case LHCALL_LGUEST_INIT:
|
||||
/*
|
||||
* You can't get here unless you're already initialized. Don't
|
||||
* do that.
|
||||
*/
|
||||
kill_guest(cpu, "already have lguest_data");
|
||||
break;
|
||||
case LHCALL_SHUTDOWN: {
|
||||
char msg[128];
|
||||
/*
|
||||
* Shutdown is such a trivial hypercall that we do it in five
|
||||
* lines right here.
|
||||
*
|
||||
* If the lgread fails, it will call kill_guest() itself; the
|
||||
* kill_guest() with the message will be ignored.
|
||||
*/
|
||||
__lgread(cpu, msg, args->arg1, sizeof(msg));
|
||||
msg[sizeof(msg)-1] = '\0';
|
||||
kill_guest(cpu, "CRASH: %s", msg);
|
||||
if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
|
||||
cpu->lg->dead = ERR_PTR(-ERESTART);
|
||||
break;
|
||||
}
|
||||
case LHCALL_FLUSH_TLB:
|
||||
/* FLUSH_TLB comes in two flavors, depending on the argument: */
|
||||
if (args->arg1)
|
||||
guest_pagetable_clear_all(cpu);
|
||||
else
|
||||
guest_pagetable_flush_user(cpu);
|
||||
break;
|
||||
|
||||
/*
|
||||
* All these calls simply pass the arguments through to the right
|
||||
* routines.
|
||||
*/
|
||||
case LHCALL_NEW_PGTABLE:
|
||||
guest_new_pagetable(cpu, args->arg1);
|
||||
break;
|
||||
case LHCALL_SET_STACK:
|
||||
guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
|
||||
break;
|
||||
case LHCALL_SET_PTE:
|
||||
#ifdef CONFIG_X86_PAE
|
||||
guest_set_pte(cpu, args->arg1, args->arg2,
|
||||
__pte(args->arg3 | (u64)args->arg4 << 32));
|
||||
#else
|
||||
guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
|
||||
#endif
|
||||
break;
|
||||
case LHCALL_SET_PGD:
|
||||
guest_set_pgd(cpu->lg, args->arg1, args->arg2);
|
||||
break;
|
||||
#ifdef CONFIG_X86_PAE
|
||||
case LHCALL_SET_PMD:
|
||||
guest_set_pmd(cpu->lg, args->arg1, args->arg2);
|
||||
break;
|
||||
#endif
|
||||
case LHCALL_SET_CLOCKEVENT:
|
||||
guest_set_clockevent(cpu, args->arg1);
|
||||
break;
|
||||
case LHCALL_HALT:
|
||||
/* Similarly, this sets the halted flag for run_guest(). */
|
||||
cpu->halted = 1;
|
||||
break;
|
||||
default:
|
||||
/* It should be an architecture-specific hypercall. */
|
||||
if (lguest_arch_do_hcall(cpu, args))
|
||||
kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
|
||||
}
|
||||
}
|
||||
|
||||
/*H:124
|
||||
* Asynchronous hypercalls are easy: we just look in the array in the
|
||||
* Guest's "struct lguest_data" to see if any new ones are marked "ready".
|
||||
*
|
||||
* We are careful to do these in order: obviously we respect the order the
|
||||
* Guest put them in the ring, but we also promise the Guest that they will
|
||||
* happen before any normal hypercall (which is why we check this before
|
||||
* checking for a normal hcall).
|
||||
*/
|
||||
static void do_async_hcalls(struct lg_cpu *cpu)
|
||||
{
|
||||
unsigned int i;
|
||||
u8 st[LHCALL_RING_SIZE];
|
||||
|
||||
/* For simplicity, we copy the entire call status array in at once. */
|
||||
if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
|
||||
return;
|
||||
|
||||
/* We process "struct lguest_data"s hcalls[] ring once. */
|
||||
for (i = 0; i < ARRAY_SIZE(st); i++) {
|
||||
struct hcall_args args;
|
||||
/*
|
||||
* We remember where we were up to from last time. This makes
|
||||
* sure that the hypercalls are done in the order the Guest
|
||||
* places them in the ring.
|
||||
*/
|
||||
unsigned int n = cpu->next_hcall;
|
||||
|
||||
/* 0xFF means there's no call here (yet). */
|
||||
if (st[n] == 0xFF)
|
||||
break;
|
||||
|
||||
/*
|
||||
* OK, we have hypercall. Increment the "next_hcall" cursor,
|
||||
* and wrap back to 0 if we reach the end.
|
||||
*/
|
||||
if (++cpu->next_hcall == LHCALL_RING_SIZE)
|
||||
cpu->next_hcall = 0;
|
||||
|
||||
/*
|
||||
* Copy the hypercall arguments into a local copy of the
|
||||
* hcall_args struct.
|
||||
*/
|
||||
if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
|
||||
sizeof(struct hcall_args))) {
|
||||
kill_guest(cpu, "Fetching async hypercalls");
|
||||
break;
|
||||
}
|
||||
|
||||
/* Do the hypercall, same as a normal one. */
|
||||
do_hcall(cpu, &args);
|
||||
|
||||
/* Mark the hypercall done. */
|
||||
if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
|
||||
kill_guest(cpu, "Writing result for async hypercall");
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Stop doing hypercalls if they want to notify the Launcher:
|
||||
* it needs to service this first.
|
||||
*/
|
||||
if (cpu->pending.trap)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Last of all, we look at what happens first of all. The very first time the
|
||||
* Guest makes a hypercall, we end up here to set things up:
|
||||
*/
|
||||
static void initialize(struct lg_cpu *cpu)
|
||||
{
|
||||
/*
|
||||
* You can't do anything until you're initialized. The Guest knows the
|
||||
* rules, so we're unforgiving here.
|
||||
*/
|
||||
if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
|
||||
kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (lguest_arch_init_hypercalls(cpu))
|
||||
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
|
||||
|
||||
/*
|
||||
* The Guest tells us where we're not to deliver interrupts by putting
|
||||
* the instruction address into "struct lguest_data".
|
||||
*/
|
||||
if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret))
|
||||
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
|
||||
|
||||
/*
|
||||
* We write the current time into the Guest's data page once so it can
|
||||
* set its clock.
|
||||
*/
|
||||
write_timestamp(cpu);
|
||||
|
||||
/* page_tables.c will also do some setup. */
|
||||
page_table_guest_data_init(cpu);
|
||||
|
||||
/*
|
||||
* This is the one case where the above accesses might have been the
|
||||
* first write to a Guest page. This may have caused a copy-on-write
|
||||
* fault, but the old page might be (read-only) in the Guest
|
||||
* pagetable.
|
||||
*/
|
||||
guest_pagetable_clear_all(cpu);
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*M:013
|
||||
* If a Guest reads from a page (so creates a mapping) that it has never
|
||||
* written to, and then the Launcher writes to it (ie. the output of a virtual
|
||||
* device), the Guest will still see the old page. In practice, this never
|
||||
* happens: why would the Guest read a page which it has never written to? But
|
||||
* a similar scenario might one day bite us, so it's worth mentioning.
|
||||
*
|
||||
* Note that if we used a shared anonymous mapping in the Launcher instead of
|
||||
* mapping /dev/zero private, we wouldn't worry about cop-on-write. And we
|
||||
* need that to switch the Launcher to processes (away from threads) anyway.
|
||||
:*/
|
||||
|
||||
/*H:100
|
||||
* Hypercalls
|
||||
*
|
||||
* Remember from the Guest, hypercalls come in two flavors: normal and
|
||||
* asynchronous. This file handles both of types.
|
||||
*/
|
||||
void do_hypercalls(struct lg_cpu *cpu)
|
||||
{
|
||||
/* Not initialized yet? This hypercall must do it. */
|
||||
if (unlikely(!cpu->lg->lguest_data)) {
|
||||
/* Set up the "struct lguest_data" */
|
||||
initialize(cpu);
|
||||
/* Hcall is done. */
|
||||
cpu->hcall = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* The Guest has initialized.
|
||||
*
|
||||
* Look in the hypercall ring for the async hypercalls:
|
||||
*/
|
||||
do_async_hcalls(cpu);
|
||||
|
||||
/*
|
||||
* If we stopped reading the hypercall ring because the Guest did a
|
||||
* NOTIFY to the Launcher, we want to return now. Otherwise we do
|
||||
* the hypercall.
|
||||
*/
|
||||
if (!cpu->pending.trap) {
|
||||
do_hcall(cpu, cpu->hcall);
|
||||
/*
|
||||
* Tricky point: we reset the hcall pointer to mark the
|
||||
* hypercall as "done". We use the hcall pointer rather than
|
||||
* the trap number to indicate a hypercall is pending.
|
||||
* Normally it doesn't matter: the Guest will run again and
|
||||
* update the trap number before we come back here.
|
||||
*
|
||||
* However, if we are signalled or the Guest sends I/O to the
|
||||
* Launcher, the run_guest() loop will exit without running the
|
||||
* Guest. When it comes back it would try to re-run the
|
||||
* hypercall. Finding that bug sucked.
|
||||
*/
|
||||
cpu->hcall = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine supplies the Guest with time: it's used for wallclock time at
|
||||
* initial boot and as a rough time source if the TSC isn't available.
|
||||
*/
|
||||
void write_timestamp(struct lg_cpu *cpu)
|
||||
{
|
||||
struct timespec now;
|
||||
ktime_get_real_ts(&now);
|
||||
if (copy_to_user(&cpu->lg->lguest_data->time,
|
||||
&now, sizeof(struct timespec)))
|
||||
kill_guest(cpu, "Writing timestamp");
|
||||
}
|
@ -1,706 +0,0 @@
|
||||
/*P:800
|
||||
* Interrupts (traps) are complicated enough to earn their own file.
|
||||
* There are three classes of interrupts:
|
||||
*
|
||||
* 1) Real hardware interrupts which occur while we're running the Guest,
|
||||
* 2) Interrupts for virtual devices attached to the Guest, and
|
||||
* 3) Traps and faults from the Guest.
|
||||
*
|
||||
* Real hardware interrupts must be delivered to the Host, not the Guest.
|
||||
* Virtual interrupts must be delivered to the Guest, but we make them look
|
||||
* just like real hardware would deliver them. Traps from the Guest can be set
|
||||
* up to go directly back into the Guest, but sometimes the Host wants to see
|
||||
* them first, so we also have a way of "reflecting" them into the Guest as if
|
||||
* they had been delivered to it directly.
|
||||
:*/
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sched.h>
|
||||
#include "lg.h"
|
||||
|
||||
/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
|
||||
static unsigned int syscall_vector = IA32_SYSCALL_VECTOR;
|
||||
module_param(syscall_vector, uint, 0444);
|
||||
|
||||
/* The address of the interrupt handler is split into two bits: */
|
||||
static unsigned long idt_address(u32 lo, u32 hi)
|
||||
{
|
||||
return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
|
||||
}
|
||||
|
||||
/*
|
||||
* The "type" of the interrupt handler is a 4 bit field: we only support a
|
||||
* couple of types.
|
||||
*/
|
||||
static int idt_type(u32 lo, u32 hi)
|
||||
{
|
||||
return (hi >> 8) & 0xF;
|
||||
}
|
||||
|
||||
/* An IDT entry can't be used unless the "present" bit is set. */
|
||||
static bool idt_present(u32 lo, u32 hi)
|
||||
{
|
||||
return (hi & 0x8000);
|
||||
}
|
||||
|
||||
/*
|
||||
* We need a helper to "push" a value onto the Guest's stack, since that's a
|
||||
* big part of what delivering an interrupt does.
|
||||
*/
|
||||
static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
|
||||
{
|
||||
/* Stack grows upwards: move stack then write value. */
|
||||
*gstack -= 4;
|
||||
lgwrite(cpu, *gstack, u32, val);
|
||||
}
|
||||
|
||||
/*H:210
|
||||
* The push_guest_interrupt_stack() routine saves Guest state on the stack for
|
||||
* an interrupt or trap. The mechanics of delivering traps and interrupts to
|
||||
* the Guest are the same, except some traps have an "error code" which gets
|
||||
* pushed onto the stack as well: the caller tells us if this is one.
|
||||
*
|
||||
* We set up the stack just like the CPU does for a real interrupt, so it's
|
||||
* identical for the Guest (and the standard "iret" instruction will undo
|
||||
* it).
|
||||
*/
|
||||
static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err)
|
||||
{
|
||||
unsigned long gstack, origstack;
|
||||
u32 eflags, ss, irq_enable;
|
||||
unsigned long virtstack;
|
||||
|
||||
/*
|
||||
* There are two cases for interrupts: one where the Guest is already
|
||||
* in the kernel, and a more complex one where the Guest is in
|
||||
* userspace. We check the privilege level to find out.
|
||||
*/
|
||||
if ((cpu->regs->ss&0x3) != GUEST_PL) {
|
||||
/*
|
||||
* The Guest told us their kernel stack with the SET_STACK
|
||||
* hypercall: both the virtual address and the segment.
|
||||
*/
|
||||
virtstack = cpu->esp1;
|
||||
ss = cpu->ss1;
|
||||
|
||||
origstack = gstack = guest_pa(cpu, virtstack);
|
||||
/*
|
||||
* We push the old stack segment and pointer onto the new
|
||||
* stack: when the Guest does an "iret" back from the interrupt
|
||||
* handler the CPU will notice they're dropping privilege
|
||||
* levels and expect these here.
|
||||
*/
|
||||
push_guest_stack(cpu, &gstack, cpu->regs->ss);
|
||||
push_guest_stack(cpu, &gstack, cpu->regs->esp);
|
||||
} else {
|
||||
/* We're staying on the same Guest (kernel) stack. */
|
||||
virtstack = cpu->regs->esp;
|
||||
ss = cpu->regs->ss;
|
||||
|
||||
origstack = gstack = guest_pa(cpu, virtstack);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remember that we never let the Guest actually disable interrupts, so
|
||||
* the "Interrupt Flag" bit is always set. We copy that bit from the
|
||||
* Guest's "irq_enabled" field into the eflags word: we saw the Guest
|
||||
* copy it back in "lguest_iret".
|
||||
*/
|
||||
eflags = cpu->regs->eflags;
|
||||
if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
|
||||
&& !(irq_enable & X86_EFLAGS_IF))
|
||||
eflags &= ~X86_EFLAGS_IF;
|
||||
|
||||
/*
|
||||
* An interrupt is expected to push three things on the stack: the old
|
||||
* "eflags" word, the old code segment, and the old instruction
|
||||
* pointer.
|
||||
*/
|
||||
push_guest_stack(cpu, &gstack, eflags);
|
||||
push_guest_stack(cpu, &gstack, cpu->regs->cs);
|
||||
push_guest_stack(cpu, &gstack, cpu->regs->eip);
|
||||
|
||||
/* For the six traps which supply an error code, we push that, too. */
|
||||
if (has_err)
|
||||
push_guest_stack(cpu, &gstack, cpu->regs->errcode);
|
||||
|
||||
/* Adjust the stack pointer and stack segment. */
|
||||
cpu->regs->ss = ss;
|
||||
cpu->regs->esp = virtstack + (gstack - origstack);
|
||||
}
|
||||
|
||||
/*
|
||||
* This actually makes the Guest start executing the given interrupt/trap
|
||||
* handler.
|
||||
*
|
||||
* "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
|
||||
* interrupt or trap. It's split into two parts for traditional reasons: gcc
|
||||
* on i386 used to be frightened by 64 bit numbers.
|
||||
*/
|
||||
static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi)
|
||||
{
|
||||
/* If we're already in the kernel, we don't change stacks. */
|
||||
if ((cpu->regs->ss&0x3) != GUEST_PL)
|
||||
cpu->regs->ss = cpu->esp1;
|
||||
|
||||
/*
|
||||
* Set the code segment and the address to execute.
|
||||
*/
|
||||
cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
|
||||
cpu->regs->eip = idt_address(lo, hi);
|
||||
|
||||
/*
|
||||
* Trapping always clears these flags:
|
||||
* TF: Trap flag
|
||||
* VM: Virtual 8086 mode
|
||||
* RF: Resume
|
||||
* NT: Nested task.
|
||||
*/
|
||||
cpu->regs->eflags &=
|
||||
~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
|
||||
|
||||
/*
|
||||
* There are two kinds of interrupt handlers: 0xE is an "interrupt
|
||||
* gate" which expects interrupts to be disabled on entry.
|
||||
*/
|
||||
if (idt_type(lo, hi) == 0xE)
|
||||
if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
|
||||
kill_guest(cpu, "Disabling interrupts");
|
||||
}
|
||||
|
||||
/* This restores the eflags word which was pushed on the stack by a trap */
|
||||
static void restore_eflags(struct lg_cpu *cpu)
|
||||
{
|
||||
/* This is the physical address of the stack. */
|
||||
unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp);
|
||||
|
||||
/*
|
||||
* Stack looks like this:
|
||||
* Address Contents
|
||||
* esp EIP
|
||||
* esp + 4 CS
|
||||
* esp + 8 EFLAGS
|
||||
*/
|
||||
cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32);
|
||||
cpu->regs->eflags &=
|
||||
~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
|
||||
}
|
||||
|
||||
/*H:205
|
||||
* Virtual Interrupts.
|
||||
*
|
||||
* interrupt_pending() returns the first pending interrupt which isn't blocked
|
||||
* by the Guest. It is called before every entry to the Guest, and just before
|
||||
* we go to sleep when the Guest has halted itself.
|
||||
*/
|
||||
unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
|
||||
{
|
||||
unsigned int irq;
|
||||
DECLARE_BITMAP(blk, LGUEST_IRQS);
|
||||
|
||||
/* If the Guest hasn't even initialized yet, we can do nothing. */
|
||||
if (!cpu->lg->lguest_data)
|
||||
return LGUEST_IRQS;
|
||||
|
||||
/*
|
||||
* Take our "irqs_pending" array and remove any interrupts the Guest
|
||||
* wants blocked: the result ends up in "blk".
|
||||
*/
|
||||
if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
|
||||
sizeof(blk)))
|
||||
return LGUEST_IRQS;
|
||||
bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
|
||||
|
||||
/* Find the first interrupt. */
|
||||
irq = find_first_bit(blk, LGUEST_IRQS);
|
||||
*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
|
||||
|
||||
return irq;
|
||||
}
|
||||
|
||||
/*
|
||||
* This actually diverts the Guest to running an interrupt handler, once an
|
||||
* interrupt has been identified by interrupt_pending().
|
||||
*/
|
||||
void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
|
||||
{
|
||||
struct desc_struct *idt;
|
||||
|
||||
BUG_ON(irq >= LGUEST_IRQS);
|
||||
|
||||
/* If they're halted, interrupts restart them. */
|
||||
if (cpu->halted) {
|
||||
/* Re-enable interrupts. */
|
||||
if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
|
||||
kill_guest(cpu, "Re-enabling interrupts");
|
||||
cpu->halted = 0;
|
||||
} else {
|
||||
/* Otherwise we check if they have interrupts disabled. */
|
||||
u32 irq_enabled;
|
||||
if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
|
||||
irq_enabled = 0;
|
||||
if (!irq_enabled) {
|
||||
/* Make sure they know an IRQ is pending. */
|
||||
put_user(X86_EFLAGS_IF,
|
||||
&cpu->lg->lguest_data->irq_pending);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Look at the IDT entry the Guest gave us for this interrupt. The
|
||||
* first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
|
||||
* over them.
|
||||
*/
|
||||
idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
|
||||
/* If they don't have a handler (yet?), we just ignore it */
|
||||
if (idt_present(idt->a, idt->b)) {
|
||||
/* OK, mark it no longer pending and deliver it. */
|
||||
clear_bit(irq, cpu->irqs_pending);
|
||||
|
||||
/*
|
||||
* They may be about to iret, where they asked us never to
|
||||
* deliver interrupts. In this case, we can emulate that iret
|
||||
* then immediately deliver the interrupt. This is basically
|
||||
* a noop: the iret would pop the interrupt frame and restore
|
||||
* eflags, and then we'd set it up again. So just restore the
|
||||
* eflags word and jump straight to the handler in this case.
|
||||
*
|
||||
* Denys Vlasenko points out that this isn't quite right: if
|
||||
* the iret was returning to userspace, then that interrupt
|
||||
* would reset the stack pointer (which the Guest told us
|
||||
* about via LHCALL_SET_STACK). But unless the Guest is being
|
||||
* *really* weird, that will be the same as the current stack
|
||||
* anyway.
|
||||
*/
|
||||
if (cpu->regs->eip == cpu->lg->noirq_iret) {
|
||||
restore_eflags(cpu);
|
||||
} else {
|
||||
/*
|
||||
* set_guest_interrupt() takes a flag to say whether
|
||||
* this interrupt pushes an error code onto the stack
|
||||
* as well: virtual interrupts never do.
|
||||
*/
|
||||
push_guest_interrupt_stack(cpu, false);
|
||||
}
|
||||
/* Actually make Guest cpu jump to handler. */
|
||||
guest_run_interrupt(cpu, idt->a, idt->b);
|
||||
}
|
||||
|
||||
/*
|
||||
* Every time we deliver an interrupt, we update the timestamp in the
|
||||
* Guest's lguest_data struct. It would be better for the Guest if we
|
||||
* did this more often, but it can actually be quite slow: doing it
|
||||
* here is a compromise which means at least it gets updated every
|
||||
* timer interrupt.
|
||||
*/
|
||||
write_timestamp(cpu);
|
||||
|
||||
/*
|
||||
* If there are no other interrupts we want to deliver, clear
|
||||
* the pending flag.
|
||||
*/
|
||||
if (!more)
|
||||
put_user(0, &cpu->lg->lguest_data->irq_pending);
|
||||
}
|
||||
|
||||
/* And this is the routine when we want to set an interrupt for the Guest. */
|
||||
void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
|
||||
{
|
||||
/*
|
||||
* Next time the Guest runs, the core code will see if it can deliver
|
||||
* this interrupt.
|
||||
*/
|
||||
set_bit(irq, cpu->irqs_pending);
|
||||
|
||||
/*
|
||||
* Make sure it sees it; it might be asleep (eg. halted), or running
|
||||
* the Guest right now, in which case kick_process() will knock it out.
|
||||
*/
|
||||
if (!wake_up_process(cpu->tsk))
|
||||
kick_process(cpu->tsk);
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*
|
||||
* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent
|
||||
* me a patch, so we support that too. It'd be a big step for lguest if half
|
||||
* the Plan 9 user base were to start using it.
|
||||
*
|
||||
* Actually now I think of it, it's possible that Ron *is* half the Plan 9
|
||||
* userbase. Oh well.
|
||||
*/
|
||||
bool could_be_syscall(unsigned int num)
|
||||
{
|
||||
/* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
|
||||
return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
|
||||
}
|
||||
|
||||
/* The syscall vector it wants must be unused by Host. */
|
||||
bool check_syscall_vector(struct lguest *lg)
|
||||
{
|
||||
u32 vector;
|
||||
|
||||
if (get_user(vector, &lg->lguest_data->syscall_vec))
|
||||
return false;
|
||||
|
||||
return could_be_syscall(vector);
|
||||
}
|
||||
|
||||
int init_interrupts(void)
|
||||
{
|
||||
/* If they want some strange system call vector, reserve it now */
|
||||
if (syscall_vector != IA32_SYSCALL_VECTOR) {
|
||||
if (test_bit(syscall_vector, used_vectors) ||
|
||||
vector_used_by_percpu_irq(syscall_vector)) {
|
||||
printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
|
||||
syscall_vector);
|
||||
return -EBUSY;
|
||||
}
|
||||
set_bit(syscall_vector, used_vectors);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void free_interrupts(void)
|
||||
{
|
||||
if (syscall_vector != IA32_SYSCALL_VECTOR)
|
||||
clear_bit(syscall_vector, used_vectors);
|
||||
}
|
||||
|
||||
/*H:220
|
||||
* Now we've got the routines to deliver interrupts, delivering traps like
|
||||
* page fault is easy. The only trick is that Intel decided that some traps
|
||||
* should have error codes:
|
||||
*/
|
||||
static bool has_err(unsigned int trap)
|
||||
{
|
||||
return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
|
||||
}
|
||||
|
||||
/* deliver_trap() returns true if it could deliver the trap. */
|
||||
bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
|
||||
{
|
||||
/*
|
||||
* Trap numbers are always 8 bit, but we set an impossible trap number
|
||||
* for traps inside the Switcher, so check that here.
|
||||
*/
|
||||
if (num >= ARRAY_SIZE(cpu->arch.idt))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Early on the Guest hasn't set the IDT entries (or maybe it put a
|
||||
* bogus one in): if we fail here, the Guest will be killed.
|
||||
*/
|
||||
if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
|
||||
return false;
|
||||
push_guest_interrupt_stack(cpu, has_err(num));
|
||||
guest_run_interrupt(cpu, cpu->arch.idt[num].a,
|
||||
cpu->arch.idt[num].b);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*H:250
|
||||
* Here's the hard part: returning to the Host every time a trap happens
|
||||
* and then calling deliver_trap() and re-entering the Guest is slow.
|
||||
* Particularly because Guest userspace system calls are traps (usually trap
|
||||
* 128).
|
||||
*
|
||||
* So we'd like to set up the IDT to tell the CPU to deliver traps directly
|
||||
* into the Guest. This is possible, but the complexities cause the size of
|
||||
* this file to double! However, 150 lines of code is worth writing for taking
|
||||
* system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
|
||||
* the other hypervisors would beat it up at lunchtime.
|
||||
*
|
||||
* This routine indicates if a particular trap number could be delivered
|
||||
* directly.
|
||||
*
|
||||
* Unfortunately, Linux 4.6 started using an interrupt gate instead of a
|
||||
* trap gate for syscalls, so this trick is ineffective. See Mastery for
|
||||
* how we could do this anyway...
|
||||
*/
|
||||
static bool direct_trap(unsigned int num)
|
||||
{
|
||||
/*
|
||||
* Hardware interrupts don't go to the Guest at all (except system
|
||||
* call).
|
||||
*/
|
||||
if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The Host needs to see page faults (for shadow paging and to save the
|
||||
* fault address), general protection faults (in/out emulation) and
|
||||
* device not available (TS handling) and of course, the hypercall trap.
|
||||
*/
|
||||
return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*M:005
|
||||
* The Guest has the ability to turn its interrupt gates into trap gates,
|
||||
* if it is careful. The Host will let trap gates can go directly to the
|
||||
* Guest, but the Guest needs the interrupts atomically disabled for an
|
||||
* interrupt gate. The Host could provide a mechanism to register more
|
||||
* "no-interrupt" regions, and the Guest could point the trap gate at
|
||||
* instructions within that region, where it can safely disable interrupts.
|
||||
*/
|
||||
|
||||
/*M:006
|
||||
* The Guests do not use the sysenter (fast system call) instruction,
|
||||
* because it's hardcoded to enter privilege level 0 and so can't go direct.
|
||||
* It's about twice as fast as the older "int 0x80" system call, so it might
|
||||
* still be worthwhile to handle it in the Switcher and lcall down to the
|
||||
* Guest. The sysenter semantics are hairy tho: search for that keyword in
|
||||
* entry.S
|
||||
:*/
|
||||
|
||||
/*H:260
|
||||
* When we make traps go directly into the Guest, we need to make sure
|
||||
* the kernel stack is valid (ie. mapped in the page tables). Otherwise, the
|
||||
* CPU trying to deliver the trap will fault while trying to push the interrupt
|
||||
* words on the stack: this is called a double fault, and it forces us to kill
|
||||
* the Guest.
|
||||
*
|
||||
* Which is deeply unfair, because (literally!) it wasn't the Guests' fault.
|
||||
*/
|
||||
void pin_stack_pages(struct lg_cpu *cpu)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
/*
|
||||
* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
|
||||
* two pages of stack space.
|
||||
*/
|
||||
for (i = 0; i < cpu->lg->stack_pages; i++)
|
||||
/*
|
||||
* The stack grows *upwards*, so the address we're given is the
|
||||
* start of the page after the kernel stack. Subtract one to
|
||||
* get back onto the first stack page, and keep subtracting to
|
||||
* get to the rest of the stack pages.
|
||||
*/
|
||||
pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Direct traps also mean that we need to know whenever the Guest wants to use
|
||||
* a different kernel stack, so we can change the guest TSS to use that
|
||||
* stack. The TSS entries expect a virtual address, so unlike most addresses
|
||||
* the Guest gives us, the "esp" (stack pointer) value here is virtual, not
|
||||
* physical.
|
||||
*
|
||||
* In Linux each process has its own kernel stack, so this happens a lot: we
|
||||
* change stacks on each context switch.
|
||||
*/
|
||||
void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
|
||||
{
|
||||
/*
|
||||
* You're not allowed a stack segment with privilege level 0: bad Guest!
|
||||
*/
|
||||
if ((seg & 0x3) != GUEST_PL)
|
||||
kill_guest(cpu, "bad stack segment %i", seg);
|
||||
/* We only expect one or two stack pages. */
|
||||
if (pages > 2)
|
||||
kill_guest(cpu, "bad stack pages %u", pages);
|
||||
/* Save where the stack is, and how many pages */
|
||||
cpu->ss1 = seg;
|
||||
cpu->esp1 = esp;
|
||||
cpu->lg->stack_pages = pages;
|
||||
/* Make sure the new stack pages are mapped */
|
||||
pin_stack_pages(cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* All this reference to mapping stacks leads us neatly into the other complex
|
||||
* part of the Host: page table handling.
|
||||
*/
|
||||
|
||||
/*H:235
|
||||
* This is the routine which actually checks the Guest's IDT entry and
|
||||
* transfers it into the entry in "struct lguest":
|
||||
*/
|
||||
static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
|
||||
unsigned int num, u32 lo, u32 hi)
|
||||
{
|
||||
u8 type = idt_type(lo, hi);
|
||||
|
||||
/* We zero-out a not-present entry */
|
||||
if (!idt_present(lo, hi)) {
|
||||
trap->a = trap->b = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
/* We only support interrupt and trap gates. */
|
||||
if (type != 0xE && type != 0xF)
|
||||
kill_guest(cpu, "bad IDT type %i", type);
|
||||
|
||||
/*
|
||||
* We only copy the handler address, present bit, privilege level and
|
||||
* type. The privilege level controls where the trap can be triggered
|
||||
* manually with an "int" instruction. This is usually GUEST_PL,
|
||||
* except for system calls which userspace can use.
|
||||
*/
|
||||
trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
|
||||
trap->b = (hi&0xFFFFEF00);
|
||||
}
|
||||
|
||||
/*H:230
|
||||
* While we're here, dealing with delivering traps and interrupts to the
|
||||
* Guest, we might as well complete the picture: how the Guest tells us where
|
||||
* it wants them to go. This would be simple, except making traps fast
|
||||
* requires some tricks.
|
||||
*
|
||||
* We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
|
||||
* LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here.
|
||||
*/
|
||||
void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
|
||||
{
|
||||
/*
|
||||
* Guest never handles: NMI, doublefault, spurious interrupt or
|
||||
* hypercall. We ignore when it tries to set them.
|
||||
*/
|
||||
if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Mark the IDT as changed: next time the Guest runs we'll know we have
|
||||
* to copy this again.
|
||||
*/
|
||||
cpu->changed |= CHANGED_IDT;
|
||||
|
||||
/* Check that the Guest doesn't try to step outside the bounds. */
|
||||
if (num >= ARRAY_SIZE(cpu->arch.idt))
|
||||
kill_guest(cpu, "Setting idt entry %u", num);
|
||||
else
|
||||
set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
|
||||
}
|
||||
|
||||
/*
|
||||
* The default entry for each interrupt points into the Switcher routines which
|
||||
* simply return to the Host. The run_guest() loop will then call
|
||||
* deliver_trap() to bounce it back into the Guest.
|
||||
*/
|
||||
static void default_idt_entry(struct desc_struct *idt,
|
||||
int trap,
|
||||
const unsigned long handler,
|
||||
const struct desc_struct *base)
|
||||
{
|
||||
/* A present interrupt gate. */
|
||||
u32 flags = 0x8e00;
|
||||
|
||||
/*
|
||||
* Set the privilege level on the entry for the hypercall: this allows
|
||||
* the Guest to use the "int" instruction to trigger it.
|
||||
*/
|
||||
if (trap == LGUEST_TRAP_ENTRY)
|
||||
flags |= (GUEST_PL << 13);
|
||||
else if (base)
|
||||
/*
|
||||
* Copy privilege level from what Guest asked for. This allows
|
||||
* debug (int 3) traps from Guest userspace, for example.
|
||||
*/
|
||||
flags |= (base->b & 0x6000);
|
||||
|
||||
/* Now pack it into the IDT entry in its weird format. */
|
||||
idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
|
||||
idt->b = (handler&0xFFFF0000) | flags;
|
||||
}
|
||||
|
||||
/* When the Guest first starts, we put default entries into the IDT. */
|
||||
void setup_default_idt_entries(struct lguest_ro_state *state,
|
||||
const unsigned long *def)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
|
||||
default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
|
||||
}
|
||||
|
||||
/*H:240
|
||||
* We don't use the IDT entries in the "struct lguest" directly, instead
|
||||
* we copy them into the IDT which we've set up for Guests on this CPU, just
|
||||
* before we run the Guest. This routine does that copy.
|
||||
*/
|
||||
void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
|
||||
const unsigned long *def)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
/*
|
||||
* We can simply copy the direct traps, otherwise we use the default
|
||||
* ones in the Switcher: they will return to the Host.
|
||||
*/
|
||||
for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
|
||||
const struct desc_struct *gidt = &cpu->arch.idt[i];
|
||||
|
||||
/* If no Guest can ever override this trap, leave it alone. */
|
||||
if (!direct_trap(i))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Only trap gates (type 15) can go direct to the Guest.
|
||||
* Interrupt gates (type 14) disable interrupts as they are
|
||||
* entered, which we never let the Guest do. Not present
|
||||
* entries (type 0x0) also can't go direct, of course.
|
||||
*
|
||||
* If it can't go direct, we still need to copy the priv. level:
|
||||
* they might want to give userspace access to a software
|
||||
* interrupt.
|
||||
*/
|
||||
if (idt_type(gidt->a, gidt->b) == 0xF)
|
||||
idt[i] = *gidt;
|
||||
else
|
||||
default_idt_entry(&idt[i], i, def[i], gidt);
|
||||
}
|
||||
}
|
||||
|
||||
/*H:200
|
||||
* The Guest Clock.
|
||||
*
|
||||
* There are two sources of virtual interrupts. We saw one in lguest_user.c:
|
||||
* the Launcher sending interrupts for virtual devices. The other is the Guest
|
||||
* timer interrupt.
|
||||
*
|
||||
* The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
|
||||
* the next timer interrupt (in nanoseconds). We use the high-resolution timer
|
||||
* infrastructure to set a callback at that time.
|
||||
*
|
||||
* 0 means "turn off the clock".
|
||||
*/
|
||||
void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
|
||||
{
|
||||
ktime_t expires;
|
||||
|
||||
if (unlikely(delta == 0)) {
|
||||
/* Clock event device is shutting down. */
|
||||
hrtimer_cancel(&cpu->hrt);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We use wallclock time here, so the Guest might not be running for
|
||||
* all the time between now and the timer interrupt it asked for. This
|
||||
* is almost always the right thing to do.
|
||||
*/
|
||||
expires = ktime_add_ns(ktime_get_real(), delta);
|
||||
hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
|
||||
}
|
||||
|
||||
/* This is the function called when the Guest's timer expires. */
|
||||
static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
|
||||
{
|
||||
struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
|
||||
|
||||
/* Remember the first interrupt is the timer interrupt. */
|
||||
set_interrupt(cpu, 0);
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
/* This sets up the timer for this Guest. */
|
||||
void init_clockdev(struct lg_cpu *cpu)
|
||||
{
|
||||
hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
|
||||
cpu->hrt.function = clockdev_fn;
|
||||
}
|
@ -1,258 +0,0 @@
|
||||
#ifndef _LGUEST_H
|
||||
#define _LGUEST_H
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#include <linux/types.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/stringify.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <linux/lguest_launcher.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/hrtimer.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <asm/lguest.h>
|
||||
|
||||
struct pgdir {
|
||||
unsigned long gpgdir;
|
||||
bool switcher_mapped;
|
||||
int last_host_cpu;
|
||||
pgd_t *pgdir;
|
||||
};
|
||||
|
||||
/* We have two pages shared with guests, per cpu. */
|
||||
struct lguest_pages {
|
||||
/* This is the stack page mapped rw in guest */
|
||||
char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
|
||||
struct lguest_regs regs;
|
||||
|
||||
/* This is the host state & guest descriptor page, ro in guest */
|
||||
struct lguest_ro_state state;
|
||||
} __attribute__((aligned(PAGE_SIZE)));
|
||||
|
||||
#define CHANGED_IDT 1
|
||||
#define CHANGED_GDT 2
|
||||
#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
|
||||
#define CHANGED_ALL 3
|
||||
|
||||
struct lg_cpu {
|
||||
unsigned int id;
|
||||
struct lguest *lg;
|
||||
struct task_struct *tsk;
|
||||
struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
|
||||
|
||||
u32 cr2;
|
||||
u32 esp1;
|
||||
u16 ss1;
|
||||
|
||||
/* Bitmap of what has changed: see CHANGED_* above. */
|
||||
int changed;
|
||||
|
||||
/* Pending operation. */
|
||||
struct lguest_pending pending;
|
||||
|
||||
unsigned long *reg_read; /* register from LHREQ_GETREG */
|
||||
|
||||
/* At end of a page shared mapped over lguest_pages in guest. */
|
||||
unsigned long regs_page;
|
||||
struct lguest_regs *regs;
|
||||
|
||||
struct lguest_pages *last_pages;
|
||||
|
||||
/* Initialization mode: linear map everything. */
|
||||
bool linear_pages;
|
||||
int cpu_pgd; /* Which pgd this cpu is currently using */
|
||||
|
||||
/* If a hypercall was asked for, this points to the arguments. */
|
||||
struct hcall_args *hcall;
|
||||
u32 next_hcall;
|
||||
|
||||
/* Virtual clock device */
|
||||
struct hrtimer hrt;
|
||||
|
||||
/* Did the Guest tell us to halt? */
|
||||
int halted;
|
||||
|
||||
/* Pending virtual interrupts */
|
||||
DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
|
||||
|
||||
struct lg_cpu_arch arch;
|
||||
};
|
||||
|
||||
/* The private info the thread maintains about the guest. */
|
||||
struct lguest {
|
||||
struct lguest_data __user *lguest_data;
|
||||
struct lg_cpu cpus[NR_CPUS];
|
||||
unsigned int nr_cpus;
|
||||
|
||||
/* Valid guest memory pages must be < this. */
|
||||
u32 pfn_limit;
|
||||
|
||||
/* Device memory is >= pfn_limit and < device_limit. */
|
||||
u32 device_limit;
|
||||
|
||||
/*
|
||||
* This provides the offset to the base of guest-physical memory in the
|
||||
* Launcher.
|
||||
*/
|
||||
void __user *mem_base;
|
||||
unsigned long kernel_address;
|
||||
|
||||
struct pgdir pgdirs[4];
|
||||
|
||||
unsigned long noirq_iret;
|
||||
|
||||
unsigned int stack_pages;
|
||||
u32 tsc_khz;
|
||||
|
||||
/* Dead? */
|
||||
const char *dead;
|
||||
};
|
||||
|
||||
extern struct mutex lguest_lock;
|
||||
|
||||
/* core.c: */
|
||||
bool lguest_address_ok(const struct lguest *lg,
|
||||
unsigned long addr, unsigned long len);
|
||||
void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
|
||||
void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
|
||||
extern struct page **lg_switcher_pages;
|
||||
|
||||
/*H:035
|
||||
* Using memory-copy operations like that is usually inconvient, so we
|
||||
* have the following helper macros which read and write a specific type (often
|
||||
* an unsigned long).
|
||||
*
|
||||
* This reads into a variable of the given type then returns that.
|
||||
*/
|
||||
#define lgread(cpu, addr, type) \
|
||||
({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
|
||||
|
||||
/* This checks that the variable is of the given type, then writes it out. */
|
||||
#define lgwrite(cpu, addr, type, val) \
|
||||
do { \
|
||||
typecheck(type, val); \
|
||||
__lgwrite((cpu), (addr), &(val), sizeof(val)); \
|
||||
} while(0)
|
||||
/* (end of memory access helper routines) :*/
|
||||
|
||||
int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
|
||||
|
||||
/*
|
||||
* Helper macros to obtain the first 12 or the last 20 bits, this is only the
|
||||
* first step in the migration to the kernel types. pte_pfn is already defined
|
||||
* in the kernel.
|
||||
*/
|
||||
#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
|
||||
#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
|
||||
#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)
|
||||
#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT)
|
||||
|
||||
/* interrupts_and_traps.c: */
|
||||
unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
|
||||
void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
|
||||
void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
|
||||
bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
|
||||
void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
|
||||
u32 low, u32 hi);
|
||||
void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
|
||||
void pin_stack_pages(struct lg_cpu *cpu);
|
||||
void setup_default_idt_entries(struct lguest_ro_state *state,
|
||||
const unsigned long *def);
|
||||
void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
|
||||
const unsigned long *def);
|
||||
void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
|
||||
bool send_notify_to_eventfd(struct lg_cpu *cpu);
|
||||
void init_clockdev(struct lg_cpu *cpu);
|
||||
bool check_syscall_vector(struct lguest *lg);
|
||||
bool could_be_syscall(unsigned int num);
|
||||
int init_interrupts(void);
|
||||
void free_interrupts(void);
|
||||
|
||||
/* segments.c: */
|
||||
void setup_default_gdt_entries(struct lguest_ro_state *state);
|
||||
void setup_guest_gdt(struct lg_cpu *cpu);
|
||||
void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
|
||||
u32 low, u32 hi);
|
||||
void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
|
||||
void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
|
||||
void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
|
||||
|
||||
/* page_tables.c: */
|
||||
int init_guest_pagetable(struct lguest *lg);
|
||||
void free_guest_pagetable(struct lguest *lg);
|
||||
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
|
||||
void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
|
||||
#ifdef CONFIG_X86_PAE
|
||||
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
|
||||
#endif
|
||||
void guest_pagetable_clear_all(struct lg_cpu *cpu);
|
||||
void guest_pagetable_flush_user(struct lg_cpu *cpu);
|
||||
void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
|
||||
unsigned long vaddr, pte_t val);
|
||||
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
|
||||
bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
|
||||
unsigned long *iomem);
|
||||
void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
|
||||
bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
|
||||
unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
|
||||
void page_table_guest_data_init(struct lg_cpu *cpu);
|
||||
|
||||
/* <arch>/core.c: */
|
||||
void lguest_arch_host_init(void);
|
||||
void lguest_arch_host_fini(void);
|
||||
void lguest_arch_run_guest(struct lg_cpu *cpu);
|
||||
void lguest_arch_handle_trap(struct lg_cpu *cpu);
|
||||
int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
|
||||
int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
|
||||
void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
|
||||
unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
|
||||
|
||||
/* <arch>/switcher.S: */
|
||||
extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
|
||||
|
||||
/* lguest_user.c: */
|
||||
int lguest_device_init(void);
|
||||
void lguest_device_remove(void);
|
||||
|
||||
/* hypercalls.c: */
|
||||
void do_hypercalls(struct lg_cpu *cpu);
|
||||
void write_timestamp(struct lg_cpu *cpu);
|
||||
|
||||
/*L:035
|
||||
* Let's step aside for the moment, to study one important routine that's used
|
||||
* widely in the Host code.
|
||||
*
|
||||
* There are many cases where the Guest can do something invalid, like pass crap
|
||||
* to a hypercall. Since only the Guest kernel can make hypercalls, it's quite
|
||||
* acceptable to simply terminate the Guest and give the Launcher a nicely
|
||||
* formatted reason. It's also simpler for the Guest itself, which doesn't
|
||||
* need to check most hypercalls for "success"; if you're still running, it
|
||||
* succeeded.
|
||||
*
|
||||
* Once this is called, the Guest will never run again, so most Host code can
|
||||
* call this then continue as if nothing had happened. This means many
|
||||
* functions don't have to explicitly return an error code, which keeps the
|
||||
* code simple.
|
||||
*
|
||||
* It also means that this can be called more than once: only the first one is
|
||||
* remembered. The only trick is that we still need to kill the Guest even if
|
||||
* we can't allocate memory to store the reason. Linux has a neat way of
|
||||
* packing error codes into invalid pointers, so we use that here.
|
||||
*
|
||||
* Like any macro which uses an "if", it is safely wrapped in a run-once "do {
|
||||
* } while(0)".
|
||||
*/
|
||||
#define kill_guest(cpu, fmt...) \
|
||||
do { \
|
||||
if (!(cpu)->lg->dead) { \
|
||||
(cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \
|
||||
if (!(cpu)->lg->dead) \
|
||||
(cpu)->lg->dead = ERR_PTR(-ENOMEM); \
|
||||
} \
|
||||
} while(0)
|
||||
/* (End of aside) :*/
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
#endif /* _LGUEST_H */
|
@ -1,446 +0,0 @@
|
||||
/*P:200 This contains all the /dev/lguest code, whereby the userspace
|
||||
* launcher controls and communicates with the Guest. For example,
|
||||
* the first write will tell us the Guest's memory layout and entry
|
||||
* point. A read will run the Guest until something happens, such as
|
||||
* a signal or the Guest accessing a device.
|
||||
:*/
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/miscdevice.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/export.h>
|
||||
#include "lg.h"
|
||||
|
||||
/*L:052
|
||||
The Launcher can get the registers, and also set some of them.
|
||||
*/
|
||||
static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
|
||||
{
|
||||
unsigned long which;
|
||||
|
||||
/* We re-use the ptrace structure to specify which register to read. */
|
||||
if (get_user(which, input) != 0)
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
* We set up the cpu register pointer, and their next read will
|
||||
* actually get the value (instead of running the guest).
|
||||
*
|
||||
* The last argument 'true' says we can access any register.
|
||||
*/
|
||||
cpu->reg_read = lguest_arch_regptr(cpu, which, true);
|
||||
if (!cpu->reg_read)
|
||||
return -ENOENT;
|
||||
|
||||
/* And because this is a write() call, we return the length used. */
|
||||
return sizeof(unsigned long) * 2;
|
||||
}
|
||||
|
||||
static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
|
||||
{
|
||||
unsigned long which, value, *reg;
|
||||
|
||||
/* We re-use the ptrace structure to specify which register to read. */
|
||||
if (get_user(which, input) != 0)
|
||||
return -EFAULT;
|
||||
input++;
|
||||
if (get_user(value, input) != 0)
|
||||
return -EFAULT;
|
||||
|
||||
/* The last argument 'false' means we can't access all registers. */
|
||||
reg = lguest_arch_regptr(cpu, which, false);
|
||||
if (!reg)
|
||||
return -ENOENT;
|
||||
|
||||
*reg = value;
|
||||
|
||||
/* And because this is a write() call, we return the length used. */
|
||||
return sizeof(unsigned long) * 3;
|
||||
}
|
||||
|
||||
/*L:050
|
||||
* Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
|
||||
* number to /dev/lguest.
|
||||
*/
|
||||
static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
|
||||
{
|
||||
unsigned long irq;
|
||||
|
||||
if (get_user(irq, input) != 0)
|
||||
return -EFAULT;
|
||||
if (irq >= LGUEST_IRQS)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Next time the Guest runs, the core code will see if it can deliver
|
||||
* this interrupt.
|
||||
*/
|
||||
set_interrupt(cpu, irq);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*L:053
|
||||
* Deliver a trap: this is used by the Launcher if it can't emulate
|
||||
* an instruction.
|
||||
*/
|
||||
static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
|
||||
{
|
||||
unsigned long trapnum;
|
||||
|
||||
if (get_user(trapnum, input) != 0)
|
||||
return -EFAULT;
|
||||
|
||||
if (!deliver_trap(cpu, trapnum))
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*L:040
|
||||
* Once our Guest is initialized, the Launcher makes it run by reading
|
||||
* from /dev/lguest.
|
||||
*/
|
||||
static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
|
||||
{
|
||||
struct lguest *lg = file->private_data;
|
||||
struct lg_cpu *cpu;
|
||||
unsigned int cpu_id = *o;
|
||||
|
||||
/* You must write LHREQ_INITIALIZE first! */
|
||||
if (!lg)
|
||||
return -EINVAL;
|
||||
|
||||
/* Watch out for arbitrary vcpu indexes! */
|
||||
if (cpu_id >= lg->nr_cpus)
|
||||
return -EINVAL;
|
||||
|
||||
cpu = &lg->cpus[cpu_id];
|
||||
|
||||
/* If you're not the task which owns the Guest, go away. */
|
||||
if (current != cpu->tsk)
|
||||
return -EPERM;
|
||||
|
||||
/* If the Guest is already dead, we indicate why */
|
||||
if (lg->dead) {
|
||||
size_t len;
|
||||
|
||||
/* lg->dead either contains an error code, or a string. */
|
||||
if (IS_ERR(lg->dead))
|
||||
return PTR_ERR(lg->dead);
|
||||
|
||||
/* We can only return as much as the buffer they read with. */
|
||||
len = min(size, strlen(lg->dead)+1);
|
||||
if (copy_to_user(user, lg->dead, len) != 0)
|
||||
return -EFAULT;
|
||||
return len;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we returned from read() last time because the Guest sent I/O,
|
||||
* clear the flag.
|
||||
*/
|
||||
if (cpu->pending.trap)
|
||||
cpu->pending.trap = 0;
|
||||
|
||||
/* Run the Guest until something interesting happens. */
|
||||
return run_guest(cpu, (unsigned long __user *)user);
|
||||
}
|
||||
|
||||
/*L:025
|
||||
* This actually initializes a CPU. For the moment, a Guest is only
|
||||
* uniprocessor, so "id" is always 0.
|
||||
*/
|
||||
static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
|
||||
{
|
||||
/* We have a limited number of CPUs in the lguest struct. */
|
||||
if (id >= ARRAY_SIZE(cpu->lg->cpus))
|
||||
return -EINVAL;
|
||||
|
||||
/* Set up this CPU's id, and pointer back to the lguest struct. */
|
||||
cpu->id = id;
|
||||
cpu->lg = container_of(cpu, struct lguest, cpus[id]);
|
||||
cpu->lg->nr_cpus++;
|
||||
|
||||
/* Each CPU has a timer it can set. */
|
||||
init_clockdev(cpu);
|
||||
|
||||
/*
|
||||
* We need a complete page for the Guest registers: they are accessible
|
||||
* to the Guest and we can only grant it access to whole pages.
|
||||
*/
|
||||
cpu->regs_page = get_zeroed_page(GFP_KERNEL);
|
||||
if (!cpu->regs_page)
|
||||
return -ENOMEM;
|
||||
|
||||
/* We actually put the registers at the end of the page. */
|
||||
cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
|
||||
|
||||
/*
|
||||
* Now we initialize the Guest's registers, handing it the start
|
||||
* address.
|
||||
*/
|
||||
lguest_arch_setup_regs(cpu, start_ip);
|
||||
|
||||
/*
|
||||
* We keep a pointer to the Launcher task (ie. current task) for when
|
||||
* other Guests want to wake this one (eg. console input).
|
||||
*/
|
||||
cpu->tsk = current;
|
||||
|
||||
/*
|
||||
* We need to keep a pointer to the Launcher's memory map, because if
|
||||
* the Launcher dies we need to clean it up. If we don't keep a
|
||||
* reference, it is destroyed before close() is called.
|
||||
*/
|
||||
cpu->mm = get_task_mm(cpu->tsk);
|
||||
|
||||
/*
|
||||
* We remember which CPU's pages this Guest used last, for optimization
|
||||
* when the same Guest runs on the same CPU twice.
|
||||
*/
|
||||
cpu->last_pages = NULL;
|
||||
|
||||
/* No error == success. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*L:020
|
||||
* The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
|
||||
* addition to the LHREQ_INITIALIZE value). These are:
|
||||
*
|
||||
* base: The start of the Guest-physical memory inside the Launcher memory.
|
||||
*
|
||||
* pfnlimit: The highest (Guest-physical) page number the Guest should be
|
||||
* allowed to access. The Guest memory lives inside the Launcher, so it sets
|
||||
* this to ensure the Guest can only reach its own memory.
|
||||
*
|
||||
* start: The first instruction to execute ("eip" in x86-speak).
|
||||
*/
|
||||
static int initialize(struct file *file, const unsigned long __user *input)
|
||||
{
|
||||
/* "struct lguest" contains all we (the Host) know about a Guest. */
|
||||
struct lguest *lg;
|
||||
int err;
|
||||
unsigned long args[4];
|
||||
|
||||
/*
|
||||
* We grab the Big Lguest lock, which protects against multiple
|
||||
* simultaneous initializations.
|
||||
*/
|
||||
mutex_lock(&lguest_lock);
|
||||
/* You can't initialize twice! Close the device and start again... */
|
||||
if (file->private_data) {
|
||||
err = -EBUSY;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (copy_from_user(args, input, sizeof(args)) != 0) {
|
||||
err = -EFAULT;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
lg = kzalloc(sizeof(*lg), GFP_KERNEL);
|
||||
if (!lg) {
|
||||
err = -ENOMEM;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* Populate the easy fields of our "struct lguest" */
|
||||
lg->mem_base = (void __user *)args[0];
|
||||
lg->pfn_limit = args[1];
|
||||
lg->device_limit = args[3];
|
||||
|
||||
/* This is the first cpu (cpu 0) and it will start booting at args[2] */
|
||||
err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
|
||||
if (err)
|
||||
goto free_lg;
|
||||
|
||||
/*
|
||||
* Initialize the Guest's shadow page tables. This allocates
|
||||
* memory, so can fail.
|
||||
*/
|
||||
err = init_guest_pagetable(lg);
|
||||
if (err)
|
||||
goto free_regs;
|
||||
|
||||
/* We keep our "struct lguest" in the file's private_data. */
|
||||
file->private_data = lg;
|
||||
|
||||
mutex_unlock(&lguest_lock);
|
||||
|
||||
/* And because this is a write() call, we return the length used. */
|
||||
return sizeof(args);
|
||||
|
||||
free_regs:
|
||||
/* FIXME: This should be in free_vcpu */
|
||||
free_page(lg->cpus[0].regs_page);
|
||||
free_lg:
|
||||
kfree(lg);
|
||||
unlock:
|
||||
mutex_unlock(&lguest_lock);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*L:010
|
||||
* The first operation the Launcher does must be a write. All writes
|
||||
* start with an unsigned long number: for the first write this must be
|
||||
* LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
|
||||
* writes of other values to send interrupts or set up receipt of notifications.
|
||||
*
|
||||
* Note that we overload the "offset" in the /dev/lguest file to indicate what
|
||||
* CPU number we're dealing with. Currently this is always 0 since we only
|
||||
* support uniprocessor Guests, but you can see the beginnings of SMP support
|
||||
* here.
|
||||
*/
|
||||
static ssize_t write(struct file *file, const char __user *in,
|
||||
size_t size, loff_t *off)
|
||||
{
|
||||
/*
|
||||
* Once the Guest is initialized, we hold the "struct lguest" in the
|
||||
* file private data.
|
||||
*/
|
||||
struct lguest *lg = file->private_data;
|
||||
const unsigned long __user *input = (const unsigned long __user *)in;
|
||||
unsigned long req;
|
||||
struct lg_cpu *uninitialized_var(cpu);
|
||||
unsigned int cpu_id = *off;
|
||||
|
||||
/* The first value tells us what this request is. */
|
||||
if (get_user(req, input) != 0)
|
||||
return -EFAULT;
|
||||
input++;
|
||||
|
||||
/* If you haven't initialized, you must do that first. */
|
||||
if (req != LHREQ_INITIALIZE) {
|
||||
if (!lg || (cpu_id >= lg->nr_cpus))
|
||||
return -EINVAL;
|
||||
cpu = &lg->cpus[cpu_id];
|
||||
|
||||
/* Once the Guest is dead, you can only read() why it died. */
|
||||
if (lg->dead)
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
switch (req) {
|
||||
case LHREQ_INITIALIZE:
|
||||
return initialize(file, input);
|
||||
case LHREQ_IRQ:
|
||||
return user_send_irq(cpu, input);
|
||||
case LHREQ_GETREG:
|
||||
return getreg_setup(cpu, input);
|
||||
case LHREQ_SETREG:
|
||||
return setreg(cpu, input);
|
||||
case LHREQ_TRAP:
|
||||
return trap(cpu, input);
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
static int open(struct inode *inode, struct file *file)
|
||||
{
|
||||
file->private_data = NULL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*L:060
|
||||
* The final piece of interface code is the close() routine. It reverses
|
||||
* everything done in initialize(). This is usually called because the
|
||||
* Launcher exited.
|
||||
*
|
||||
* Note that the close routine returns 0 or a negative error number: it can't
|
||||
* really fail, but it can whine. I blame Sun for this wart, and K&R C for
|
||||
* letting them do it.
|
||||
:*/
|
||||
static int close(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct lguest *lg = file->private_data;
|
||||
unsigned int i;
|
||||
|
||||
/* If we never successfully initialized, there's nothing to clean up */
|
||||
if (!lg)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* We need the big lock, to protect from inter-guest I/O and other
|
||||
* Launchers initializing guests.
|
||||
*/
|
||||
mutex_lock(&lguest_lock);
|
||||
|
||||
/* Free up the shadow page tables for the Guest. */
|
||||
free_guest_pagetable(lg);
|
||||
|
||||
for (i = 0; i < lg->nr_cpus; i++) {
|
||||
/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
|
||||
hrtimer_cancel(&lg->cpus[i].hrt);
|
||||
/* We can free up the register page we allocated. */
|
||||
free_page(lg->cpus[i].regs_page);
|
||||
/*
|
||||
* Now all the memory cleanups are done, it's safe to release
|
||||
* the Launcher's memory management structure.
|
||||
*/
|
||||
mmput(lg->cpus[i].mm);
|
||||
}
|
||||
|
||||
/*
|
||||
* If lg->dead doesn't contain an error code it will be NULL or a
|
||||
* kmalloc()ed string, either of which is ok to hand to kfree().
|
||||
*/
|
||||
if (!IS_ERR(lg->dead))
|
||||
kfree(lg->dead);
|
||||
/* Free the memory allocated to the lguest_struct */
|
||||
kfree(lg);
|
||||
/* Release lock and exit. */
|
||||
mutex_unlock(&lguest_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*L:000
|
||||
* Welcome to our journey through the Launcher!
|
||||
*
|
||||
* The Launcher is the Host userspace program which sets up, runs and services
|
||||
* the Guest. In fact, many comments in the Drivers which refer to "the Host"
|
||||
* doing things are inaccurate: the Launcher does all the device handling for
|
||||
* the Guest, but the Guest can't know that.
|
||||
*
|
||||
* Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
|
||||
* shall see more of that later.
|
||||
*
|
||||
* We begin our understanding with the Host kernel interface which the Launcher
|
||||
* uses: reading and writing a character device called /dev/lguest. All the
|
||||
* work happens in the read(), write() and close() routines:
|
||||
*/
|
||||
static const struct file_operations lguest_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = open,
|
||||
.release = close,
|
||||
.write = write,
|
||||
.read = read,
|
||||
.llseek = default_llseek,
|
||||
};
|
||||
/*:*/
|
||||
|
||||
/*
|
||||
* This is a textbook example of a "misc" character device. Populate a "struct
|
||||
* miscdevice" and register it with misc_register().
|
||||
*/
|
||||
static struct miscdevice lguest_dev = {
|
||||
.minor = MISC_DYNAMIC_MINOR,
|
||||
.name = "lguest",
|
||||
.fops = &lguest_fops,
|
||||
};
|
||||
|
||||
int __init lguest_device_init(void)
|
||||
{
|
||||
return misc_register(&lguest_dev);
|
||||
}
|
||||
|
||||
void __exit lguest_device_remove(void)
|
||||
{
|
||||
misc_deregister(&lguest_dev);
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,228 +0,0 @@
|
||||
/*P:600
|
||||
* The x86 architecture has segments, which involve a table of descriptors
|
||||
* which can be used to do funky things with virtual address interpretation.
|
||||
* We originally used to use segments so the Guest couldn't alter the
|
||||
* Guest<->Host Switcher, and then we had to trim Guest segments, and restore
|
||||
* for userspace per-thread segments, but trim again for on userspace->kernel
|
||||
* transitions... This nightmarish creation was contained within this file,
|
||||
* where we knew not to tread without heavy armament and a change of underwear.
|
||||
*
|
||||
* In these modern times, the segment handling code consists of simple sanity
|
||||
* checks, and the worst you'll experience reading this code is butterfly-rash
|
||||
* from frolicking through its parklike serenity.
|
||||
:*/
|
||||
#include "lg.h"
|
||||
|
||||
/*H:600
|
||||
* Segments & The Global Descriptor Table
|
||||
*
|
||||
* (That title sounds like a bad Nerdcore group. Not to suggest that there are
|
||||
* any good Nerdcore groups, but in high school a friend of mine had a band
|
||||
* called Joe Fish and the Chips, so there are definitely worse band names).
|
||||
*
|
||||
* To refresh: the GDT is a table of 8-byte values describing segments. Once
|
||||
* set up, these segments can be loaded into one of the 6 "segment registers".
|
||||
*
|
||||
* GDT entries are passed around as "struct desc_struct"s, which like IDT
|
||||
* entries are split into two 32-bit members, "a" and "b". One day, someone
|
||||
* will clean that up, and be declared a Hero. (No pressure, I'm just saying).
|
||||
*
|
||||
* Anyway, the GDT entry contains a base (the start address of the segment), a
|
||||
* limit (the size of the segment - 1), and some flags. Sounds simple, and it
|
||||
* would be, except those zany Intel engineers decided that it was too boring
|
||||
* to put the base at one end, the limit at the other, and the flags in
|
||||
* between. They decided to shotgun the bits at random throughout the 8 bytes,
|
||||
* like so:
|
||||
*
|
||||
* 0 16 40 48 52 56 63
|
||||
* [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ]
|
||||
* mit ags part 2
|
||||
* part 2
|
||||
*
|
||||
* As a result, this file contains a certain amount of magic numeracy. Let's
|
||||
* begin.
|
||||
*/
|
||||
|
||||
/*
|
||||
* There are several entries we don't let the Guest set. The TSS entry is the
|
||||
* "Task State Segment" which controls all kinds of delicate things. The
|
||||
* LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
|
||||
* the Guest can't be trusted to deal with double faults.
|
||||
*/
|
||||
static bool ignored_gdt(unsigned int num)
|
||||
{
|
||||
return (num == GDT_ENTRY_TSS
|
||||
|| num == GDT_ENTRY_LGUEST_CS
|
||||
|| num == GDT_ENTRY_LGUEST_DS
|
||||
|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
|
||||
}
|
||||
|
||||
/*H:630
|
||||
* Once the Guest gave us new GDT entries, we fix them up a little. We
|
||||
* don't care if they're invalid: the worst that can happen is a General
|
||||
* Protection Fault in the Switcher when it restores a Guest segment register
|
||||
* which tries to use that entry. Then we kill the Guest for causing such a
|
||||
* mess: the message will be "unhandled trap 256".
|
||||
*/
|
||||
static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = start; i < end; i++) {
|
||||
/*
|
||||
* We never copy these ones to real GDT, so we don't care what
|
||||
* they say
|
||||
*/
|
||||
if (ignored_gdt(i))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Segment descriptors contain a privilege level: the Guest is
|
||||
* sometimes careless and leaves this as 0, even though it's
|
||||
* running at privilege level 1. If so, we fix it here.
|
||||
*/
|
||||
if (cpu->arch.gdt[i].dpl == 0)
|
||||
cpu->arch.gdt[i].dpl |= GUEST_PL;
|
||||
|
||||
/*
|
||||
* Each descriptor has an "accessed" bit. If we don't set it
|
||||
* now, the CPU will try to set it when the Guest first loads
|
||||
* that entry into a segment register. But the GDT isn't
|
||||
* writable by the Guest, so bad things can happen.
|
||||
*/
|
||||
cpu->arch.gdt[i].type |= 0x1;
|
||||
}
|
||||
}
|
||||
|
||||
/*H:610
|
||||
* Like the IDT, we never simply use the GDT the Guest gives us. We keep
|
||||
* a GDT for each CPU, and copy across the Guest's entries each time we want to
|
||||
* run the Guest on that CPU.
|
||||
*
|
||||
* This routine is called at boot or modprobe time for each CPU to set up the
|
||||
* constant GDT entries: the ones which are the same no matter what Guest we're
|
||||
* running.
|
||||
*/
|
||||
void setup_default_gdt_entries(struct lguest_ro_state *state)
|
||||
{
|
||||
struct desc_struct *gdt = state->guest_gdt;
|
||||
unsigned long tss = (unsigned long)&state->guest_tss;
|
||||
|
||||
/* The Switcher segments are full 0-4G segments, privilege level 0 */
|
||||
gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
|
||||
gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
|
||||
|
||||
/*
|
||||
* The TSS segment refers to the TSS entry for this particular CPU.
|
||||
*/
|
||||
gdt[GDT_ENTRY_TSS].a = 0;
|
||||
gdt[GDT_ENTRY_TSS].b = 0;
|
||||
|
||||
gdt[GDT_ENTRY_TSS].limit0 = 0x67;
|
||||
gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF;
|
||||
gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF;
|
||||
gdt[GDT_ENTRY_TSS].base2 = tss >> 24;
|
||||
gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */
|
||||
gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */
|
||||
gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */
|
||||
gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine sets up the initial Guest GDT for booting. All entries start
|
||||
* as 0 (unusable).
|
||||
*/
|
||||
void setup_guest_gdt(struct lg_cpu *cpu)
|
||||
{
|
||||
/*
|
||||
* Start with full 0-4G segments...except the Guest is allowed to use
|
||||
* them, so set the privilege level appropriately in the flags.
|
||||
*/
|
||||
cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
|
||||
cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
|
||||
cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
|
||||
cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
|
||||
}
|
||||
|
||||
/*H:650
|
||||
* An optimization of copy_gdt(), for just the three "thead-local storage"
|
||||
* entries.
|
||||
*/
|
||||
void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
|
||||
gdt[i] = cpu->arch.gdt[i];
|
||||
}
|
||||
|
||||
/*H:640
|
||||
* When the Guest is run on a different CPU, or the GDT entries have changed,
|
||||
* copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
|
||||
* GDT.
|
||||
*/
|
||||
void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
/*
|
||||
* The default entries from setup_default_gdt_entries() are not
|
||||
* replaced. See ignored_gdt() above.
|
||||
*/
|
||||
for (i = 0; i < GDT_ENTRIES; i++)
|
||||
if (!ignored_gdt(i))
|
||||
gdt[i] = cpu->arch.gdt[i];
|
||||
}
|
||||
|
||||
/*H:620
|
||||
* This is where the Guest asks us to load a new GDT entry
|
||||
* (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in.
|
||||
*/
|
||||
void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
|
||||
{
|
||||
/*
|
||||
* We assume the Guest has the same number of GDT entries as the
|
||||
* Host, otherwise we'd have to dynamically allocate the Guest GDT.
|
||||
*/
|
||||
if (num >= ARRAY_SIZE(cpu->arch.gdt)) {
|
||||
kill_guest(cpu, "too many gdt entries %i", num);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set it up, then fix it. */
|
||||
cpu->arch.gdt[num].a = lo;
|
||||
cpu->arch.gdt[num].b = hi;
|
||||
fixup_gdt_table(cpu, num, num+1);
|
||||
/*
|
||||
* Mark that the GDT changed so the core knows it has to copy it again,
|
||||
* even if the Guest is run on the same CPU.
|
||||
*/
|
||||
cpu->changed |= CHANGED_GDT;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the fast-track version for just changing the three TLS entries.
|
||||
* Remember that this happens on every context switch, so it's worth
|
||||
* optimizing. But wouldn't it be neater to have a single hypercall to cover
|
||||
* both cases?
|
||||
*/
|
||||
void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
|
||||
{
|
||||
struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
|
||||
|
||||
__lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
|
||||
fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
|
||||
/* Note that just the TLS entries have changed. */
|
||||
cpu->changed |= CHANGED_GDT_TLS;
|
||||
}
|
||||
|
||||
/*H:660
|
||||
* With this, we have finished the Host.
|
||||
*
|
||||
* Five of the seven parts of our task are complete. You have made it through
|
||||
* the Bit of Despair (I think that's somewhere in the page table code,
|
||||
* myself).
|
||||
*
|
||||
* Next, we examine "make Switcher". It's short, but intense.
|
||||
*/
|
@ -1,724 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
|
||||
* Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
/*P:450
|
||||
* This file contains the x86-specific lguest code. It used to be all
|
||||
* mixed in with drivers/lguest/core.c but several foolhardy code slashers
|
||||
* wrestled most of the dependencies out to here in preparation for porting
|
||||
* lguest to other architectures (see what I mean by foolhardy?).
|
||||
*
|
||||
* This also contains a couple of non-obvious setup and teardown pieces which
|
||||
* were implemented after days of debugging pain.
|
||||
:*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/start_kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/console.h>
|
||||
#include <linux/screen_info.h>
|
||||
#include <linux/irq.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/clocksource.h>
|
||||
#include <linux/clockchips.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <linux/lguest_launcher.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/param.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/setup.h>
|
||||
#include <asm/lguest.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/fpu/internal.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include "../lg.h"
|
||||
|
||||
static int cpu_had_pge;
|
||||
|
||||
static struct {
|
||||
unsigned long offset;
|
||||
unsigned short segment;
|
||||
} lguest_entry;
|
||||
|
||||
/* Offset from where switcher.S was compiled to where we've copied it */
|
||||
static unsigned long switcher_offset(void)
|
||||
{
|
||||
return switcher_addr - (unsigned long)start_switcher_text;
|
||||
}
|
||||
|
||||
/* This cpu's struct lguest_pages (after the Switcher text page) */
|
||||
static struct lguest_pages *lguest_pages(unsigned int cpu)
|
||||
{
|
||||
return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
|
||||
|
||||
/*S:010
|
||||
* We approach the Switcher.
|
||||
*
|
||||
* Remember that each CPU has two pages which are visible to the Guest when it
|
||||
* runs on that CPU. This has to contain the state for that Guest: we copy the
|
||||
* state in just before we run the Guest.
|
||||
*
|
||||
* Each Guest has "changed" flags which indicate what has changed in the Guest
|
||||
* since it last ran. We saw this set in interrupts_and_traps.c and
|
||||
* segments.c.
|
||||
*/
|
||||
static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
|
||||
{
|
||||
/*
|
||||
* Copying all this data can be quite expensive. We usually run the
|
||||
* same Guest we ran last time (and that Guest hasn't run anywhere else
|
||||
* meanwhile). If that's not the case, we pretend everything in the
|
||||
* Guest has changed.
|
||||
*/
|
||||
if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) {
|
||||
__this_cpu_write(lg_last_cpu, cpu);
|
||||
cpu->last_pages = pages;
|
||||
cpu->changed = CHANGED_ALL;
|
||||
}
|
||||
|
||||
/*
|
||||
* These copies are pretty cheap, so we do them unconditionally: */
|
||||
/* Save the current Host top-level page directory.
|
||||
*/
|
||||
pages->state.host_cr3 = __pa(current->mm->pgd);
|
||||
/*
|
||||
* Set up the Guest's page tables to see this CPU's pages (and no
|
||||
* other CPU's pages).
|
||||
*/
|
||||
map_switcher_in_guest(cpu, pages);
|
||||
/*
|
||||
* Set up the two "TSS" members which tell the CPU what stack to use
|
||||
* for traps which do directly into the Guest (ie. traps at privilege
|
||||
* level 1).
|
||||
*/
|
||||
pages->state.guest_tss.sp1 = cpu->esp1;
|
||||
pages->state.guest_tss.ss1 = cpu->ss1;
|
||||
|
||||
/* Copy direct-to-Guest trap entries. */
|
||||
if (cpu->changed & CHANGED_IDT)
|
||||
copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
|
||||
|
||||
/* Copy all GDT entries which the Guest can change. */
|
||||
if (cpu->changed & CHANGED_GDT)
|
||||
copy_gdt(cpu, pages->state.guest_gdt);
|
||||
/* If only the TLS entries have changed, copy them. */
|
||||
else if (cpu->changed & CHANGED_GDT_TLS)
|
||||
copy_gdt_tls(cpu, pages->state.guest_gdt);
|
||||
|
||||
/* Mark the Guest as unchanged for next time. */
|
||||
cpu->changed = 0;
|
||||
}
|
||||
|
||||
/* Finally: the code to actually call into the Switcher to run the Guest. */
|
||||
static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
|
||||
{
|
||||
/* This is a dummy value we need for GCC's sake. */
|
||||
unsigned int clobber;
|
||||
|
||||
/*
|
||||
* Copy the guest-specific information into this CPU's "struct
|
||||
* lguest_pages".
|
||||
*/
|
||||
copy_in_guest_info(cpu, pages);
|
||||
|
||||
/*
|
||||
* Set the trap number to 256 (impossible value). If we fault while
|
||||
* switching to the Guest (bad segment registers or bug), this will
|
||||
* cause us to abort the Guest.
|
||||
*/
|
||||
cpu->regs->trapnum = 256;
|
||||
|
||||
/*
|
||||
* Now: we push the "eflags" register on the stack, then do an "lcall".
|
||||
* This is how we change from using the kernel code segment to using
|
||||
* the dedicated lguest code segment, as well as jumping into the
|
||||
* Switcher.
|
||||
*
|
||||
* The lcall also pushes the old code segment (KERNEL_CS) onto the
|
||||
* stack, then the address of this call. This stack layout happens to
|
||||
* exactly match the stack layout created by an interrupt...
|
||||
*/
|
||||
asm volatile("pushf; lcall *%4"
|
||||
/*
|
||||
* This is how we tell GCC that %eax ("a") and %ebx ("b")
|
||||
* are changed by this routine. The "=" means output.
|
||||
*/
|
||||
: "=a"(clobber), "=b"(clobber)
|
||||
/*
|
||||
* %eax contains the pages pointer. ("0" refers to the
|
||||
* 0-th argument above, ie "a"). %ebx contains the
|
||||
* physical address of the Guest's top-level page
|
||||
* directory.
|
||||
*/
|
||||
: "0"(pages),
|
||||
"1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
|
||||
"m"(lguest_entry)
|
||||
/*
|
||||
* We tell gcc that all these registers could change,
|
||||
* which means we don't have to save and restore them in
|
||||
* the Switcher.
|
||||
*/
|
||||
: "memory", "%edx", "%ecx", "%edi", "%esi");
|
||||
}
|
||||
/*:*/
|
||||
|
||||
unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
|
||||
{
|
||||
switch (reg_off) {
|
||||
case offsetof(struct pt_regs, bx):
|
||||
return &cpu->regs->ebx;
|
||||
case offsetof(struct pt_regs, cx):
|
||||
return &cpu->regs->ecx;
|
||||
case offsetof(struct pt_regs, dx):
|
||||
return &cpu->regs->edx;
|
||||
case offsetof(struct pt_regs, si):
|
||||
return &cpu->regs->esi;
|
||||
case offsetof(struct pt_regs, di):
|
||||
return &cpu->regs->edi;
|
||||
case offsetof(struct pt_regs, bp):
|
||||
return &cpu->regs->ebp;
|
||||
case offsetof(struct pt_regs, ax):
|
||||
return &cpu->regs->eax;
|
||||
case offsetof(struct pt_regs, ip):
|
||||
return &cpu->regs->eip;
|
||||
case offsetof(struct pt_regs, sp):
|
||||
return &cpu->regs->esp;
|
||||
}
|
||||
|
||||
/* Launcher can read these, but we don't allow any setting. */
|
||||
if (any) {
|
||||
switch (reg_off) {
|
||||
case offsetof(struct pt_regs, ds):
|
||||
return &cpu->regs->ds;
|
||||
case offsetof(struct pt_regs, es):
|
||||
return &cpu->regs->es;
|
||||
case offsetof(struct pt_regs, fs):
|
||||
return &cpu->regs->fs;
|
||||
case offsetof(struct pt_regs, gs):
|
||||
return &cpu->regs->gs;
|
||||
case offsetof(struct pt_regs, cs):
|
||||
return &cpu->regs->cs;
|
||||
case offsetof(struct pt_regs, flags):
|
||||
return &cpu->regs->eflags;
|
||||
case offsetof(struct pt_regs, ss):
|
||||
return &cpu->regs->ss;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*M:002
|
||||
* There are hooks in the scheduler which we can register to tell when we
|
||||
* get kicked off the CPU (preempt_notifier_register()). This would allow us
|
||||
* to lazily disable SYSENTER which would regain some performance, and should
|
||||
* also simplify copy_in_guest_info(). Note that we'd still need to restore
|
||||
* things when we exit to Launcher userspace, but that's fairly easy.
|
||||
*
|
||||
* We could also try using these hooks for PGE, but that might be too expensive.
|
||||
*
|
||||
* The hooks were designed for KVM, but we can also put them to good use.
|
||||
:*/
|
||||
|
||||
/*H:040
|
||||
* This is the i386-specific code to setup and run the Guest. Interrupts
|
||||
* are disabled: we own the CPU.
|
||||
*/
|
||||
void lguest_arch_run_guest(struct lg_cpu *cpu)
|
||||
{
|
||||
/*
|
||||
* SYSENTER is an optimized way of doing system calls. We can't allow
|
||||
* it because it always jumps to privilege level 0. A normal Guest
|
||||
* won't try it because we don't advertise it in CPUID, but a malicious
|
||||
* Guest (or malicious Guest userspace program) could, so we tell the
|
||||
* CPU to disable it before running the Guest.
|
||||
*/
|
||||
if (boot_cpu_has(X86_FEATURE_SEP))
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
|
||||
|
||||
/*
|
||||
* Now we actually run the Guest. It will return when something
|
||||
* interesting happens, and we can examine its registers to see what it
|
||||
* was doing.
|
||||
*/
|
||||
run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
|
||||
|
||||
/*
|
||||
* Note that the "regs" structure contains two extra entries which are
|
||||
* not really registers: a trap number which says what interrupt or
|
||||
* trap made the switcher code come back, and an error code which some
|
||||
* traps set.
|
||||
*/
|
||||
|
||||
/* Restore SYSENTER if it's supposed to be on. */
|
||||
if (boot_cpu_has(X86_FEATURE_SEP))
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
|
||||
|
||||
/*
|
||||
* If the Guest page faulted, then the cr2 register will tell us the
|
||||
* bad virtual address. We have to grab this now, because once we
|
||||
* re-enable interrupts an interrupt could fault and thus overwrite
|
||||
* cr2, or we could even move off to a different CPU.
|
||||
*/
|
||||
if (cpu->regs->trapnum == 14)
|
||||
cpu->arch.last_pagefault = read_cr2();
|
||||
/*
|
||||
* Similarly, if we took a trap because the Guest used the FPU,
|
||||
* we have to restore the FPU it expects to see.
|
||||
* fpu__restore() may sleep and we may even move off to
|
||||
* a different CPU. So all the critical stuff should be done
|
||||
* before this.
|
||||
*/
|
||||
else if (cpu->regs->trapnum == 7 && !fpregs_active())
|
||||
fpu__restore(¤t->thread.fpu);
|
||||
}
|
||||
|
||||
/*H:130
|
||||
* Now we've examined the hypercall code; our Guest can make requests.
|
||||
* Our Guest is usually so well behaved; it never tries to do things it isn't
|
||||
* allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
|
||||
* infrastructure isn't quite complete, because it doesn't contain replacements
|
||||
* for the Intel I/O instructions. As a result, the Guest sometimes fumbles
|
||||
* across one during the boot process as it probes for various things which are
|
||||
* usually attached to a PC.
|
||||
*
|
||||
* When the Guest uses one of these instructions, we get a trap (General
|
||||
* Protection Fault) and come here. We queue this to be sent out to the
|
||||
* Launcher to handle.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The eip contains the *virtual* address of the Guest's instruction:
|
||||
* we copy the instruction here so the Launcher doesn't have to walk
|
||||
* the page tables to decode it. We handle the case (eg. in a kernel
|
||||
* module) where the instruction is over two pages, and the pages are
|
||||
* virtually but not physically contiguous.
|
||||
*
|
||||
* The longest possible x86 instruction is 15 bytes, but we don't handle
|
||||
* anything that strange.
|
||||
*/
|
||||
static void copy_from_guest(struct lg_cpu *cpu,
|
||||
void *dst, unsigned long vaddr, size_t len)
|
||||
{
|
||||
size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
|
||||
unsigned long paddr;
|
||||
|
||||
BUG_ON(len > PAGE_SIZE);
|
||||
|
||||
/* If it goes over a page, copy in two parts. */
|
||||
if (len > to_page_end) {
|
||||
/* But make sure the next page is mapped! */
|
||||
if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
|
||||
copy_from_guest(cpu, dst + to_page_end,
|
||||
vaddr + to_page_end,
|
||||
len - to_page_end);
|
||||
else
|
||||
/* Otherwise fill with zeroes. */
|
||||
memset(dst + to_page_end, 0, len - to_page_end);
|
||||
len = to_page_end;
|
||||
}
|
||||
|
||||
/* This will kill the guest if it isn't mapped, but that
|
||||
* shouldn't happen. */
|
||||
__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
|
||||
}
|
||||
|
||||
|
||||
static void setup_emulate_insn(struct lg_cpu *cpu)
|
||||
{
|
||||
cpu->pending.trap = 13;
|
||||
copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
|
||||
sizeof(cpu->pending.insn));
|
||||
}
|
||||
|
||||
static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
|
||||
{
|
||||
cpu->pending.trap = 14;
|
||||
cpu->pending.addr = iomem_addr;
|
||||
copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
|
||||
sizeof(cpu->pending.insn));
|
||||
}
|
||||
|
||||
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
|
||||
void lguest_arch_handle_trap(struct lg_cpu *cpu)
|
||||
{
|
||||
unsigned long iomem_addr;
|
||||
|
||||
switch (cpu->regs->trapnum) {
|
||||
case 13: /* We've intercepted a General Protection Fault. */
|
||||
/* Hand to Launcher to emulate those pesky IN and OUT insns */
|
||||
if (cpu->regs->errcode == 0) {
|
||||
setup_emulate_insn(cpu);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
case 14: /* We've intercepted a Page Fault. */
|
||||
/*
|
||||
* The Guest accessed a virtual address that wasn't mapped.
|
||||
* This happens a lot: we don't actually set up most of the page
|
||||
* tables for the Guest at all when we start: as it runs it asks
|
||||
* for more and more, and we set them up as required. In this
|
||||
* case, we don't even tell the Guest that the fault happened.
|
||||
*
|
||||
* The errcode tells whether this was a read or a write, and
|
||||
* whether kernel or userspace code.
|
||||
*/
|
||||
if (demand_page(cpu, cpu->arch.last_pagefault,
|
||||
cpu->regs->errcode, &iomem_addr))
|
||||
return;
|
||||
|
||||
/* Was this an access to memory mapped IO? */
|
||||
if (iomem_addr) {
|
||||
/* Tell Launcher, let it handle it. */
|
||||
setup_iomem_insn(cpu, iomem_addr);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* OK, it's really not there (or not OK): the Guest needs to
|
||||
* know. We write out the cr2 value so it knows where the
|
||||
* fault occurred.
|
||||
*
|
||||
* Note that if the Guest were really messed up, this could
|
||||
* happen before it's done the LHCALL_LGUEST_INIT hypercall, so
|
||||
* lg->lguest_data could be NULL
|
||||
*/
|
||||
if (cpu->lg->lguest_data &&
|
||||
put_user(cpu->arch.last_pagefault,
|
||||
&cpu->lg->lguest_data->cr2))
|
||||
kill_guest(cpu, "Writing cr2");
|
||||
break;
|
||||
case 7: /* We've intercepted a Device Not Available fault. */
|
||||
/* No special handling is needed here. */
|
||||
break;
|
||||
case 32 ... 255:
|
||||
/* This might be a syscall. */
|
||||
if (could_be_syscall(cpu->regs->trapnum))
|
||||
break;
|
||||
|
||||
/*
|
||||
* Other values mean a real interrupt occurred, in which case
|
||||
* the Host handler has already been run. We just do a
|
||||
* friendly check if another process should now be run, then
|
||||
* return to run the Guest again.
|
||||
*/
|
||||
cond_resched();
|
||||
return;
|
||||
case LGUEST_TRAP_ENTRY:
|
||||
/*
|
||||
* Our 'struct hcall_args' maps directly over our regs: we set
|
||||
* up the pointer now to indicate a hypercall is pending.
|
||||
*/
|
||||
cpu->hcall = (struct hcall_args *)cpu->regs;
|
||||
return;
|
||||
}
|
||||
|
||||
/* We didn't handle the trap, so it needs to go to the Guest. */
|
||||
if (!deliver_trap(cpu, cpu->regs->trapnum))
|
||||
/*
|
||||
* If the Guest doesn't have a handler (either it hasn't
|
||||
* registered any yet, or it's one of the faults we don't let
|
||||
* it handle), it dies with this cryptic error message.
|
||||
*/
|
||||
kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
|
||||
cpu->regs->trapnum, cpu->regs->eip,
|
||||
cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
|
||||
: cpu->regs->errcode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we can look at each of the routines this calls, in increasing order of
|
||||
* complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
|
||||
* deliver_trap() and demand_page(). After all those, we'll be ready to
|
||||
* examine the Switcher, and our philosophical understanding of the Host/Guest
|
||||
* duality will be complete.
|
||||
:*/
|
||||
static void adjust_pge(void *on)
|
||||
{
|
||||
if (on)
|
||||
cr4_set_bits(X86_CR4_PGE);
|
||||
else
|
||||
cr4_clear_bits(X86_CR4_PGE);
|
||||
}
|
||||
|
||||
/*H:020
|
||||
* Now the Switcher is mapped and every thing else is ready, we need to do
|
||||
* some more i386-specific initialization.
|
||||
*/
|
||||
void __init lguest_arch_host_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Most of the x86/switcher_32.S doesn't care that it's been moved; on
|
||||
* Intel, jumps are relative, and it doesn't access any references to
|
||||
* external code or data.
|
||||
*
|
||||
* The only exception is the interrupt handlers in switcher.S: their
|
||||
* addresses are placed in a table (default_idt_entries), so we need to
|
||||
* update the table with the new addresses. switcher_offset() is a
|
||||
* convenience function which returns the distance between the
|
||||
* compiled-in switcher code and the high-mapped copy we just made.
|
||||
*/
|
||||
for (i = 0; i < IDT_ENTRIES; i++)
|
||||
default_idt_entries[i] += switcher_offset();
|
||||
|
||||
/*
|
||||
* Set up the Switcher's per-cpu areas.
|
||||
*
|
||||
* Each CPU gets two pages of its own within the high-mapped region
|
||||
* (aka. "struct lguest_pages"). Much of this can be initialized now,
|
||||
* but some depends on what Guest we are running (which is set up in
|
||||
* copy_in_guest_info()).
|
||||
*/
|
||||
for_each_possible_cpu(i) {
|
||||
/* lguest_pages() returns this CPU's two pages. */
|
||||
struct lguest_pages *pages = lguest_pages(i);
|
||||
/* This is a convenience pointer to make the code neater. */
|
||||
struct lguest_ro_state *state = &pages->state;
|
||||
|
||||
/*
|
||||
* The Global Descriptor Table: the Host has a different one
|
||||
* for each CPU. We keep a descriptor for the GDT which says
|
||||
* where it is and how big it is (the size is actually the last
|
||||
* byte, not the size, hence the "-1").
|
||||
*/
|
||||
state->host_gdt_desc.size = GDT_SIZE-1;
|
||||
state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
|
||||
|
||||
/*
|
||||
* All CPUs on the Host use the same Interrupt Descriptor
|
||||
* Table, so we just use store_idt(), which gets this CPU's IDT
|
||||
* descriptor.
|
||||
*/
|
||||
store_idt(&state->host_idt_desc);
|
||||
|
||||
/*
|
||||
* The descriptors for the Guest's GDT and IDT can be filled
|
||||
* out now, too. We copy the GDT & IDT into ->guest_gdt and
|
||||
* ->guest_idt before actually running the Guest.
|
||||
*/
|
||||
state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
|
||||
state->guest_idt_desc.address = (long)&state->guest_idt;
|
||||
state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
|
||||
state->guest_gdt_desc.address = (long)&state->guest_gdt;
|
||||
|
||||
/*
|
||||
* We know where we want the stack to be when the Guest enters
|
||||
* the Switcher: in pages->regs. The stack grows upwards, so
|
||||
* we start it at the end of that structure.
|
||||
*/
|
||||
state->guest_tss.sp0 = (long)(&pages->regs + 1);
|
||||
/*
|
||||
* And this is the GDT entry to use for the stack: we keep a
|
||||
* couple of special LGUEST entries.
|
||||
*/
|
||||
state->guest_tss.ss0 = LGUEST_DS;
|
||||
|
||||
/*
|
||||
* x86 can have a finegrained bitmap which indicates what I/O
|
||||
* ports the process can use. We set it to the end of our
|
||||
* structure, meaning "none".
|
||||
*/
|
||||
state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
|
||||
|
||||
/*
|
||||
* Some GDT entries are the same across all Guests, so we can
|
||||
* set them up now.
|
||||
*/
|
||||
setup_default_gdt_entries(state);
|
||||
/* Most IDT entries are the same for all Guests, too.*/
|
||||
setup_default_idt_entries(state, default_idt_entries);
|
||||
|
||||
/*
|
||||
* The Host needs to be able to use the LGUEST segments on this
|
||||
* CPU, too, so put them in the Host GDT.
|
||||
*/
|
||||
get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
|
||||
get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
|
||||
}
|
||||
|
||||
/*
|
||||
* In the Switcher, we want the %cs segment register to use the
|
||||
* LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
|
||||
* it will be undisturbed when we switch. To change %cs and jump we
|
||||
* need this structure to feed to Intel's "lcall" instruction.
|
||||
*/
|
||||
lguest_entry.offset = (long)switch_to_guest + switcher_offset();
|
||||
lguest_entry.segment = LGUEST_CS;
|
||||
|
||||
/*
|
||||
* Finally, we need to turn off "Page Global Enable". PGE is an
|
||||
* optimization where page table entries are specially marked to show
|
||||
* they never change. The Host kernel marks all the kernel pages this
|
||||
* way because it's always present, even when userspace is running.
|
||||
*
|
||||
* Lguest breaks this: unbeknownst to the rest of the Host kernel, we
|
||||
* switch to the Guest kernel. If you don't disable this on all CPUs,
|
||||
* you'll get really weird bugs that you'll chase for two days.
|
||||
*
|
||||
* I used to turn PGE off every time we switched to the Guest and back
|
||||
* on when we return, but that slowed the Switcher down noticibly.
|
||||
*/
|
||||
|
||||
/*
|
||||
* We don't need the complexity of CPUs coming and going while we're
|
||||
* doing this.
|
||||
*/
|
||||
get_online_cpus();
|
||||
if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */
|
||||
/* Remember that this was originally set (for cleanup). */
|
||||
cpu_had_pge = 1;
|
||||
/*
|
||||
* adjust_pge is a helper function which sets or unsets the PGE
|
||||
* bit on its CPU, depending on the argument (0 == unset).
|
||||
*/
|
||||
on_each_cpu(adjust_pge, (void *)0, 1);
|
||||
/* Turn off the feature in the global feature set. */
|
||||
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
|
||||
}
|
||||
put_online_cpus();
|
||||
}
|
||||
/*:*/
|
||||
|
||||
void __exit lguest_arch_host_fini(void)
|
||||
{
|
||||
/* If we had PGE before we started, turn it back on now. */
|
||||
get_online_cpus();
|
||||
if (cpu_had_pge) {
|
||||
set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
|
||||
/* adjust_pge's argument "1" means set PGE. */
|
||||
on_each_cpu(adjust_pge, (void *)1, 1);
|
||||
}
|
||||
put_online_cpus();
|
||||
}
|
||||
|
||||
|
||||
/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
|
||||
int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
|
||||
{
|
||||
switch (args->arg0) {
|
||||
case LHCALL_LOAD_GDT_ENTRY:
|
||||
load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3);
|
||||
break;
|
||||
case LHCALL_LOAD_IDT_ENTRY:
|
||||
load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
|
||||
break;
|
||||
case LHCALL_LOAD_TLS:
|
||||
guest_load_tls(cpu, args->arg1);
|
||||
break;
|
||||
default:
|
||||
/* Bad Guest. Bad! */
|
||||
return -EIO;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*H:126 i386-specific hypercall initialization: */
|
||||
int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
|
||||
{
|
||||
u32 tsc_speed;
|
||||
|
||||
/*
|
||||
* The pointer to the Guest's "struct lguest_data" is the only argument.
|
||||
* We check that address now.
|
||||
*/
|
||||
if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
|
||||
sizeof(*cpu->lg->lguest_data)))
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
* Having checked it, we simply set lg->lguest_data to point straight
|
||||
* into the Launcher's memory at the right place and then use
|
||||
* copy_to_user/from_user from now on, instead of lgread/write. I put
|
||||
* this in to show that I'm not immune to writing stupid
|
||||
* optimizations.
|
||||
*/
|
||||
cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
|
||||
|
||||
/*
|
||||
* We insist that the Time Stamp Counter exist and doesn't change with
|
||||
* cpu frequency. Some devious chip manufacturers decided that TSC
|
||||
* changes could be handled in software. I decided that time going
|
||||
* backwards might be good for benchmarks, but it's bad for users.
|
||||
*
|
||||
* We also insist that the TSC be stable: the kernel detects unreliable
|
||||
* TSCs for its own purposes, and we use that here.
|
||||
*/
|
||||
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
|
||||
tsc_speed = tsc_khz;
|
||||
else
|
||||
tsc_speed = 0;
|
||||
if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
|
||||
return -EFAULT;
|
||||
|
||||
/* The interrupt code might not like the system call vector. */
|
||||
if (!check_syscall_vector(cpu->lg))
|
||||
kill_guest(cpu, "bad syscall vector");
|
||||
|
||||
return 0;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*L:030
|
||||
* Most of the Guest's registers are left alone: we used get_zeroed_page() to
|
||||
* allocate the structure, so they will be 0.
|
||||
*/
|
||||
void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
|
||||
{
|
||||
struct lguest_regs *regs = cpu->regs;
|
||||
|
||||
/*
|
||||
* There are four "segment" registers which the Guest needs to boot:
|
||||
* The "code segment" register (cs) refers to the kernel code segment
|
||||
* __KERNEL_CS, and the "data", "extra" and "stack" segment registers
|
||||
* refer to the kernel data segment __KERNEL_DS.
|
||||
*
|
||||
* The privilege level is packed into the lower bits. The Guest runs
|
||||
* at privilege level 1 (GUEST_PL).
|
||||
*/
|
||||
regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
|
||||
regs->cs = __KERNEL_CS|GUEST_PL;
|
||||
|
||||
/*
|
||||
* The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
|
||||
* is supposed to always be "1". Bit 9 (0x200) controls whether
|
||||
* interrupts are enabled. We always leave interrupts enabled while
|
||||
* running the Guest.
|
||||
*/
|
||||
regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
|
||||
|
||||
/*
|
||||
* The "Extended Instruction Pointer" register says where the Guest is
|
||||
* running.
|
||||
*/
|
||||
regs->eip = start;
|
||||
|
||||
/*
|
||||
* %esi points to our boot information, at physical address 0, so don't
|
||||
* touch it.
|
||||
*/
|
||||
|
||||
/* There are a couple of GDT entries the Guest expects at boot. */
|
||||
setup_guest_gdt(cpu);
|
||||
}
|
@ -1,388 +0,0 @@
|
||||
/*P:900
|
||||
* This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
|
||||
* both the Host and Guest to do the low-level Guest<->Host switch. It is as
|
||||
* simple as it can be made, but it's naturally very specific to x86.
|
||||
*
|
||||
* You have now completed Preparation. If this has whet your appetite; if you
|
||||
* are feeling invigorated and refreshed then the next, more challenging stage
|
||||
* can be found in "make Guest".
|
||||
:*/
|
||||
|
||||
/*M:012
|
||||
* Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
|
||||
* gain at least 1% more performance. Since neither LOC nor performance can be
|
||||
* measured beforehand, it generally means implementing a feature then deciding
|
||||
* if it's worth it. And once it's implemented, who can say no?
|
||||
*
|
||||
* This is why I haven't implemented this idea myself. I want to, but I
|
||||
* haven't. You could, though.
|
||||
*
|
||||
* The main place where lguest performance sucks is Guest page faulting. When
|
||||
* a Guest userspace process hits an unmapped page we switch back to the Host,
|
||||
* walk the page tables, find it's not mapped, switch back to the Guest page
|
||||
* fault handler, which calls a hypercall to set the page table entry, then
|
||||
* finally returns to userspace. That's two round-trips.
|
||||
*
|
||||
* If we had a small walker in the Switcher, we could quickly check the Guest
|
||||
* page table and if the page isn't mapped, immediately reflect the fault back
|
||||
* into the Guest. This means the Switcher would have to know the top of the
|
||||
* Guest page table and the page fault handler address.
|
||||
*
|
||||
* For simplicity, the Guest should only handle the case where the privilege
|
||||
* level of the fault is 3 and probably only not present or write faults. It
|
||||
* should also detect recursive faults, and hand the original fault to the
|
||||
* Host (which is actually really easy).
|
||||
*
|
||||
* Two questions remain. Would the performance gain outweigh the complexity?
|
||||
* And who would write the verse documenting it?
|
||||
:*/
|
||||
|
||||
/*M:011
|
||||
* Lguest64 handles NMI. This gave me NMI envy (until I looked at their
|
||||
* code). It's worth doing though, since it would let us use oprofile in the
|
||||
* Host when a Guest is running.
|
||||
:*/
|
||||
|
||||
/*S:100
|
||||
* Welcome to the Switcher itself!
|
||||
*
|
||||
* This file contains the low-level code which changes the CPU to run the Guest
|
||||
* code, and returns to the Host when something happens. Understand this, and
|
||||
* you understand the heart of our journey.
|
||||
*
|
||||
* Because this is in assembler rather than C, our tale switches from prose to
|
||||
* verse. First I tried limericks:
|
||||
*
|
||||
* There once was an eax reg,
|
||||
* To which our pointer was fed,
|
||||
* It needed an add,
|
||||
* Which asm-offsets.h had
|
||||
* But this limerick is hurting my head.
|
||||
*
|
||||
* Next I tried haikus, but fitting the required reference to the seasons in
|
||||
* every stanza was quickly becoming tiresome:
|
||||
*
|
||||
* The %eax reg
|
||||
* Holds "struct lguest_pages" now:
|
||||
* Cherry blossoms fall.
|
||||
*
|
||||
* Then I started with Heroic Verse, but the rhyming requirement leeched away
|
||||
* the content density and led to some uniquely awful oblique rhymes:
|
||||
*
|
||||
* These constants are coming from struct offsets
|
||||
* For use within the asm switcher text.
|
||||
*
|
||||
* Finally, I settled for something between heroic hexameter, and normal prose
|
||||
* with inappropriate linebreaks. Anyway, it aint no Shakespeare.
|
||||
*/
|
||||
|
||||
// Not all kernel headers work from assembler
|
||||
// But these ones are needed: the ENTRY() define
|
||||
// And constants extracted from struct offsets
|
||||
// To avoid magic numbers and breakage:
|
||||
// Should they change the compiler can't save us
|
||||
// Down here in the depths of assembler code.
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/segment.h>
|
||||
#include <asm/lguest.h>
|
||||
|
||||
// We mark the start of the code to copy
|
||||
// It's placed in .text tho it's never run here
|
||||
// You'll see the trick macro at the end
|
||||
// Which interleaves data and text to effect.
|
||||
.text
|
||||
ENTRY(start_switcher_text)
|
||||
|
||||
// When we reach switch_to_guest we have just left
|
||||
// The safe and comforting shores of C code
|
||||
// %eax has the "struct lguest_pages" to use
|
||||
// Where we save state and still see it from the Guest
|
||||
// And %ebx holds the Guest shadow pagetable:
|
||||
// Once set we have truly left Host behind.
|
||||
ENTRY(switch_to_guest)
|
||||
// We told gcc all its regs could fade,
|
||||
// Clobbered by our journey into the Guest
|
||||
// We could have saved them, if we tried
|
||||
// But time is our master and cycles count.
|
||||
|
||||
// Segment registers must be saved for the Host
|
||||
// We push them on the Host stack for later
|
||||
pushl %es
|
||||
pushl %ds
|
||||
pushl %gs
|
||||
pushl %fs
|
||||
// But the compiler is fickle, and heeds
|
||||
// No warning of %ebp clobbers
|
||||
// When frame pointers are used. That register
|
||||
// Must be saved and restored or chaos strikes.
|
||||
pushl %ebp
|
||||
// The Host's stack is done, now save it away
|
||||
// In our "struct lguest_pages" at offset
|
||||
// Distilled into asm-offsets.h
|
||||
movl %esp, LGUEST_PAGES_host_sp(%eax)
|
||||
|
||||
// All saved and there's now five steps before us:
|
||||
// Stack, GDT, IDT, TSS
|
||||
// Then last of all the page tables are flipped.
|
||||
|
||||
// Yet beware that our stack pointer must be
|
||||
// Always valid lest an NMI hits
|
||||
// %edx does the duty here as we juggle
|
||||
// %eax is lguest_pages: our stack lies within.
|
||||
movl %eax, %edx
|
||||
addl $LGUEST_PAGES_regs, %edx
|
||||
movl %edx, %esp
|
||||
|
||||
// The Guest's GDT we so carefully
|
||||
// Placed in the "struct lguest_pages" before
|
||||
lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
|
||||
|
||||
// The Guest's IDT we did partially
|
||||
// Copy to "struct lguest_pages" as well.
|
||||
lidt LGUEST_PAGES_guest_idt_desc(%eax)
|
||||
|
||||
// The TSS entry which controls traps
|
||||
// Must be loaded up with "ltr" now:
|
||||
// The GDT entry that TSS uses
|
||||
// Changes type when we load it: damn Intel!
|
||||
// For after we switch over our page tables
|
||||
// That entry will be read-only: we'd crash.
|
||||
movl $(GDT_ENTRY_TSS*8), %edx
|
||||
ltr %dx
|
||||
|
||||
// Look back now, before we take this last step!
|
||||
// The Host's TSS entry was also marked used;
|
||||
// Let's clear it again for our return.
|
||||
// The GDT descriptor of the Host
|
||||
// Points to the table after two "size" bytes
|
||||
movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
|
||||
// Clear "used" from type field (byte 5, bit 2)
|
||||
andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
|
||||
|
||||
// Once our page table's switched, the Guest is live!
|
||||
// The Host fades as we run this final step.
|
||||
// Our "struct lguest_pages" is now read-only.
|
||||
movl %ebx, %cr3
|
||||
|
||||
// The page table change did one tricky thing:
|
||||
// The Guest's register page has been mapped
|
||||
// Writable under our %esp (stack) --
|
||||
// We can simply pop off all Guest regs.
|
||||
popl %eax
|
||||
popl %ebx
|
||||
popl %ecx
|
||||
popl %edx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
popl %gs
|
||||
popl %fs
|
||||
popl %ds
|
||||
popl %es
|
||||
|
||||
// Near the base of the stack lurk two strange fields
|
||||
// Which we fill as we exit the Guest
|
||||
// These are the trap number and its error
|
||||
// We can simply step past them on our way.
|
||||
addl $8, %esp
|
||||
|
||||
// The last five stack slots hold return address
|
||||
// And everything needed to switch privilege
|
||||
// From Switcher's level 0 to Guest's 1,
|
||||
// And the stack where the Guest had last left it.
|
||||
// Interrupts are turned back on: we are Guest.
|
||||
iret
|
||||
|
||||
// We tread two paths to switch back to the Host
|
||||
// Yet both must save Guest state and restore Host
|
||||
// So we put the routine in a macro.
|
||||
#define SWITCH_TO_HOST \
|
||||
/* We save the Guest state: all registers first \
|
||||
* Laid out just as "struct lguest_regs" defines */ \
|
||||
pushl %es; \
|
||||
pushl %ds; \
|
||||
pushl %fs; \
|
||||
pushl %gs; \
|
||||
pushl %ebp; \
|
||||
pushl %edi; \
|
||||
pushl %esi; \
|
||||
pushl %edx; \
|
||||
pushl %ecx; \
|
||||
pushl %ebx; \
|
||||
pushl %eax; \
|
||||
/* Our stack and our code are using segments \
|
||||
* Set in the TSS and IDT \
|
||||
* Yet if we were to touch data we'd use \
|
||||
* Whatever data segment the Guest had. \
|
||||
* Load the lguest ds segment for now. */ \
|
||||
movl $(LGUEST_DS), %eax; \
|
||||
movl %eax, %ds; \
|
||||
/* So where are we? Which CPU, which struct? \
|
||||
* The stack is our clue: our TSS starts \
|
||||
* It at the end of "struct lguest_pages". \
|
||||
* Or we may have stumbled while restoring \
|
||||
* Our Guest segment regs while in switch_to_guest, \
|
||||
* The fault pushed atop that part-unwound stack. \
|
||||
* If we round the stack down to the page start \
|
||||
* We're at the start of "struct lguest_pages". */ \
|
||||
movl %esp, %eax; \
|
||||
andl $(~(1 << PAGE_SHIFT - 1)), %eax; \
|
||||
/* Save our trap number: the switch will obscure it \
|
||||
* (In the Host the Guest regs are not mapped here) \
|
||||
* %ebx holds it safe for deliver_to_host */ \
|
||||
movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
|
||||
/* The Host GDT, IDT and stack! \
|
||||
* All these lie safely hidden from the Guest: \
|
||||
* We must return to the Host page tables \
|
||||
* (Hence that was saved in struct lguest_pages) */ \
|
||||
movl LGUEST_PAGES_host_cr3(%eax), %edx; \
|
||||
movl %edx, %cr3; \
|
||||
/* As before, when we looked back at the Host \
|
||||
* As we left and marked TSS unused \
|
||||
* So must we now for the Guest left behind. */ \
|
||||
andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
|
||||
/* Switch to Host's GDT, IDT. */ \
|
||||
lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
|
||||
lidt LGUEST_PAGES_host_idt_desc(%eax); \
|
||||
/* Restore the Host's stack where its saved regs lie */ \
|
||||
movl LGUEST_PAGES_host_sp(%eax), %esp; \
|
||||
/* Last the TSS: our Host is returned */ \
|
||||
movl $(GDT_ENTRY_TSS*8), %edx; \
|
||||
ltr %dx; \
|
||||
/* Restore now the regs saved right at the first. */ \
|
||||
popl %ebp; \
|
||||
popl %fs; \
|
||||
popl %gs; \
|
||||
popl %ds; \
|
||||
popl %es
|
||||
|
||||
// The first path is trod when the Guest has trapped:
|
||||
// (Which trap it was has been pushed on the stack).
|
||||
// We need only switch back, and the Host will decode
|
||||
// Why we came home, and what needs to be done.
|
||||
return_to_host:
|
||||
SWITCH_TO_HOST
|
||||
iret
|
||||
|
||||
// We are lead to the second path like so:
|
||||
// An interrupt, with some cause external
|
||||
// Has ajerked us rudely from the Guest's code
|
||||
// Again we must return home to the Host
|
||||
deliver_to_host:
|
||||
SWITCH_TO_HOST
|
||||
// But now we must go home via that place
|
||||
// Where that interrupt was supposed to go
|
||||
// Had we not been ensconced, running the Guest.
|
||||
// Here we see the trickness of run_guest_once():
|
||||
// The Host stack is formed like an interrupt
|
||||
// With EIP, CS and EFLAGS layered.
|
||||
// Interrupt handlers end with "iret"
|
||||
// And that will take us home at long long last.
|
||||
|
||||
// But first we must find the handler to call!
|
||||
// The IDT descriptor for the Host
|
||||
// Has two bytes for size, and four for address:
|
||||
// %edx will hold it for us for now.
|
||||
movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
|
||||
// We now know the table address we need,
|
||||
// And saved the trap's number inside %ebx.
|
||||
// Yet the pointer to the handler is smeared
|
||||
// Across the bits of the table entry.
|
||||
// What oracle can tell us how to extract
|
||||
// From such a convoluted encoding?
|
||||
// I consulted gcc, and it gave
|
||||
// These instructions, which I gladly credit:
|
||||
leal (%edx,%ebx,8), %eax
|
||||
movzwl (%eax),%edx
|
||||
movl 4(%eax), %eax
|
||||
xorw %ax, %ax
|
||||
orl %eax, %edx
|
||||
// Now the address of the handler's in %edx
|
||||
// We call it now: its "iret" drops us home.
|
||||
jmp *%edx
|
||||
|
||||
// Every interrupt can come to us here
|
||||
// But we must truly tell each apart.
|
||||
// They number two hundred and fifty six
|
||||
// And each must land in a different spot,
|
||||
// Push its number on stack, and join the stream.
|
||||
|
||||
// And worse, a mere six of the traps stand apart
|
||||
// And push on their stack an addition:
|
||||
// An error number, thirty two bits long
|
||||
// So we punish the other two fifty
|
||||
// And make them push a zero so they match.
|
||||
|
||||
// Yet two fifty six entries is long
|
||||
// And all will look most the same as the last
|
||||
// So we create a macro which can make
|
||||
// As many entries as we need to fill.
|
||||
|
||||
// Note the change to .data then .text:
|
||||
// We plant the address of each entry
|
||||
// Into a (data) table for the Host
|
||||
// To know where each Guest interrupt should go.
|
||||
.macro IRQ_STUB N TARGET
|
||||
.data; .long 1f; .text; 1:
|
||||
// Trap eight, ten through fourteen and seventeen
|
||||
// Supply an error number. Else zero.
|
||||
.if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
|
||||
pushl $0
|
||||
.endif
|
||||
pushl $\N
|
||||
jmp \TARGET
|
||||
ALIGN
|
||||
.endm
|
||||
|
||||
// This macro creates numerous entries
|
||||
// Using GAS macros which out-power C's.
|
||||
.macro IRQ_STUBS FIRST LAST TARGET
|
||||
irq=\FIRST
|
||||
.rept \LAST-\FIRST+1
|
||||
IRQ_STUB irq \TARGET
|
||||
irq=irq+1
|
||||
.endr
|
||||
.endm
|
||||
|
||||
// Here's the marker for our pointer table
|
||||
// Laid in the data section just before
|
||||
// Each macro places the address of code
|
||||
// Forming an array: each one points to text
|
||||
// Which handles interrupt in its turn.
|
||||
.data
|
||||
.global default_idt_entries
|
||||
default_idt_entries:
|
||||
.text
|
||||
// The first two traps go straight back to the Host
|
||||
IRQ_STUBS 0 1 return_to_host
|
||||
// We'll say nothing, yet, about NMI
|
||||
IRQ_STUB 2 handle_nmi
|
||||
// Other traps also return to the Host
|
||||
IRQ_STUBS 3 31 return_to_host
|
||||
// All interrupts go via their handlers
|
||||
IRQ_STUBS 32 127 deliver_to_host
|
||||
// 'Cept system calls coming from userspace
|
||||
// Are to go to the Guest, never the Host.
|
||||
IRQ_STUB 128 return_to_host
|
||||
IRQ_STUBS 129 255 deliver_to_host
|
||||
|
||||
// The NMI, what a fabulous beast
|
||||
// Which swoops in and stops us no matter that
|
||||
// We're suspended between heaven and hell,
|
||||
// (Or more likely between the Host and Guest)
|
||||
// When in it comes! We are dazed and confused
|
||||
// So we do the simplest thing which one can.
|
||||
// Though we've pushed the trap number and zero
|
||||
// We discard them, return, and hope we live.
|
||||
handle_nmi:
|
||||
addl $8, %esp
|
||||
iret
|
||||
|
||||
// We are done; all that's left is Mastery
|
||||
// And "make Mastery" is a journey long
|
||||
// Designed to make your fingers itch to code.
|
||||
|
||||
// Here ends the text, the file and poem.
|
||||
ENTRY(end_switcher_text)
|
@ -333,7 +333,7 @@ config VIRTIO_NET
|
||||
depends on VIRTIO
|
||||
---help---
|
||||
This is the virtual network driver for virtio. It can be used with
|
||||
lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
|
||||
QEMU based VMMs (like KVM or Xen). Say Y or M.
|
||||
|
||||
config NLMON
|
||||
tristate "Virtual netlink monitoring device"
|
||||
|
@ -4,7 +4,7 @@ config HVC_DRIVER
|
||||
bool
|
||||
help
|
||||
Generic "hypervisor virtual console" infrastructure for various
|
||||
hypervisors (pSeries, iSeries, Xen, lguest).
|
||||
hypervisors (pSeries, iSeries, Xen).
|
||||
It will automatically be selected if one of the back-end console drivers
|
||||
is selected.
|
||||
|
||||
|
@ -2,8 +2,8 @@ config VIRTIO
|
||||
tristate
|
||||
---help---
|
||||
This option is selected by any driver which implements the virtio
|
||||
bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
|
||||
CONFIG_RPMSG or CONFIG_S390_GUEST.
|
||||
bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
|
||||
or CONFIG_S390_GUEST.
|
||||
|
||||
menu "Virtio drivers"
|
||||
|
||||
|
@ -915,6 +915,9 @@ extern void ioport_unmap(void __iomem *p);
|
||||
#endif /* CONFIG_GENERIC_IOMAP */
|
||||
#endif /* CONFIG_HAS_IOPORT_MAP */
|
||||
|
||||
/*
|
||||
* Convert a virtual cached pointer to an uncached pointer
|
||||
*/
|
||||
#ifndef xlate_dev_kmem_ptr
|
||||
#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr
|
||||
static inline void *xlate_dev_kmem_ptr(void *addr)
|
||||
@ -954,6 +957,14 @@ static inline void *bus_to_virt(unsigned long address)
|
||||
|
||||
#ifndef memset_io
|
||||
#define memset_io memset_io
|
||||
/**
|
||||
* memset_io Set a range of I/O memory to a constant value
|
||||
* @addr: The beginning of the I/O-memory range to set
|
||||
* @val: The value to set the memory to
|
||||
* @count: The number of bytes to set
|
||||
*
|
||||
* Set a range of I/O memory to a given value.
|
||||
*/
|
||||
static inline void memset_io(volatile void __iomem *addr, int value,
|
||||
size_t size)
|
||||
{
|
||||
@ -963,6 +974,14 @@ static inline void memset_io(volatile void __iomem *addr, int value,
|
||||
|
||||
#ifndef memcpy_fromio
|
||||
#define memcpy_fromio memcpy_fromio
|
||||
/**
|
||||
* memcpy_fromio Copy a block of data from I/O memory
|
||||
* @dst: The (RAM) destination for the copy
|
||||
* @src: The (I/O memory) source for the data
|
||||
* @count: The number of bytes to copy
|
||||
*
|
||||
* Copy a block of data from I/O memory.
|
||||
*/
|
||||
static inline void memcpy_fromio(void *buffer,
|
||||
const volatile void __iomem *addr,
|
||||
size_t size)
|
||||
@ -973,6 +992,14 @@ static inline void memcpy_fromio(void *buffer,
|
||||
|
||||
#ifndef memcpy_toio
|
||||
#define memcpy_toio memcpy_toio
|
||||
/**
|
||||
* memcpy_toio Copy a block of data into I/O memory
|
||||
* @dst: The (I/O memory) destination for the copy
|
||||
* @src: The (RAM) source for the data
|
||||
* @count: The number of bytes to copy
|
||||
*
|
||||
* Copy a block of data to I/O memory.
|
||||
*/
|
||||
static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer,
|
||||
size_t size)
|
||||
{
|
||||
|
@ -680,6 +680,31 @@
|
||||
#define BUG_TABLE
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_ORC_UNWINDER
|
||||
#define ORC_UNWIND_TABLE \
|
||||
. = ALIGN(4); \
|
||||
.orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \
|
||||
VMLINUX_SYMBOL(__start_orc_unwind_ip) = .; \
|
||||
KEEP(*(.orc_unwind_ip)) \
|
||||
VMLINUX_SYMBOL(__stop_orc_unwind_ip) = .; \
|
||||
} \
|
||||
. = ALIGN(6); \
|
||||
.orc_unwind : AT(ADDR(.orc_unwind) - LOAD_OFFSET) { \
|
||||
VMLINUX_SYMBOL(__start_orc_unwind) = .; \
|
||||
KEEP(*(.orc_unwind)) \
|
||||
VMLINUX_SYMBOL(__stop_orc_unwind) = .; \
|
||||
} \
|
||||
. = ALIGN(4); \
|
||||
.orc_lookup : AT(ADDR(.orc_lookup) - LOAD_OFFSET) { \
|
||||
VMLINUX_SYMBOL(orc_lookup) = .; \
|
||||
. += (((SIZEOF(.text) + LOOKUP_BLOCK_SIZE - 1) / \
|
||||
LOOKUP_BLOCK_SIZE) + 1) * 4; \
|
||||
VMLINUX_SYMBOL(orc_lookup_end) = .; \
|
||||
}
|
||||
#else
|
||||
#define ORC_UNWIND_TABLE
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PM_TRACE
|
||||
#define TRACEDATA \
|
||||
. = ALIGN(4); \
|
||||
@ -866,7 +891,7 @@
|
||||
DATA_DATA \
|
||||
CONSTRUCTORS \
|
||||
} \
|
||||
BUG_TABLE
|
||||
BUG_TABLE \
|
||||
|
||||
#define INIT_TEXT_SECTION(inittext_align) \
|
||||
. = ALIGN(inittext_align); \
|
||||
|
@ -201,17 +201,6 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_STACK_VALIDATION
|
||||
#define annotate_unreachable() ({ \
|
||||
asm("%c0:\t\n" \
|
||||
".pushsection .discard.unreachable\t\n" \
|
||||
".long %c0b - .\t\n" \
|
||||
".popsection\t\n" : : "i" (__LINE__)); \
|
||||
})
|
||||
#else
|
||||
#define annotate_unreachable()
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Mark a position in code as unreachable. This can be used to
|
||||
* suppress control flow warnings after asm blocks that transfer
|
||||
|
@ -185,8 +185,34 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
|
||||
#endif
|
||||
|
||||
/* Unreachable code */
|
||||
#ifdef CONFIG_STACK_VALIDATION
|
||||
#define annotate_reachable() ({ \
|
||||
asm("%c0:\n\t" \
|
||||
".pushsection .discard.reachable\n\t" \
|
||||
".long %c0b - .\n\t" \
|
||||
".popsection\n\t" : : "i" (__LINE__)); \
|
||||
})
|
||||
#define annotate_unreachable() ({ \
|
||||
asm("%c0:\n\t" \
|
||||
".pushsection .discard.unreachable\n\t" \
|
||||
".long %c0b - .\n\t" \
|
||||
".popsection\n\t" : : "i" (__LINE__)); \
|
||||
})
|
||||
#define ASM_UNREACHABLE \
|
||||
"999:\n\t" \
|
||||
".pushsection .discard.unreachable\n\t" \
|
||||
".long 999b - .\n\t" \
|
||||
".popsection\n\t"
|
||||
#else
|
||||
#define annotate_reachable()
|
||||
#define annotate_unreachable()
|
||||
#endif
|
||||
|
||||
#ifndef ASM_UNREACHABLE
|
||||
# define ASM_UNREACHABLE
|
||||
#endif
|
||||
#ifndef unreachable
|
||||
# define unreachable() do { } while (1)
|
||||
# define unreachable() do { annotate_reachable(); do { } while (1); } while (0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -1,73 +0,0 @@
|
||||
/*
|
||||
* Things the lguest guest needs to know. Note: like all lguest interfaces,
|
||||
* this is subject to wild and random change between versions.
|
||||
*/
|
||||
#ifndef _LINUX_LGUEST_H
|
||||
#define _LINUX_LGUEST_H
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#include <linux/time.h>
|
||||
#include <asm/irq.h>
|
||||
#include <asm/lguest_hcall.h>
|
||||
|
||||
#define LG_CLOCK_MIN_DELTA 100UL
|
||||
#define LG_CLOCK_MAX_DELTA ULONG_MAX
|
||||
|
||||
/*G:031
|
||||
* The second method of communicating with the Host is to via "struct
|
||||
* lguest_data". Once the Guest's initialization hypercall tells the Host where
|
||||
* this is, the Guest and Host both publish information in it.
|
||||
:*/
|
||||
struct lguest_data {
|
||||
/*
|
||||
* 512 == enabled (same as eflags in normal hardware). The Guest
|
||||
* changes interrupts so often that a hypercall is too slow.
|
||||
*/
|
||||
unsigned int irq_enabled;
|
||||
/* Fine-grained interrupt disabling by the Guest */
|
||||
DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
|
||||
|
||||
/*
|
||||
* The Host writes the virtual address of the last page fault here,
|
||||
* which saves the Guest a hypercall. CR2 is the native register where
|
||||
* this address would normally be found.
|
||||
*/
|
||||
unsigned long cr2;
|
||||
|
||||
/* Wallclock time set by the Host. */
|
||||
struct timespec time;
|
||||
|
||||
/*
|
||||
* Interrupt pending set by the Host. The Guest should do a hypercall
|
||||
* if it re-enables interrupts and sees this set (to X86_EFLAGS_IF).
|
||||
*/
|
||||
int irq_pending;
|
||||
|
||||
/*
|
||||
* Async hypercall ring. Instead of directly making hypercalls, we can
|
||||
* place them in here for processing the next time the Host wants.
|
||||
* This batching can be quite efficient.
|
||||
*/
|
||||
|
||||
/* 0xFF == done (set by Host), 0 == pending (set by Guest). */
|
||||
u8 hcall_status[LHCALL_RING_SIZE];
|
||||
/* The actual registers for the hypercalls. */
|
||||
struct hcall_args hcalls[LHCALL_RING_SIZE];
|
||||
|
||||
/* Fields initialized by the Host at boot: */
|
||||
/* Memory not to try to access */
|
||||
unsigned long reserve_mem;
|
||||
/* KHz for the TSC clock. */
|
||||
u32 tsc_khz;
|
||||
|
||||
/* Fields initialized by the Guest at boot: */
|
||||
/* Instruction to suppress interrupts even if enabled */
|
||||
unsigned long noirq_iret;
|
||||
/* Address above which page tables are all identical. */
|
||||
unsigned long kernel_address;
|
||||
/* The vector to try to use for system calls (0x40 or 0x80). */
|
||||
unsigned int syscall_vec;
|
||||
};
|
||||
extern struct lguest_data lguest_data;
|
||||
#endif /* __ASSEMBLY__ */
|
||||
#endif /* _LINUX_LGUEST_H */
|
@ -1,44 +0,0 @@
|
||||
#ifndef _LINUX_LGUEST_LAUNCHER
|
||||
#define _LINUX_LGUEST_LAUNCHER
|
||||
/* Everything the "lguest" userspace program needs to know. */
|
||||
#include <linux/types.h>
|
||||
|
||||
/*D:010
|
||||
* Drivers
|
||||
*
|
||||
* The Guest needs devices to do anything useful. Since we don't let it touch
|
||||
* real devices (think of the damage it could do!) we provide virtual devices.
|
||||
* We emulate a PCI bus with virtio devices on it; we used to have our own
|
||||
* lguest bus which was far simpler, but this tests the virtio 1.0 standard.
|
||||
*
|
||||
* Virtio devices are also used by kvm, so we can simply reuse their optimized
|
||||
* device drivers. And one day when everyone uses virtio, my plan will be
|
||||
* complete. Bwahahahah!
|
||||
*/
|
||||
|
||||
/* Write command first word is a request. */
|
||||
enum lguest_req
|
||||
{
|
||||
LHREQ_INITIALIZE, /* + base, pfnlimit, start */
|
||||
LHREQ_GETDMA, /* No longer used */
|
||||
LHREQ_IRQ, /* + irq */
|
||||
LHREQ_BREAK, /* No longer used */
|
||||
LHREQ_EVENTFD, /* No longer used. */
|
||||
LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
|
||||
LHREQ_SETREG, /* + offset within struct pt_regs, value. */
|
||||
LHREQ_TRAP, /* + trap number to deliver to guest. */
|
||||
};
|
||||
|
||||
/*
|
||||
* This is what read() of the lguest fd populates. trap ==
|
||||
* LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the
|
||||
* argument), 14 for a page fault in the MMIO region (addr is
|
||||
* the trap address, insn is the instruction), or 13 for a GPF
|
||||
* (insn is the instruction).
|
||||
*/
|
||||
struct lguest_pending {
|
||||
__u8 trap;
|
||||
__u8 insn[7];
|
||||
__u32 addr;
|
||||
};
|
||||
#endif /* _LINUX_LGUEST_LAUNCHER */
|
@ -1,7 +1,7 @@
|
||||
#ifndef _UAPI_LINUX_VIRTIO_RING_H
|
||||
#define _UAPI_LINUX_VIRTIO_RING_H
|
||||
/* An interface for efficient virtio implementation, currently for use by KVM
|
||||
* and lguest, but hopefully others soon. Do NOT change this since it will
|
||||
/* An interface for efficient virtio implementation, currently for use by KVM,
|
||||
* but hopefully others soon. Do NOT change this since it will
|
||||
* break existing servers and clients.
|
||||
*
|
||||
* This header is BSD licensed so anyone can use the definitions to implement
|
||||
|
@ -374,6 +374,9 @@ config STACK_VALIDATION
|
||||
pointers (if CONFIG_FRAME_POINTER is enabled). This helps ensure
|
||||
that runtime stack traces are more reliable.
|
||||
|
||||
This is also a prerequisite for generation of ORC unwind data, which
|
||||
is needed for CONFIG_ORC_UNWINDER.
|
||||
|
||||
For more information, see
|
||||
tools/objtool/Documentation/stack-validation.txt.
|
||||
|
||||
@ -1128,7 +1131,7 @@ config LOCKDEP
|
||||
bool
|
||||
depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
|
||||
select STACKTRACE
|
||||
select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE
|
||||
select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE && !X86
|
||||
select KALLSYMS
|
||||
select KALLSYMS_ALL
|
||||
|
||||
@ -1547,7 +1550,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER
|
||||
depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
|
||||
depends on !X86_64
|
||||
select STACKTRACE
|
||||
select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE
|
||||
select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE && !X86
|
||||
help
|
||||
Provide stacktrace filter for fault-injection capabilities
|
||||
|
||||
@ -1556,7 +1559,7 @@ config LATENCYTOP
|
||||
depends on DEBUG_KERNEL
|
||||
depends on STACKTRACE_SUPPORT
|
||||
depends on PROC_FS
|
||||
select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC
|
||||
select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86
|
||||
select KALLSYMS
|
||||
select KALLSYMS_ALL
|
||||
select STACKTRACE
|
||||
|
@ -258,10 +258,14 @@ ifneq ($(SKIP_STACK_VALIDATION),1)
|
||||
|
||||
__objtool_obj := $(objtree)/tools/objtool/objtool
|
||||
|
||||
objtool_args = check
|
||||
objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check)
|
||||
|
||||
ifndef CONFIG_FRAME_POINTER
|
||||
objtool_args += --no-fp
|
||||
endif
|
||||
ifdef CONFIG_GCOV_KERNEL
|
||||
objtool_args += --no-unreachable
|
||||
endif
|
||||
|
||||
# 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory
|
||||
# 'OBJECT_FILES_NON_STANDARD_foo.o := 'y': skip objtool checking for a file
|
||||
@ -276,6 +280,11 @@ objtool_obj = $(if $(patsubst y%,, \
|
||||
endif # SKIP_STACK_VALIDATION
|
||||
endif # CONFIG_STACK_VALIDATION
|
||||
|
||||
# Rebuild all objects when objtool changes, or is enabled/disabled.
|
||||
objtool_dep = $(objtool_obj) \
|
||||
$(wildcard include/config/orc/unwinder.h \
|
||||
include/config/stack/validation.h)
|
||||
|
||||
define rule_cc_o_c
|
||||
$(call echo-cmd,checksrc) $(cmd_checksrc) \
|
||||
$(call cmd_and_fixdep,cc_o_c) \
|
||||
@ -298,13 +307,13 @@ cmd_undef_syms = echo
|
||||
endif
|
||||
|
||||
# Built-in and composite module parts
|
||||
$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
|
||||
$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE
|
||||
$(call cmd,force_checksrc)
|
||||
$(call if_changed_rule,cc_o_c)
|
||||
|
||||
# Single-part modules are special since we need to mark them in $(MODVERDIR)
|
||||
|
||||
$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
|
||||
$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE
|
||||
$(call cmd,force_checksrc)
|
||||
$(call if_changed_rule,cc_o_c)
|
||||
@{ echo $(@:.o=.ko); echo $@; \
|
||||
@ -399,7 +408,7 @@ cmd_modversions_S = \
|
||||
endif
|
||||
endif
|
||||
|
||||
$(obj)/%.o: $(src)/%.S $(objtool_obj) FORCE
|
||||
$(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE
|
||||
$(call if_changed_rule,as_o_S)
|
||||
|
||||
targets += $(real-objs-y) $(real-objs-m) $(lib-y)
|
||||
|
@ -18,7 +18,6 @@ help:
|
||||
@echo ' iio - IIO tools'
|
||||
@echo ' kvm_stat - top-like utility for displaying kvm statistics'
|
||||
@echo ' leds - LEDs tools'
|
||||
@echo ' lguest - a minimal 32-bit x86 hypervisor'
|
||||
@echo ' liblockdep - user-space wrapper for kernel locking-validator'
|
||||
@echo ' net - misc networking tools'
|
||||
@echo ' perf - Linux performance measurement and analysis tool'
|
||||
@ -90,7 +89,7 @@ freefall: FORCE
|
||||
kvm_stat: FORCE
|
||||
$(call descend,kvm/$@)
|
||||
|
||||
all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \
|
||||
all: acpi cgroup cpupower gpio hv firewire liblockdep \
|
||||
perf selftests turbostat usb \
|
||||
virtio vm net x86_energy_perf_policy \
|
||||
tmon freefall objtool kvm_stat
|
||||
@ -101,7 +100,7 @@ acpi_install:
|
||||
cpupower_install:
|
||||
$(call descend,power/$(@:_install=),install)
|
||||
|
||||
cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install:
|
||||
cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install:
|
||||
$(call descend,$(@:_install=),install)
|
||||
|
||||
liblockdep_install:
|
||||
@ -123,7 +122,7 @@ kvm_stat_install:
|
||||
$(call descend,kvm/$(@:_install=),install)
|
||||
|
||||
install: acpi_install cgroup_install cpupower_install gpio_install \
|
||||
hv_install firewire_install lguest_install liblockdep_install \
|
||||
hv_install firewire_install liblockdep_install \
|
||||
perf_install selftests_install turbostat_install usb_install \
|
||||
virtio_install vm_install net_install x86_energy_perf_policy_install \
|
||||
tmon_install freefall_install objtool_install kvm_stat_install
|
||||
@ -134,7 +133,7 @@ acpi_clean:
|
||||
cpupower_clean:
|
||||
$(call descend,power/cpupower,clean)
|
||||
|
||||
cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
|
||||
cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
|
||||
$(call descend,$(@:_clean=),clean)
|
||||
|
||||
liblockdep_clean:
|
||||
@ -168,7 +167,7 @@ freefall_clean:
|
||||
build_clean:
|
||||
$(call descend,build,clean)
|
||||
|
||||
clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \
|
||||
clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
|
||||
perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \
|
||||
vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
|
||||
freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
|
||||
|
2
tools/lguest/.gitignore
vendored
2
tools/lguest/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
lguest
|
||||
include
|
@ -1,14 +0,0 @@
|
||||
# This creates the demonstration utility "lguest" which runs a Linux guest.
|
||||
CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude
|
||||
|
||||
all: lguest
|
||||
|
||||
include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h
|
||||
mkdir -p include/linux 2>&1 || true
|
||||
ln -sf ../../../../include/uapi/linux/virtio_types.h $@
|
||||
|
||||
lguest: include/linux/virtio_types.h
|
||||
|
||||
clean:
|
||||
rm -f lguest
|
||||
rm -rf include
|
@ -1,58 +0,0 @@
|
||||
#! /bin/sh
|
||||
|
||||
set -e
|
||||
|
||||
PREFIX=$1
|
||||
shift
|
||||
|
||||
trap 'rm -r $TMPDIR' 0
|
||||
TMPDIR=`mktemp -d`
|
||||
|
||||
exec 3>/dev/null
|
||||
for f; do
|
||||
while IFS="
|
||||
" read -r LINE; do
|
||||
case "$LINE" in
|
||||
*$PREFIX:[0-9]*:\**)
|
||||
NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
|
||||
if [ -f $TMPDIR/$NUM ]; then
|
||||
echo "$TMPDIR/$NUM already exits prior to $f"
|
||||
exit 1
|
||||
fi
|
||||
exec 3>>$TMPDIR/$NUM
|
||||
echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
|
||||
/bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
|
||||
;;
|
||||
*$PREFIX:[0-9]*)
|
||||
NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
|
||||
if [ -f $TMPDIR/$NUM ]; then
|
||||
echo "$TMPDIR/$NUM already exits prior to $f"
|
||||
exit 1
|
||||
fi
|
||||
exec 3>>$TMPDIR/$NUM
|
||||
echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
|
||||
/bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
|
||||
;;
|
||||
*:\**)
|
||||
/bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
|
||||
echo >&3
|
||||
exec 3>/dev/null
|
||||
;;
|
||||
*)
|
||||
/bin/echo "$LINE" >&3
|
||||
;;
|
||||
esac
|
||||
done < $f
|
||||
echo >&3
|
||||
exec 3>/dev/null
|
||||
done
|
||||
|
||||
LASTFILE=""
|
||||
for f in $TMPDIR/*; do
|
||||
if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
|
||||
LASTFILE=$(cat $TMPDIR/.$(basename $f) )
|
||||
echo "[ $LASTFILE ]"
|
||||
fi
|
||||
cat $f
|
||||
done
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,125 +0,0 @@
|
||||
__
|
||||
(___()'`; Rusty's Remarkably Unreliable Guide to Lguest
|
||||
/, /` - or, A Young Coder's Illustrated Hypervisor
|
||||
\\"--\\ http://lguest.ozlabs.org
|
||||
|
||||
Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
|
||||
for Linux developers and users to experiment with virtualization with the
|
||||
minimum of complexity. Nonetheless, it should have sufficient features to
|
||||
make it useful for specific tasks, and, of course, you are encouraged to fork
|
||||
and enhance it (see drivers/lguest/README).
|
||||
|
||||
Features:
|
||||
|
||||
- Kernel module which runs in a normal kernel.
|
||||
- Simple I/O model for communication.
|
||||
- Simple program to create new guests.
|
||||
- Logo contains cute puppies: http://lguest.ozlabs.org
|
||||
|
||||
Developer features:
|
||||
|
||||
- Fun to hack on.
|
||||
- No ABI: being tied to a specific kernel anyway, you can change anything.
|
||||
- Many opportunities for improvement or feature implementation.
|
||||
|
||||
Running Lguest:
|
||||
|
||||
- The easiest way to run lguest is to use same kernel as guest and host.
|
||||
You can configure them differently, but usually it's easiest not to.
|
||||
|
||||
You will need to configure your kernel with the following options:
|
||||
|
||||
"Processor type and features":
|
||||
"Paravirtualized guest support" = Y
|
||||
"Lguest guest support" = Y
|
||||
"High Memory Support" = off/4GB
|
||||
"Alignment value to which kernel should be aligned" = 0x100000
|
||||
(CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
|
||||
CONFIG_PHYSICAL_ALIGN=0x100000)
|
||||
|
||||
"Device Drivers":
|
||||
"Block devices"
|
||||
"Virtio block driver" = M/Y
|
||||
"Network device support"
|
||||
"Universal TUN/TAP device driver support" = M/Y
|
||||
"Virtio network driver" = M/Y
|
||||
(CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
|
||||
|
||||
"Virtualization"
|
||||
"Linux hypervisor example code" = M/Y
|
||||
(CONFIG_LGUEST=m)
|
||||
|
||||
- A tool called "lguest" is available in this directory: type "make"
|
||||
to build it. If you didn't build your kernel in-tree, use "make
|
||||
O=<builddir>".
|
||||
|
||||
- Create or find a root disk image. There are several useful ones
|
||||
around, such as the xm-test tiny root image at
|
||||
http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
|
||||
|
||||
For more serious work, I usually use a distribution ISO image and
|
||||
install it under qemu, then make multiple copies:
|
||||
|
||||
dd if=/dev/zero of=rootfile bs=1M count=2048
|
||||
qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
|
||||
|
||||
Make sure that you install a getty on /dev/hvc0 if you want to log in on the
|
||||
console!
|
||||
|
||||
- "modprobe lg" if you built it as a module.
|
||||
|
||||
- Run an lguest as root:
|
||||
|
||||
tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
|
||||
--block=rootfile root=/dev/vda
|
||||
|
||||
Explanation:
|
||||
64: the amount of memory to use, in MB.
|
||||
|
||||
vmlinux: the kernel image found in the top of your build directory. You
|
||||
can also use a standard bzImage.
|
||||
|
||||
--tunnet=192.168.19.1: configures a "tap" device for networking with this
|
||||
IP address.
|
||||
|
||||
--block=rootfile: a file or block device which becomes /dev/vda
|
||||
inside the guest.
|
||||
|
||||
root=/dev/vda: this (and anything else on the command line) are
|
||||
kernel boot parameters.
|
||||
|
||||
- Configuring networking. I usually have the host masquerade, using
|
||||
"iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
|
||||
/proc/sys/net/ipv4/ip_forward". In this example, I would configure
|
||||
eth0 inside the guest at 192.168.19.2.
|
||||
|
||||
Another method is to bridge the tap device to an external interface
|
||||
using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
|
||||
to obtain an IP address. The bridge needs to be configured first:
|
||||
this option simply adds the tap interface to it.
|
||||
|
||||
A simple example on my system:
|
||||
|
||||
ifconfig eth0 0.0.0.0
|
||||
brctl addbr lg0
|
||||
ifconfig lg0 up
|
||||
brctl addif lg0 eth0
|
||||
dhclient lg0
|
||||
|
||||
Then use --tunnet=bridge:lg0 when launching the guest.
|
||||
|
||||
See:
|
||||
|
||||
http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
|
||||
|
||||
for general information on how to get bridging to work.
|
||||
|
||||
- Random number generation. Using the --rng option will provide a
|
||||
/dev/hwrng in the guest that will read from the host's /dev/random.
|
||||
Use this option in conjunction with rng-tools (see ../hw_random.txt)
|
||||
to provide entropy to the guest kernel's /dev/random.
|
||||
|
||||
There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
|
||||
|
||||
Good luck!
|
||||
Rusty Russell rusty@rustcorp.com.au.
|
@ -1,6 +1,9 @@
|
||||
objtool-y += arch/$(SRCARCH)/
|
||||
objtool-y += builtin-check.o
|
||||
objtool-y += builtin-orc.o
|
||||
objtool-y += check.o
|
||||
objtool-y += orc_gen.o
|
||||
objtool-y += orc_dump.o
|
||||
objtool-y += elf.o
|
||||
objtool-y += special.o
|
||||
objtool-y += objtool.o
|
||||
|
@ -11,9 +11,6 @@ analyzes every .o file and ensures the validity of its stack metadata.
|
||||
It enforces a set of rules on asm code and C inline assembly code so
|
||||
that stack traces can be reliable.
|
||||
|
||||
Currently it only checks frame pointer usage, but there are plans to add
|
||||
CFI validation for C files and CFI generation for asm files.
|
||||
|
||||
For each function, it recursively follows all possible code paths and
|
||||
validates the correct frame pointer state at each instruction.
|
||||
|
||||
@ -23,6 +20,10 @@ alternative execution paths to a given instruction (or set of
|
||||
instructions). Similarly, it knows how to follow switch statements, for
|
||||
which gcc sometimes uses jump tables.
|
||||
|
||||
(Objtool also has an 'orc generate' subcommand which generates debuginfo
|
||||
for the ORC unwinder. See Documentation/x86/orc-unwinder.txt in the
|
||||
kernel tree for more details.)
|
||||
|
||||
|
||||
Why do we need stack metadata validation?
|
||||
-----------------------------------------
|
||||
@ -93,37 +94,14 @@ a) More reliable stack traces for frame pointer enabled kernels
|
||||
or at the very end of the function after the stack frame has been
|
||||
destroyed. This is an inherent limitation of frame pointers.
|
||||
|
||||
b) 100% reliable stack traces for DWARF enabled kernels
|
||||
b) ORC (Oops Rewind Capability) unwind table generation
|
||||
|
||||
(NOTE: This is not yet implemented)
|
||||
An alternative to frame pointers and DWARF, ORC unwind data can be
|
||||
used to walk the stack. Unlike frame pointers, ORC data is out of
|
||||
band. So it doesn't affect runtime performance and it can be
|
||||
reliable even when interrupts or exceptions are involved.
|
||||
|
||||
As an alternative to frame pointers, DWARF Call Frame Information
|
||||
(CFI) metadata can be used to walk the stack. Unlike frame pointers,
|
||||
CFI metadata is out of band. So it doesn't affect runtime
|
||||
performance and it can be reliable even when interrupts or exceptions
|
||||
are involved.
|
||||
|
||||
For C code, gcc automatically generates DWARF CFI metadata. But for
|
||||
asm code, generating CFI is a tedious manual approach which requires
|
||||
manually placed .cfi assembler macros to be scattered throughout the
|
||||
code. It's clumsy and very easy to get wrong, and it makes the real
|
||||
code harder to read.
|
||||
|
||||
Stacktool will improve this situation in several ways. For code
|
||||
which already has CFI annotations, it will validate them. For code
|
||||
which doesn't have CFI annotations, it will generate them. So an
|
||||
architecture can opt to strip out all the manual .cfi annotations
|
||||
from their asm code and have objtool generate them instead.
|
||||
|
||||
We might also add a runtime stack validation debug option where we
|
||||
periodically walk the stack from schedule() and/or an NMI to ensure
|
||||
that the stack metadata is sane and that we reach the bottom of the
|
||||
stack.
|
||||
|
||||
So the benefit of objtool here will be that external tooling should
|
||||
always show perfect stack traces. And the same will be true for
|
||||
kernel warning/oops traces if the architecture has a runtime DWARF
|
||||
unwinder.
|
||||
For more details, see Documentation/x86/orc-unwinder.txt.
|
||||
|
||||
c) Higher live patching compatibility rate
|
||||
|
||||
@ -211,7 +189,7 @@ they mean, and suggestions for how to fix them.
|
||||
function, add proper frame pointer logic using the FRAME_BEGIN and
|
||||
FRAME_END macros. Otherwise, if it's not a callable function, remove
|
||||
its ELF function annotation by changing ENDPROC to END, and instead
|
||||
use the manual CFI hint macros in asm/undwarf.h.
|
||||
use the manual unwind hint macros in asm/unwind_hints.h.
|
||||
|
||||
If it's a GCC-compiled .c file, the error may be because the function
|
||||
uses an inline asm() statement which has a "call" instruction. An
|
||||
@ -231,8 +209,8 @@ they mean, and suggestions for how to fix them.
|
||||
If the error is for an asm file, and the instruction is inside (or
|
||||
reachable from) a callable function, the function should be annotated
|
||||
with the ENTRY/ENDPROC macros (ENDPROC is the important one).
|
||||
Otherwise, the code should probably be annotated with the CFI hint
|
||||
macros in asm/undwarf.h so objtool and the unwinder can know the
|
||||
Otherwise, the code should probably be annotated with the unwind hint
|
||||
macros in asm/unwind_hints.h so objtool and the unwinder can know the
|
||||
stack state associated with the code.
|
||||
|
||||
If you're 100% sure the code won't affect stack traces, or if you're
|
||||
@ -258,7 +236,7 @@ they mean, and suggestions for how to fix them.
|
||||
instructions aren't allowed in a callable function, and are most
|
||||
likely part of the kernel entry code. They should usually not have
|
||||
the callable function annotation (ENDPROC) and should always be
|
||||
annotated with the CFI hint macros in asm/undwarf.h.
|
||||
annotated with the unwind hint macros in asm/unwind_hints.h.
|
||||
|
||||
|
||||
6. file.o: warning: objtool: func()+0x26: sibling call from callable instruction with modified stack frame
|
||||
@ -272,7 +250,7 @@ they mean, and suggestions for how to fix them.
|
||||
|
||||
If the instruction is not actually in a callable function (e.g.
|
||||
kernel entry code), change ENDPROC to END and annotate manually with
|
||||
the CFI hint macros in asm/undwarf.h.
|
||||
the unwind hint macros in asm/unwind_hints.h.
|
||||
|
||||
|
||||
7. file: warning: objtool: func()+0x5c: stack state mismatch
|
||||
@ -288,8 +266,8 @@ they mean, and suggestions for how to fix them.
|
||||
|
||||
Another possibility is that the code has some asm or inline asm which
|
||||
does some unusual things to the stack or the frame pointer. In such
|
||||
cases it's probably appropriate to use the CFI hint macros in
|
||||
asm/undwarf.h.
|
||||
cases it's probably appropriate to use the unwind hint macros in
|
||||
asm/unwind_hints.h.
|
||||
|
||||
|
||||
8. file.o: warning: objtool: funcA() falls through to next function funcB()
|
||||
|
@ -25,7 +25,8 @@ OBJTOOL_IN := $(OBJTOOL)-in.o
|
||||
all: $(OBJTOOL)
|
||||
|
||||
INCLUDES := -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi
|
||||
CFLAGS += -Wall -Werror $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -fomit-frame-pointer -O2 -g $(INCLUDES)
|
||||
WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed
|
||||
CFLAGS += -Wall -Werror $(WARNINGS) -fomit-frame-pointer -O2 -g $(INCLUDES)
|
||||
LDFLAGS += -lelf $(LIBSUBCMD)
|
||||
|
||||
# Allow old libelf to be used:
|
||||
@ -52,6 +53,9 @@ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
|
||||
diff -I'^#include' arch/x86/insn/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \
|
||||
diff -I'^#include' arch/x86/insn/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \
|
||||
|| echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true
|
||||
@(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \
|
||||
diff ../../arch/x86/include/asm/orc_types.h orc_types.h >/dev/null) \
|
||||
|| echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true
|
||||
$(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
|
||||
|
||||
|
||||
|
@ -31,8 +31,9 @@
|
||||
#define INSN_RETURN 6
|
||||
#define INSN_CONTEXT_SWITCH 7
|
||||
#define INSN_STACK 8
|
||||
#define INSN_NOP 9
|
||||
#define INSN_OTHER 10
|
||||
#define INSN_BUG 9
|
||||
#define INSN_NOP 10
|
||||
#define INSN_OTHER 11
|
||||
#define INSN_LAST INSN_OTHER
|
||||
|
||||
enum op_dest_type {
|
||||
|
@ -271,7 +271,7 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
|
||||
case 0x8d:
|
||||
if (rex == 0x48 && modrm == 0x65) {
|
||||
|
||||
/* lea -disp(%rbp), %rsp */
|
||||
/* lea disp(%rbp), %rsp */
|
||||
*type = INSN_STACK;
|
||||
op->src.type = OP_SRC_ADD;
|
||||
op->src.reg = CFI_BP;
|
||||
@ -281,6 +281,30 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
|
||||
break;
|
||||
}
|
||||
|
||||
if (rex == 0x48 && (modrm == 0xa4 || modrm == 0x64) &&
|
||||
sib == 0x24) {
|
||||
|
||||
/* lea disp(%rsp), %rsp */
|
||||
*type = INSN_STACK;
|
||||
op->src.type = OP_SRC_ADD;
|
||||
op->src.reg = CFI_SP;
|
||||
op->src.offset = insn.displacement.value;
|
||||
op->dest.type = OP_DEST_REG;
|
||||
op->dest.reg = CFI_SP;
|
||||
break;
|
||||
}
|
||||
|
||||
if (rex == 0x48 && modrm == 0x2c && sib == 0x24) {
|
||||
|
||||
/* lea (%rsp), %rbp */
|
||||
*type = INSN_STACK;
|
||||
op->src.type = OP_SRC_REG;
|
||||
op->src.reg = CFI_SP;
|
||||
op->dest.type = OP_DEST_REG;
|
||||
op->dest.reg = CFI_BP;
|
||||
break;
|
||||
}
|
||||
|
||||
if (rex == 0x4c && modrm == 0x54 && sib == 0x24 &&
|
||||
insn.displacement.value == 8) {
|
||||
|
||||
@ -382,20 +406,27 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
|
||||
|
||||
case 0x0f:
|
||||
|
||||
if (op2 >= 0x80 && op2 <= 0x8f)
|
||||
if (op2 >= 0x80 && op2 <= 0x8f) {
|
||||
|
||||
*type = INSN_JUMP_CONDITIONAL;
|
||||
else if (op2 == 0x05 || op2 == 0x07 || op2 == 0x34 ||
|
||||
op2 == 0x35)
|
||||
|
||||
} else if (op2 == 0x05 || op2 == 0x07 || op2 == 0x34 ||
|
||||
op2 == 0x35) {
|
||||
|
||||
/* sysenter, sysret */
|
||||
*type = INSN_CONTEXT_SWITCH;
|
||||
|
||||
else if (op2 == 0x0d || op2 == 0x1f)
|
||||
} else if (op2 == 0x0b || op2 == 0xb9) {
|
||||
|
||||
/* ud2 */
|
||||
*type = INSN_BUG;
|
||||
|
||||
} else if (op2 == 0x0d || op2 == 0x1f) {
|
||||
|
||||
/* nopl/nopw */
|
||||
*type = INSN_NOP;
|
||||
|
||||
else if (op2 == 0xa0 || op2 == 0xa8) {
|
||||
} else if (op2 == 0xa0 || op2 == 0xa8) {
|
||||
|
||||
/* push fs/gs */
|
||||
*type = INSN_STACK;
|
||||
|
@ -29,7 +29,7 @@
|
||||
#include "builtin.h"
|
||||
#include "check.h"
|
||||
|
||||
bool nofp;
|
||||
bool no_fp, no_unreachable;
|
||||
|
||||
static const char * const check_usage[] = {
|
||||
"objtool check [<options>] file.o",
|
||||
@ -37,7 +37,8 @@ static const char * const check_usage[] = {
|
||||
};
|
||||
|
||||
const struct option check_options[] = {
|
||||
OPT_BOOLEAN('f', "no-fp", &nofp, "Skip frame pointer validation"),
|
||||
OPT_BOOLEAN('f', "no-fp", &no_fp, "Skip frame pointer validation"),
|
||||
OPT_BOOLEAN('u', "no-unreachable", &no_unreachable, "Skip 'unreachable instruction' warnings"),
|
||||
OPT_END(),
|
||||
};
|
||||
|
||||
@ -52,5 +53,5 @@ int cmd_check(int argc, const char **argv)
|
||||
|
||||
objname = argv[0];
|
||||
|
||||
return check(objname, nofp);
|
||||
return check(objname, no_fp, no_unreachable, false);
|
||||
}
|
||||
|
70
tools/objtool/builtin-orc.c
Normal file
70
tools/objtool/builtin-orc.c
Normal file
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* objtool orc:
|
||||
*
|
||||
* This command analyzes a .o file and adds .orc_unwind and .orc_unwind_ip
|
||||
* sections to it, which is used by the in-kernel ORC unwinder.
|
||||
*
|
||||
* This command is a superset of "objtool check".
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <subcmd/parse-options.h>
|
||||
#include "builtin.h"
|
||||
#include "check.h"
|
||||
|
||||
|
||||
static const char *orc_usage[] = {
|
||||
"objtool orc generate [<options>] file.o",
|
||||
"objtool orc dump file.o",
|
||||
NULL,
|
||||
};
|
||||
|
||||
extern const struct option check_options[];
|
||||
extern bool no_fp, no_unreachable;
|
||||
|
||||
int cmd_orc(int argc, const char **argv)
|
||||
{
|
||||
const char *objname;
|
||||
|
||||
argc--; argv++;
|
||||
if (!strncmp(argv[0], "gen", 3)) {
|
||||
argc = parse_options(argc, argv, check_options, orc_usage, 0);
|
||||
if (argc != 1)
|
||||
usage_with_options(orc_usage, check_options);
|
||||
|
||||
objname = argv[0];
|
||||
|
||||
return check(objname, no_fp, no_unreachable, true);
|
||||
|
||||
}
|
||||
|
||||
if (!strcmp(argv[0], "dump")) {
|
||||
if (argc != 2)
|
||||
usage_with_options(orc_usage, check_options);
|
||||
|
||||
objname = argv[1];
|
||||
|
||||
return orc_dump(objname);
|
||||
}
|
||||
|
||||
usage_with_options(orc_usage, check_options);
|
||||
|
||||
return 0;
|
||||
}
|
@ -18,5 +18,6 @@
|
||||
#define _BUILTIN_H
|
||||
|
||||
extern int cmd_check(int argc, const char **argv);
|
||||
extern int cmd_orc(int argc, const char **argv);
|
||||
|
||||
#endif /* _BUILTIN_H */
|
||||
|
@ -33,11 +33,11 @@ struct alternative {
|
||||
};
|
||||
|
||||
const char *objname;
|
||||
static bool nofp;
|
||||
static bool no_fp;
|
||||
struct cfi_state initial_func_cfi;
|
||||
|
||||
static struct instruction *find_insn(struct objtool_file *file,
|
||||
struct section *sec, unsigned long offset)
|
||||
struct instruction *find_insn(struct objtool_file *file,
|
||||
struct section *sec, unsigned long offset)
|
||||
{
|
||||
struct instruction *insn;
|
||||
|
||||
@ -59,19 +59,6 @@ static struct instruction *next_insn_same_sec(struct objtool_file *file,
|
||||
return next;
|
||||
}
|
||||
|
||||
static bool gcov_enabled(struct objtool_file *file)
|
||||
{
|
||||
struct section *sec;
|
||||
struct symbol *sym;
|
||||
|
||||
for_each_sec(file, sec)
|
||||
list_for_each_entry(sym, &sec->symbol_list, list)
|
||||
if (!strncmp(sym->name, "__gcov_.", 8))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#define func_for_each_insn(file, func, insn) \
|
||||
for (insn = find_insn(file, func->sec, func->offset); \
|
||||
insn && &insn->list != &file->insn_list && \
|
||||
@ -100,7 +87,6 @@ static bool gcov_enabled(struct objtool_file *file)
|
||||
static bool ignore_func(struct objtool_file *file, struct symbol *func)
|
||||
{
|
||||
struct rela *rela;
|
||||
struct instruction *insn;
|
||||
|
||||
/* check for STACK_FRAME_NON_STANDARD */
|
||||
if (file->whitelist && file->whitelist->rela)
|
||||
@ -113,11 +99,6 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* check if it has a context switching instruction */
|
||||
func_for_each_insn(file, func, insn)
|
||||
if (insn->type == INSN_CONTEXT_SWITCH)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -240,6 +221,7 @@ static void clear_insn_state(struct insn_state *state)
|
||||
for (i = 0; i < CFI_NUM_REGS; i++)
|
||||
state->regs[i].base = CFI_UNDEFINED;
|
||||
state->drap_reg = CFI_UNDEFINED;
|
||||
state->drap_offset = -1;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -259,6 +241,11 @@ static int decode_instructions(struct objtool_file *file)
|
||||
if (!(sec->sh.sh_flags & SHF_EXECINSTR))
|
||||
continue;
|
||||
|
||||
if (strcmp(sec->name, ".altinstr_replacement") &&
|
||||
strcmp(sec->name, ".altinstr_aux") &&
|
||||
strncmp(sec->name, ".discard.", 9))
|
||||
sec->text = true;
|
||||
|
||||
for (offset = 0; offset < sec->len; offset += insn->len) {
|
||||
insn = malloc(sizeof(*insn));
|
||||
if (!insn) {
|
||||
@ -310,7 +297,7 @@ static int decode_instructions(struct objtool_file *file)
|
||||
}
|
||||
|
||||
/*
|
||||
* Find all uses of the unreachable() macro, which are code path dead ends.
|
||||
* Mark "ud2" instructions and manually annotated dead ends.
|
||||
*/
|
||||
static int add_dead_ends(struct objtool_file *file)
|
||||
{
|
||||
@ -319,9 +306,20 @@ static int add_dead_ends(struct objtool_file *file)
|
||||
struct instruction *insn;
|
||||
bool found;
|
||||
|
||||
/*
|
||||
* By default, "ud2" is a dead end unless otherwise annotated, because
|
||||
* GCC 7 inserts it for certain divide-by-zero cases.
|
||||
*/
|
||||
for_each_insn(file, insn)
|
||||
if (insn->type == INSN_BUG)
|
||||
insn->dead_end = true;
|
||||
|
||||
/*
|
||||
* Check for manually annotated dead ends.
|
||||
*/
|
||||
sec = find_section_by_name(file->elf, ".rela.discard.unreachable");
|
||||
if (!sec)
|
||||
return 0;
|
||||
goto reachable;
|
||||
|
||||
list_for_each_entry(rela, &sec->rela_list, list) {
|
||||
if (rela->sym->type != STT_SECTION) {
|
||||
@ -354,6 +352,48 @@ static int add_dead_ends(struct objtool_file *file)
|
||||
insn->dead_end = true;
|
||||
}
|
||||
|
||||
reachable:
|
||||
/*
|
||||
* These manually annotated reachable checks are needed for GCC 4.4,
|
||||
* where the Linux unreachable() macro isn't supported. In that case
|
||||
* GCC doesn't know the "ud2" is fatal, so it generates code as if it's
|
||||
* not a dead end.
|
||||
*/
|
||||
sec = find_section_by_name(file->elf, ".rela.discard.reachable");
|
||||
if (!sec)
|
||||
return 0;
|
||||
|
||||
list_for_each_entry(rela, &sec->rela_list, list) {
|
||||
if (rela->sym->type != STT_SECTION) {
|
||||
WARN("unexpected relocation symbol type in %s", sec->name);
|
||||
return -1;
|
||||
}
|
||||
insn = find_insn(file, rela->sym->sec, rela->addend);
|
||||
if (insn)
|
||||
insn = list_prev_entry(insn, list);
|
||||
else if (rela->addend == rela->sym->sec->len) {
|
||||
found = false;
|
||||
list_for_each_entry_reverse(insn, &file->insn_list, list) {
|
||||
if (insn->sec == rela->sym->sec) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
WARN("can't find reachable insn at %s+0x%x",
|
||||
rela->sym->sec->name, rela->addend);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
WARN("can't find reachable insn at %s+0x%x",
|
||||
rela->sym->sec->name, rela->addend);
|
||||
return -1;
|
||||
}
|
||||
|
||||
insn->dead_end = false;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -874,6 +914,99 @@ static int add_switch_table_alts(struct objtool_file *file)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_unwind_hints(struct objtool_file *file)
|
||||
{
|
||||
struct section *sec, *relasec;
|
||||
struct rela *rela;
|
||||
struct unwind_hint *hint;
|
||||
struct instruction *insn;
|
||||
struct cfi_reg *cfa;
|
||||
int i;
|
||||
|
||||
sec = find_section_by_name(file->elf, ".discard.unwind_hints");
|
||||
if (!sec)
|
||||
return 0;
|
||||
|
||||
relasec = sec->rela;
|
||||
if (!relasec) {
|
||||
WARN("missing .rela.discard.unwind_hints section");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (sec->len % sizeof(struct unwind_hint)) {
|
||||
WARN("struct unwind_hint size mismatch");
|
||||
return -1;
|
||||
}
|
||||
|
||||
file->hints = true;
|
||||
|
||||
for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) {
|
||||
hint = (struct unwind_hint *)sec->data->d_buf + i;
|
||||
|
||||
rela = find_rela_by_dest(sec, i * sizeof(*hint));
|
||||
if (!rela) {
|
||||
WARN("can't find rela for unwind_hints[%d]", i);
|
||||
return -1;
|
||||
}
|
||||
|
||||
insn = find_insn(file, rela->sym->sec, rela->addend);
|
||||
if (!insn) {
|
||||
WARN("can't find insn for unwind_hints[%d]", i);
|
||||
return -1;
|
||||
}
|
||||
|
||||
cfa = &insn->state.cfa;
|
||||
|
||||
if (hint->type == UNWIND_HINT_TYPE_SAVE) {
|
||||
insn->save = true;
|
||||
continue;
|
||||
|
||||
} else if (hint->type == UNWIND_HINT_TYPE_RESTORE) {
|
||||
insn->restore = true;
|
||||
insn->hint = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
insn->hint = true;
|
||||
|
||||
switch (hint->sp_reg) {
|
||||
case ORC_REG_UNDEFINED:
|
||||
cfa->base = CFI_UNDEFINED;
|
||||
break;
|
||||
case ORC_REG_SP:
|
||||
cfa->base = CFI_SP;
|
||||
break;
|
||||
case ORC_REG_BP:
|
||||
cfa->base = CFI_BP;
|
||||
break;
|
||||
case ORC_REG_SP_INDIRECT:
|
||||
cfa->base = CFI_SP_INDIRECT;
|
||||
break;
|
||||
case ORC_REG_R10:
|
||||
cfa->base = CFI_R10;
|
||||
break;
|
||||
case ORC_REG_R13:
|
||||
cfa->base = CFI_R13;
|
||||
break;
|
||||
case ORC_REG_DI:
|
||||
cfa->base = CFI_DI;
|
||||
break;
|
||||
case ORC_REG_DX:
|
||||
cfa->base = CFI_DX;
|
||||
break;
|
||||
default:
|
||||
WARN_FUNC("unsupported unwind_hint sp base reg %d",
|
||||
insn->sec, insn->offset, hint->sp_reg);
|
||||
return -1;
|
||||
}
|
||||
|
||||
cfa->offset = hint->sp_offset;
|
||||
insn->state.type = hint->type;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int decode_sections(struct objtool_file *file)
|
||||
{
|
||||
int ret;
|
||||
@ -904,6 +1037,10 @@ static int decode_sections(struct objtool_file *file)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = read_unwind_hints(file);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -947,11 +1084,34 @@ static bool has_valid_stack_frame(struct insn_state *state)
|
||||
return false;
|
||||
}
|
||||
|
||||
static int update_insn_state_regs(struct instruction *insn, struct insn_state *state)
|
||||
{
|
||||
struct cfi_reg *cfa = &state->cfa;
|
||||
struct stack_op *op = &insn->stack_op;
|
||||
|
||||
if (cfa->base != CFI_SP)
|
||||
return 0;
|
||||
|
||||
/* push */
|
||||
if (op->dest.type == OP_DEST_PUSH)
|
||||
cfa->offset += 8;
|
||||
|
||||
/* pop */
|
||||
if (op->src.type == OP_SRC_POP)
|
||||
cfa->offset -= 8;
|
||||
|
||||
/* add immediate to sp */
|
||||
if (op->dest.type == OP_DEST_REG && op->src.type == OP_SRC_ADD &&
|
||||
op->dest.reg == CFI_SP && op->src.reg == CFI_SP)
|
||||
cfa->offset -= op->src.offset;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void save_reg(struct insn_state *state, unsigned char reg, int base,
|
||||
int offset)
|
||||
{
|
||||
if ((arch_callee_saved_reg(reg) ||
|
||||
(state->drap && reg == state->drap_reg)) &&
|
||||
if (arch_callee_saved_reg(reg) &&
|
||||
state->regs[reg].base == CFI_UNDEFINED) {
|
||||
state->regs[reg].base = base;
|
||||
state->regs[reg].offset = offset;
|
||||
@ -1032,6 +1192,9 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (state->type == ORC_TYPE_REGS || state->type == ORC_TYPE_REGS_IRET)
|
||||
return update_insn_state_regs(insn, state);
|
||||
|
||||
switch (op->dest.type) {
|
||||
|
||||
case OP_DEST_REG:
|
||||
@ -1051,7 +1214,7 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
|
||||
regs[CFI_BP].base = CFI_BP;
|
||||
regs[CFI_BP].offset = -state->stack_size;
|
||||
state->bp_scratch = false;
|
||||
} else if (!nofp) {
|
||||
} else if (!no_fp) {
|
||||
|
||||
WARN_FUNC("unknown stack-related register move",
|
||||
insn->sec, insn->offset);
|
||||
@ -1118,7 +1281,6 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
|
||||
cfa->base = state->drap_reg;
|
||||
cfa->offset = state->stack_size = 0;
|
||||
state->drap = true;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1136,17 +1298,19 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
|
||||
cfa->base = CFI_SP;
|
||||
}
|
||||
|
||||
if (regs[op->dest.reg].offset == -state->stack_size) {
|
||||
if (state->drap && cfa->base == CFI_BP_INDIRECT &&
|
||||
op->dest.type == OP_DEST_REG &&
|
||||
op->dest.reg == state->drap_reg &&
|
||||
state->drap_offset == -state->stack_size) {
|
||||
|
||||
if (state->drap && cfa->base == CFI_BP_INDIRECT &&
|
||||
op->dest.type == OP_DEST_REG &&
|
||||
op->dest.reg == state->drap_reg) {
|
||||
/* drap: pop %drap */
|
||||
cfa->base = state->drap_reg;
|
||||
cfa->offset = 0;
|
||||
state->drap_offset = -1;
|
||||
|
||||
/* drap: pop %drap */
|
||||
cfa->base = state->drap_reg;
|
||||
cfa->offset = 0;
|
||||
}
|
||||
} else if (regs[op->dest.reg].offset == -state->stack_size) {
|
||||
|
||||
/* pop %reg */
|
||||
restore_reg(state, op->dest.reg);
|
||||
}
|
||||
|
||||
@ -1157,15 +1321,19 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
|
||||
break;
|
||||
|
||||
case OP_SRC_REG_INDIRECT:
|
||||
if (state->drap && op->src.reg == CFI_BP &&
|
||||
op->src.offset == state->drap_offset) {
|
||||
|
||||
/* drap: mov disp(%rbp), %drap */
|
||||
cfa->base = state->drap_reg;
|
||||
cfa->offset = 0;
|
||||
state->drap_offset = -1;
|
||||
}
|
||||
|
||||
if (state->drap && op->src.reg == CFI_BP &&
|
||||
op->src.offset == regs[op->dest.reg].offset) {
|
||||
|
||||
/* drap: mov disp(%rbp), %reg */
|
||||
if (op->dest.reg == state->drap_reg) {
|
||||
cfa->base = state->drap_reg;
|
||||
cfa->offset = 0;
|
||||
}
|
||||
|
||||
restore_reg(state, op->dest.reg);
|
||||
|
||||
} else if (op->src.reg == cfa->base &&
|
||||
@ -1201,8 +1369,8 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
|
||||
cfa->base = CFI_BP_INDIRECT;
|
||||
cfa->offset = -state->stack_size;
|
||||
|
||||
/* save drap so we know when to undefine it */
|
||||
save_reg(state, op->src.reg, CFI_CFA, -state->stack_size);
|
||||
/* save drap so we know when to restore it */
|
||||
state->drap_offset = -state->stack_size;
|
||||
|
||||
} else if (op->src.reg == CFI_BP && cfa->base == state->drap_reg) {
|
||||
|
||||
@ -1222,7 +1390,7 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
|
||||
}
|
||||
|
||||
/* detect when asm code uses rbp as a scratch register */
|
||||
if (!nofp && insn->func && op->src.reg == CFI_BP &&
|
||||
if (!no_fp && insn->func && op->src.reg == CFI_BP &&
|
||||
cfa->base != CFI_BP)
|
||||
state->bp_scratch = true;
|
||||
break;
|
||||
@ -1236,8 +1404,8 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
|
||||
cfa->base = CFI_BP_INDIRECT;
|
||||
cfa->offset = op->dest.offset;
|
||||
|
||||
/* save drap so we know when to undefine it */
|
||||
save_reg(state, op->src.reg, CFI_CFA, op->dest.offset);
|
||||
/* save drap offset so we know when to restore it */
|
||||
state->drap_offset = op->dest.offset;
|
||||
}
|
||||
|
||||
else if (regs[op->src.reg].base == CFI_UNDEFINED) {
|
||||
@ -1323,12 +1491,17 @@ static bool insn_state_match(struct instruction *insn, struct insn_state *state)
|
||||
break;
|
||||
}
|
||||
|
||||
} else if (state1->type != state2->type) {
|
||||
WARN_FUNC("stack state mismatch: type1=%d type2=%d",
|
||||
insn->sec, insn->offset, state1->type, state2->type);
|
||||
|
||||
} else if (state1->drap != state2->drap ||
|
||||
(state1->drap && state1->drap_reg != state2->drap_reg)) {
|
||||
WARN_FUNC("stack state mismatch: drap1=%d(%d) drap2=%d(%d)",
|
||||
(state1->drap && state1->drap_reg != state2->drap_reg) ||
|
||||
(state1->drap && state1->drap_offset != state2->drap_offset)) {
|
||||
WARN_FUNC("stack state mismatch: drap1=%d(%d,%d) drap2=%d(%d,%d)",
|
||||
insn->sec, insn->offset,
|
||||
state1->drap, state1->drap_reg,
|
||||
state2->drap, state2->drap_reg);
|
||||
state1->drap, state1->drap_reg, state1->drap_offset,
|
||||
state2->drap, state2->drap_reg, state2->drap_offset);
|
||||
|
||||
} else
|
||||
return true;
|
||||
@ -1346,7 +1519,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
|
||||
struct insn_state state)
|
||||
{
|
||||
struct alternative *alt;
|
||||
struct instruction *insn;
|
||||
struct instruction *insn, *next_insn;
|
||||
struct section *sec;
|
||||
struct symbol *func = NULL;
|
||||
int ret;
|
||||
@ -1357,34 +1530,77 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
|
||||
if (insn->alt_group && list_empty(&insn->alts)) {
|
||||
WARN_FUNC("don't know how to handle branch to middle of alternative instruction group",
|
||||
sec, insn->offset);
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (1) {
|
||||
if (file->c_file && insn->func) {
|
||||
if (func && func != insn->func) {
|
||||
WARN("%s() falls through to next function %s()",
|
||||
func->name, insn->func->name);
|
||||
return 1;
|
||||
}
|
||||
next_insn = next_insn_same_sec(file, insn);
|
||||
|
||||
|
||||
if (file->c_file && func && insn->func && func != insn->func) {
|
||||
WARN("%s() falls through to next function %s()",
|
||||
func->name, insn->func->name);
|
||||
return 1;
|
||||
}
|
||||
|
||||
func = insn->func;
|
||||
if (insn->func)
|
||||
func = insn->func;
|
||||
|
||||
if (func && insn->ignore) {
|
||||
WARN_FUNC("BUG: why am I validating an ignored function?",
|
||||
sec, insn->offset);
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (insn->visited) {
|
||||
if (!!insn_state_match(insn, &state))
|
||||
if (!insn->hint && !insn_state_match(insn, &state))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
insn->state = state;
|
||||
if (insn->hint) {
|
||||
if (insn->restore) {
|
||||
struct instruction *save_insn, *i;
|
||||
|
||||
i = insn;
|
||||
save_insn = NULL;
|
||||
func_for_each_insn_continue_reverse(file, func, i) {
|
||||
if (i->save) {
|
||||
save_insn = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!save_insn) {
|
||||
WARN_FUNC("no corresponding CFI save for CFI restore",
|
||||
sec, insn->offset);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!save_insn->visited) {
|
||||
/*
|
||||
* Oops, no state to copy yet.
|
||||
* Hopefully we can reach this
|
||||
* instruction from another branch
|
||||
* after the save insn has been
|
||||
* visited.
|
||||
*/
|
||||
if (insn == first)
|
||||
return 0;
|
||||
|
||||
WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo",
|
||||
sec, insn->offset);
|
||||
return 1;
|
||||
}
|
||||
|
||||
insn->state = save_insn->state;
|
||||
}
|
||||
|
||||
state = insn->state;
|
||||
|
||||
} else
|
||||
insn->state = state;
|
||||
|
||||
insn->visited = true;
|
||||
|
||||
@ -1423,7 +1639,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
|
||||
|
||||
/* fallthrough */
|
||||
case INSN_CALL_DYNAMIC:
|
||||
if (!nofp && func && !has_valid_stack_frame(&state)) {
|
||||
if (!no_fp && func && !has_valid_stack_frame(&state)) {
|
||||
WARN_FUNC("call without frame pointer save/setup",
|
||||
sec, insn->offset);
|
||||
return 1;
|
||||
@ -1461,9 +1677,17 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
|
||||
|
||||
return 0;
|
||||
|
||||
case INSN_CONTEXT_SWITCH:
|
||||
if (func && (!next_insn || !next_insn->hint)) {
|
||||
WARN_FUNC("unsupported instruction in callable function",
|
||||
sec, insn->offset);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
|
||||
case INSN_STACK:
|
||||
if (update_insn_state(insn, &state))
|
||||
return -1;
|
||||
return 1;
|
||||
|
||||
break;
|
||||
|
||||
@ -1474,7 +1698,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
|
||||
if (insn->dead_end)
|
||||
return 0;
|
||||
|
||||
insn = next_insn_same_sec(file, insn);
|
||||
insn = next_insn;
|
||||
if (!insn) {
|
||||
WARN("%s: unexpected end of section", sec->name);
|
||||
return 1;
|
||||
@ -1484,6 +1708,27 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int validate_unwind_hints(struct objtool_file *file)
|
||||
{
|
||||
struct instruction *insn;
|
||||
int ret, warnings = 0;
|
||||
struct insn_state state;
|
||||
|
||||
if (!file->hints)
|
||||
return 0;
|
||||
|
||||
clear_insn_state(&state);
|
||||
|
||||
for_each_insn(file, insn) {
|
||||
if (insn->hint && !insn->visited) {
|
||||
ret = validate_branch(file, insn, state);
|
||||
warnings += ret;
|
||||
}
|
||||
}
|
||||
|
||||
return warnings;
|
||||
}
|
||||
|
||||
static bool is_kasan_insn(struct instruction *insn)
|
||||
{
|
||||
return (insn->type == INSN_CALL &&
|
||||
@ -1507,8 +1752,13 @@ static bool ignore_unreachable_insn(struct instruction *insn)
|
||||
/*
|
||||
* Ignore any unused exceptions. This can happen when a whitelisted
|
||||
* function has an exception table entry.
|
||||
*
|
||||
* Also ignore alternative replacement instructions. This can happen
|
||||
* when a whitelisted function uses one of the ALTERNATIVE macros.
|
||||
*/
|
||||
if (!strcmp(insn->sec->name, ".fixup"))
|
||||
if (!strcmp(insn->sec->name, ".fixup") ||
|
||||
!strcmp(insn->sec->name, ".altinstr_replacement") ||
|
||||
!strcmp(insn->sec->name, ".altinstr_aux"))
|
||||
return true;
|
||||
|
||||
/*
|
||||
@ -1580,15 +1830,6 @@ static int validate_reachable_instructions(struct objtool_file *file)
|
||||
if (insn->visited || ignore_unreachable_insn(insn))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* gcov produces a lot of unreachable instructions. If we get
|
||||
* an unreachable warning and the file has gcov enabled, just
|
||||
* ignore it, and all other such warnings for the file. Do
|
||||
* this here because this is an expensive function.
|
||||
*/
|
||||
if (gcov_enabled(file))
|
||||
return 0;
|
||||
|
||||
WARN_FUNC("unreachable instruction", insn->sec, insn->offset);
|
||||
return 1;
|
||||
}
|
||||
@ -1613,15 +1854,15 @@ static void cleanup(struct objtool_file *file)
|
||||
elf_close(file->elf);
|
||||
}
|
||||
|
||||
int check(const char *_objname, bool _nofp)
|
||||
int check(const char *_objname, bool _no_fp, bool no_unreachable, bool orc)
|
||||
{
|
||||
struct objtool_file file;
|
||||
int ret, warnings = 0;
|
||||
|
||||
objname = _objname;
|
||||
nofp = _nofp;
|
||||
no_fp = _no_fp;
|
||||
|
||||
file.elf = elf_open(objname);
|
||||
file.elf = elf_open(objname, orc ? O_RDWR : O_RDONLY);
|
||||
if (!file.elf)
|
||||
return 1;
|
||||
|
||||
@ -1629,8 +1870,9 @@ int check(const char *_objname, bool _nofp)
|
||||
hash_init(file.insn_hash);
|
||||
file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
|
||||
file.rodata = find_section_by_name(file.elf, ".rodata");
|
||||
file.ignore_unreachables = false;
|
||||
file.c_file = find_section_by_name(file.elf, ".comment");
|
||||
file.ignore_unreachables = no_unreachable;
|
||||
file.hints = false;
|
||||
|
||||
arch_initial_func_cfi_state(&initial_func_cfi);
|
||||
|
||||
@ -1647,6 +1889,11 @@ int check(const char *_objname, bool _nofp)
|
||||
goto out;
|
||||
warnings += ret;
|
||||
|
||||
ret = validate_unwind_hints(&file);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
warnings += ret;
|
||||
|
||||
if (!warnings) {
|
||||
ret = validate_reachable_instructions(&file);
|
||||
if (ret < 0)
|
||||
@ -1654,6 +1901,20 @@ int check(const char *_objname, bool _nofp)
|
||||
warnings += ret;
|
||||
}
|
||||
|
||||
if (orc) {
|
||||
ret = create_orc(&file);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = create_orc_sections(&file);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = elf_write(file.elf);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
cleanup(&file);
|
||||
|
||||
|
@ -22,15 +22,17 @@
|
||||
#include "elf.h"
|
||||
#include "cfi.h"
|
||||
#include "arch.h"
|
||||
#include "orc.h"
|
||||
#include <linux/hashtable.h>
|
||||
|
||||
struct insn_state {
|
||||
struct cfi_reg cfa;
|
||||
struct cfi_reg regs[CFI_NUM_REGS];
|
||||
int stack_size;
|
||||
unsigned char type;
|
||||
bool bp_scratch;
|
||||
bool drap;
|
||||
int drap_reg;
|
||||
int drap_reg, drap_offset;
|
||||
};
|
||||
|
||||
struct instruction {
|
||||
@ -41,13 +43,14 @@ struct instruction {
|
||||
unsigned int len;
|
||||
unsigned char type;
|
||||
unsigned long immediate;
|
||||
bool alt_group, visited, dead_end, ignore;
|
||||
bool alt_group, visited, dead_end, ignore, hint, save, restore;
|
||||
struct symbol *call_dest;
|
||||
struct instruction *jump_dest;
|
||||
struct list_head alts;
|
||||
struct symbol *func;
|
||||
struct stack_op stack_op;
|
||||
struct insn_state state;
|
||||
struct orc_entry orc;
|
||||
};
|
||||
|
||||
struct objtool_file {
|
||||
@ -55,12 +58,22 @@ struct objtool_file {
|
||||
struct list_head insn_list;
|
||||
DECLARE_HASHTABLE(insn_hash, 16);
|
||||
struct section *rodata, *whitelist;
|
||||
bool ignore_unreachables, c_file;
|
||||
bool ignore_unreachables, c_file, hints;
|
||||
};
|
||||
|
||||
int check(const char *objname, bool nofp);
|
||||
int check(const char *objname, bool no_fp, bool no_unreachable, bool orc);
|
||||
|
||||
struct instruction *find_insn(struct objtool_file *file,
|
||||
struct section *sec, unsigned long offset);
|
||||
|
||||
#define for_each_insn(file, insn) \
|
||||
list_for_each_entry(insn, &file->insn_list, list)
|
||||
|
||||
#define sec_for_each_insn(file, sec, insn) \
|
||||
for (insn = find_insn(file, sec, 0); \
|
||||
insn && &insn->list != &file->insn_list && \
|
||||
insn->sec == sec; \
|
||||
insn = list_next_entry(insn, list))
|
||||
|
||||
|
||||
#endif /* _CHECK_H */
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user