From 4ccefbe597392d2914cf7ad904e33c734972681d Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 5 Nov 2015 15:15:07 +0000 Subject: [PATCH 01/18] xen: move xen_setup_runstate_info and get_runstate_snapshot to drivers/xen/time.c Signed-off-by: Stefano Stabellini Acked-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/xen/time.c | 76 +----------------------------------- drivers/xen/Makefile | 2 +- drivers/xen/time.c | 91 +++++++++++++++++++++++++++++++++++++++++++ include/xen/xen-ops.h | 5 +++ 4 files changed, 98 insertions(+), 76 deletions(-) create mode 100644 drivers/xen/time.c diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index f1ba6a092854..041d4cda3939 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -32,86 +32,12 @@ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) -/* runstate info updated by Xen */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); - /* snapshots of runstate info */ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); /* unused ns of stolen time */ static DEFINE_PER_CPU(u64, xen_residual_stolen); -/* return an consistent snapshot of 64-bit time/counter value */ -static u64 get64(const u64 *p) -{ - u64 ret; - - if (BITS_PER_LONG < 64) { - u32 *p32 = (u32 *)p; - u32 h, l; - - /* - * Read high then low, and then make sure high is - * still the same; this will only loop if low wraps - * and carries into high. - * XXX some clean way to make this endian-proof? - */ - do { - h = p32[1]; - barrier(); - l = p32[0]; - barrier(); - } while (p32[1] != h); - - ret = (((u64)h) << 32) | l; - } else - ret = *p; - - return ret; -} - -/* - * Runstate accounting - */ -static void get_runstate_snapshot(struct vcpu_runstate_info *res) -{ - u64 state_time; - struct vcpu_runstate_info *state; - - BUG_ON(preemptible()); - - state = this_cpu_ptr(&xen_runstate); - - /* - * The runstate info is always updated by the hypervisor on - * the current CPU, so there's no need to use anything - * stronger than a compiler barrier when fetching it. - */ - do { - state_time = get64(&state->state_entry_time); - barrier(); - *res = *state; - barrier(); - } while (get64(&state->state_entry_time) != state_time); -} - -/* return true when a vcpu could run but has no real cpu to run on */ -bool xen_vcpu_stolen(int vcpu) -{ - return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; -} - -void xen_setup_runstate_info(int cpu) -{ - struct vcpu_register_runstate_memory_area area; - - area.addr.v = &per_cpu(xen_runstate, cpu); - - if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, - cpu, &area)) - BUG(); -} - static void do_stolen_accounting(void) { struct vcpu_runstate_info state; @@ -119,7 +45,7 @@ static void do_stolen_accounting(void) s64 runnable, offline, stolen; cputime_t ticks; - get_runstate_snapshot(&state); + xen_get_runstate_snapshot(&state); WARN_ON(state.state != RUNSTATE_running); diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index aa8a7f71f310..9b7a35c9e51d 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_X86) += fallback.o -obj-y += grant-table.o features.o balloon.o manage.o preempt.o +obj-y += grant-table.o features.o balloon.o manage.o preempt.o time.o obj-y += events/ obj-y += xenbus/ diff --git a/drivers/xen/time.c b/drivers/xen/time.c new file mode 100644 index 000000000000..433fe247c5ff --- /dev/null +++ b/drivers/xen/time.c @@ -0,0 +1,91 @@ +/* + * Xen stolen ticks accounting. + */ +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +/* runstate info updated by Xen */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); + +/* return an consistent snapshot of 64-bit time/counter value */ +static u64 get64(const u64 *p) +{ + u64 ret; + + if (BITS_PER_LONG < 64) { + u32 *p32 = (u32 *)p; + u32 h, l; + + /* + * Read high then low, and then make sure high is + * still the same; this will only loop if low wraps + * and carries into high. + * XXX some clean way to make this endian-proof? + */ + do { + h = p32[1]; + barrier(); + l = p32[0]; + barrier(); + } while (p32[1] != h); + + ret = (((u64)h) << 32) | l; + } else + ret = *p; + + return ret; +} + +/* + * Runstate accounting + */ +void xen_get_runstate_snapshot(struct vcpu_runstate_info *res) +{ + u64 state_time; + struct vcpu_runstate_info *state; + + BUG_ON(preemptible()); + + state = this_cpu_ptr(&xen_runstate); + + /* + * The runstate info is always updated by the hypervisor on + * the current CPU, so there's no need to use anything + * stronger than a compiler barrier when fetching it. + */ + do { + state_time = get64(&state->state_entry_time); + barrier(); + *res = *state; + barrier(); + } while (get64(&state->state_entry_time) != state_time); +} + +/* return true when a vcpu could run but has no real cpu to run on */ +bool xen_vcpu_stolen(int vcpu) +{ + return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; +} + +void xen_setup_runstate_info(int cpu) +{ + struct vcpu_register_runstate_memory_area area; + + area.addr.v = &per_cpu(xen_runstate, cpu); + + if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, + cpu, &area)) + BUG(); +} + diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index e4e214a5abd5..86abe07b20ec 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -5,6 +5,7 @@ #include #include #include +#include DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); @@ -18,6 +19,10 @@ void xen_arch_suspend(void); void xen_resume_notifier_register(struct notifier_block *nb); void xen_resume_notifier_unregister(struct notifier_block *nb); +bool xen_vcpu_stolen(int vcpu); +void xen_setup_runstate_info(int cpu); +void xen_get_runstate_snapshot(struct vcpu_runstate_info *res); + int xen_setup_shutdown_event(void); extern unsigned long *xen_contiguous_bitmap; From 1fe7c4ef88bd32e039f5f4126537c3f20c340414 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 10 Nov 2015 12:36:46 +0000 Subject: [PATCH 02/18] missing include asm/paravirt.h in cputime.c Add include asm/paravirt.h to cputime.c, as steal_account_process_tick calls paravirt_steal_clock, which is defined in asm/paravirt.h. The ifdef CONFIG_PARAVIRT is necessary because not all archs have an asm/paravirt.h to include. The reason why currently cputime.c compiles, even though include is missing, is that on x86 asm/paravirt.h is included by one of the other headers included in kernel/sched/cputime.c: On arm and arm64, where I am about to introduce asm/paravirt.h and stolen time support, without #include in cputime.c, I would get an error. Signed-off-by: Stefano Stabellini Acked-by: Peter Zijlstra (Intel) --- kernel/sched/cputime.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 05de80b48586..851b00f344ae 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -5,6 +5,9 @@ #include #include #include "sched.h" +#ifdef CONFIG_PARAVIRT +#include +#endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING From 02c2433b3aa6b57313c261c9811bbbe49528101c Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:32:57 +0000 Subject: [PATCH 03/18] arm: introduce CONFIG_PARAVIRT, PARAVIRT_TIME_ACCOUNTING and pv_time_ops Introduce CONFIG_PARAVIRT and PARAVIRT_TIME_ACCOUNTING on ARM. The only paravirt interface supported is pv_time_ops.steal_clock, so no runtime pvops patching needed. This allows us to make use of steal_account_process_tick for stolen ticks accounting. Signed-off-by: Stefano Stabellini Acked-by: Christopher Covington Acked-by: Ian Campbell Acked-by: Russell King --- arch/arm/Kconfig | 20 ++++++++++++++++++++ arch/arm/include/asm/paravirt.h | 20 ++++++++++++++++++++ arch/arm/kernel/Makefile | 1 + arch/arm/kernel/paravirt.c | 25 +++++++++++++++++++++++++ 4 files changed, 66 insertions(+) create mode 100644 arch/arm/include/asm/paravirt.h create mode 100644 arch/arm/kernel/paravirt.c diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 34e1569a11ee..1ab9b98d4c6c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1800,6 +1800,25 @@ config SWIOTLB config IOMMU_HELPER def_bool SWIOTLB +config PARAVIRT + bool "Enable paravirtualization code" + help + This changes the kernel so it can modify itself when it is run + under a hypervisor, potentially improving performance significantly + over full virtualization. + +config PARAVIRT_TIME_ACCOUNTING + bool "Paravirtual steal time accounting" + select PARAVIRT + default n + help + Select this option to enable fine granularity task steal time + accounting. Time spent executing other tasks in parallel with + the current vCPU is discounted from the vCPU power. To account for + that, there can be a small performance impact. + + If in doubt, say N here. + config XEN_DOM0 def_bool y depends on XEN @@ -1813,6 +1832,7 @@ config XEN select ARCH_DMA_ADDR_T_64BIT select ARM_PSCI select SWIOTLB_XEN + select PARAVIRT help Say Y if you want to run Linux in a Virtual Machine on Xen on ARM. diff --git a/arch/arm/include/asm/paravirt.h b/arch/arm/include/asm/paravirt.h new file mode 100644 index 000000000000..8435ff591386 --- /dev/null +++ b/arch/arm/include/asm/paravirt.h @@ -0,0 +1,20 @@ +#ifndef _ASM_ARM_PARAVIRT_H +#define _ASM_ARM_PARAVIRT_H + +#ifdef CONFIG_PARAVIRT +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; + +struct pv_time_ops { + unsigned long long (*steal_clock)(int cpu); +}; +extern struct pv_time_ops pv_time_ops; + +static inline u64 paravirt_steal_clock(int cpu) +{ + return pv_time_ops.steal_clock(cpu); +} +#endif + +#endif diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index af9e59bf3831..3e6e93725ca7 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_VDSO) += vdso.o ifneq ($(CONFIG_ARCH_EBSA110),y) obj-y += io.o endif +obj-$(CONFIG_PARAVIRT) += paravirt.o head-y := head$(MMUEXT).o obj-$(CONFIG_DEBUG_LL) += debug.o diff --git a/arch/arm/kernel/paravirt.c b/arch/arm/kernel/paravirt.c new file mode 100644 index 000000000000..53f371ed4568 --- /dev/null +++ b/arch/arm/kernel/paravirt.c @@ -0,0 +1,25 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2013 Citrix Systems + * + * Author: Stefano Stabellini + */ + +#include +#include +#include +#include + +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +struct pv_time_ops pv_time_ops; +EXPORT_SYMBOL_GPL(pv_time_ops); From dfd57bc3a5664b98eb7b477e8d7bb2591a5198cf Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:33:49 +0000 Subject: [PATCH 04/18] arm64: introduce CONFIG_PARAVIRT, PARAVIRT_TIME_ACCOUNTING and pv_time_ops Introduce CONFIG_PARAVIRT and PARAVIRT_TIME_ACCOUNTING on ARM64. Necessary duplication of paravirt.h and paravirt.c with ARM. The only paravirt interface supported is pv_time_ops.steal_clock, so no runtime pvops patching needed. This allows us to make use of steal_account_process_tick for stolen ticks accounting. Signed-off-by: Stefano Stabellini Acked-by: Marc Zyngier --- arch/arm64/Kconfig | 20 ++++++++++++++++++++ arch/arm64/include/asm/paravirt.h | 20 ++++++++++++++++++++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/paravirt.c | 25 +++++++++++++++++++++++++ 4 files changed, 66 insertions(+) create mode 100644 arch/arm64/include/asm/paravirt.h create mode 100644 arch/arm64/kernel/paravirt.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 871f21783866..987ae39cc471 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -556,6 +556,25 @@ config SECCOMP and the task is only allowed to execute a few safe syscalls defined by each seccomp mode. +config PARAVIRT + bool "Enable paravirtualization code" + help + This changes the kernel so it can modify itself when it is run + under a hypervisor, potentially improving performance significantly + over full virtualization. + +config PARAVIRT_TIME_ACCOUNTING + bool "Paravirtual steal time accounting" + select PARAVIRT + default n + help + Select this option to enable fine granularity task steal time + accounting. Time spent executing other tasks in parallel with + the current vCPU is discounted from the vCPU power. To account for + that, there can be a small performance impact. + + If in doubt, say N here. + config XEN_DOM0 def_bool y depends on XEN @@ -564,6 +583,7 @@ config XEN bool "Xen guest support on ARM64" depends on ARM64 && OF select SWIOTLB_XEN + select PARAVIRT help Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64. diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h new file mode 100644 index 000000000000..fd5f42886251 --- /dev/null +++ b/arch/arm64/include/asm/paravirt.h @@ -0,0 +1,20 @@ +#ifndef _ASM_ARM64_PARAVIRT_H +#define _ASM_ARM64_PARAVIRT_H + +#ifdef CONFIG_PARAVIRT +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; + +struct pv_time_ops { + unsigned long long (*steal_clock)(int cpu); +}; +extern struct pv_time_ops pv_time_ops; + +static inline u64 paravirt_steal_clock(int cpu) +{ + return pv_time_ops.steal_clock(cpu); +} +#endif + +#endif diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 474691f8b13a..ca9fbe1d7bd9 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -41,6 +41,7 @@ arm64-obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o arm64-obj-$(CONFIG_PCI) += pci.o arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o arm64-obj-$(CONFIG_ACPI) += acpi.o +arm64-obj-$(CONFIG_PARAVIRT) += paravirt.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c new file mode 100644 index 000000000000..53f371ed4568 --- /dev/null +++ b/arch/arm64/kernel/paravirt.c @@ -0,0 +1,25 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2013 Citrix Systems + * + * Author: Stefano Stabellini + */ + +#include +#include +#include +#include + +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +struct pv_time_ops pv_time_ops; +EXPORT_SYMBOL_GPL(pv_time_ops); From 34e38523d586ae1e838241d44c8a2e9a1c9e0b43 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:35:12 +0000 Subject: [PATCH 05/18] xen/arm: account for stolen ticks Register the runstate_memory_area with the hypervisor. Use pv_time_ops.steal_clock to account for stolen ticks. Signed-off-by: Stefano Stabellini --- arch/arm/xen/enlighten.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index fc7ea529f462..ebbfa64e4add 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,19 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range); +static unsigned long long xen_stolen_accounting(int cpu) +{ + struct vcpu_runstate_info state; + + BUG_ON(cpu != smp_processor_id()); + + xen_get_runstate_snapshot(&state); + + WARN_ON(state.state != RUNSTATE_running); + + return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline]; +} + static void xen_percpu_init(void) { struct vcpu_register_vcpu_info info; @@ -104,6 +118,8 @@ static void xen_percpu_init(void) BUG_ON(err); per_cpu(xen_vcpu, cpu) = vcpup; + xen_setup_runstate_info(cpu); + after_register_vcpu_info: enable_percpu_irq(xen_events_irq, 0); put_cpu(); @@ -271,6 +287,9 @@ static int __init xen_guest_init(void) register_cpu_notifier(&xen_cpu_notifier); + pv_time_ops.steal_clock = xen_stolen_accounting; + static_key_slow_inc(¶virt_steal_enabled); + return 0; } early_initcall(xen_guest_init); From cfafae940381207d48b11a73a211142dba5947d3 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:36:12 +0000 Subject: [PATCH 06/18] xen: rename dom0_op to platform_op The dom0_op hypercall has been renamed to platform_op since Xen 3.2, which is ancient, and modern upstream Linux kernels cannot run as dom0 and it anymore anyway. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky --- arch/x86/include/asm/xen/hypercall.h | 6 +++--- arch/x86/xen/apic.c | 2 +- arch/x86/xen/enlighten.c | 8 ++++---- arch/x86/xen/time.c | 2 +- drivers/xen/acpi.c | 2 +- drivers/xen/efi.c | 30 ++++++++++++++-------------- drivers/xen/pcpu.c | 8 ++++---- drivers/xen/xen-acpi-cpuhotplug.c | 2 +- drivers/xen/xen-acpi-pad.c | 4 ++-- drivers/xen/xen-acpi-processor.c | 8 ++++---- drivers/xen/xenfs/xensyms.c | 4 ++-- include/xen/interface/xen.h | 2 +- 12 files changed, 39 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 4c20dd333412..3bcdcc84259d 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -310,10 +310,10 @@ HYPERVISOR_mca(struct xen_mc *mc_op) } static inline int -HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) +HYPERVISOR_platform_op(struct xen_platform_op *op) { - platform_op->interface_version = XENPF_INTERFACE_VERSION; - return _hypercall1(int, dom0_op, platform_op); + op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, op); } static inline int diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index acda713ab5be..abf4901c917b 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -64,7 +64,7 @@ static u32 xen_apic_read(u32 reg) if (reg != APIC_ID) return 0; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) return 0; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5774800ff583..f963c40fae01 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -415,7 +415,7 @@ static bool __init xen_check_mwait(void) set_xen_guest_handle(op.u.set_pminfo.pdc, buf); - if ((HYPERVISOR_dom0_op(&op) == 0) && + if ((HYPERVISOR_platform_op(&op) == 0) && (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { cpuid_leaf5_ecx_val = cx; cpuid_leaf5_edx_val = dx; @@ -1374,7 +1374,7 @@ static void __init xen_boot_params_init_edd(void) info->params.length = sizeof(info->params); set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, &info->params); - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) break; @@ -1392,7 +1392,7 @@ static void __init xen_boot_params_init_edd(void) op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { op.u.firmware_info.index = nr; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) break; mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; @@ -1698,7 +1698,7 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; - if (HYPERVISOR_dom0_op(&op) == 0) + if (HYPERVISOR_platform_op(&op) == 0) boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; /* Make sure ACS will be enabled */ diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 041d4cda3939..663c2ea449c7 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -139,7 +139,7 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb, op.u.settime.nsecs = now.tv_nsec; op.u.settime.system_time = xen_clocksource_read(); - (void)HYPERVISOR_dom0_op(&op); + (void)HYPERVISOR_platform_op(&op); /* * Move the next drift compensation time 11 minutes diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c index 90307c0b630c..6893c79fd2a1 100644 --- a/drivers/xen/acpi.c +++ b/drivers/xen/acpi.c @@ -58,7 +58,7 @@ static int xen_acpi_notify_hypervisor_state(u8 sleep_state, bits, val_a, val_b)) return -1; - HYPERVISOR_dom0_op(&op); + HYPERVISOR_platform_op(&op); return 1; } diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index f745db270171..be7e56a338e8 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -42,7 +42,7 @@ static efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { struct xen_platform_op op = INIT_EFI_OP(get_time); - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; if (tm) { @@ -67,7 +67,7 @@ static efi_status_t xen_efi_set_time(efi_time_t *tm) BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.set_time)); memcpy(&efi_data(op).u.set_time, tm, sizeof(*tm)); - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; return efi_data(op).status; @@ -79,7 +79,7 @@ static efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, { struct xen_platform_op op = INIT_EFI_OP(get_wakeup_time); - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; if (tm) { @@ -108,7 +108,7 @@ static efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) else efi_data(op).misc |= XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY; - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; return efi_data(op).status; @@ -129,7 +129,7 @@ static efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_data(op).u.get_variable.size = *data_size; set_xen_guest_handle(efi_data(op).u.get_variable.data, data); - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; *data_size = efi_data(op).u.get_variable.size; @@ -152,7 +152,7 @@ static efi_status_t xen_efi_get_next_variable(unsigned long *name_size, memcpy(&efi_data(op).u.get_next_variable_name.vendor_guid, vendor, sizeof(*vendor)); - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; *name_size = efi_data(op).u.get_next_variable_name.size; @@ -178,7 +178,7 @@ static efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_data(op).u.set_variable.size = data_size; set_xen_guest_handle(efi_data(op).u.set_variable.data, data); - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; return efi_data(op).status; @@ -196,7 +196,7 @@ static efi_status_t xen_efi_query_variable_info(u32 attr, efi_data(op).u.query_variable_info.attr = attr; - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; *storage_space = efi_data(op).u.query_variable_info.max_store_size; @@ -210,7 +210,7 @@ static efi_status_t xen_efi_get_next_high_mono_count(u32 *count) { struct xen_platform_op op = INIT_EFI_OP(get_next_high_monotonic_count); - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; *count = efi_data(op).misc; @@ -232,7 +232,7 @@ static efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, efi_data(op).u.update_capsule.capsule_count = count; efi_data(op).u.update_capsule.sg_list = sg_list; - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; return efi_data(op).status; @@ -252,7 +252,7 @@ static efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, capsules); efi_data(op).u.query_capsule_capabilities.capsule_count = count; - if (HYPERVISOR_dom0_op(&op) < 0) + if (HYPERVISOR_platform_op(&op) < 0) return EFI_UNSUPPORTED; *max_size = efi_data(op).u.query_capsule_capabilities.max_capsule_size; @@ -331,7 +331,7 @@ efi_system_table_t __init *xen_efi_probe(void) }; union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info; - if (!xen_initial_domain() || HYPERVISOR_dom0_op(&op) < 0) + if (!xen_initial_domain() || HYPERVISOR_platform_op(&op) < 0) return NULL; /* Here we know that Xen runs on EFI platform. */ @@ -347,7 +347,7 @@ efi_system_table_t __init *xen_efi_probe(void) info->vendor.bufsz = sizeof(vendor); set_xen_guest_handle(info->vendor.name, vendor); - if (HYPERVISOR_dom0_op(&op) == 0) { + if (HYPERVISOR_platform_op(&op) == 0) { efi_systab_xen.fw_vendor = __pa_symbol(vendor); efi_systab_xen.fw_revision = info->vendor.revision; } else @@ -357,14 +357,14 @@ efi_system_table_t __init *xen_efi_probe(void) op.u.firmware_info.type = XEN_FW_EFI_INFO; op.u.firmware_info.index = XEN_FW_EFI_VERSION; - if (HYPERVISOR_dom0_op(&op) == 0) + if (HYPERVISOR_platform_op(&op) == 0) efi_systab_xen.hdr.revision = info->version; op.cmd = XENPF_firmware_info; op.u.firmware_info.type = XEN_FW_EFI_INFO; op.u.firmware_info.index = XEN_FW_EFI_RT_VERSION; - if (HYPERVISOR_dom0_op(&op) == 0) + if (HYPERVISOR_platform_op(&op) == 0) efi.runtime_version = info->version; return &efi_systab_xen; diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index 49e88f2ce7a1..cdc6daa7a9f6 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -78,7 +78,7 @@ static int xen_pcpu_down(uint32_t cpu_id) .u.cpu_ol.cpuid = cpu_id, }; - return HYPERVISOR_dom0_op(&op); + return HYPERVISOR_platform_op(&op); } static int xen_pcpu_up(uint32_t cpu_id) @@ -89,7 +89,7 @@ static int xen_pcpu_up(uint32_t cpu_id) .u.cpu_ol.cpuid = cpu_id, }; - return HYPERVISOR_dom0_op(&op); + return HYPERVISOR_platform_op(&op); } static ssize_t show_online(struct device *dev, @@ -277,7 +277,7 @@ static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu) .u.pcpu_info.xen_cpuid = cpu, }; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) return ret; @@ -364,7 +364,7 @@ int xen_pcpu_id(uint32_t acpi_id) op.cmd = XENPF_get_cpuinfo; while (cpu_id <= max_id) { op.u.pcpu_info.xen_cpuid = cpu_id; - if (HYPERVISOR_dom0_op(&op)) { + if (HYPERVISOR_platform_op(&op)) { cpu_id++; continue; } diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c index f4a369429553..fdc9e67b842d 100644 --- a/drivers/xen/xen-acpi-cpuhotplug.c +++ b/drivers/xen/xen-acpi-cpuhotplug.c @@ -206,7 +206,7 @@ static int xen_hotadd_cpu(struct acpi_processor *pr) op.u.cpu_add.acpi_id = pr->acpi_id; op.u.cpu_add.pxm = pxm; - cpu_id = HYPERVISOR_dom0_op(&op); + cpu_id = HYPERVISOR_platform_op(&op); if (cpu_id < 0) pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n", pr->acpi_id); diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c index f83b754505f8..23d1808fe027 100644 --- a/drivers/xen/xen-acpi-pad.c +++ b/drivers/xen/xen-acpi-pad.c @@ -36,7 +36,7 @@ static int xen_acpi_pad_idle_cpus(unsigned int idle_nums) op.u.core_parking.type = XEN_CORE_PARKING_SET; op.u.core_parking.idle_nums = idle_nums; - return HYPERVISOR_dom0_op(&op); + return HYPERVISOR_platform_op(&op); } static int xen_acpi_pad_idle_cpus_num(void) @@ -46,7 +46,7 @@ static int xen_acpi_pad_idle_cpus_num(void) op.cmd = XENPF_core_parking; op.u.core_parking.type = XEN_CORE_PARKING_GET; - return HYPERVISOR_dom0_op(&op) + return HYPERVISOR_platform_op(&op) ?: op.u.core_parking.idle_nums; } diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 70fa438000af..076970a54f89 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -116,7 +116,7 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr) set_xen_guest_handle(op.u.set_pminfo.power.states, dst_cx_states); if (!no_hypercall) - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (!ret) { pr_debug("ACPI CPU%u - C-states uploaded.\n", _pr->acpi_id); @@ -244,7 +244,7 @@ static int push_pxx_to_hypervisor(struct acpi_processor *_pr) } if (!no_hypercall) - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (!ret) { struct acpi_processor_performance *perf; @@ -302,7 +302,7 @@ static unsigned int __init get_max_acpi_id(void) info = &op.u.pcpu_info; info->xen_cpuid = 0; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) return NR_CPUS; @@ -310,7 +310,7 @@ static unsigned int __init get_max_acpi_id(void) last_cpu = op.u.pcpu_info.max_present; for (i = 0; i <= last_cpu; i++) { info->xen_cpuid = i; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) continue; max_acpi_id = max(info->acpi_id, max_acpi_id); diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c index f8b12856753f..a03f261b12d8 100644 --- a/drivers/xen/xenfs/xensyms.c +++ b/drivers/xen/xenfs/xensyms.c @@ -31,7 +31,7 @@ static int xensyms_next_sym(struct xensyms *xs) symnum = symdata->symnum; - ret = HYPERVISOR_dom0_op(&xs->op); + ret = HYPERVISOR_platform_op(&xs->op); if (ret < 0) return ret; @@ -50,7 +50,7 @@ static int xensyms_next_sym(struct xensyms *xs) set_xen_guest_handle(symdata->name, xs->name); symdata->symnum--; /* Rewind */ - ret = HYPERVISOR_dom0_op(&xs->op); + ret = HYPERVISOR_platform_op(&xs->op); if (ret < 0) return ret; } diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 167071c290b3..d1331121c0bd 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -48,7 +48,7 @@ #define __HYPERVISOR_set_callbacks 4 #define __HYPERVISOR_fpu_taskswitch 5 #define __HYPERVISOR_sched_op_compat 6 -#define __HYPERVISOR_dom0_op 7 +#define __HYPERVISOR_platform_op 7 #define __HYPERVISOR_set_debugreg 8 #define __HYPERVISOR_get_debugreg 9 #define __HYPERVISOR_update_descriptor 10 From 72d39c691b4269c95547245562bfde8504432407 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:37:12 +0000 Subject: [PATCH 07/18] xen/arm: introduce HYPERVISOR_platform_op on arm and arm64 Signed-off-by: Stefano Stabellini --- arch/arm/include/asm/xen/hypercall.h | 7 +++++++ arch/arm/include/asm/xen/interface.h | 2 ++ arch/arm/xen/enlighten.c | 1 + arch/arm/xen/hypercall.S | 1 + arch/arm64/xen/hypercall.S | 1 + 5 files changed, 12 insertions(+) diff --git a/arch/arm/include/asm/xen/hypercall.h b/arch/arm/include/asm/xen/hypercall.h index 712b50e0a6dc..d769972db8cb 100644 --- a/arch/arm/include/asm/xen/hypercall.h +++ b/arch/arm/include/asm/xen/hypercall.h @@ -35,6 +35,7 @@ #include #include +#include long privcmd_call(unsigned call, unsigned long a1, unsigned long a2, unsigned long a3, @@ -49,6 +50,12 @@ int HYPERVISOR_memory_op(unsigned int cmd, void *arg); int HYPERVISOR_physdev_op(int cmd, void *arg); int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args); int HYPERVISOR_tmem_op(void *arg); +int HYPERVISOR_platform_op_raw(void *arg); +static inline int HYPERVISOR_platform_op(struct xen_platform_op *op) +{ + op->interface_version = XENPF_INTERFACE_VERSION; + return HYPERVISOR_platform_op_raw(op); +} int HYPERVISOR_multicall(struct multicall_entry *calls, uint32_t nr); static inline int diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h index 50066006e6bd..4dfd6d43aab2 100644 --- a/arch/arm/include/asm/xen/interface.h +++ b/arch/arm/include/asm/xen/interface.h @@ -27,6 +27,8 @@ (hnd).p = val; \ } while (0) +#define __HYPERVISOR_platform_op_raw __HYPERVISOR_platform_op + #ifndef __ASSEMBLY__ /* Explicitly size integers that represent pfns in the interface with * Xen so that we can have one ABI that works for 32 and 64 bit guests. diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index ebbfa64e4add..64f17264f2f5 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -326,5 +326,6 @@ EXPORT_SYMBOL_GPL(HYPERVISOR_memory_op); EXPORT_SYMBOL_GPL(HYPERVISOR_physdev_op); EXPORT_SYMBOL_GPL(HYPERVISOR_vcpu_op); EXPORT_SYMBOL_GPL(HYPERVISOR_tmem_op); +EXPORT_SYMBOL_GPL(HYPERVISOR_platform_op); EXPORT_SYMBOL_GPL(HYPERVISOR_multicall); EXPORT_SYMBOL_GPL(privcmd_call); diff --git a/arch/arm/xen/hypercall.S b/arch/arm/xen/hypercall.S index 10fd99c568c6..9a36f4f49c10 100644 --- a/arch/arm/xen/hypercall.S +++ b/arch/arm/xen/hypercall.S @@ -89,6 +89,7 @@ HYPERCALL2(memory_op); HYPERCALL2(physdev_op); HYPERCALL3(vcpu_op); HYPERCALL1(tmem_op); +HYPERCALL1(platform_op_raw); HYPERCALL2(multicall); ENTRY(privcmd_call) diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 8bbe9401f4f0..70df80e8da2c 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -80,6 +80,7 @@ HYPERCALL2(memory_op); HYPERCALL2(physdev_op); HYPERCALL3(vcpu_op); HYPERCALL1(tmem_op); +HYPERCALL1(platform_op_raw); HYPERCALL2(multicall); ENTRY(privcmd_call) From f3d6027ee0568b5442077120beeb5d9d17c2d0da Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:38:12 +0000 Subject: [PATCH 08/18] xen: introduce XENPF_settime64 Rename the current XENPF_settime hypercall and related struct to XENPF_settime32. Signed-off-by: Stefano Stabellini Acked-by: Arnd Bergmann Reviewed-by: Boris Ostrovsky --- arch/x86/xen/time.c | 8 ++++---- include/xen/interface/platform.h | 18 ++++++++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 663c2ea449c7..3bbd377e1657 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -134,10 +134,10 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb, if (!was_set && timespec_compare(&now, &next_sync) < 0) return NOTIFY_OK; - op.cmd = XENPF_settime; - op.u.settime.secs = now.tv_sec; - op.u.settime.nsecs = now.tv_nsec; - op.u.settime.system_time = xen_clocksource_read(); + op.cmd = XENPF_settime32; + op.u.settime32.secs = now.tv_sec; + op.u.settime32.nsecs = now.tv_nsec; + op.u.settime32.system_time = xen_clocksource_read(); (void)HYPERVISOR_platform_op(&op); diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h index 8e035871360e..732efb08c3e1 100644 --- a/include/xen/interface/platform.h +++ b/include/xen/interface/platform.h @@ -35,14 +35,23 @@ * Set clock such that it would read after 00:00:00 UTC, * 1 January, 1970 if the current system time was . */ -#define XENPF_settime 17 -struct xenpf_settime { +#define XENPF_settime32 17 +struct xenpf_settime32 { /* IN variables. */ uint32_t secs; uint32_t nsecs; uint64_t system_time; }; -DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t); +DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime32_t); +#define XENPF_settime64 62 +struct xenpf_settime64 { + /* IN variables. */ + uint64_t secs; + uint32_t nsecs; + uint32_t mbz; + uint64_t system_time; +}; +DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime64_t); /* * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type. @@ -495,7 +504,8 @@ struct xen_platform_op { uint32_t cmd; uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ union { - struct xenpf_settime settime; + struct xenpf_settime32 settime32; + struct xenpf_settime64 settime64; struct xenpf_add_memtype add_memtype; struct xenpf_del_memtype del_memtype; struct xenpf_read_memtype read_memtype; From ab76078a3d43226288fa43489b76b1416975705f Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:39:12 +0000 Subject: [PATCH 09/18] arm: extend pvclock_wall_clock with sec_hi The hypervisor actually exposes an additional field to struct pvclock_wall_clock, with the high 32 bit seconds. Signed-off-by: Stefano Stabellini Reviewed-by: Julien Grall --- arch/arm/include/asm/xen/interface.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h index 4dfd6d43aab2..75d596862892 100644 --- a/arch/arm/include/asm/xen/interface.h +++ b/arch/arm/include/asm/xen/interface.h @@ -78,6 +78,7 @@ struct pvclock_wall_clock { u32 version; u32 sec; u32 nsec; + u32 sec_hi; } __attribute__((__packed__)); #endif From e709fba132db696bbc21fca2e7f736198ec53eda Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:40:12 +0000 Subject: [PATCH 10/18] xen/arm: introduce xen_read_wallclock Read the wallclock from the shared info page at boot time. Signed-off-by: Stefano Stabellini Acked-by: Arnd Bergmann --- arch/arm/xen/enlighten.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 64f17264f2f5..6370222b8053 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -93,6 +94,27 @@ static unsigned long long xen_stolen_accounting(int cpu) return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline]; } +static void xen_read_wallclock(struct timespec64 *ts) +{ + u32 version; + struct timespec64 now, ts_monotonic; + struct shared_info *s = HYPERVISOR_shared_info; + struct pvclock_wall_clock *wall_clock = &(s->wc); + + /* get wallclock at system boot */ + do { + version = wall_clock->version; + rmb(); /* fetch version before time */ + now.tv_sec = ((uint64_t)wall_clock->sec_hi << 32) | wall_clock->sec; + now.tv_nsec = wall_clock->nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->version & 1) || (version != wall_clock->version)); + + /* time since system boot */ + ktime_get_ts64(&ts_monotonic); + *ts = timespec64_add(now, ts_monotonic); +} + static void xen_percpu_init(void) { struct vcpu_register_vcpu_info info; @@ -301,6 +323,11 @@ static int __init xen_pm_init(void) pm_power_off = xen_power_off; arm_pm_restart = xen_restart; + if (!xen_initial_domain()) { + struct timespec64 ts; + xen_read_wallclock(&ts); + do_settimeofday64(&ts); + } return 0; } From 7d5f6f81ddbb5b532a832cbeb65472541b22a7c2 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:41:12 +0000 Subject: [PATCH 11/18] xen/arm: set the system time in Xen via the XENPF_settime64 hypercall If Linux is running as dom0, call XENPF_settime64 to update the system time in Xen on pvclock_gtod notifications. Signed-off-by: Stefano Stabellini Acked-by: Arnd Bergmann --- arch/arm/xen/enlighten.c | 48 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 6370222b8053..75cd7345c654 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -26,7 +26,10 @@ #include #include #include +#include +#include #include +#include #include @@ -115,6 +118,49 @@ static void xen_read_wallclock(struct timespec64 *ts) *ts = timespec64_add(now, ts_monotonic); } +static int xen_pvclock_gtod_notify(struct notifier_block *nb, + unsigned long was_set, void *priv) +{ + /* Protected by the calling core code serialization */ + static struct timespec64 next_sync; + + struct xen_platform_op op; + struct timespec64 now, system_time; + struct timekeeper *tk = priv; + + now.tv_sec = tk->xtime_sec; + now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + system_time = timespec64_add(now, tk->wall_to_monotonic); + + /* + * We only take the expensive HV call when the clock was set + * or when the 11 minutes RTC synchronization time elapsed. + */ + if (!was_set && timespec64_compare(&now, &next_sync) < 0) + return NOTIFY_OK; + + op.cmd = XENPF_settime64; + op.u.settime64.mbz = 0; + op.u.settime64.secs = now.tv_sec; + op.u.settime64.nsecs = now.tv_nsec; + op.u.settime64.system_time = timespec64_to_ns(&system_time); + (void)HYPERVISOR_platform_op(&op); + + /* + * Move the next drift compensation time 11 minutes + * ahead. That's emulating the sync_cmos_clock() update for + * the hardware RTC. + */ + next_sync = now; + next_sync.tv_sec += 11 * 60; + + return NOTIFY_OK; +} + +static struct notifier_block xen_pvclock_gtod_notifier = { + .notifier_call = xen_pvclock_gtod_notify, +}; + static void xen_percpu_init(void) { struct vcpu_register_vcpu_info info; @@ -311,6 +357,8 @@ static int __init xen_guest_init(void) pv_time_ops.steal_clock = xen_stolen_accounting; static_key_slow_inc(¶virt_steal_enabled); + if (xen_initial_domain()) + pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); return 0; } From 760968631323f710ea0824369bbd65f812c82f08 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 23 Nov 2015 10:42:12 +0000 Subject: [PATCH 12/18] xen/x86: support XENPF_settime64 Try XENPF_settime64 first, if it is not available fall back to XENPF_settime32. No need to call __current_kernel_time() when all the info needed are already passed via the struct timekeeper * argument. Return NOTIFY_BAD in case of errors. Signed-off-by: Stefano Stabellini Acked-by: Arnd Bergmann Reviewed-by: Boris Ostrovsky --- arch/x86/xen/time.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 3bbd377e1657..4b8af45b211a 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -123,9 +124,13 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb, static struct timespec next_sync; struct xen_platform_op op; - struct timespec now; + struct timespec64 now; + struct timekeeper *tk = priv; + static bool settime64_supported = true; + int ret; - now = __current_kernel_time(); + now.tv_sec = tk->xtime_sec; + now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); /* * We only take the expensive HV call when the clock was set @@ -134,12 +139,28 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb, if (!was_set && timespec_compare(&now, &next_sync) < 0) return NOTIFY_OK; - op.cmd = XENPF_settime32; - op.u.settime32.secs = now.tv_sec; - op.u.settime32.nsecs = now.tv_nsec; - op.u.settime32.system_time = xen_clocksource_read(); +again: + if (settime64_supported) { + op.cmd = XENPF_settime64; + op.u.settime64.mbz = 0; + op.u.settime64.secs = now.tv_sec; + op.u.settime64.nsecs = now.tv_nsec; + op.u.settime64.system_time = xen_clocksource_read(); + } else { + op.cmd = XENPF_settime32; + op.u.settime32.secs = now.tv_sec; + op.u.settime32.nsecs = now.tv_nsec; + op.u.settime32.system_time = xen_clocksource_read(); + } - (void)HYPERVISOR_platform_op(&op); + ret = HYPERVISOR_platform_op(&op); + + if (ret == -ENOSYS && settime64_supported) { + settime64_supported = false; + goto again; + } + if (ret < 0) + return NOTIFY_BAD; /* * Move the next drift compensation time 11 minutes From 187b26a97244b1083d573175650f41b2267ac635 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 24 Nov 2015 14:53:02 +0000 Subject: [PATCH 13/18] xen/x86: convert remaining timespec to timespec64 in xen_pvclock_gtod_notify Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky --- arch/x86/xen/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 4b8af45b211a..a0a4e554c6f1 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -121,7 +121,7 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb, unsigned long was_set, void *priv) { /* Protected by the calling core code serialization */ - static struct timespec next_sync; + static struct timespec64 next_sync; struct xen_platform_op op; struct timespec64 now; @@ -136,7 +136,7 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb, * We only take the expensive HV call when the clock was set * or when the 11 minutes RTC synchronization time elapsed. */ - if (!was_set && timespec_compare(&now, &next_sync) < 0) + if (!was_set && timespec64_compare(&now, &next_sync) < 0) return NOTIFY_OK; again: From 2dd887e32175b624375570a0361083eb2cd64a07 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 20 Nov 2015 15:02:44 +0000 Subject: [PATCH 14/18] xen/time: use READ_ONCE Use READ_ONCE through the code, rather than explicit barriers. Suggested-by: Mark Rutland Signed-off-by: Stefano Stabellini Signed-off-by: David Vrabel --- drivers/xen/time.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/xen/time.c b/drivers/xen/time.c index 433fe247c5ff..71078425c9ea 100644 --- a/drivers/xen/time.c +++ b/drivers/xen/time.c @@ -25,7 +25,7 @@ static u64 get64(const u64 *p) if (BITS_PER_LONG < 64) { u32 *p32 = (u32 *)p; - u32 h, l; + u32 h, l, h2; /* * Read high then low, and then make sure high is @@ -34,15 +34,14 @@ static u64 get64(const u64 *p) * XXX some clean way to make this endian-proof? */ do { - h = p32[1]; - barrier(); - l = p32[0]; - barrier(); - } while (p32[1] != h); + h = READ_ONCE(p32[1]); + l = READ_ONCE(p32[0]); + h2 = READ_ONCE(p32[1]); + } while(h2 != h); ret = (((u64)h) << 32) | l; } else - ret = *p; + ret = READ_ONCE(*p); return ret; } @@ -66,9 +65,7 @@ void xen_get_runstate_snapshot(struct vcpu_runstate_info *res) */ do { state_time = get64(&state->state_entry_time); - barrier(); - *res = *state; - barrier(); + *res = READ_ONCE(*state); } while (get64(&state->state_entry_time) != state_time); } From 86fc2136736d2767bf797e6d2b1f80b49f52953c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 28 Nov 2015 15:28:40 +0100 Subject: [PATCH 15/18] xen/grant-table: constify gnttab_ops structure The gnttab_ops structure is never modified, so declare it as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: David Vrabel --- drivers/xen/grant-table.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index c49f79ed58c5..effbaf91791f 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -128,7 +128,7 @@ struct unmap_refs_callback_data { int result; }; -static struct gnttab_ops *gnttab_interface; +static const struct gnttab_ops *gnttab_interface; static int grant_table_version; static int grefs_per_grant_frame; @@ -1013,7 +1013,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) return rc; } -static struct gnttab_ops gnttab_v1_ops = { +static const struct gnttab_ops gnttab_v1_ops = { .map_frames = gnttab_map_frames_v1, .unmap_frames = gnttab_unmap_frames_v1, .update_entry = gnttab_update_entry_v1, From b9c0a92a9aa953e5a98f2af2098c747d4358c7bb Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 29 Nov 2015 23:02:49 +0100 Subject: [PATCH 16/18] xen/gntdev: constify mmu_notifier_ops structures This mmu_notifier_ops structure is never modified, so declare it as const, like the other mmu_notifier_ops structures. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: David Vrabel --- drivers/xen/gntdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1be5dd048622..cbd3836ec8fa 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -518,7 +518,7 @@ static void mn_release(struct mmu_notifier *mn, mutex_unlock(&priv->lock); } -static struct mmu_notifier_ops gntdev_mmu_ops = { +static const struct mmu_notifier_ops gntdev_mmu_ops = { .release = mn_release, .invalidate_page = mn_invl_page, .invalidate_range_start = mn_invl_range_start, From 6a1f513776b78c994045287073e55bae44ed9f8c Mon Sep 17 00:00:00 2001 From: "Ouyang Zhaowei (Charles)" Date: Wed, 6 May 2015 09:47:04 +0800 Subject: [PATCH 17/18] x86/xen: don't reset vcpu_info on a cancelled suspend On a cancelled suspend the vcpu_info location does not change (it's still in the per-cpu area registered by xen_vcpu_setup()). So do not call xen_hvm_init_shared_info() which would make the kernel think its back in the shared info. With the wrong vcpu_info, events cannot be received and the domain will hang after a cancelled suspend. Signed-off-by: Charles Ouyang Reviewed-by: Boris Ostrovsky Cc: Signed-off-by: David Vrabel --- arch/x86/xen/suspend.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 3705eabd7e22..5df55833b529 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -33,7 +33,8 @@ static void xen_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_init_shared_info(); + if (!suspend_cancelled) + xen_hvm_init_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { From a4cdb556cae05cd3e7b602b3a44c01420c4e2258 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 2 Dec 2014 16:13:26 +0000 Subject: [PATCH 18/18] xen/gntdev: add ioctl for grant copy Add IOCTL_GNTDEV_GRANT_COPY to allow applications to copy between user space buffers and grant references. This interface is similar to the GNTTABOP_copy hypercall ABI except the local buffers are provided using a virtual address (instead of a GFN and offset). To avoid userspace from having to page align its buffers the driver will use two or more ops if required. If the ioctl returns 0, the application must check the status of each segment with the segments status field. If the ioctl returns a -ve error code (EINVAL or EFAULT), the status of individual ops is undefined. Signed-off-by: David Vrabel Reviewed-by: Boris Ostrovsky --- drivers/xen/gntdev.c | 203 ++++++++++++++++++++++++++++++++++++++ include/uapi/xen/gntdev.h | 50 ++++++++++ 2 files changed, 253 insertions(+) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index cbd3836ec8fa..dc495383ad73 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -748,6 +748,206 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) return rc; } +#define GNTDEV_COPY_BATCH 24 + +struct gntdev_copy_batch { + struct gnttab_copy ops[GNTDEV_COPY_BATCH]; + struct page *pages[GNTDEV_COPY_BATCH]; + s16 __user *status[GNTDEV_COPY_BATCH]; + unsigned int nr_ops; + unsigned int nr_pages; +}; + +static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, + bool writeable, unsigned long *gfn) +{ + unsigned long addr = (unsigned long)virt; + struct page *page; + unsigned long xen_pfn; + int ret; + + ret = get_user_pages_fast(addr, 1, writeable, &page); + if (ret < 0) + return ret; + + batch->pages[batch->nr_pages++] = page; + + xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(addr & ~PAGE_MASK); + *gfn = pfn_to_gfn(xen_pfn); + + return 0; +} + +static void gntdev_put_pages(struct gntdev_copy_batch *batch) +{ + unsigned int i; + + for (i = 0; i < batch->nr_pages; i++) + put_page(batch->pages[i]); + batch->nr_pages = 0; +} + +static int gntdev_copy(struct gntdev_copy_batch *batch) +{ + unsigned int i; + + gnttab_batch_copy(batch->ops, batch->nr_ops); + gntdev_put_pages(batch); + + /* + * For each completed op, update the status if the op failed + * and all previous ops for the segment were successful. + */ + for (i = 0; i < batch->nr_ops; i++) { + s16 status = batch->ops[i].status; + s16 old_status; + + if (status == GNTST_okay) + continue; + + if (__get_user(old_status, batch->status[i])) + return -EFAULT; + + if (old_status != GNTST_okay) + continue; + + if (__put_user(status, batch->status[i])) + return -EFAULT; + } + + batch->nr_ops = 0; + return 0; +} + +static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch, + struct gntdev_grant_copy_segment *seg, + s16 __user *status) +{ + uint16_t copied = 0; + + /* + * Disallow local -> local copies since there is only space in + * batch->pages for one page per-op and this would be a very + * expensive memcpy(). + */ + if (!(seg->flags & (GNTCOPY_source_gref | GNTCOPY_dest_gref))) + return -EINVAL; + + /* Can't cross page if source/dest is a grant ref. */ + if (seg->flags & GNTCOPY_source_gref) { + if (seg->source.foreign.offset + seg->len > XEN_PAGE_SIZE) + return -EINVAL; + } + if (seg->flags & GNTCOPY_dest_gref) { + if (seg->dest.foreign.offset + seg->len > XEN_PAGE_SIZE) + return -EINVAL; + } + + if (put_user(GNTST_okay, status)) + return -EFAULT; + + while (copied < seg->len) { + struct gnttab_copy *op; + void __user *virt; + size_t len, off; + unsigned long gfn; + int ret; + + if (batch->nr_ops >= GNTDEV_COPY_BATCH) { + ret = gntdev_copy(batch); + if (ret < 0) + return ret; + } + + len = seg->len - copied; + + op = &batch->ops[batch->nr_ops]; + op->flags = 0; + + if (seg->flags & GNTCOPY_source_gref) { + op->source.u.ref = seg->source.foreign.ref; + op->source.domid = seg->source.foreign.domid; + op->source.offset = seg->source.foreign.offset + copied; + op->flags |= GNTCOPY_source_gref; + } else { + virt = seg->source.virt + copied; + off = (unsigned long)virt & ~XEN_PAGE_MASK; + len = min(len, (size_t)XEN_PAGE_SIZE - off); + + ret = gntdev_get_page(batch, virt, false, &gfn); + if (ret < 0) + return ret; + + op->source.u.gmfn = gfn; + op->source.domid = DOMID_SELF; + op->source.offset = off; + } + + if (seg->flags & GNTCOPY_dest_gref) { + op->dest.u.ref = seg->dest.foreign.ref; + op->dest.domid = seg->dest.foreign.domid; + op->dest.offset = seg->dest.foreign.offset + copied; + op->flags |= GNTCOPY_dest_gref; + } else { + virt = seg->dest.virt + copied; + off = (unsigned long)virt & ~XEN_PAGE_MASK; + len = min(len, (size_t)XEN_PAGE_SIZE - off); + + ret = gntdev_get_page(batch, virt, true, &gfn); + if (ret < 0) + return ret; + + op->dest.u.gmfn = gfn; + op->dest.domid = DOMID_SELF; + op->dest.offset = off; + } + + op->len = len; + copied += len; + + batch->status[batch->nr_ops] = status; + batch->nr_ops++; + } + + return 0; +} + +static long gntdev_ioctl_grant_copy(struct gntdev_priv *priv, void __user *u) +{ + struct ioctl_gntdev_grant_copy copy; + struct gntdev_copy_batch batch; + unsigned int i; + int ret = 0; + + if (copy_from_user(©, u, sizeof(copy))) + return -EFAULT; + + batch.nr_ops = 0; + batch.nr_pages = 0; + + for (i = 0; i < copy.count; i++) { + struct gntdev_grant_copy_segment seg; + + if (copy_from_user(&seg, ©.segments[i], sizeof(seg))) { + ret = -EFAULT; + goto out; + } + + ret = gntdev_grant_copy_seg(&batch, &seg, ©.segments[i].status); + if (ret < 0) + goto out; + + cond_resched(); + } + if (batch.nr_ops) + ret = gntdev_copy(&batch); + return ret; + + out: + gntdev_put_pages(&batch); + return ret; +} + static long gntdev_ioctl(struct file *flip, unsigned int cmd, unsigned long arg) { @@ -767,6 +967,9 @@ static long gntdev_ioctl(struct file *flip, case IOCTL_GNTDEV_SET_UNMAP_NOTIFY: return gntdev_ioctl_notify(priv, ptr); + case IOCTL_GNTDEV_GRANT_COPY: + return gntdev_ioctl_grant_copy(priv, ptr); + default: pr_debug("priv %p, unknown cmd %x\n", priv, cmd); return -ENOIOCTLCMD; diff --git a/include/uapi/xen/gntdev.h b/include/uapi/xen/gntdev.h index aa7610a9b867..d0661977667e 100644 --- a/include/uapi/xen/gntdev.h +++ b/include/uapi/xen/gntdev.h @@ -144,6 +144,56 @@ struct ioctl_gntdev_unmap_notify { __u32 event_channel_port; }; +struct gntdev_grant_copy_segment { + union { + void __user *virt; + struct { + grant_ref_t ref; + __u16 offset; + domid_t domid; + } foreign; + } source, dest; + __u16 len; + + __u16 flags; /* GNTCOPY_* */ + __s16 status; /* GNTST_* */ +}; + +/* + * Copy between grant references and local buffers. + * + * The copy is split into @count @segments, each of which can copy + * to/from one grant reference. + * + * Each segment is similar to struct gnttab_copy in the hypervisor ABI + * except the local buffer is specified using a virtual address + * (instead of a GFN and offset). + * + * The local buffer may cross a Xen page boundary -- the driver will + * split segments into multiple ops if required. + * + * Returns 0 if all segments have been processed and @status in each + * segment is valid. Note that one or more segments may have failed + * (status != GNTST_okay). + * + * If the driver had to split a segment into two or more ops, @status + * includes the status of the first failed op for that segment (or + * GNTST_okay if all ops were successful). + * + * If -1 is returned, the status of all segments is undefined. + * + * EINVAL: A segment has local buffers for both source and + * destination. + * EINVAL: A segment crosses the boundary of a foreign page. + * EFAULT: A segment's local buffer is not accessible. + */ +#define IOCTL_GNTDEV_GRANT_COPY \ + _IOC(_IOC_NONE, 'G', 8, sizeof(struct ioctl_gntdev_grant_copy)) +struct ioctl_gntdev_grant_copy { + unsigned int count; + struct gntdev_grant_copy_segment __user *segments; +}; + /* Clear (set to zero) the byte specified by index */ #define UNMAP_NOTIFY_CLEAR_BYTE 0x1 /* Send an interrupt on the indicated event channel */