From 59ea746337c69f6a5f1bc4d5e8544b3cbf12f801 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Jun 2008 13:56:40 +0200 Subject: [PATCH 001/138] MM: virtual address debug Add some (configurable) expensive sanity checking to catch wrong address translations on x86. - create linux/mmdebug.h file to be able include this file in asm headers to not get unsolvable loops in header files - __phys_addr on x86_32 became a function in ioremap.c since PAGE_OFFSET, is_vmalloc_addr and VMALLOC_* non-constasts are undefined if declared in page_32.h - add __phys_addr_const for initializing doublefault_tss.__cr3 Tested on 386, 386pae, x86_64 and x86_64 numa=fake=2. Contains Andi's enable numa virtual address debug patch. Signed-off-by: Jiri Slaby Cc: Andi Kleen Signed-off-by: Ingo Molnar --- arch/x86/kernel/doublefault_32.c | 2 +- arch/x86/mm/ioremap.c | 33 ++++++++++++++++++++++++-------- include/asm-x86/mmzone_64.h | 2 +- include/asm-x86/page_32.h | 3 ++- include/linux/mm.h | 7 +------ include/linux/mmdebug.h | 18 +++++++++++++++++ lib/Kconfig.debug | 9 +++++++++ mm/vmalloc.c | 5 +++++ 8 files changed, 62 insertions(+), 17 deletions(-) create mode 100644 include/linux/mmdebug.h diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index a47798b59f07..395acb12b0d1 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = { .ds = __USER_DS, .fs = __KERNEL_PERCPU, - .__cr3 = __pa(swapper_pg_dir) + .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir) } }; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 2b2bb3f9b683..a78ffef62a2b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -23,19 +23,27 @@ #ifdef CONFIG_X86_64 -unsigned long __phys_addr(unsigned long x) -{ - if (x >= __START_KERNEL_map) - return x - __START_KERNEL_map + phys_base; - return x - PAGE_OFFSET; -} -EXPORT_SYMBOL(__phys_addr); - static inline int phys_addr_valid(unsigned long addr) { return addr < (1UL << boot_cpu_data.x86_phys_bits); } +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); + x += phys_base; + } else { + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + x -= PAGE_OFFSET; + VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : + !phys_addr_valid(x)); + } + return x; +} +EXPORT_SYMBOL(__phys_addr); + #else static inline int phys_addr_valid(unsigned long addr) @@ -43,6 +51,15 @@ static inline int phys_addr_valid(unsigned long addr) return 1; } +unsigned long __phys_addr(unsigned long x) +{ + /* VMALLOC_* aren't constants; not available at the boot time */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET || (system_state != SYSTEM_BOOTING && + is_vmalloc_addr((void *)x))); + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); + #endif int page_is_ram(unsigned long pagenr) diff --git a/include/asm-x86/mmzone_64.h b/include/asm-x86/mmzone_64.h index 594bd0dc1d08..facde3e5314f 100644 --- a/include/asm-x86/mmzone_64.h +++ b/include/asm-x86/mmzone_64.h @@ -7,7 +7,7 @@ #ifdef CONFIG_NUMA -#define VIRTUAL_BUG_ON(x) +#include #include diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 424e82f8ae27..9159bfb9dcf9 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -64,7 +64,8 @@ typedef struct page *pgtable_t; #endif #ifndef __ASSEMBLY__ -#define __phys_addr(x) ((x) - PAGE_OFFSET) +#define __phys_addr_const(x) ((x) - PAGE_OFFSET) +extern unsigned long __phys_addr(unsigned long); #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) #ifdef CONFIG_FLATMEM diff --git a/include/linux/mm.h b/include/linux/mm.h index 586a943cab01..3414a8813e97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -210,12 +211,6 @@ struct inode; */ #include -#ifdef CONFIG_DEBUG_VM -#define VM_BUG_ON(cond) BUG_ON(cond) -#else -#define VM_BUG_ON(condition) do { } while(0) -#endif - /* * Methods to modify the page usage count. * diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h new file mode 100644 index 000000000000..860ed1a71bbe --- /dev/null +++ b/include/linux/mmdebug.h @@ -0,0 +1,18 @@ +#ifndef LINUX_MM_DEBUG_H +#define LINUX_MM_DEBUG_H 1 + +#include + +#ifdef CONFIG_DEBUG_VM +#define VM_BUG_ON(cond) BUG_ON(cond) +#else +#define VM_BUG_ON(cond) do { } while(0) +#endif + +#ifdef CONFIG_DEBUG_VIRTUAL +#define VIRTUAL_BUG_ON(cond) BUG_ON(cond) +#else +#define VIRTUAL_BUG_ON(cond) do { } while(0) +#endif + +#endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d2099f41aa1e..9d9dc0ddf13a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -469,6 +469,15 @@ config DEBUG_VM If unsure, say N. +config DEBUG_VIRTUAL + bool "Debug VM translations" + depends on DEBUG_KERNEL && X86 + help + Enable some costly sanity checks in virtual to page code. This can + catch mistakes with virt_to_page() and friends. + + If unsure, say N. + config DEBUG_WRITECOUNT bool "Debug filesystem writers count" depends on DEBUG_KERNEL diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6e45b0f3d125..dc41e9c8ca6f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -180,6 +180,11 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) pmd_t *pmd; pte_t *ptep, pte; + /* XXX we might need to change this if we add VIRTUAL_BUG_ON for + * architectures that do not vmalloc module space */ + VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) && + !is_module_address(addr)); + if (!pgd_none(*pgd)) { pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { From a1bf9631be7332ce0641e299ddafad2d8223100f Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Jun 2008 13:56:40 +0200 Subject: [PATCH 002/138] x86, MM: virtual address debug, v2 I've removed the test from phys_to_nid and made a function from __phys_addr only when the debugging is enabled (on x86_32). Signed-off-by: Jiri Slaby Cc: tglx@linutronix.de Cc: hpa@zytor.com Cc: Mike Travis Cc: Nick Piggin Cc: Cc: linux-mm@kvack.org Cc: Jiri Slaby Cc: Andi Kleen Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 ++ include/asm-x86/mmzone_64.h | 1 - include/asm-x86/page_32.h | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index a78ffef62a2b..9dd3cb905971 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -51,6 +51,7 @@ static inline int phys_addr_valid(unsigned long addr) return 1; } +#ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { /* VMALLOC_* aren't constants; not available at the boot time */ @@ -59,6 +60,7 @@ unsigned long __phys_addr(unsigned long x) return x - PAGE_OFFSET; } EXPORT_SYMBOL(__phys_addr); +#endif #endif diff --git a/include/asm-x86/mmzone_64.h b/include/asm-x86/mmzone_64.h index facde3e5314f..5e3a6cbddb49 100644 --- a/include/asm-x86/mmzone_64.h +++ b/include/asm-x86/mmzone_64.h @@ -29,7 +29,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) { unsigned nid; VIRTUAL_BUG_ON(!memnodemap); - VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize); nid = memnodemap[addr >> memnode_shift]; VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); return nid; diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 9159bfb9dcf9..71a2e424e584 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -65,7 +65,11 @@ typedef struct page *pgtable_t; #ifndef __ASSEMBLY__ #define __phys_addr_const(x) ((x) - PAGE_OFFSET) +#ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); +#else +#define __phys_addr(x) ((x) - PAGE_OFFSET) +#endif #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) #ifdef CONFIG_FLATMEM From 7aa413def76146f7b3784228556d9e4bc562eab3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 Jun 2008 13:28:11 +0200 Subject: [PATCH 003/138] x86, MM: virtual address debug, cleanups Signed-off-by: Ingo Molnar --- include/linux/mmdebug.h | 4 ++-- mm/vmalloc.c | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 860ed1a71bbe..8a5509877192 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -6,13 +6,13 @@ #ifdef CONFIG_DEBUG_VM #define VM_BUG_ON(cond) BUG_ON(cond) #else -#define VM_BUG_ON(cond) do { } while(0) +#define VM_BUG_ON(cond) do { } while (0) #endif #ifdef CONFIG_DEBUG_VIRTUAL #define VIRTUAL_BUG_ON(cond) BUG_ON(cond) #else -#define VIRTUAL_BUG_ON(cond) do { } while(0) +#define VIRTUAL_BUG_ON(cond) do { } while (0) #endif #endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dc41e9c8ca6f..830a5580c5d7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -180,8 +180,10 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) pmd_t *pmd; pte_t *ptep, pte; - /* XXX we might need to change this if we add VIRTUAL_BUG_ON for - * architectures that do not vmalloc module space */ + /* + * XXX we might need to change this if we add VIRTUAL_BUG_ON for + * architectures that do not vmalloc module space + */ VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) && !is_module_address(addr)); From 38ffbe66d59051fd9cfcfc8545f164700e2fa3bc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 23 Jul 2008 14:21:18 -0700 Subject: [PATCH 004/138] x86/paravirt/xen: properly fill out the ldt ops LTP testing showed that Xen does not properly implement sys_modify_ldt(). This patch does the final little bits needed to make the ldt work properly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/ldt.c | 9 ++++++++- arch/x86/kernel/paravirt.c | 4 ++++ arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++ include/asm-x86/desc.h | 10 +++++++++- include/asm-x86/paravirt.h | 13 +++++++++++++ 5 files changed, 57 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 3fee2aa50f3f..4895e0634d22 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -51,6 +51,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, (mincount - oldsize) * LDT_ENTRY_SIZE); + paravirt_alloc_ldt(newldt, mincount); + #ifdef CONFIG_X86_64 /* CHECKME: Do we really need this ? */ wmb(); @@ -75,6 +77,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #endif } if (oldsize) { + paravirt_free_ldt(oldldt, oldsize); if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(oldldt); else @@ -86,10 +89,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { int err = alloc_ldt(new, old->size, 0); + int i; if (err < 0) return err; - memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + + for(i = 0; i < old->size; i++) + write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); return 0; } @@ -126,6 +132,7 @@ void destroy_context(struct mm_struct *mm) if (mm == current->active_mm) clear_LDT(); #endif + paravirt_free_ldt(mm->context.ldt, mm->context.size); if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(mm->context.ldt); else diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 94da4d52d798..d8f2277be5a0 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -348,6 +348,10 @@ struct pv_cpu_ops pv_cpu_ops = { .write_ldt_entry = native_write_ldt_entry, .write_gdt_entry = native_write_gdt_entry, .write_idt_entry = native_write_idt_entry, + + .alloc_ldt = paravirt_nop, + .free_ldt = paravirt_nop, + .load_sp0 = native_load_sp0, #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3cbf08f..06219e60e9c8 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -325,6 +325,26 @@ static unsigned long xen_store_tr(void) return 0; } +static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ + unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); + void *v = ldt; + int i; + + for(i = 0; i < pages; i += PAGE_SIZE) + make_lowmem_page_readonly(v + i); +} + +static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) +{ + unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); + void *v = ldt; + int i; + + for(i = 0; i < pages; i += PAGE_SIZE) + make_lowmem_page_readwrite(v + i); +} + static void xen_set_ldt(const void *addr, unsigned entries) { struct mmuext_op *op; @@ -1220,6 +1240,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .load_gs_index = xen_load_gs_index, #endif + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, + .store_gdt = native_store_gdt, .store_idt = native_store_idt, .store_tr = xen_store_tr, diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h index a44c4dc70590..24a524f5e1a2 100644 --- a/include/asm-x86/desc.h +++ b/include/asm-x86/desc.h @@ -97,7 +97,15 @@ static inline int desc_empty(const void *ptr) native_write_gdt_entry(dt, entry, desc, type) #define write_idt_entry(dt, entry, g) \ native_write_idt_entry(dt, entry, g) -#endif + +static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + +static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) +{ +} +#endif /* CONFIG_PARAVIRT */ static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate) diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index fbbde93f12d6..db9b0647b346 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -124,6 +124,9 @@ struct pv_cpu_ops { int entrynum, const void *desc, int size); void (*write_idt_entry)(gate_desc *, int entrynum, const gate_desc *gate); + void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); + void (*free_ldt)(struct desc_struct *ldt, unsigned entries); + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); void (*set_iopl_mask)(unsigned mask); @@ -824,6 +827,16 @@ do { \ (aux) = __aux; \ } while (0) +static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ + PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries); +} + +static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) +{ + PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries); +} + static inline void load_TR_desc(void) { PVOP_VCALL0(pv_cpu_ops.load_tr_desc); From d5de8841355a48f7f634a04507185eaf1f9755e3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 23 Jul 2008 13:28:58 -0700 Subject: [PATCH 005/138] x86: split spinlock implementations out into their own files ftrace requires certain low-level code, like spinlocks and timestamps, to be compiled without -pg in order to avoid infinite recursion. This patch splits out the core paravirt spinlocks and the Xen spinlocks into separate files which can be compiled without -pg. Also do xen/time.c while we're about it. As a result, we can now use ftrace within a Xen domain. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 4 +- arch/x86/kernel/paravirt-spinlocks.c | 31 +++++ arch/x86/kernel/paravirt.c | 23 ---- arch/x86/xen/Makefile | 8 +- arch/x86/xen/smp.c | 167 ------------------------ arch/x86/xen/spinlock.c | 183 +++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 3 + 7 files changed, 226 insertions(+), 193 deletions(-) create mode 100644 arch/x86/kernel/paravirt-spinlocks.c create mode 100644 arch/x86/xen/spinlock.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3db651fc8ec5..d679cb2c79b4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg -CFLAGS_REMOVE_paravirt.o = -pg +CFLAGS_REMOVE_paravirt-spinlocks.o = -pg endif # @@ -89,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c new file mode 100644 index 000000000000..38d7f7f1dbc9 --- /dev/null +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -0,0 +1,31 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include +#include + +#include + +struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +#endif +}; +EXPORT_SYMBOL_GPL(pv_lock_ops); + +void __init paravirt_use_bytelocks(void) +{ +#ifdef CONFIG_SMP + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 94da4d52d798..0d71de9ff56d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } -void __init paravirt_use_bytelocks(void) -{ -#ifdef CONFIG_SMP - pv_lock_ops.spin_is_locked = __byte_spin_is_locked; - pv_lock_ops.spin_is_contended = __byte_spin_is_contended; - pv_lock_ops.spin_lock = __byte_spin_lock; - pv_lock_ops.spin_trylock = __byte_spin_trylock; - pv_lock_ops.spin_unlock = __byte_spin_unlock; -#endif -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, @@ -461,18 +450,6 @@ struct pv_mmu_ops pv_mmu_ops = { .set_fixmap = native_set_fixmap, }; -struct pv_lock_ops pv_lock_ops = { -#ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, - - .spin_lock = __ticket_spin_lock, - .spin_trylock = __ticket_spin_trylock, - .spin_unlock = __ticket_spin_unlock, -#endif -}; -EXPORT_SYMBOL_GPL(pv_lock_ops); - EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 59c1e539aed2..5bfee243cf9a 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,10 @@ +ifdef CONFIG_FTRACE +# Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_spinlock.o = -pg +CFLAGS_REMOVE_time.o = -pg +endif + obj-y := enlighten.o setup.o multicalls.o mmu.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o -obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smp.o spinlock.o diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index d8faf79a0a1d..baca7f2fbd8a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -15,7 +15,6 @@ * This does not handle HOTPLUG_CPU yet. */ #include -#include #include #include @@ -36,8 +35,6 @@ #include "xen-ops.h" #include "mmu.h" -static void __cpuinit xen_init_lock_cpu(int cpu); - cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); @@ -419,170 +416,6 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -struct xen_spinlock { - unsigned char lock; /* 0 -> free; 1 -> locked */ - unsigned short spinners; /* count of waiting cpus */ -}; - -static int xen_spin_is_locked(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl->lock != 0; -} - -static int xen_spin_is_contended(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl->spinners != 0; -} - -static int xen_spin_trylock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm("xchgb %b0,%1" - : "+q" (old), "+m" (xl->lock) : : "memory"); - - return old == 0; -} - -static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -static inline void spinning_lock(struct xen_spinlock *xl) -{ - __get_cpu_var(lock_spinners) = xl; - wmb(); /* set lock of interest before count */ - asm(LOCK_PREFIX " incw %0" - : "+m" (xl->spinners) : : "memory"); -} - -static inline void unspinning_lock(struct xen_spinlock *xl) -{ - asm(LOCK_PREFIX " decw %0" - : "+m" (xl->spinners) : : "memory"); - wmb(); /* decrement count before clearing lock */ - __get_cpu_var(lock_spinners) = NULL; -} - -static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int irq = __get_cpu_var(lock_kicker_irq); - int ret; - - /* If kicker interrupts not initialized yet, just spin */ - if (irq == -1) - return 0; - - /* announce we're spinning */ - spinning_lock(xl); - - /* clear pending */ - xen_clear_irq_pending(irq); - - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) - goto out; - - /* block until irq becomes pending */ - xen_poll_irq(irq); - kstat_this_cpu.irqs[irq]++; - -out: - unspinning_lock(xl); - return ret; -} - -static void xen_spin_lock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int timeout; - u8 oldval; - - do { - timeout = 1 << 10; - - asm("1: xchgb %1,%0\n" - " testb %1,%1\n" - " jz 3f\n" - "2: rep;nop\n" - " cmpb $0,%0\n" - " je 1b\n" - " dec %2\n" - " jnz 2b\n" - "3:\n" - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) - : "1" (1) - : "memory"); - - } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); -} - -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) -{ - int cpu; - - for_each_online_cpu(cpu) { - /* XXX should mix up next cpu selection */ - if (per_cpu(lock_spinners, cpu) == xl) { - xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); - break; - } - } -} - -static void xen_spin_unlock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - smp_wmb(); /* make sure no writes get moved after unlock */ - xl->lock = 0; /* release lock */ - - /* make sure unlock happens before kick */ - barrier(); - - if (unlikely(xl->spinners)) - xen_spin_unlock_slow(xl); -} - -static __cpuinit void xen_init_lock_cpu(int cpu) -{ - int irq; - const char *name; - - name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); - irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, - cpu, - xen_reschedule_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, - name, - NULL); - - if (irq >= 0) { - disable_irq(irq); /* make sure it's never delivered */ - per_cpu(lock_kicker_irq, cpu) = irq; - } - - printk("cpu %d spinlock event irq %d\n", cpu, irq); -} - -static void __init xen_init_spinlocks(void) -{ - pv_lock_ops.spin_is_locked = xen_spin_is_locked; - pv_lock_ops.spin_is_contended = xen_spin_is_contended; - pv_lock_ops.spin_lock = xen_spin_lock; - pv_lock_ops.spin_trylock = xen_spin_trylock; - pv_lock_ops.spin_unlock = xen_spin_unlock; -} - static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c new file mode 100644 index 000000000000..8dc4d31da67f --- /dev/null +++ b/arch/x86/xen/spinlock.c @@ -0,0 +1,183 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include +#include + +#include + +#include +#include + +#include "xen-ops.h" + +struct xen_spinlock { + unsigned char lock; /* 0 -> free; 1 -> locked */ + unsigned short spinners; /* count of waiting cpus */ +}; + +static int xen_spin_is_locked(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + return xl->lock != 0; +} + +static int xen_spin_is_contended(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + /* Not strictly true; this is only the count of contended + lock-takers entering the slow path. */ + return xl->spinners != 0; +} + +static int xen_spin_trylock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + u8 old = 1; + + asm("xchgb %b0,%1" + : "+q" (old), "+m" (xl->lock) : : "memory"); + + return old == 0; +} + +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); + +static inline void spinning_lock(struct xen_spinlock *xl) +{ + __get_cpu_var(lock_spinners) = xl; + wmb(); /* set lock of interest before count */ + asm(LOCK_PREFIX " incw %0" + : "+m" (xl->spinners) : : "memory"); +} + +static inline void unspinning_lock(struct xen_spinlock *xl) +{ + asm(LOCK_PREFIX " decw %0" + : "+m" (xl->spinners) : : "memory"); + wmb(); /* decrement count before clearing lock */ + __get_cpu_var(lock_spinners) = NULL; +} + +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int irq = __get_cpu_var(lock_kicker_irq); + int ret; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return 0; + + /* announce we're spinning */ + spinning_lock(xl); + + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) + goto out; + + /* block until irq becomes pending */ + xen_poll_irq(irq); + kstat_this_cpu.irqs[irq]++; + +out: + unspinning_lock(xl); + return ret; +} + +static void xen_spin_lock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int timeout; + u8 oldval; + + do { + timeout = 1 << 10; + + asm("1: xchgb %1,%0\n" + " testb %1,%1\n" + " jz 3f\n" + "2: rep;nop\n" + " cmpb $0,%0\n" + " je 1b\n" + " dec %2\n" + " jnz 2b\n" + "3:\n" + : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) + : "1" (1) + : "memory"); + + } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); +} + +static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +{ + int cpu; + + for_each_online_cpu(cpu) { + /* XXX should mix up next cpu selection */ + if (per_cpu(lock_spinners, cpu) == xl) { + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; + } + } +} + +static void xen_spin_unlock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + smp_wmb(); /* make sure no writes get moved after unlock */ + xl->lock = 0; /* release lock */ + + /* make sure unlock happens before kick */ + barrier(); + + if (unlikely(xl->spinners)) + xen_spin_unlock_slow(xl); +} + +static irqreturn_t dummy_handler(int irq, void *dev_id) +{ + BUG(); + return IRQ_HANDLED; +} + +void __cpuinit xen_init_lock_cpu(int cpu) +{ + int irq; + const char *name; + + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, + cpu, + dummy_handler, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, + NULL); + + if (irq >= 0) { + disable_irq(irq); /* make sure it's never delivered */ + per_cpu(lock_kicker_irq, cpu) = irq; + } + + printk("cpu %d spinlock event irq %d\n", cpu, irq); +} + +void __init xen_init_spinlocks(void) +{ + pv_lock_ops.spin_is_locked = xen_spin_is_locked; + pv_lock_ops.spin_is_contended = xen_spin_is_contended; + pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_trylock = xen_spin_trylock; + pv_lock_ops.spin_unlock = xen_spin_unlock; +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index dd3c23152a2e..8847fb34f17e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -50,6 +50,9 @@ void __init xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP void xen_smp_init(void); +void __init xen_init_spinlocks(void); +__cpuinit void xen_init_lock_cpu(int cpu); + extern cpumask_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} From 0af36739af81f152cc24a0fdfa0754ef657afe3d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jul 2008 17:27:57 -0700 Subject: [PATCH 006/138] usb: move ehci reg def prepare x86: usb debug port early console move ehci struct def to linux/usrb/ehci_def.h from host/ehci.h Signed-off-by: Yinghai Lu Acked-by: David Brownell Cc: Andrew Morton Cc: Andi Kleen Cc: "Arjan van de Ven" Cc: "Eric W. Biederman" Cc: "Greg KH" Signed-off-by: Ingo Molnar --- drivers/usb/host/ehci.h | 138 +----------------------------- include/linux/usb/ehci_def.h | 160 +++++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 137 deletions(-) create mode 100644 include/linux/usb/ehci_def.h diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h index 5799298364fb..b697a13364ec 100644 --- a/drivers/usb/host/ehci.h +++ b/drivers/usb/host/ehci.h @@ -210,143 +210,7 @@ timer_action (struct ehci_hcd *ehci, enum ehci_timer_action action) /*-------------------------------------------------------------------------*/ -/* EHCI register interface, corresponds to EHCI Revision 0.95 specification */ - -/* Section 2.2 Host Controller Capability Registers */ -struct ehci_caps { - /* these fields are specified as 8 and 16 bit registers, - * but some hosts can't perform 8 or 16 bit PCI accesses. - */ - u32 hc_capbase; -#define HC_LENGTH(p) (((p)>>00)&0x00ff) /* bits 7:0 */ -#define HC_VERSION(p) (((p)>>16)&0xffff) /* bits 31:16 */ - u32 hcs_params; /* HCSPARAMS - offset 0x4 */ -#define HCS_DEBUG_PORT(p) (((p)>>20)&0xf) /* bits 23:20, debug port? */ -#define HCS_INDICATOR(p) ((p)&(1 << 16)) /* true: has port indicators */ -#define HCS_N_CC(p) (((p)>>12)&0xf) /* bits 15:12, #companion HCs */ -#define HCS_N_PCC(p) (((p)>>8)&0xf) /* bits 11:8, ports per CC */ -#define HCS_PORTROUTED(p) ((p)&(1 << 7)) /* true: port routing */ -#define HCS_PPC(p) ((p)&(1 << 4)) /* true: port power control */ -#define HCS_N_PORTS(p) (((p)>>0)&0xf) /* bits 3:0, ports on HC */ - - u32 hcc_params; /* HCCPARAMS - offset 0x8 */ -#define HCC_EXT_CAPS(p) (((p)>>8)&0xff) /* for pci extended caps */ -#define HCC_ISOC_CACHE(p) ((p)&(1 << 7)) /* true: can cache isoc frame */ -#define HCC_ISOC_THRES(p) (((p)>>4)&0x7) /* bits 6:4, uframes cached */ -#define HCC_CANPARK(p) ((p)&(1 << 2)) /* true: can park on async qh */ -#define HCC_PGM_FRAMELISTLEN(p) ((p)&(1 << 1)) /* true: periodic_size changes*/ -#define HCC_64BIT_ADDR(p) ((p)&(1)) /* true: can use 64-bit addr */ - u8 portroute [8]; /* nibbles for routing - offset 0xC */ -} __attribute__ ((packed)); - - -/* Section 2.3 Host Controller Operational Registers */ -struct ehci_regs { - - /* USBCMD: offset 0x00 */ - u32 command; -/* 23:16 is r/w intr rate, in microframes; default "8" == 1/msec */ -#define CMD_PARK (1<<11) /* enable "park" on async qh */ -#define CMD_PARK_CNT(c) (((c)>>8)&3) /* how many transfers to park for */ -#define CMD_LRESET (1<<7) /* partial reset (no ports, etc) */ -#define CMD_IAAD (1<<6) /* "doorbell" interrupt async advance */ -#define CMD_ASE (1<<5) /* async schedule enable */ -#define CMD_PSE (1<<4) /* periodic schedule enable */ -/* 3:2 is periodic frame list size */ -#define CMD_RESET (1<<1) /* reset HC not bus */ -#define CMD_RUN (1<<0) /* start/stop HC */ - - /* USBSTS: offset 0x04 */ - u32 status; -#define STS_ASS (1<<15) /* Async Schedule Status */ -#define STS_PSS (1<<14) /* Periodic Schedule Status */ -#define STS_RECL (1<<13) /* Reclamation */ -#define STS_HALT (1<<12) /* Not running (any reason) */ -/* some bits reserved */ - /* these STS_* flags are also intr_enable bits (USBINTR) */ -#define STS_IAA (1<<5) /* Interrupted on async advance */ -#define STS_FATAL (1<<4) /* such as some PCI access errors */ -#define STS_FLR (1<<3) /* frame list rolled over */ -#define STS_PCD (1<<2) /* port change detect */ -#define STS_ERR (1<<1) /* "error" completion (overflow, ...) */ -#define STS_INT (1<<0) /* "normal" completion (short, ...) */ - - /* USBINTR: offset 0x08 */ - u32 intr_enable; - - /* FRINDEX: offset 0x0C */ - u32 frame_index; /* current microframe number */ - /* CTRLDSSEGMENT: offset 0x10 */ - u32 segment; /* address bits 63:32 if needed */ - /* PERIODICLISTBASE: offset 0x14 */ - u32 frame_list; /* points to periodic list */ - /* ASYNCLISTADDR: offset 0x18 */ - u32 async_next; /* address of next async queue head */ - - u32 reserved [9]; - - /* CONFIGFLAG: offset 0x40 */ - u32 configured_flag; -#define FLAG_CF (1<<0) /* true: we'll support "high speed" */ - - /* PORTSC: offset 0x44 */ - u32 port_status [0]; /* up to N_PORTS */ -/* 31:23 reserved */ -#define PORT_WKOC_E (1<<22) /* wake on overcurrent (enable) */ -#define PORT_WKDISC_E (1<<21) /* wake on disconnect (enable) */ -#define PORT_WKCONN_E (1<<20) /* wake on connect (enable) */ -/* 19:16 for port testing */ -#define PORT_LED_OFF (0<<14) -#define PORT_LED_AMBER (1<<14) -#define PORT_LED_GREEN (2<<14) -#define PORT_LED_MASK (3<<14) -#define PORT_OWNER (1<<13) /* true: companion hc owns this port */ -#define PORT_POWER (1<<12) /* true: has power (see PPC) */ -#define PORT_USB11(x) (((x)&(3<<10))==(1<<10)) /* USB 1.1 device */ -/* 11:10 for detecting lowspeed devices (reset vs release ownership) */ -/* 9 reserved */ -#define PORT_RESET (1<<8) /* reset port */ -#define PORT_SUSPEND (1<<7) /* suspend port */ -#define PORT_RESUME (1<<6) /* resume it */ -#define PORT_OCC (1<<5) /* over current change */ -#define PORT_OC (1<<4) /* over current active */ -#define PORT_PEC (1<<3) /* port enable change */ -#define PORT_PE (1<<2) /* port enable */ -#define PORT_CSC (1<<1) /* connect status change */ -#define PORT_CONNECT (1<<0) /* device connected */ -#define PORT_RWC_BITS (PORT_CSC | PORT_PEC | PORT_OCC) -} __attribute__ ((packed)); - -#define USBMODE 0x68 /* USB Device mode */ -#define USBMODE_SDIS (1<<3) /* Stream disable */ -#define USBMODE_BE (1<<2) /* BE/LE endianness select */ -#define USBMODE_CM_HC (3<<0) /* host controller mode */ -#define USBMODE_CM_IDLE (0<<0) /* idle state */ - -/* Appendix C, Debug port ... intended for use with special "debug devices" - * that can help if there's no serial console. (nonstandard enumeration.) - */ -struct ehci_dbg_port { - u32 control; -#define DBGP_OWNER (1<<30) -#define DBGP_ENABLED (1<<28) -#define DBGP_DONE (1<<16) -#define DBGP_INUSE (1<<10) -#define DBGP_ERRCODE(x) (((x)>>7)&0x07) -# define DBGP_ERR_BAD 1 -# define DBGP_ERR_SIGNAL 2 -#define DBGP_ERROR (1<<6) -#define DBGP_GO (1<<5) -#define DBGP_OUT (1<<4) -#define DBGP_LEN(x) (((x)>>0)&0x0f) - u32 pids; -#define DBGP_PID_GET(x) (((x)>>16)&0xff) -#define DBGP_PID_SET(data,tok) (((data)<<8)|(tok)) - u32 data03; - u32 data47; - u32 address; -#define DBGP_EPADDR(dev,ep) (((dev)<<8)|(ep)) -} __attribute__ ((packed)); +#include /*-------------------------------------------------------------------------*/ diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h new file mode 100644 index 000000000000..5b88e36c9103 --- /dev/null +++ b/include/linux/usb/ehci_def.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2001-2002 by David Brownell + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __LINUX_USB_EHCI_DEF_H +#define __LINUX_USB_EHCI_DEF_H + +/* EHCI register interface, corresponds to EHCI Revision 0.95 specification */ + +/* Section 2.2 Host Controller Capability Registers */ +struct ehci_caps { + /* these fields are specified as 8 and 16 bit registers, + * but some hosts can't perform 8 or 16 bit PCI accesses. + */ + u32 hc_capbase; +#define HC_LENGTH(p) (((p)>>00)&0x00ff) /* bits 7:0 */ +#define HC_VERSION(p) (((p)>>16)&0xffff) /* bits 31:16 */ + u32 hcs_params; /* HCSPARAMS - offset 0x4 */ +#define HCS_DEBUG_PORT(p) (((p)>>20)&0xf) /* bits 23:20, debug port? */ +#define HCS_INDICATOR(p) ((p)&(1 << 16)) /* true: has port indicators */ +#define HCS_N_CC(p) (((p)>>12)&0xf) /* bits 15:12, #companion HCs */ +#define HCS_N_PCC(p) (((p)>>8)&0xf) /* bits 11:8, ports per CC */ +#define HCS_PORTROUTED(p) ((p)&(1 << 7)) /* true: port routing */ +#define HCS_PPC(p) ((p)&(1 << 4)) /* true: port power control */ +#define HCS_N_PORTS(p) (((p)>>0)&0xf) /* bits 3:0, ports on HC */ + + u32 hcc_params; /* HCCPARAMS - offset 0x8 */ +#define HCC_EXT_CAPS(p) (((p)>>8)&0xff) /* for pci extended caps */ +#define HCC_ISOC_CACHE(p) ((p)&(1 << 7)) /* true: can cache isoc frame */ +#define HCC_ISOC_THRES(p) (((p)>>4)&0x7) /* bits 6:4, uframes cached */ +#define HCC_CANPARK(p) ((p)&(1 << 2)) /* true: can park on async qh */ +#define HCC_PGM_FRAMELISTLEN(p) ((p)&(1 << 1)) /* true: periodic_size changes*/ +#define HCC_64BIT_ADDR(p) ((p)&(1)) /* true: can use 64-bit addr */ + u8 portroute [8]; /* nibbles for routing - offset 0xC */ +} __attribute__ ((packed)); + + +/* Section 2.3 Host Controller Operational Registers */ +struct ehci_regs { + + /* USBCMD: offset 0x00 */ + u32 command; +/* 23:16 is r/w intr rate, in microframes; default "8" == 1/msec */ +#define CMD_PARK (1<<11) /* enable "park" on async qh */ +#define CMD_PARK_CNT(c) (((c)>>8)&3) /* how many transfers to park for */ +#define CMD_LRESET (1<<7) /* partial reset (no ports, etc) */ +#define CMD_IAAD (1<<6) /* "doorbell" interrupt async advance */ +#define CMD_ASE (1<<5) /* async schedule enable */ +#define CMD_PSE (1<<4) /* periodic schedule enable */ +/* 3:2 is periodic frame list size */ +#define CMD_RESET (1<<1) /* reset HC not bus */ +#define CMD_RUN (1<<0) /* start/stop HC */ + + /* USBSTS: offset 0x04 */ + u32 status; +#define STS_ASS (1<<15) /* Async Schedule Status */ +#define STS_PSS (1<<14) /* Periodic Schedule Status */ +#define STS_RECL (1<<13) /* Reclamation */ +#define STS_HALT (1<<12) /* Not running (any reason) */ +/* some bits reserved */ + /* these STS_* flags are also intr_enable bits (USBINTR) */ +#define STS_IAA (1<<5) /* Interrupted on async advance */ +#define STS_FATAL (1<<4) /* such as some PCI access errors */ +#define STS_FLR (1<<3) /* frame list rolled over */ +#define STS_PCD (1<<2) /* port change detect */ +#define STS_ERR (1<<1) /* "error" completion (overflow, ...) */ +#define STS_INT (1<<0) /* "normal" completion (short, ...) */ + + /* USBINTR: offset 0x08 */ + u32 intr_enable; + + /* FRINDEX: offset 0x0C */ + u32 frame_index; /* current microframe number */ + /* CTRLDSSEGMENT: offset 0x10 */ + u32 segment; /* address bits 63:32 if needed */ + /* PERIODICLISTBASE: offset 0x14 */ + u32 frame_list; /* points to periodic list */ + /* ASYNCLISTADDR: offset 0x18 */ + u32 async_next; /* address of next async queue head */ + + u32 reserved [9]; + + /* CONFIGFLAG: offset 0x40 */ + u32 configured_flag; +#define FLAG_CF (1<<0) /* true: we'll support "high speed" */ + + /* PORTSC: offset 0x44 */ + u32 port_status [0]; /* up to N_PORTS */ +/* 31:23 reserved */ +#define PORT_WKOC_E (1<<22) /* wake on overcurrent (enable) */ +#define PORT_WKDISC_E (1<<21) /* wake on disconnect (enable) */ +#define PORT_WKCONN_E (1<<20) /* wake on connect (enable) */ +/* 19:16 for port testing */ +#define PORT_LED_OFF (0<<14) +#define PORT_LED_AMBER (1<<14) +#define PORT_LED_GREEN (2<<14) +#define PORT_LED_MASK (3<<14) +#define PORT_OWNER (1<<13) /* true: companion hc owns this port */ +#define PORT_POWER (1<<12) /* true: has power (see PPC) */ +#define PORT_USB11(x) (((x)&(3<<10)) == (1<<10)) /* USB 1.1 device */ +/* 11:10 for detecting lowspeed devices (reset vs release ownership) */ +/* 9 reserved */ +#define PORT_RESET (1<<8) /* reset port */ +#define PORT_SUSPEND (1<<7) /* suspend port */ +#define PORT_RESUME (1<<6) /* resume it */ +#define PORT_OCC (1<<5) /* over current change */ +#define PORT_OC (1<<4) /* over current active */ +#define PORT_PEC (1<<3) /* port enable change */ +#define PORT_PE (1<<2) /* port enable */ +#define PORT_CSC (1<<1) /* connect status change */ +#define PORT_CONNECT (1<<0) /* device connected */ +#define PORT_RWC_BITS (PORT_CSC | PORT_PEC | PORT_OCC) +} __attribute__ ((packed)); + +#define USBMODE 0x68 /* USB Device mode */ +#define USBMODE_SDIS (1<<3) /* Stream disable */ +#define USBMODE_BE (1<<2) /* BE/LE endianness select */ +#define USBMODE_CM_HC (3<<0) /* host controller mode */ +#define USBMODE_CM_IDLE (0<<0) /* idle state */ + +/* Appendix C, Debug port ... intended for use with special "debug devices" + * that can help if there's no serial console. (nonstandard enumeration.) + */ +struct ehci_dbg_port { + u32 control; +#define DBGP_OWNER (1<<30) +#define DBGP_ENABLED (1<<28) +#define DBGP_DONE (1<<16) +#define DBGP_INUSE (1<<10) +#define DBGP_ERRCODE(x) (((x)>>7)&0x07) +# define DBGP_ERR_BAD 1 +# define DBGP_ERR_SIGNAL 2 +#define DBGP_ERROR (1<<6) +#define DBGP_GO (1<<5) +#define DBGP_OUT (1<<4) +#define DBGP_LEN(x) (((x)>>0)&0x0f) + u32 pids; +#define DBGP_PID_GET(x) (((x)>>16)&0xff) +#define DBGP_PID_SET(data, tok) (((data)<<8)|(tok)) + u32 data03; + u32 data47; + u32 address; +#define DBGP_EPADDR(dev, ep) (((dev)<<8)|(ep)) +} __attribute__ ((packed)); + +#endif /* __LINUX_USB_EHCI_DEF_H */ From 5c05917e7fe313a187ad6ebb94c1c6cf42862a0b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jul 2008 17:29:40 -0700 Subject: [PATCH 007/138] x86: usb debug port early console, v4 based on work from Eric, and add some timeout so don't dead loop when debug device is not installed v2: fix checkpatch warning v3: move ehci struct def to linux/usrb/ehci_def.h from host/ehci.h also add CONFIG_EARLY_PRINTK_DBGP to disable it by default v4: address comments from Ingo, seperate ehci reg def moving to another patch also add auto detect port that connect to debug device for Nvidia southbridge Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Andi Kleen Cc: "Arjan van de Ven" Cc: "Eric W. Biederman" Cc: "Greg KH" Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 3 +- arch/x86/Kconfig.debug | 13 + arch/x86/kernel/early_printk.c | 762 +++++++++++++++++++++++++++- 3 files changed, 774 insertions(+), 4 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e7bea3e85304..92ddd4afe174 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -657,11 +657,12 @@ and is between 256 and 4096 characters. It is defined in the file earlyprintk= [X86-32,X86-64,SH,BLACKFIN] earlyprintk=vga earlyprintk=serial[,ttySn[,baudrate]] + earlyprintk=dbgp Append ",keep" to not disable it when the real console takes over. - Only vga or serial at a time, not both. + Only vga or serial or usb debug port at a time. Currently only ttyS0 and ttyS1 are supported. diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 092f019e033a..93422d2ecf61 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -43,6 +43,19 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. +config EARLY_PRINTK_DBGP + bool "Early printk via EHCI debug port" + default n + depends on EARLY_PRINTK + help + Write kernel log output directly into the EHCI debug port. + + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server. You should normally N here, + unless you want to debug such a crash. You need usb debug device. + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index ff9e7350da54..28155acf706e 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -3,11 +3,19 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include #include +#include +#include +#include +#include /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */ static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); outb(ch, early_serial_base + TXR); @@ -151,6 +160,721 @@ static struct console early_serial_console = { .index = -1, }; +#ifdef CONFIG_EARLY_PRINTK_DBGP + +static struct ehci_caps __iomem *ehci_caps; +static struct ehci_regs __iomem *ehci_regs; +static struct ehci_dbg_port __iomem *ehci_debug; +static unsigned int dbgp_endpoint_out; + +struct ehci_dev { + u32 bus; + u32 slot; + u32 func; +}; + +static struct ehci_dev ehci_dev; + +#define USB_DEBUG_DEVNUM 127 + +#define DBGP_DATA_TOGGLE 0x8800 + +static inline u32 dbgp_pid_update(u32 x, u32 tok) +{ + return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); +} + +static inline u32 dbgp_len_update(u32 x, u32 len) +{ + return (x & ~0x0f) | (len & 0x0f); +} + +/* + * USB Packet IDs (PIDs) + */ + +/* token */ +#define USB_PID_OUT 0xe1 +#define USB_PID_IN 0x69 +#define USB_PID_SOF 0xa5 +#define USB_PID_SETUP 0x2d +/* handshake */ +#define USB_PID_ACK 0xd2 +#define USB_PID_NAK 0x5a +#define USB_PID_STALL 0x1e +#define USB_PID_NYET 0x96 +/* data */ +#define USB_PID_DATA0 0xc3 +#define USB_PID_DATA1 0x4b +#define USB_PID_DATA2 0x87 +#define USB_PID_MDATA 0x0f +/* Special */ +#define USB_PID_PREAMBLE 0x3c +#define USB_PID_ERR 0x3c +#define USB_PID_SPLIT 0x78 +#define USB_PID_PING 0xb4 +#define USB_PID_UNDEF_0 0xf0 + +#define USB_PID_DATA_TOGGLE 0x88 +#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) + +#define PCI_CAP_ID_EHCI_DEBUG 0xa + +#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ +#define HUB_SHORT_RESET_TIME 10 +#define HUB_LONG_RESET_TIME 200 +#define HUB_RESET_TIMEOUT 500 + +#define DBGP_MAX_PACKET 8 + +static int dbgp_wait_until_complete(void) +{ + u32 ctrl; + int loop = 0x100000; + + do { + ctrl = readl(&ehci_debug->control); + /* Stop when the transaction is finished */ + if (ctrl & DBGP_DONE) + break; + } while (--loop > 0); + + if (!loop) + return -1; + + /* + * Now that we have observed the completed transaction, + * clear the done bit. + */ + writel(ctrl | DBGP_DONE, &ehci_debug->control); + return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); +} + +static void dbgp_mdelay(int ms) +{ + int i; + + while (ms--) { + for (i = 0; i < 1000; i++) + outb(0x1, 0x80); + } +} + +static void dbgp_breath(void) +{ + /* Sleep to give the debug port a chance to breathe */ +} + +static int dbgp_wait_until_done(unsigned ctrl) +{ + u32 pids, lpid; + int ret; + int loop = 3; + +retry: + writel(ctrl | DBGP_GO, &ehci_debug->control); + ret = dbgp_wait_until_complete(); + pids = readl(&ehci_debug->pids); + lpid = DBGP_PID_GET(pids); + + if (ret < 0) + return ret; + + /* + * If the port is getting full or it has dropped data + * start pacing ourselves, not necessary but it's friendly. + */ + if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) + dbgp_breath(); + + /* If I get a NACK reissue the transmission */ + if (lpid == USB_PID_NAK) { + if (--loop > 0) + goto retry; + } + + return ret; +} + +static void dbgp_set_data(const void *buf, int size) +{ + const unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = hi = 0; + for (i = 0; i < 4 && i < size; i++) + lo |= bytes[i] << (8*i); + for (; i < 8 && i < size; i++) + hi |= bytes[i] << (8*(i - 4)); + writel(lo, &ehci_debug->data03); + writel(hi, &ehci_debug->data47); +} + +static void dbgp_get_data(void *buf, int size) +{ + unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = readl(&ehci_debug->data03); + hi = readl(&ehci_debug->data47); + for (i = 0; i < 4 && i < size; i++) + bytes[i] = (lo >> (8*i)) & 0xff; + for (; i < 8 && i < size; i++) + bytes[i] = (hi >> (8*(i - 4))) & 0xff; +} + +static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, + const char *bytes, int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_OUT); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + dbgp_set_data(bytes, size); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + return ret; +} + +static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, + int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_IN); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl &= ~DBGP_OUT; + ctrl |= DBGP_GO; + + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + if (size > ret) + size = ret; + dbgp_get_data(data, size); + return ret; +} + +static int dbgp_control_msg(unsigned devnum, int requesttype, int request, + int value, int index, void *data, int size) +{ + u32 pids, addr, ctrl; + struct usb_ctrlrequest req; + int read; + int ret; + + read = (requesttype & USB_DIR_IN) != 0; + if (size > (read ? DBGP_MAX_PACKET:0)) + return -1; + + /* Compute the control message */ + req.bRequestType = requesttype; + req.bRequest = request; + req.wValue = value; + req.wIndex = index; + req.wLength = size; + + pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); + addr = DBGP_EPADDR(devnum, 0); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, sizeof(req)); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + /* Send the setup message */ + dbgp_set_data(&req, sizeof(req)); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + /* Read the result */ + return dbgp_bulk_read(devnum, 0, data, size); +} + + +/* Find a PCI capability */ +static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) +{ + u8 pos; + int bytes; + + if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos = read_pci_config_byte(num, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) +{ + u32 class; + + class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) + return 0; + + return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); +} + +static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) +{ + u32 bus, slot, func; + + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + unsigned cap; + + cap = __find_dbgp(bus, slot, func); + + if (!cap) + continue; + if (ehci_num-- != 0) + continue; + *rbus = bus; + *rslot = slot; + *rfunc = func; + return cap; + } + } + } + return 0; +} + +static int ehci_reset_port(int port) +{ + u32 portsc; + u32 delay_time, delay; + int loop; + + /* Reset the usb debug port */ + portsc = readl(&ehci_regs->port_status[port - 1]); + portsc &= ~PORT_PE; + portsc |= PORT_RESET; + writel(portsc, &ehci_regs->port_status[port - 1]); + + delay = HUB_ROOT_RESET_TIME; + for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; + delay_time += delay) { + dbgp_mdelay(delay); + + portsc = readl(&ehci_regs->port_status[port - 1]); + if (portsc & PORT_RESET) { + /* force reset to complete */ + loop = 2; + writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), + &ehci_regs->port_status[port - 1]); + do { + portsc = readl(&ehci_regs->port_status[port-1]); + } while ((portsc & PORT_RESET) && (--loop > 0)); + } + + /* Device went away? */ + if (!(portsc & PORT_CONNECT)) + return -ENOTCONN; + + /* bomb out completely if something weird happend */ + if ((portsc & PORT_CSC)) + return -EINVAL; + + /* If we've finished resetting, then break out of the loop */ + if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) + return 0; + } + return -EBUSY; +} + +static int ehci_wait_for_port(int port) +{ + u32 status; + int ret, reps; + + for (reps = 0; reps < 3; reps++) { + dbgp_mdelay(100); + status = readl(&ehci_regs->status); + if (status & STS_PCD) { + ret = ehci_reset_port(port); + if (ret == 0) + return 0; + } + } + return -ENOTCONN; +} + +#ifdef DBGP_DEBUG +# define dbgp_printk early_printk +#else +static inline void dbgp_printk(const char *fmt, ...) { } +#endif + +typedef void (*set_debug_port_t)(int port); + +static void default_set_debug_port(int port) +{ +} + +static set_debug_port_t set_debug_port = default_set_debug_port; + +static void nvidia_set_debug_port(int port) +{ + u32 dword; + dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x74); + dword &= ~(0x0f<<12); + dword |= ((port & 0x0f)<<12); + write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, + dword); + dbgp_printk("set debug port to %d\n", port); +} + +static void __init detect_set_debug_port(void) +{ + u32 vendorid; + + vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x00); + + if ((vendorid & 0xffff) == 0x10de) { + dbgp_printk("using nvidia set_debug_port\n"); + set_debug_port = nvidia_set_debug_port; + } +} + +static int __init ehci_setup(void) +{ + struct usb_debug_descriptor dbgp_desc; + u32 cmd, ctrl, status, portsc, hcs_params; + u32 debug_port, new_debug_port = 0, n_ports; + u32 devnum; + int ret, i; + int loop; + int port_map_tried; + int playtimes = 3; + +try_next_time: + port_map_tried = 0; + +try_next_port: + + hcs_params = readl(&ehci_caps->hcs_params); + debug_port = HCS_DEBUG_PORT(hcs_params); + n_ports = HCS_N_PORTS(hcs_params); + + dbgp_printk("debug_port: %d\n", debug_port); + dbgp_printk("n_ports: %d\n", n_ports); + + for (i = 1; i <= n_ports; i++) { + portsc = readl(&ehci_regs->port_status[i-1]); + dbgp_printk("portstatus%d: %08x\n", i, portsc); + } + + if (port_map_tried && (new_debug_port != debug_port)) { + if (--playtimes) { + set_debug_port(new_debug_port); + goto try_next_time; + } + return -1; + } + + loop = 10; + /* Reset the EHCI controller */ + cmd = readl(&ehci_regs->command); + cmd |= CMD_RESET; + writel(cmd, &ehci_regs->command); + do { + cmd = readl(&ehci_regs->command); + } while ((cmd & CMD_RESET) && (--loop > 0)); + + if (!loop) { + dbgp_printk("can not reset ehci\n"); + return -1; + } + dbgp_printk("ehci reset done\n"); + + /* Claim ownership, but do not enable yet */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_OWNER; + ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); + writel(ctrl, &ehci_debug->control); + + /* Start the ehci running */ + cmd = readl(&ehci_regs->command); + cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); + cmd |= CMD_RUN; + writel(cmd, &ehci_regs->command); + + /* Ensure everything is routed to the EHCI */ + writel(FLAG_CF, &ehci_regs->configured_flag); + + /* Wait until the controller is no longer halted */ + loop = 10; + do { + status = readl(&ehci_regs->status); + } while ((status & STS_HALT) && (--loop > 0)); + + if (!loop) { + dbgp_printk("ehci can be started\n"); + return -1; + } + dbgp_printk("ehci started\n"); + + /* Wait for a device to show up in the debug port */ + ret = ehci_wait_for_port(debug_port); + if (ret < 0) { + dbgp_printk("No device found in debug port\n"); + goto next_debug_port; + } + dbgp_printk("ehci wait for port done\n"); + + /* Enable the debug port */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_CLAIM; + writel(ctrl, &ehci_debug->control); + ctrl = readl(&ehci_debug->control); + if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { + dbgp_printk("No device in debug port\n"); + writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); + goto err; + } + dbgp_printk("debug ported enabled\n"); + + /* Completely transfer the debug device to the debug controller */ + portsc = readl(&ehci_regs->port_status[debug_port - 1]); + portsc &= ~PORT_PE; + writel(portsc, &ehci_regs->port_status[debug_port - 1]); + + dbgp_mdelay(100); + + /* Find the debug device and make it device number 127 */ + for (devnum = 0; devnum <= 127; devnum++) { + ret = dbgp_control_msg(devnum, + USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, + &dbgp_desc, sizeof(dbgp_desc)); + if (ret > 0) + break; + } + if (devnum > 127) { + dbgp_printk("Could not find attached debug device\n"); + goto err; + } + if (ret < 0) { + dbgp_printk("Attached device is not a debug device\n"); + goto err; + } + dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; + + /* Move the device to 127 if it isn't already there */ + if (devnum != USB_DEBUG_DEVNUM) { + ret = dbgp_control_msg(devnum, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); + if (ret < 0) { + dbgp_printk("Could not move attached device to %d\n", + USB_DEBUG_DEVNUM); + goto err; + } + devnum = USB_DEBUG_DEVNUM; + dbgp_printk("debug device renamed to 127\n"); + } + + /* Enable the debug interface */ + ret = dbgp_control_msg(USB_DEBUG_DEVNUM, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); + if (ret < 0) { + dbgp_printk(" Could not enable the debug device\n"); + goto err; + } + dbgp_printk("debug interface enabled\n"); + + /* Perform a small write to get the even/odd data state in sync + */ + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); + if (ret < 0) { + dbgp_printk("dbgp_bulk_write failed: %d\n", ret); + goto err; + } + dbgp_printk("small write doned\n"); + + return 0; +err: + /* Things didn't work so remove my claim */ + ctrl = readl(&ehci_debug->control); + ctrl &= ~(DBGP_CLAIM | DBGP_OUT); + writel(ctrl, &ehci_debug->control); + return -1; + +next_debug_port: + port_map_tried |= (1<<(debug_port - 1)); + new_debug_port = ((debug_port-1+1)%n_ports) + 1; + if (port_map_tried != ((1<> 29) & 0x7; + bar = (bar * 4) + 0xc; + offset = (debug_port >> 16) & 0xfff; + dbgp_printk("bar: %02x offset: %03x\n", bar, offset); + if (bar != PCI_BASE_ADDRESS_0) { + dbgp_printk("only debug ports on bar 1 handled.\n"); + + return -1; + } + + bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); + if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { + dbgp_printk("only simple 32bit mmio bars supported\n"); + + return -1; + } + + /* double check if the mem space is enabled */ + byte = read_pci_config_byte(bus, slot, func, 0x04); + if (!(byte & 0x2)) { + byte |= 0x02; + write_pci_config_byte(bus, slot, func, 0x04, byte); + dbgp_printk("mmio for ehci enabled\n"); + } + + /* + * FIXME I don't have the bar size so just guess PAGE_SIZE is more + * than enough. 1K is the biggest I have seen. + */ + set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); + ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); + ehci_bar += bar_val & ~PAGE_MASK; + dbgp_printk("ehci_bar: %p\n", ehci_bar); + + ehci_caps = ehci_bar; + ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); + ehci_debug = ehci_bar + offset; + ehci_dev.bus = bus; + ehci_dev.slot = slot; + ehci_dev.func = func; + + detect_set_debug_port(); + + ret = ehci_setup(); + if (ret < 0) { + dbgp_printk("ehci_setup failed\n"); + ehci_debug = 0; + + return -1; + } + + return 0; +} + +static void early_dbgp_write(struct console *con, const char *str, u32 n) +{ + int chunk, ret; + + if (!ehci_debug) + return; + while (n > 0) { + chunk = n; + if (chunk > DBGP_MAX_PACKET) + chunk = DBGP_MAX_PACKET; + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, + dbgp_endpoint_out, str, chunk); + str += chunk; + n -= chunk; + } +} + +static struct console early_dbgp_console = { + .name = "earlydbg", + .write = early_dbgp_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; +#endif + /* Console interface to a host file on AMD's SimNow! */ static int simnow_fd; @@ -165,6 +889,7 @@ enum { static noinline long simnow(long cmd, long a, long b, long c) { long ret; + asm volatile("cpuid" : "=a" (ret) : "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); @@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c) static void __init simnow_init(char *str) { char *fn = "klog"; + if (*str == '=') fn = ++str; /* error ignored */ @@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...) va_end(ap); } -static int __initdata keep_early; static int __init setup_early_printk(char *buf) { + int keep_early; + if (!buf) return 0; @@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - if (strstr(buf, "keep")) - keep_early = 1; + keep_early = (strstr(buf, "keep") != NULL); if (!strncmp(buf, "serial", 6)) { early_serial_init(buf + 6); @@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf) simnow_init(buf + 6); early_console = &simnow_console; keep_early = 1; +#ifdef CONFIG_EARLY_PRINTK_DBGP + } else if (!strncmp(buf, "dbgp", 4)) { + if (early_dbgp_init(buf+4) < 0) + return 0; + early_console = &early_dbgp_console; + /* + * usb subsys will reset ehci controller, so don't keep + * that early console + */ + keep_early = 0; +#endif #ifdef CONFIG_HVC_XEN } else if (!strncmp(buf, "xen", 3)) { early_console = &xenboot_console; @@ -251,4 +988,23 @@ static int __init setup_early_printk(char *buf) register_console(early_console); return 0; } + +void __init enable_debug_console(char *buf) +{ +#ifdef DBGP_DEBUG + struct console *old_early_console = NULL; + + if (early_console_initialized && early_console) { + old_early_console = early_console; + unregister_console(early_console); + early_console_initialized = 0; + } + + setup_early_printk(buf); + + if (early_console == old_early_console && old_early_console) + register_console(old_early_console); +#endif +} + early_param("earlyprintk", setup_early_printk); From 9749986a878e91182ff027ff0010ab8e3211031a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 26 Jul 2008 17:28:11 +0200 Subject: [PATCH 008/138] x86: usb debug port early console, fix fix: arch/x86/kernel/built-in.o: In function `nvidia_set_debug_port': early_printk.c:(.text+0xf8b1): undefined reference to `read_pci_config' early_printk.c:(.text+0xf8dc): undefined reference to `write_pci_config' arch/x86/kernel/built-in.o: In function `setup_early_printk': early_printk.c:(.init.text+0x5487): undefined reference to `early_pci_allowed' early_printk.c:(.init.text+0x54cb): undefined reference to `read_pci_config' early_printk.c:(.init.text+0x54ec): undefined reference to `read_pci_config_16' [...] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 93422d2ecf61..2a3dfbd5e677 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -46,7 +46,7 @@ config EARLY_PRINTK config EARLY_PRINTK_DBGP bool "Early printk via EHCI debug port" default n - depends on EARLY_PRINTK + depends on EARLY_PRINTK && PCI help Write kernel log output directly into the EHCI debug port. From b56afe1d41653fb07ab1b5af5ccc12001c4dd5a0 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Thu, 24 Jul 2008 12:15:45 -0300 Subject: [PATCH 009/138] x86, xen: Use native_pte_flags instead of native_pte_val for .pte_flags Using native_pte_val triggers the BUG_ON() in the paravirt_ops version of pte_flags(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 06219e60e9c8..e2767c28dac7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1347,7 +1347,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, .pte_val = xen_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, From 64f53a0492b4bc11868307990bb8f7c1e0764f89 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 27 Jul 2008 08:42:32 -0700 Subject: [PATCH 010/138] x86: fix initialization of 'l' bit in ldt descriptors Make sure that fill_ldt() initializes the 'l' bit in the descriptor. It always sets it to 0, ignoring 'lm' in user_desc, preserving original x86_64 behaviour. Previously it was leaving 'l' uninitialized. Signed-off-by: Jeremy Fitzhardinge Cc: Glauber de Oliveira Costa Signed-off-by: Ingo Molnar Cc: Glauber de Oliveira Costa --- include/asm-x86/desc.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h index 24a524f5e1a2..06f786f4b4fb 100644 --- a/include/asm-x86/desc.h +++ b/include/asm-x86/desc.h @@ -24,6 +24,11 @@ static inline void fill_ldt(struct desc_struct *desc, desc->d = info->seg_32bit; desc->g = info->limit_in_pages; desc->base2 = (info->base_addr & 0xff000000) >> 24; + /* + * Don't allow setting of the lm bit. It is useless anyway + * because 64bit system calls require __USER_CS: + */ + desc->l = 0; } extern struct desc_ptr idt_descr; From a05d2ebab28011c2f3f520833f4bfdd2fd1b9c02 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 27 Jul 2008 08:45:02 -0700 Subject: [PATCH 011/138] xen: fix allocation and use of large ldts When the ldt gets to more than 1 page in size, the kernel uses vmalloc to allocate it. This means that: - when making the ldt RO, we must update the pages in both the vmalloc mapping and the linear mapping to make sure there are no RW aliases. - we need to use arbitrary_virt_to_machine to compute the machine addr for each update Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 51 ++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e2767c28dac7..b011e4a5dbbe 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -325,24 +325,55 @@ static unsigned long xen_store_tr(void) return 0; } +/* + * If 'v' is a vmalloc mapping, then find the linear mapping of the + * page (if any) and also set its protections to match: + */ +static void set_aliased_prot(void *v, pgprot_t prot) +{ + int level; + pte_t *ptep; + pte_t pte; + unsigned long pfn; + struct page *page; + + ptep = lookup_address((unsigned long)v, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + page = pfn_to_page(pfn); + + pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) + BUG(); + + if (!PageHighMem(page)) { + void *av = __va(PFN_PHYS(pfn)); + + if (av != v) + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) + BUG(); + } else + kmap_flush_unused(); +} + static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) { - unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); - void *v = ldt; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; int i; - for(i = 0; i < pages; i += PAGE_SIZE) - make_lowmem_page_readonly(v + i); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); } static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) { - unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); - void *v = ldt; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; int i; - for(i = 0; i < pages; i += PAGE_SIZE) - make_lowmem_page_readwrite(v + i); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL); } static void xen_set_ldt(const void *addr, unsigned entries) @@ -446,7 +477,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(lp); u64 entry = *(u64 *)ptr; preempt_disable(); @@ -579,7 +610,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, } static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { struct multicall_space mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); From d974ae379a2fbe8948f01eabbc6b19c0a80f09bc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 24 Jul 2008 16:27:46 -0700 Subject: [PATCH 012/138] generic, memparse(): constify argument memparse()'s first argument can be const, so it should be. Signed-off-by: Jeremy Fitzhardinge Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 2 +- lib/cmdline.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fdbbf72ca2eb..7889c2f9b75d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -176,7 +176,7 @@ extern int vsscanf(const char *, const char *, va_list) extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); -extern unsigned long long memparse(char *ptr, char **retptr); +extern unsigned long long memparse(const char *ptr, char **retptr); extern int core_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/lib/cmdline.c b/lib/cmdline.c index 5ba8a942a478..f5f3ad8b62ff 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -126,7 +126,7 @@ char *get_options(const char *str, int nints, int *ints) * megabyte, or one gigabyte, respectively. */ -unsigned long long memparse(char *ptr, char **retptr) +unsigned long long memparse(const char *ptr, char **retptr) { char *endptr; /* local pointer to end of parsed string */ From 167e6cf62979df03513898966f9227847d192327 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 24 Jul 2008 16:27:52 -0700 Subject: [PATCH 013/138] xen-balloon: fix up sysfs issues 1. Set the class so it doesn't clash with the normal memory class 2. Fix up the sysfs show functions to match the new prototype 3. Clean up use of memparse Signed-off-by: Jeremy Fitzhardinge Cc: "viets@work.de" Cc: Andi Kleen Signed-off-by: Ingo Molnar --- drivers/xen/balloon.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index d4427cb86979..cdec2d843e4c 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -60,7 +60,7 @@ #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) -#define BALLOON_CLASS_NAME "memory" +#define BALLOON_CLASS_NAME "xen_memory" struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ @@ -588,12 +588,13 @@ static void balloon_release_driver_page(struct page *page) } -#define BALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct sys_device *dev, \ - char *buf) \ - { \ - return sprintf(buf, format, ##args); \ - } \ +#define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); @@ -604,7 +605,8 @@ BALLOON_SHOW(hard_limit_kb, (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0); BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); -static ssize_t show_target_kb(struct sys_device *dev, char *buf) +static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) { return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); } @@ -614,19 +616,14 @@ static ssize_t store_target_kb(struct sys_device *dev, const char *buf, size_t count) { - char memstring[64], *endchar; + char *endchar; unsigned long long target_bytes; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (count <= 1) - return -EBADMSG; /* runt */ - if (count > sizeof(memstring)) - return -EFBIG; /* too long */ - strcpy(memstring, buf); + target_bytes = memparse(buf, &endchar); - target_bytes = memparse(memstring, &endchar); balloon_set_new_target(target_bytes >> PAGE_SHIFT); return count; From fde28e8f49ed86e771667a3fe81fcc25c93ede8e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 24 Jul 2008 16:28:00 -0700 Subject: [PATCH 014/138] xen-balloon: clean up unused functions Remove some unused functions: balloon_update_driver_allowance balloon_release_driver_page only used on the (obsolete, removed) flip path in netfront alloc_empty_pages_and_pagevec free_empty_pages_and_pagevec only used in backend drivers; can be reintroduced when needed Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- drivers/xen/balloon.c | 147 +----------------------------------------- 1 file changed, 3 insertions(+), 144 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index cdec2d843e4c..fff987b10e0f 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -226,9 +226,8 @@ static int increase_reservation(unsigned long nr_pages) } set_xen_guest_handle(reservation.extent_start, frame_list); - reservation.nr_extents = nr_pages; - rc = HYPERVISOR_memory_op( - XENMEM_populate_physmap, &reservation); + reservation.nr_extents = nr_pages; + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); if (rc < nr_pages) { if (rc > 0) { int ret; @@ -236,7 +235,7 @@ static int increase_reservation(unsigned long nr_pages) /* We hit the Xen hard limit: reprobe. */ reservation.nr_extents = rc; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); + &reservation); BUG_ON(ret != rc); } if (rc >= 0) @@ -464,130 +463,6 @@ static void balloon_exit(void) module_exit(balloon_exit); -static void balloon_update_driver_allowance(long delta) -{ - unsigned long flags; - - spin_lock_irqsave(&balloon_lock, flags); - balloon_stats.driver_pages += delta; - spin_unlock_irqrestore(&balloon_lock, flags); -} - -static int dealloc_pte_fn( - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) -{ - unsigned long mfn = pte_mfn(*pte); - int ret; - struct xen_memory_reservation reservation = { - .nr_extents = 1, - .extent_order = 0, - .domid = DOMID_SELF - }; - set_xen_guest_handle(reservation.extent_start, &mfn); - set_pte_at(&init_mm, addr, pte, __pte_ma(0ull)); - set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); - BUG_ON(ret != 1); - return 0; -} - -static struct page **alloc_empty_pages_and_pagevec(int nr_pages) -{ - unsigned long vaddr, flags; - struct page *page, **pagevec; - int i, ret; - - pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL); - if (pagevec == NULL) - return NULL; - - for (i = 0; i < nr_pages; i++) { - page = pagevec[i] = alloc_page(GFP_KERNEL); - if (page == NULL) - goto err; - - vaddr = (unsigned long)page_address(page); - - scrub_page(page); - - spin_lock_irqsave(&balloon_lock, flags); - - if (xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long gmfn = page_to_pfn(page); - struct xen_memory_reservation reservation = { - .nr_extents = 1, - .extent_order = 0, - .domid = DOMID_SELF - }; - set_xen_guest_handle(reservation.extent_start, &gmfn); - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - if (ret == 1) - ret = 0; /* success */ - } else { - ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE, - dealloc_pte_fn, NULL); - } - - if (ret != 0) { - spin_unlock_irqrestore(&balloon_lock, flags); - __free_page(page); - goto err; - } - - totalram_pages = --balloon_stats.current_pages; - - spin_unlock_irqrestore(&balloon_lock, flags); - } - - out: - schedule_work(&balloon_worker); - flush_tlb_all(); - return pagevec; - - err: - spin_lock_irqsave(&balloon_lock, flags); - while (--i >= 0) - balloon_append(pagevec[i]); - spin_unlock_irqrestore(&balloon_lock, flags); - kfree(pagevec); - pagevec = NULL; - goto out; -} - -static void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages) -{ - unsigned long flags; - int i; - - if (pagevec == NULL) - return; - - spin_lock_irqsave(&balloon_lock, flags); - for (i = 0; i < nr_pages; i++) { - BUG_ON(page_count(pagevec[i]) != 1); - balloon_append(pagevec[i]); - } - spin_unlock_irqrestore(&balloon_lock, flags); - - kfree(pagevec); - - schedule_work(&balloon_worker); -} - -static void balloon_release_driver_page(struct page *page) -{ - unsigned long flags; - - spin_lock_irqsave(&balloon_lock, flags); - balloon_append(page); - balloon_stats.driver_pages--; - spin_unlock_irqrestore(&balloon_lock, flags); - - schedule_work(&balloon_worker); -} - - #define BALLOON_SHOW(name, format, args...) \ static ssize_t show_##name(struct sys_device *dev, \ struct sysdev_attribute *attr, \ @@ -691,20 +566,4 @@ static int register_balloon(struct sys_device *sysdev) return error; } -static void unregister_balloon(struct sys_device *sysdev) -{ - int i; - - sysfs_remove_group(&sysdev->kobj, &balloon_info_group); - for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) - sysdev_remove_file(sysdev, balloon_attrs[i]); - sysdev_unregister(sysdev); - sysdev_class_unregister(&balloon_sysdev_class); -} - -static void balloon_sysfs_exit(void) -{ - unregister_balloon(&balloon_sysdev); -} - MODULE_LICENSE("GPL"); From d89961e2dc87b6e30b8e3f60bd2af5cd92cf4643 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 24 Jul 2008 13:48:58 -0700 Subject: [PATCH 015/138] xen: suppress known wrmsrs In general, Xen doesn't support wrmsr from an unprivileged domain; it just ends up ignoring the instruction and printing a message on the console. Given that there are sets of MSRs we know the kernel will try to write to, but we don't care, just eat them in xen_write_msr to cut down on console noise. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b011e4a5dbbe..b795470ec069 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -854,6 +854,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) ret = -EFAULT; break; #endif + + case MSR_STAR: + case MSR_CSTAR: + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* Fast syscall setup is all done in hypercalls, so + these are all ignored. Stub them out here to stop + Xen console noise. */ + break; + default: ret = native_write_msr_safe(msr, low, high); } From e7f5b309c9bd6142f395c4a36123ebac4bcdc1b0 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:11 +0200 Subject: [PATCH 016/138] x86: AMD microcode patch loading support v2 Add an entry to the MAINTAINERS file for AMD CPU microcode patch loading support. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- MAINTAINERS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 03c5d6ccb9f8..28f867d99a8c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -384,6 +384,11 @@ M: joerg.roedel@amd.com L: iommu@lists.linux-foundation.org S: Supported +AMD MICROCODE UPDATE SUPPORT +P: Peter Oruba +M: peter.oruba@amd.com +S: Supported + AMS (Apple Motion Sensor) DRIVER P: Stelian Pop M: stelian@popies.net From 9a56a0f80b52cb41c5e0add47c7ce0bb2ef25eb0 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:13 +0200 Subject: [PATCH 017/138] x86: moved Intel microcode patch loader declarations to seperate header file Intel specific microcode declarations have been moved to a seperate header file. There are no code changes to the code itself and no side effects to other parts. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 1 + include/asm-x86/microcode.h | 34 ++++++++++++++++++++++++++++++++++ include/asm-x86/processor.h | 35 ----------------------------------- 3 files changed, 35 insertions(+), 35 deletions(-) create mode 100644 include/asm-x86/microcode.h diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 6994c751590e..0d654bd32928 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -93,6 +93,7 @@ #include #include #include +#include MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h new file mode 100644 index 000000000000..5a0556855154 --- /dev/null +++ b/include/asm-x86/microcode.h @@ -0,0 +1,34 @@ +struct microcode_header { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode { + struct microcode_header hdr; + unsigned int bits[0]; +}; + +typedef struct microcode microcode_t; +typedef struct microcode_header microcode_header_t; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h index 5f58da401b43..58a76f69ee31 100644 --- a/include/asm-x86/processor.h +++ b/include/asm-x86/processor.h @@ -561,41 +561,6 @@ static inline void clear_in_cr4(unsigned long mask) write_cr4(cr4); } -struct microcode_header { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode { - struct microcode_header hdr; - unsigned int bits[0]; -}; - -typedef struct microcode microcode_t; -typedef struct microcode_header microcode_header_t; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; - typedef struct { unsigned long seg; } mm_segment_t; From 8e61028dfdc6b8ca996abfe8f9baef6792ea2904 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:14 +0200 Subject: [PATCH 018/138] x86: typedef removal Removed typedefs. No functional changes to the code. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 24 ++++++++++++------------ include/asm-x86/microcode.h | 3 --- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 0d654bd32928..74e6a77bf190 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -102,17 +102,17 @@ MODULE_LICENSE("GPL"); #define MICROCODE_VERSION "1.14a" #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ +#define MC_HEADER_SIZE (sizeof (struct microcode_header)) /* 48 bytes */ #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ #define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ #define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ #define DWSIZE (sizeof (u32)) #define get_totalsize(mc) \ - (((microcode_t *)mc)->hdr.totalsize ? \ - ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) + (((struct microcode *)mc)->hdr.totalsize ? \ + ((struct microcode *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) #define get_datasize(mc) \ - (((microcode_t *)mc)->hdr.datasize ? \ - ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + (((struct microcode *)mc)->hdr.datasize ? \ + ((struct microcode *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) #define sigmatch(s1, s2, p1, p2) \ (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) @@ -130,7 +130,7 @@ static struct ucode_cpu_info { unsigned int sig; unsigned int pf; unsigned int rev; - microcode_t *mc; + struct microcode *mc; } ucode_cpu_info[NR_CPUS]; static void collect_cpu_info(int cpu_num) @@ -171,7 +171,7 @@ static void collect_cpu_info(int cpu_num) } static inline int microcode_update_match(int cpu_num, - microcode_header_t *mc_header, int sig, int pf) + struct microcode_header *mc_header, int sig, int pf) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -183,7 +183,7 @@ static inline int microcode_update_match(int cpu_num, static int microcode_sanity_check(void *mc) { - microcode_header_t *mc_header = mc; + struct microcode_header *mc_header = mc; struct extended_sigtable *ext_header = NULL; struct extended_signature *ext_sig; unsigned long total_size, data_size, ext_table_size; @@ -268,7 +268,7 @@ static int microcode_sanity_check(void *mc) static int get_maching_microcode(void *mc, int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - microcode_header_t *mc_header = mc; + struct microcode_header *mc_header = mc; struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); int ext_sigcount, i; @@ -355,7 +355,7 @@ static unsigned int user_buffer_size; /* it's size */ static long get_next_ucode(void **mc, long offset) { - microcode_header_t mc_header; + struct microcode_header mc_header; unsigned long total_size; /* No more data */ @@ -497,13 +497,13 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); static long get_next_ucode_from_buffer(void **mc, const u8 *buf, unsigned long size, long offset) { - microcode_header_t *mc_header; + struct microcode_header *mc_header; unsigned long total_size; /* No more data */ if (offset >= size) return 0; - mc_header = (microcode_header_t *)(buf + offset); + mc_header = (struct microcode_header *)(buf + offset); total_size = get_totalsize(mc_header); if (offset + total_size > size) { diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h index 5a0556855154..1519ef0674bb 100644 --- a/include/asm-x86/microcode.h +++ b/include/asm-x86/microcode.h @@ -16,9 +16,6 @@ struct microcode { unsigned int bits[0]; }; -typedef struct microcode microcode_t; -typedef struct microcode_header microcode_header_t; - /* microcode format is extended from prescott processors */ struct extended_signature { unsigned int sig; From c3b71bcec0380836caac9b524fa1585b469b7456 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:15 +0200 Subject: [PATCH 019/138] x86: move per CPU microcode structure declaration to header file This structure will be later used by other modules as well and needs therfore to be moved out to a header file. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 8 +------- include/asm-x86/microcode.h | 8 ++++++++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 74e6a77bf190..4e7b2f65fed6 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -125,13 +125,7 @@ static DEFINE_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); -static struct ucode_cpu_info { - int valid; - unsigned int sig; - unsigned int pf; - unsigned int rev; - struct microcode *mc; -} ucode_cpu_info[NR_CPUS]; +static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; static void collect_cpu_info(int cpu_num) { diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h index 1519ef0674bb..d34a1fc1b173 100644 --- a/include/asm-x86/microcode.h +++ b/include/asm-x86/microcode.h @@ -29,3 +29,11 @@ struct extended_sigtable { unsigned int reserved[3]; struct extended_signature sigs[0]; }; + +struct ucode_cpu_info { + int valid; + unsigned int sig; + unsigned int pf; + unsigned int rev; + struct microcode *mc; +}; From 1abae31096007cc993f67ae2576fe8f812270ad6 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:16 +0200 Subject: [PATCH 020/138] x86: move microcode.c to microcode_intel.c Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/{microcode.c => microcode_intel.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename arch/x86/kernel/{microcode.c => microcode_intel.c} (100%) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3db651fc8ec5..a9be2a0dde80 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -51,7 +51,7 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_MICROCODE) += microcode_intel.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode_intel.c similarity index 100% rename from arch/x86/kernel/microcode.c rename to arch/x86/kernel/microcode_intel.c From 3e135d887c973b525d43fbb67dfc5972694882f6 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:17 +0200 Subject: [PATCH 021/138] x86: code split to two parts Split off existing code into two seperate files. One file holds general code, the other file vendor specific parts. No functional changes, only refactoring. Temporarily Introduced a new module name 'ucode' for result, due to already taken name 'microcode'. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 +- arch/x86/kernel/microcode.c | 463 ++++++++++++++++++++++++++++++ arch/x86/kernel/microcode_intel.c | 387 ++----------------------- 3 files changed, 488 insertions(+), 365 deletions(-) create mode 100644 arch/x86/kernel/microcode.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a9be2a0dde80..abb32aed7f4b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -51,7 +51,8 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode_intel.o +obj-$(CONFIG_MICROCODE) += ucode.o +ucode-objs := microcode.o microcode_intel.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c new file mode 100644 index 000000000000..c1047d7f7ede --- /dev/null +++ b/arch/x86/kernel/microcode.c @@ -0,0 +1,463 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman for the fix. + * 1.08 11 Dec 2000, Richard Schaal and + * Tigran Aivazian + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick and + * Tigran Aivazian , + * Serialize updates as required on HT processors due to speculative + * nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble , + * Jun Nakajima + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +//#define DEBUG /* pr_debug */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "1.14a" + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; + +extern long get_next_ucode(void **mc, long offset); +extern int microcode_sanity_check(void *mc); +extern int get_matching_microcode(void *mc, int cpu); +extern void collect_cpu_info(int cpu_num); +extern int cpu_request_microcode(int cpu); +extern void microcode_fini_cpu(int cpu); +extern void apply_microcode(int cpu); +extern int apply_microcode_check_cpu(int cpu); + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +void __user *user_buffer; /* user area microcode data buffer */ +unsigned int user_buffer_size; /* it's size */ + +static int do_microcode_update (void) +{ + long cursor = 0; + int error = 0; + void *new_mc = NULL; + int cpu; + cpumask_t old; + cpumask_of_cpu_ptr_declare(newmask); + + old = current->cpus_allowed; + + while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { + error = microcode_sanity_check(new_mc); + if (error) + goto out; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; + cpumask_of_cpu_ptr_next(newmask, cpu); + set_cpus_allowed_ptr(current, newmask); + error = get_maching_microcode(new_mc, cpu); + if (error < 0) + goto out; + if (error == 1) + apply_microcode(cpu); + } + vfree(new_mc); + } +out: + if (cursor > 0) + vfree(new_mc); + if (cursor < 0) + error = cursor; + set_cpus_allowed_ptr(current, &old); + return error; +} + +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + cycle_kernel_lock(); + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +{ + ssize_t ret; + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); + return -EINVAL; + } + + get_online_cpus(); + mutex_lock(µcode_mutex); + + user_buffer = (void __user *) buf; + user_buffer_size = (int) len; + + ret = do_microcode_update(); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init (void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void microcode_dev_exit (void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while(0) +#endif + +/* fake device for request_firmware */ +struct platform_device *microcode_pdev; + +static void microcode_init_cpu(int cpu, int resume) +{ + cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + old = current->cpus_allowed; + + set_cpus_allowed_ptr(current, newmask); + mutex_lock(µcode_mutex); + collect_cpu_info(cpu); + if (uci->valid && system_state == SYSTEM_RUNNING && !resume) + cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); +} + +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + char *end; + unsigned long val = simple_strtoul(buf, &end, 0); + int err = 0; + int cpu = dev->id; + + if (end == buf) + return -EINVAL; + if (val == 1) { + cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); + + old = current->cpus_allowed; + + get_online_cpus(); + set_cpus_allowed_ptr(current, newmask); + + mutex_lock(µcode_mutex); + if (uci->valid) + err = cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + put_online_cpus(); + set_cpus_allowed_ptr(current, &old); + } + if (err) + return err; + return sz; +} + +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->rev); +} + +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->pf); +} + +static SYSDEV_ATTR(reload, 0200, NULL, reload_store); +static SYSDEV_ATTR(version, 0400, version_show, NULL); +static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &attr_reload.attr, + &attr_version.attr, + &attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) +{ + int err, cpu = sys_dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d added\n", cpu); + memset(uci, 0, sizeof(*uci)); + + err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + if (err) + return err; + + microcode_init_cpu(cpu, resume); + + return 0; +} + +static int mc_sysdev_add(struct sys_device *sys_dev) +{ + return __mc_sysdev_add(sys_dev, 0); +} + +static int mc_sysdev_remove(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return 0; +} + +static int mc_sysdev_resume(struct sys_device *dev) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + pr_debug("microcode: CPU%d resumed\n", cpu); + /* only CPU 0 will apply ucode here */ + apply_microcode(0); + return 0; +} + +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, +}; + +static __cpuinit int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + switch (action) { + case CPU_UP_CANCELED_FROZEN: + /* The CPU refused to come up during a system resume */ + microcode_fini_cpu(cpu); + break; + case CPU_ONLINE: + case CPU_DOWN_FAILED: + mc_sysdev_add(sys_dev); + break; + case CPU_ONLINE_FROZEN: + /* System-wide resume is in progress, try to apply microcode */ + if (apply_microcode_check_cpu(cpu)) { + /* The application of microcode failed */ + microcode_fini_cpu(cpu); + __mc_sysdev_add(sys_dev, 1); + break; + } + case CPU_DOWN_FAILED_FROZEN: + if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + printk(KERN_ERR "microcode: Failed to create the sysfs " + "group for CPU%d\n", cpu); + break; + case CPU_DOWN_PREPARE: + mc_sysdev_remove(sys_dev); + break; + case CPU_DOWN_PREPARE_FROZEN: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +static int __init microcode_init (void) +{ + int error; + + printk(KERN_INFO + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + get_online_cpus(); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + if (error) { + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + return error; + } + + register_hotcpu_notifier(&mc_cpu_notifier); + return 0; +} + +static void __exit microcode_exit (void) +{ + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + + get_online_cpus(); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + + platform_device_unregister(microcode_pdev); +} + +module_init(microcode_init) +module_exit(microcode_exit) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 4e7b2f65fed6..eded0a154ea8 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -95,18 +95,16 @@ #include #include -MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); +MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); -#define MICROCODE_VERSION "1.14a" - #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof (struct microcode_header)) /* 48 bytes */ +#define MC_HEADER_SIZE (sizeof(struct microcode_header)) /* 48 bytes */ #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ -#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ -#define DWSIZE (sizeof (u32)) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) /* 20 bytes */ +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) /* 12 bytes */ +#define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ (((struct microcode *)mc)->hdr.totalsize ? \ ((struct microcode *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) @@ -123,11 +121,11 @@ MODULE_LICENSE("GPL"); static DEFINE_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DEFINE_MUTEX(microcode_mutex); +extern struct mutex microcode_mutex; -static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -static void collect_cpu_info(int cpu_num) +void collect_cpu_info(int cpu_num) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -140,7 +138,7 @@ static void collect_cpu_info(int cpu_num) uci->valid = 1; if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { + cpu_has(c, X86_FEATURE_IA64)) { printk(KERN_ERR "microcode: CPU%d not a capable Intel " "processor\n", cpu_num); uci->valid = 0; @@ -175,7 +173,7 @@ static inline int microcode_update_match(int cpu_num, return 1; } -static int microcode_sanity_check(void *mc) +int microcode_sanity_check(void *mc) { struct microcode_header *mc_header = mc; struct extended_sigtable *ext_header = NULL; @@ -259,7 +257,7 @@ static int microcode_sanity_check(void *mc) * return 1 - found update * return < 0 - error */ -static int get_maching_microcode(void *mc, int cpu) +int get_matching_microcode(void *mc, int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header *mc_header = mc; @@ -288,7 +286,7 @@ static int get_maching_microcode(void *mc, int cpu) return 0; find: pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); + " version 0x%x (current=0x%x)\n", cpu, mc_header->rev, uci->rev); new_mc = vmalloc(total_size); if (!new_mc) { printk(KERN_ERR "microcode: error! Can not allocate memory\n"); @@ -303,7 +301,7 @@ find: return 1; } -static void apply_microcode(int cpu) +void apply_microcode(int cpu) { unsigned long flags; unsigned int val[2]; @@ -344,10 +342,10 @@ static void apply_microcode(int cpu) } #ifdef CONFIG_MICROCODE_OLD_INTERFACE -static void __user *user_buffer; /* user area microcode data buffer */ -static unsigned int user_buffer_size; /* it's size */ +extern void __user *user_buffer; /* user area microcode data buffer */ +extern unsigned int user_buffer_size; /* it's size */ -static long get_next_ucode(void **mc, long offset) +long get_next_ucode(void **mc, long offset) { struct microcode_header mc_header; unsigned long total_size; @@ -375,117 +373,6 @@ static long get_next_ucode(void **mc, long offset) } return offset + total_size; } - -static int do_microcode_update (void) -{ - long cursor = 0; - int error = 0; - void *new_mc = NULL; - int cpu; - cpumask_t old; - cpumask_of_cpu_ptr_declare(newmask); - - old = current->cpus_allowed; - - while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { - error = microcode_sanity_check(new_mc); - if (error) - goto out; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!uci->valid) - continue; - cpumask_of_cpu_ptr_next(newmask, cpu); - set_cpus_allowed_ptr(current, newmask); - error = get_maching_microcode(new_mc, cpu); - if (error < 0) - goto out; - if (error == 1) - apply_microcode(cpu); - } - vfree(new_mc); - } -out: - if (cursor > 0) - vfree(new_mc); - if (cursor < 0) - error = cursor; - set_cpus_allowed_ptr(current, &old); - return error; -} - -static int microcode_open (struct inode *unused1, struct file *unused2) -{ - cycle_kernel_lock(); - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) -{ - ssize_t ret; - - if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); - return -EINVAL; - } - - get_online_cpus(); - mutex_lock(µcode_mutex); - - user_buffer = (void __user *) buf; - user_buffer_size = (int) len; - - ret = do_microcode_update(); - if (!ret) - ret = (ssize_t)len; - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init (void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void microcode_dev_exit (void) -{ - misc_deregister(µcode_dev); -} - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while(0) #endif static long get_next_ucode_from_buffer(void **mc, const u8 *buf, @@ -515,9 +402,9 @@ static long get_next_ucode_from_buffer(void **mc, const u8 *buf, } /* fake device for request_firmware */ -static struct platform_device *microcode_pdev; +extern struct platform_device *microcode_pdev; -static int cpu_request_microcode(int cpu) +int cpu_request_microcode(int cpu) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -530,7 +417,7 @@ static int cpu_request_microcode(int cpu) /* We should bind the task to the CPU */ BUG_ON(cpu != raw_smp_processor_id()); - sprintf(name,"intel-ucode/%02x-%02x-%02x", + sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); error = request_firmware(&firmware, name, µcode_pdev->dev); if (error) { @@ -544,7 +431,7 @@ static int cpu_request_microcode(int cpu) error = microcode_sanity_check(mc); if (error) break; - error = get_maching_microcode(mc, cpu); + error = get_matching_microcode(mc, cpu); if (error < 0) break; /* @@ -566,7 +453,7 @@ static int cpu_request_microcode(int cpu) return error; } -static int apply_microcode_check_cpu(int cpu) +int apply_microcode_check_cpu(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -615,241 +502,13 @@ static int apply_microcode_check_cpu(int cpu) return err; } -static void microcode_init_cpu(int cpu, int resume) -{ - cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - old = current->cpus_allowed; - - set_cpus_allowed_ptr(current, newmask); - mutex_lock(µcode_mutex); - collect_cpu_info(cpu); - if (uci->valid && system_state == SYSTEM_RUNNING && !resume) - cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); -} - -static void microcode_fini_cpu(int cpu) +void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; mutex_lock(µcode_mutex); uci->valid = 0; - vfree(uci->mc); + kfree(uci->mc); uci->mc = NULL; mutex_unlock(µcode_mutex); } - -static ssize_t reload_store(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, size_t sz) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; - int cpu = dev->id; - - if (end == buf) - return -EINVAL; - if (val == 1) { - cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); - - old = current->cpus_allowed; - - get_online_cpus(); - set_cpus_allowed_ptr(current, newmask); - - mutex_lock(µcode_mutex); - if (uci->valid) - err = cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - put_online_cpus(); - set_cpus_allowed_ptr(current, &old); - } - if (err) - return err; - return sz; -} - -static ssize_t version_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->rev); -} - -static ssize_t pf_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->pf); -} - -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); - -static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, - NULL -}; - -static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", -}; - -static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) -{ - int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); - - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); - if (err) - return err; - - microcode_init_cpu(cpu, resume); - - return 0; -} - -static int mc_sysdev_add(struct sys_device *sys_dev) -{ - return __mc_sysdev_add(sys_dev, 0); -} - -static int mc_sysdev_remove(struct sys_device *sys_dev) -{ - int cpu = sys_dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - return 0; -} - -static int mc_sysdev_resume(struct sys_device *dev) -{ - int cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - pr_debug("microcode: CPU%d resumed\n", cpu); - /* only CPU 0 will apply ucode here */ - apply_microcode(0); - return 0; -} - -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, -}; - -static __cpuinit int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; - - sys_dev = get_cpu_sysdev(cpu); - switch (action) { - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - mc_sysdev_add(sys_dev); - break; - case CPU_ONLINE_FROZEN: - /* System-wide resume is in progress, try to apply microcode */ - if (apply_microcode_check_cpu(cpu)) { - /* The application of microcode failed */ - microcode_fini_cpu(cpu); - __mc_sysdev_add(sys_dev, 1); - break; - } - case CPU_DOWN_FAILED_FROZEN: - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); - break; - case CPU_DOWN_PREPARE: - mc_sysdev_remove(sys_dev); - break; - case CPU_DOWN_PREPARE_FROZEN: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __refdata mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - -static int __init microcode_init (void) -{ - int error; - - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); - - error = microcode_dev_init(); - if (error) - return error; - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); - if (IS_ERR(microcode_pdev)) { - microcode_dev_exit(); - return PTR_ERR(microcode_pdev); - } - - get_online_cpus(); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - if (error) { - microcode_dev_exit(); - platform_device_unregister(microcode_pdev); - return error; - } - - register_hotcpu_notifier(&mc_cpu_notifier); - return 0; -} - -static void __exit microcode_exit (void) -{ - microcode_dev_exit(); - - unregister_hotcpu_notifier(&mc_cpu_notifier); - - get_online_cpus(); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - - platform_device_unregister(microcode_pdev); -} - -module_init(microcode_init) -module_exit(microcode_exit) From d4ee36686853d5714437c4409f17ad42bfaf4211 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:18 +0200 Subject: [PATCH 022/138] x86: structure declaration renaming Renamed common structures to vendor specific naming scheme so other vendors will be able to use the same naming convention. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_intel.c | 46 ++++++++++++++++--------------- include/asm-x86/microcode.h | 10 ++++--- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index eded0a154ea8..ca9861bf067e 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -100,17 +100,19 @@ MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof(struct microcode_header)) /* 48 bytes */ +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) /* 48 bytes */ #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) /* 20 bytes */ #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) /* 12 bytes */ #define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ - (((struct microcode *)mc)->hdr.totalsize ? \ - ((struct microcode *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) + (((struct microcode_intel *)mc)->hdr.totalsize ? \ + ((struct microcode_intel *)mc)->hdr.totalsize : \ + DEFAULT_UCODE_TOTALSIZE) + #define get_datasize(mc) \ - (((struct microcode *)mc)->hdr.datasize ? \ - ((struct microcode *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + (((struct microcode_intel *)mc)->hdr.datasize ? \ + ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) #define sigmatch(s1, s2, p1, p2) \ (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) @@ -134,7 +136,7 @@ void collect_cpu_info(int cpu_num) /* We should bind the task to the CPU */ BUG_ON(raw_smp_processor_id() != cpu_num); uci->pf = uci->rev = 0; - uci->mc = NULL; + uci->mc.mc_intel = NULL; uci->valid = 1; if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || @@ -163,7 +165,7 @@ void collect_cpu_info(int cpu_num) } static inline int microcode_update_match(int cpu_num, - struct microcode_header *mc_header, int sig, int pf) + struct microcode_header_intel *mc_header, int sig, int pf) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -175,7 +177,7 @@ static inline int microcode_update_match(int cpu_num, int microcode_sanity_check(void *mc) { - struct microcode_header *mc_header = mc; + struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; struct extended_signature *ext_sig; unsigned long total_size, data_size, ext_table_size; @@ -260,7 +262,7 @@ int microcode_sanity_check(void *mc) int get_matching_microcode(void *mc, int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct microcode_header *mc_header = mc; + struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); int ext_sigcount, i; @@ -294,10 +296,10 @@ find: } /* free previous update file */ - vfree(uci->mc); + vfree(uci->mc.mc_intel); memcpy(new_mc, mc, total_size); - uci->mc = new_mc; + uci->mc.mc_intel = new_mc; return 1; } @@ -311,7 +313,7 @@ void apply_microcode(int cpu) /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); - if (uci->mc == NULL) + if (uci->mc.mc_intel == NULL) return; /* serialize access to the physical write to MSR 0x79 */ @@ -319,8 +321,8 @@ void apply_microcode(int cpu) /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) uci->mc->bits, - (unsigned long) uci->mc->bits >> 16 >> 16); + (unsigned long) uci->mc.mc_intel->bits, + (unsigned long) uci->mc.mc_intel->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ @@ -330,14 +332,14 @@ void apply_microcode(int cpu) rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); spin_unlock_irqrestore(µcode_update_lock, flags); - if (val[1] != uci->mc->hdr.rev) { + if (val[1] != uci->mc.mc_intel->hdr.rev) { printk(KERN_ERR "microcode: CPU%d update from revision " "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); return; } printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x, date = %08x \n", - cpu_num, uci->rev, val[1], uci->mc->hdr.date); + cpu_num, uci->rev, val[1], uci->mc.mc_intel->hdr.date); uci->rev = val[1]; } @@ -347,7 +349,7 @@ extern unsigned int user_buffer_size; /* it's size */ long get_next_ucode(void **mc, long offset) { - struct microcode_header mc_header; + struct microcode_header_intel mc_header; unsigned long total_size; /* No more data */ @@ -378,13 +380,13 @@ long get_next_ucode(void **mc, long offset) static long get_next_ucode_from_buffer(void **mc, const u8 *buf, unsigned long size, long offset) { - struct microcode_header *mc_header; + struct microcode_header_intel *mc_header; unsigned long total_size; /* No more data */ if (offset >= size) return 0; - mc_header = (struct microcode_header *)(buf + offset); + mc_header = (struct microcode_header_intel *)(buf + offset); total_size = get_totalsize(mc_header); if (offset + total_size > size) { @@ -463,7 +465,7 @@ int apply_microcode_check_cpu(int cpu) int err = 0; /* Check if the microcode is available */ - if (!uci->mc) + if (!uci->mc.mc_intel) return 0; old = current->cpus_allowed; @@ -508,7 +510,7 @@ void microcode_fini_cpu(int cpu) mutex_lock(µcode_mutex); uci->valid = 0; - kfree(uci->mc); - uci->mc = NULL; + kfree(uci->mc.mc_intel); + uci->mc.mc_intel = NULL; mutex_unlock(µcode_mutex); } diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h index d34a1fc1b173..ef77c6f438bf 100644 --- a/include/asm-x86/microcode.h +++ b/include/asm-x86/microcode.h @@ -1,4 +1,4 @@ -struct microcode_header { +struct microcode_header_intel { unsigned int hdrver; unsigned int rev; unsigned int date; @@ -11,8 +11,8 @@ struct microcode_header { unsigned int reserved[3]; }; -struct microcode { - struct microcode_header hdr; +struct microcode_intel { + struct microcode_header_intel hdr; unsigned int bits[0]; }; @@ -35,5 +35,7 @@ struct ucode_cpu_info { unsigned int sig; unsigned int pf; unsigned int rev; - struct microcode *mc; + union { + struct microcode_intel *mc_intel; + } mc; }; From 9835fd4ad9ee5fc6b909df72aa3e3dba04415f4b Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:19 +0200 Subject: [PATCH 023/138] x86: add AMD specific declarations Added AMD specific declarations to header file. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- include/asm-x86/microcode.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h index ef77c6f438bf..4e941721c0d1 100644 --- a/include/asm-x86/microcode.h +++ b/include/asm-x86/microcode.h @@ -30,6 +30,35 @@ struct extended_sigtable { struct extended_signature sigs[0]; }; +struct equiv_cpu_entry { + unsigned int installed_cpu; + unsigned int fixed_errata_mask; + unsigned int fixed_errata_compare; + unsigned int equiv_cpu; +}; + +struct microcode_header_amd { + unsigned int data_code; + unsigned int patch_id; + unsigned char mc_patch_data_id[2]; + unsigned char mc_patch_data_len; + unsigned char init_flag; + unsigned int mc_patch_data_checksum; + unsigned int nb_dev_id; + unsigned int sb_dev_id; + unsigned char processor_rev_id[2]; + unsigned char nb_rev_id; + unsigned char sb_rev_id; + unsigned char bios_api_rev; + unsigned char reserved1[3]; + unsigned int match_reg[8]; +}; + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[0]; +}; + struct ucode_cpu_info { int valid; unsigned int sig; @@ -37,5 +66,6 @@ struct ucode_cpu_info { unsigned int rev; union { struct microcode_intel *mc_intel; + struct microcode_amd *mc_amd; } mc; }; From 26bf7a48c33906cc3415a4492aa9ead7a75f1353 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:20 +0200 Subject: [PATCH 024/138] x86: first step of refactoring, introducing microcode_ops Refactoring with the goal of having one general module and separate vendor specific modules that hook into the general one. Microcode_ops is a function pointer structure in which vendor specific modules will enter all functions that differ between vendors and that need to be accessed from the general module. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- include/asm-x86/microcode.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h index 4e941721c0d1..9231c876e374 100644 --- a/include/asm-x86/microcode.h +++ b/include/asm-x86/microcode.h @@ -1,3 +1,16 @@ +struct microcode_ops { + long (*get_next_ucode)(void **mc, long offset); + long (*microcode_get_next_ucode)(void **mc, long offset); + int (*get_matching_microcode)(void *mc, int cpu); + int (*apply_microcode_check_cpu)(int cpu); + int (*microcode_sanity_check)(void *mc); + int (*cpu_request_microcode)(int cpu); + void (*collect_cpu_info)(int cpu_num); + void (*apply_microcode)(int cpu); + void (*microcode_fini_cpu)(int cpu); + void (*clear_patch)(void *data); +}; + struct microcode_header_intel { unsigned int hdrver; unsigned int rev; From 8d86f390d9bb5b39f0a315838d1616de6363e1b9 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:21 +0200 Subject: [PATCH 025/138] x86: major refactoring Refactored code by introducing a two-module solution. There is one general module in which vendor specific modules can hook into. However, that is exclusive, there is only one vendor specific module allowed at a time. A CPU vendor check makes sure only the correct module for the underlying system gets called. Functinally in terms of patch loading itself there are no changes. This refactoring provides a basis for future implementations of other vendors' patch loaders. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 25 ++++++++-- arch/x86/kernel/Makefile | 4 +- arch/x86/kernel/microcode.c | 80 ++++++++++++++++++------------- arch/x86/kernel/microcode_intel.c | 50 +++++++++++++++---- include/asm-x86/microcode.h | 3 ++ 5 files changed, 112 insertions(+), 50 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b6fa2877b173..6b0b885cf47a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -782,7 +782,7 @@ config X86_REBOOTFIXUPS Say N otherwise. config MICROCODE - tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" + tristate "/dev/cpu/microcode - microcode support" select FW_LOADER ---help--- If you say Y here, you will be able to update the microcode on @@ -791,14 +791,29 @@ config MICROCODE actual microcode binary data itself which is not shipped with the Linux kernel. - For latest news and information on obtaining all the required - ingredients for this driver, check: - . + This option selects the general module only, you need to select + at least one vendor specific module as well. To compile this driver as a module, choose M here: the module will be called microcode. -config MICROCODE_OLD_INTERFACE +config MICROCODE_INTEL + tristate "Intel microcode patch loading support" + depends on MICROCODE + default MICROCODE + select FW_LOADER + --help--- + This options enables microcode patch loading support for Intel + processors. + + For latest news and information on obtaining all the required + Intel ingredients for this driver, check: + . + + This driver is only available as a module: the module + will be called microcode_intel. + + config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index abb32aed7f4b..f2f9f6da8c18 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -51,8 +51,8 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += ucode.o -ucode-objs := microcode.o microcode_intel.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index c1047d7f7ede..1e42e79ca694 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -99,25 +99,22 @@ MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); -#define MICROCODE_VERSION "1.14a" +#define MICROCODE_VERSION "2.00" + +struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -DEFINE_MUTEX(microcode_mutex); +static DEFINE_MUTEX(microcode_mutex); +EXPORT_SYMBOL_GPL(microcode_mutex); -struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; - -extern long get_next_ucode(void **mc, long offset); -extern int microcode_sanity_check(void *mc); -extern int get_matching_microcode(void *mc, int cpu); -extern void collect_cpu_info(int cpu_num); -extern int cpu_request_microcode(int cpu); -extern void microcode_fini_cpu(int cpu); -extern void apply_microcode(int cpu); -extern int apply_microcode_check_cpu(int cpu); +static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +EXPORT_SYMBOL_GPL(ucode_cpu_info); #ifdef CONFIG_MICROCODE_OLD_INTERFACE -void __user *user_buffer; /* user area microcode data buffer */ -unsigned int user_buffer_size; /* it's size */ +static void __user *user_buffer; /* user area microcode data buffer */ +EXPORT_SYMBOL_GPL(user_buffer); +static unsigned int user_buffer_size; /* it's size */ +EXPORT_SYMBOL_GPL(user_buffer_size); static int do_microcode_update (void) { @@ -130,8 +127,8 @@ static int do_microcode_update (void) old = current->cpus_allowed; - while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { - error = microcode_sanity_check(new_mc); + while ((cursor = microcode_ops->get_next_ucode(&new_mc, cursor)) > 0) { + error = microcode_ops->microcode_sanity_check(new_mc); if (error) goto out; /* @@ -145,11 +142,12 @@ static int do_microcode_update (void) continue; cpumask_of_cpu_ptr_next(newmask, cpu); set_cpus_allowed_ptr(current, newmask); - error = get_maching_microcode(new_mc, cpu); + error = microcode_ops->get_matching_microcode(new_mc, + cpu); if (error < 0) goto out; if (error == 1) - apply_microcode(cpu); + microcode_ops->apply_microcode(cpu); } vfree(new_mc); } @@ -232,7 +230,8 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #endif /* fake device for request_firmware */ -struct platform_device *microcode_pdev; +static struct platform_device *microcode_pdev; +EXPORT_SYMBOL_GPL(microcode_pdev); static void microcode_init_cpu(int cpu, int resume) { @@ -244,9 +243,9 @@ static void microcode_init_cpu(int cpu, int resume) set_cpus_allowed_ptr(current, newmask); mutex_lock(µcode_mutex); - collect_cpu_info(cpu); + microcode_ops->collect_cpu_info(cpu); if (uci->valid && system_state == SYSTEM_RUNNING && !resume) - cpu_request_microcode(cpu); + microcode_ops->cpu_request_microcode(cpu); mutex_unlock(µcode_mutex); set_cpus_allowed_ptr(current, &old); } @@ -274,7 +273,7 @@ static ssize_t reload_store(struct sys_device *dev, mutex_lock(µcode_mutex); if (uci->valid) - err = cpu_request_microcode(cpu); + err = microcode_ops->cpu_request_microcode(cpu); mutex_unlock(µcode_mutex); put_online_cpus(); set_cpus_allowed_ptr(current, &old); @@ -349,7 +348,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) return 0; pr_debug("microcode: CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); + microcode_ops->microcode_fini_cpu(cpu); sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); return 0; } @@ -362,7 +361,7 @@ static int mc_sysdev_resume(struct sys_device *dev) return 0; pr_debug("microcode: CPU%d resumed\n", cpu); /* only CPU 0 will apply ucode here */ - apply_microcode(0); + microcode_ops->apply_microcode(0); return 0; } @@ -382,7 +381,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_CANCELED_FROZEN: /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); + microcode_ops->microcode_fini_cpu(cpu); break; case CPU_ONLINE: case CPU_DOWN_FAILED: @@ -390,9 +389,9 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) break; case CPU_ONLINE_FROZEN: /* System-wide resume is in progress, try to apply microcode */ - if (apply_microcode_check_cpu(cpu)) { + if (microcode_ops->apply_microcode_check_cpu(cpu)) { /* The application of microcode failed */ - microcode_fini_cpu(cpu); + microcode_ops->microcode_fini_cpu(cpu); __mc_sysdev_add(sys_dev, 1); break; } @@ -416,12 +415,17 @@ static struct notifier_block __refdata mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; -static int __init microcode_init (void) +static int microcode_init(void *opaque, struct module *module) { + struct microcode_ops *ops = (struct microcode_ops *)opaque; int error; - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); + if (microcode_ops) { + printk(KERN_ERR "microcode: already loaded the other module\n"); + return -EEXIST; + } + + microcode_ops = ops; error = microcode_dev_init(); if (error) @@ -443,8 +447,15 @@ static int __init microcode_init (void) } register_hotcpu_notifier(&mc_cpu_notifier); + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION + " " + " \n"); + return 0; } +EXPORT_SYMBOL_GPL(microcode_init); static void __exit microcode_exit (void) { @@ -457,7 +468,10 @@ static void __exit microcode_exit (void) put_online_cpus(); platform_device_unregister(microcode_pdev); -} -module_init(microcode_init) -module_exit(microcode_exit) + microcode_ops = NULL; + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} +EXPORT_SYMBOL_GPL(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index ca9861bf067e..831db5363dba 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -127,7 +127,7 @@ extern struct mutex microcode_mutex; extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -void collect_cpu_info(int cpu_num) +static void collect_cpu_info(int cpu_num) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -175,7 +175,7 @@ static inline int microcode_update_match(int cpu_num, return 1; } -int microcode_sanity_check(void *mc) +static int microcode_sanity_check(void *mc) { struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; @@ -259,7 +259,7 @@ int microcode_sanity_check(void *mc) * return 1 - found update * return < 0 - error */ -int get_matching_microcode(void *mc, int cpu) +static int get_matching_microcode(void *mc, int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_intel *mc_header = mc; @@ -288,7 +288,8 @@ int get_matching_microcode(void *mc, int cpu) return 0; find: pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", cpu, mc_header->rev, uci->rev); + " version 0x%x (current=0x%x)\n", + cpu, mc_header->rev, uci->rev); new_mc = vmalloc(total_size); if (!new_mc) { printk(KERN_ERR "microcode: error! Can not allocate memory\n"); @@ -303,7 +304,7 @@ find: return 1; } -void apply_microcode(int cpu) +static void apply_microcode(int cpu) { unsigned long flags; unsigned int val[2]; @@ -347,7 +348,7 @@ void apply_microcode(int cpu) extern void __user *user_buffer; /* user area microcode data buffer */ extern unsigned int user_buffer_size; /* it's size */ -long get_next_ucode(void **mc, long offset) +static long get_next_ucode(void **mc, long offset) { struct microcode_header_intel mc_header; unsigned long total_size; @@ -406,7 +407,7 @@ static long get_next_ucode_from_buffer(void **mc, const u8 *buf, /* fake device for request_firmware */ extern struct platform_device *microcode_pdev; -int cpu_request_microcode(int cpu) +static int cpu_request_microcode(int cpu) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -455,7 +456,7 @@ int cpu_request_microcode(int cpu) return error; } -int apply_microcode_check_cpu(int cpu) +static int apply_microcode_check_cpu(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -504,13 +505,42 @@ int apply_microcode_check_cpu(int cpu) return err; } -void microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; mutex_lock(µcode_mutex); uci->valid = 0; - kfree(uci->mc.mc_intel); + vfree(uci->mc.mc_intel); uci->mc.mc_intel = NULL; mutex_unlock(µcode_mutex); } + +static struct microcode_ops microcode_intel_ops = { + .get_next_ucode = get_next_ucode, + .get_matching_microcode = get_matching_microcode, + .microcode_sanity_check = microcode_sanity_check, + .apply_microcode_check_cpu = apply_microcode_check_cpu, + .cpu_request_microcode = cpu_request_microcode, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode, + .microcode_fini_cpu = microcode_fini_cpu, +}; + +static int __init microcode_intel_module_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(get_cpu()); + + if (c->x86_vendor == X86_VENDOR_INTEL) + return microcode_init(µcode_intel_ops, THIS_MODULE); + else + return -ENODEV; +} + +static void __exit microcode_intel_module_exit(void) +{ + microcode_exit(); +} + +module_init(microcode_intel_module_init) +module_exit(microcode_intel_module_exit) diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h index 9231c876e374..18b2aeec2adf 100644 --- a/include/asm-x86/microcode.h +++ b/include/asm-x86/microcode.h @@ -1,3 +1,6 @@ +extern int microcode_init(void *opaque, struct module *module); +extern void microcode_exit(void); + struct microcode_ops { long (*get_next_ucode)(void **mc, long offset); long (*microcode_get_next_ucode)(void **mc, long offset); From 80cc9f1020f49c9e5b50898c102fd444de70a0a3 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:22 +0200 Subject: [PATCH 026/138] x86: AMD microcode patch loading support This patch introduces microcode patch loading for AMD processors. It is based on previous corresponding work for Intel processors. It hooks into the general patch loading module. Main difference is that a container file format is used to hold all patch data for multiple processors as well as an equivalent CPU table, which comes seperately, as opposed to Intel's microcode patching solution. Kconfig and Makefile have been changed provice config and build option for new source file. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 21 +- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/microcode_amd.c | 521 ++++++++++++++++++++++++++++++++ 3 files changed, 539 insertions(+), 4 deletions(-) create mode 100644 arch/x86/kernel/microcode_amd.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6b0b885cf47a..7bb461758d31 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -786,10 +786,12 @@ config MICROCODE select FW_LOADER ---help--- If you say Y here, you will be able to update the microcode on - Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, - Pentium III, Pentium 4, Xeon etc. You will obviously need the - actual microcode binary data itself which is not shipped with the - Linux kernel. + certain Intel and AMD processors. The Intel support is for the + IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, + Pentium 4, Xeon etc. The AMD support is for family 0x10 and + 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra. + You will obviously need the actual microcode binary data itself + which is not shipped with the Linux kernel. This option selects the general module only, you need to select at least one vendor specific module as well. @@ -813,6 +815,17 @@ config MICROCODE_INTEL This driver is only available as a module: the module will be called microcode_intel. +config MICROCODE_AMD + tristate "AMD microcode patch loading support" + depends on MICROCODE + select FW_LOADER + --help--- + If you select this option, microcode patch loading support for AMD + processors will be enabled. + + This driver is only available as a module: the module + will be called microcode_amd. + config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f2f9f6da8c18..be454f348c3b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o +obj-$(CONFIG_MICROCODE_AMD) += microcode_amd.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c new file mode 100644 index 000000000000..db199e3fdd1f --- /dev/null +++ b/arch/x86/kernel/microcode_amd.c @@ -0,0 +1,521 @@ +/* + * AMD CPU Microcode Update Driver for Linux + * Copyright (C) 2008 Advanced Micro Devices Inc. + * + * Author: Peter Oruba + * + * Based on work by: + * Tigran Aivazian + * + * This driver allows to upgrade microcode on AMD + * family 0x10 and 0x11 processors. + * + * Licensed unter the terms of the GNU General Public + * License version 2. See file COPYING for details. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_DESCRIPTION("AMD Microcode Update Driver"); +MODULE_AUTHOR("Peter Oruba "); +MODULE_LICENSE("GPLv2"); + +#define UCODE_MAGIC 0x00414d44 +#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 +#define UCODE_UCODE_TYPE 0x00000001 + +#define UCODE_MAX_SIZE (2048) +#define DEFAULT_UCODE_DATASIZE (896) /* 896 bytes */ +#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) /* 64 bytes */ +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 960 bytes */ +#define DWSIZE (sizeof(u32)) +/* For now we support a fixed ucode total size only */ +#define get_totalsize(mc) \ + ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ + + MC_HEADER_SIZE) + +extern int microcode_init(void *opaque, struct module *module); +extern void microcode_exit(void); + +/* serialize access to the physical write */ +static DEFINE_SPINLOCK(microcode_update_lock); + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +extern struct mutex (microcode_mutex); + +struct equiv_cpu_entry *equiv_cpu_table; + +extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; + +static void collect_cpu_info_amd(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu); + uci->rev = 0; + uci->pf = 0; + uci->mc.mc_amd = NULL; + uci->valid = 1; + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", + cpu); + uci->valid = 0; + return; + } + + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (uci->rev) + : "i" (0x0000008B) : "ecx"); + + printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", + uci->rev); +} + +static int get_matching_microcode_amd(void *mc, int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_header_amd *mc_header = mc; + unsigned long total_size = get_totalsize(mc_header); + void *new_mc; + struct pci_dev *nb_pci_dev, *sb_pci_dev; + unsigned int current_cpu_id; + unsigned int equiv_cpu_id = 0x00; + unsigned int i = 0; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + /* This is a tricky part. We might be called from a write operation */ + /* to the device file instead of the usual process of firmware */ + /* loading. This routine needs to be able to distinguish both */ +/* cases. This is done by checking if there alread is a equivalent */ + /* CPU table installed. If not, we're written through */ + /* /dev/cpu/microcode. */ +/* Since we ignore all checks. The error case in which going through */ +/* firmware loading and that table is not loaded has already been */ + /* checked earlier. */ + if (equiv_cpu_table == NULL) { + printk(KERN_INFO "microcode: CPU%d microcode update with " + "version 0x%x (current=0x%x)\n", + cpu, mc_header->patch_id, uci->rev); + goto out; + } + + current_cpu_id = cpuid_eax(0x00000001); + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { + equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; + break; + } + i++; + } + + if (!equiv_cpu_id) { + printk(KERN_ERR "microcode: CPU%d cpu_id " + "not found in equivalent cpu table \n", cpu); + return 0; + } + + if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { + printk(KERN_ERR + "microcode: CPU%d patch does not match " + "(patch is %x, cpu extended is %x) \n", + cpu, mc_header->processor_rev_id[0], + (equiv_cpu_id & 0xff)); + return 0; + } + + if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { + printk(KERN_ERR "microcode: CPU%d patch does not match " + "(patch is %x, cpu base id is %x) \n", + cpu, mc_header->processor_rev_id[1], + ((equiv_cpu_id >> 16) & 0xff)); + + return 0; + } + + /* ucode may be northbridge specific */ + if (mc_header->nb_dev_id) { + nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, + (mc_header->nb_dev_id & 0xff), + NULL); + if ((!nb_pci_dev) || + (mc_header->nb_rev_id != nb_pci_dev->revision)) { + printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu); + pci_dev_put(nb_pci_dev); + return 0; + } + pci_dev_put(nb_pci_dev); + } + + /* ucode may be southbridge specific */ + if (mc_header->sb_dev_id) { + sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, + (mc_header->sb_dev_id & 0xff), + NULL); + if ((!sb_pci_dev) || + (mc_header->sb_rev_id != sb_pci_dev->revision)) { + printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu); + pci_dev_put(sb_pci_dev); + return 0; + } + pci_dev_put(sb_pci_dev); + } + + if (mc_header->patch_id <= uci->rev) + return 0; + + printk(KERN_INFO "microcode: CPU%d found a matching microcode " + "update with version 0x%x (current=0x%x)\n", + cpu, mc_header->patch_id, uci->rev); + +out: + new_mc = vmalloc(UCODE_MAX_SIZE); + if (!new_mc) { + printk(KERN_ERR "microcode: error, can't allocate memory\n"); + return -ENOMEM; + } + memset(new_mc, 0, UCODE_MAX_SIZE); + + /* free previous update file */ + vfree(uci->mc.mc_amd); + + memcpy(new_mc, mc, total_size); + + uci->mc.mc_amd = new_mc; + return 1; +} + +static void apply_microcode_amd(int cpu) +{ + unsigned long flags; + unsigned int eax, edx; + unsigned int rev; + int cpu_num = raw_smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (uci->mc.mc_amd == NULL) + return; + + spin_lock_irqsave(µcode_update_lock, flags); + + edx = (unsigned int)(((unsigned long) + &(uci->mc.mc_amd->hdr.data_code)) >> 32); + eax = (unsigned int)(((unsigned long) + &(uci->mc.mc_amd->hdr.data_code)) & 0xffffffffL); + + asm volatile("movl %0, %%ecx; wrmsr" : + : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); + + /* get patch id after patching */ + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (rev) + : "i" (0x0000008B) : "ecx"); + + spin_unlock_irqrestore(µcode_update_lock, flags); + + /* check current patch id and patch's id for match */ + if (rev != uci->mc.mc_amd->hdr.patch_id) { + printk(KERN_ERR "microcode: CPU%d update from revision " + "0x%x to 0x%x failed\n", cpu_num, + uci->mc.mc_amd->hdr.patch_id, rev); + return; + } + + printk(KERN_INFO "microcode: CPU%d updated from revision " + "0x%x to 0x%x \n", + cpu_num, uci->rev, uci->mc.mc_amd->hdr.patch_id); + + uci->rev = rev; +} + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +extern void __user *user_buffer; /* user area microcode data buffer */ +extern unsigned int user_buffer_size; /* it's size */ + +static long get_next_ucode_amd(void **mc, long offset) +{ + struct microcode_header_amd mc_header; + unsigned long total_size; + + /* No more data */ + if (offset >= user_buffer_size) + return 0; + if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + return -EFAULT; + } + total_size = get_totalsize(&mc_header); + if (offset + total_size > user_buffer_size) { + printk(KERN_ERR "microcode: error! Bad total size in microcode " + "data file\n"); + return -EINVAL; + } + *mc = vmalloc(UCODE_MAX_SIZE); + if (!*mc) + return -ENOMEM; + memset(*mc, 0, UCODE_MAX_SIZE); + + if (copy_from_user(*mc, user_buffer + offset, total_size)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + vfree(*mc); + return -EFAULT; + } + return offset + total_size; +} +#else +#define get_next_ucode_amd() NULL +#endif + +static long get_next_ucode_from_buffer_amd(void **mc, void *buf, + unsigned long size, long offset) +{ + struct microcode_header_amd *mc_header; + unsigned long total_size; + unsigned char *buf_pos = buf; + + /* No more data */ + if (offset >= size) + return 0; + + if (buf_pos[offset] != UCODE_UCODE_TYPE) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode payload type field\n"); + return -EINVAL; + } + + mc_header = (struct microcode_header_amd *)(&buf_pos[offset+8]); + + total_size = (unsigned long) (buf_pos[offset+4] + + (buf_pos[offset+5] << 8)); + + printk(KERN_INFO "microcode: size %lu, total_size %lu, offset %ld\n", + size, total_size, offset); + + if (offset + total_size > size) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + return -EINVAL; + } + + *mc = vmalloc(UCODE_MAX_SIZE); + if (!*mc) { + printk(KERN_ERR "microcode: error! " + "Can not allocate memory for microcode patch\n"); + return -ENOMEM; + } + + memset(*mc, 0, UCODE_MAX_SIZE); + memcpy(*mc, buf + offset + 8, total_size); + + return offset + total_size + 8; +} + +static long install_equiv_cpu_table(void *buf, unsigned long size, long offset) +{ + unsigned int *buf_pos = buf; + + /* No more data */ + if (offset >= size) + return 0; + + if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode equivalnet cpu table type field\n"); + return 0; + } + + if (size == 0) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode equivalnet cpu table length\n"); + return 0; + } + + equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); + if (!equiv_cpu_table) { + printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); + return 0; + } + + memset(equiv_cpu_table, 0, size); + memcpy(equiv_cpu_table, &buf_pos[3], size); + + return size + 12; /* add header length */ +} + +/* fake device for request_firmware */ +extern struct platform_device *microcode_pdev; + +static int cpu_request_microcode_amd(int cpu) +{ + char name[30]; + const struct firmware *firmware; + void *buf; + unsigned int *buf_pos; + unsigned long size; + long offset = 0; + int error; + void *mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + sprintf(name, "amd-ucode/microcode_amd.bin"); + error = request_firmware(&firmware, "amd-ucode/microcode_amd.bin", + µcode_pdev->dev); + if (error) { + printk(KERN_ERR "microcode: ucode data file %s load failed\n", + name); + return error; + } + + buf_pos = buf = firmware->data; + size = firmware->size; + + if (buf_pos[0] != UCODE_MAGIC) { + printk(KERN_ERR "microcode: error! Wrong microcode patch file magic\n"); + return -EINVAL; + } + + offset = install_equiv_cpu_table(buf, buf_pos[2], offset); + + if (!offset) { + printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); + return -EINVAL; + } + + while ((offset = + get_next_ucode_from_buffer_amd(&mc, buf, size, offset)) > 0) { + error = get_matching_microcode_amd(mc, cpu); + if (error < 0) + break; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + if (error == 1) { + apply_microcode_amd(cpu); + error = 0; + } + vfree(mc); + } + if (offset > 0) { + vfree(mc); + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; + } + if (offset < 0) + error = offset; + release_firmware(firmware); + + return error; +} + +static int apply_microcode_check_cpu_amd(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + unsigned int rev; + cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); + int err = 0; + + /* Check if the microcode is available */ + if (!uci->mc.mc_amd) + return 0; + + old = current->cpus_allowed; + set_cpus_allowed(current, newmask); + + /* Check if the microcode we have in memory matches the CPU */ + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 16) + err = -EINVAL; + + if (!err) { + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (rev) + : "i" (0x0000008B) : "ecx"); + + if (uci->rev != rev) + err = -EINVAL; + } + + if (!err) + apply_microcode_amd(cpu); + else + printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" + " rev=0x%x\n", + cpu, uci->rev); + + set_cpus_allowed(current, old); + return err; +} + +static void microcode_fini_cpu_amd(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + uci->valid = 0; + vfree(uci->mc.mc_amd); + uci->mc.mc_amd = NULL; + mutex_unlock(µcode_mutex); +} + +static struct microcode_ops microcode_amd_ops = { + .get_next_ucode = get_next_ucode_amd, + .get_matching_microcode = get_matching_microcode_amd, + .microcode_sanity_check = NULL, + .apply_microcode_check_cpu = apply_microcode_check_cpu_amd, + .cpu_request_microcode = cpu_request_microcode_amd, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, +}; + +static int __init microcode_amd_module_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(get_cpu()); + + equiv_cpu_table = NULL; + if (c->x86_vendor == X86_VENDOR_AMD) + return microcode_init(µcode_amd_ops, THIS_MODULE); + else + return -ENODEV; +} + +static void __exit microcode_amd_module_exit(void) +{ + microcode_exit(); +} + +module_init(microcode_amd_module_init) +module_exit(microcode_amd_module_exit) From 45b1e23eca1c53fa79a611a2bc8c93697ede6c97 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 09:42:17 +0200 Subject: [PATCH 027/138] x86, microcode support: fix build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/microcode.c:412: error: static declaration of ‘microcode_init’ follows non-static declaration include/asm/microcode.h:1: error: previous declaration of ‘microcode_init’ was here arch/x86/kernel/microcode.c:454: error: static declaration of ‘microcode_exit’ follows non-static declaration include/asm/microcode.h:2: error: previous declaration of ‘microcode_exit’ was here Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 1e42e79ca694..9a881845b35b 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -415,7 +415,7 @@ static struct notifier_block __refdata mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; -static int microcode_init(void *opaque, struct module *module) +int microcode_init(void *opaque, struct module *module) { struct microcode_ops *ops = (struct microcode_ops *)opaque; int error; @@ -457,7 +457,7 @@ static int microcode_init(void *opaque, struct module *module) } EXPORT_SYMBOL_GPL(microcode_init); -static void __exit microcode_exit (void) +void __exit microcode_exit (void) { microcode_dev_exit(); From 224e946b81166d732f3e9b414cb69612072fb500 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 09:52:55 +0200 Subject: [PATCH 028/138] x86, microcode: fix symbol exports fix tons of build errors: arch/x86/kernel/built-in.o: In function `microcode_fini_cpu': microcode_intel.c:(.text+0x11598): undefined reference to `microcode_mutex' microcode_intel.c:(.text+0x115a4): undefined reference to `ucode_cpu_info' microcode_intel.c:(.text+0x115ae): undefined reference to `ucode_cpu_info' microcode_intel.c:(.text+0x115bc): undefined reference to `microcode_mutex' [...] Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 9a881845b35b..758b958a6637 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -104,16 +104,16 @@ MODULE_LICENSE("GPL"); struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DEFINE_MUTEX(microcode_mutex); +DEFINE_MUTEX(microcode_mutex); EXPORT_SYMBOL_GPL(microcode_mutex); -static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); #ifdef CONFIG_MICROCODE_OLD_INTERFACE -static void __user *user_buffer; /* user area microcode data buffer */ +void __user *user_buffer; /* user area microcode data buffer */ EXPORT_SYMBOL_GPL(user_buffer); -static unsigned int user_buffer_size; /* it's size */ +unsigned int user_buffer_size; /* it's size */ EXPORT_SYMBOL_GPL(user_buffer_size); static int do_microcode_update (void) @@ -230,7 +230,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #endif /* fake device for request_firmware */ -static struct platform_device *microcode_pdev; +struct platform_device *microcode_pdev; EXPORT_SYMBOL_GPL(microcode_pdev); static void microcode_init_cpu(int cpu, int resume) From 5d7b605245b1aa1a9cd6549b1f57d69273eb0c37 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 10:07:36 +0200 Subject: [PATCH 029/138] x86, microcode: fix module license string fix: FATAL: modpost: GPL-incompatible module microcode_amd.ko uses GPL-only symbol 'set_cpus_allowed_ptr' Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index db199e3fdd1f..fd9e68e88899 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -40,7 +40,7 @@ MODULE_DESCRIPTION("AMD Microcode Update Driver"); MODULE_AUTHOR("Peter Oruba "); -MODULE_LICENSE("GPLv2"); +MODULE_LICENSE("GPL v2"); #define UCODE_MAGIC 0x00414d44 #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 From 0d1edf46ba229b46efacf75c0544b88c05a7b266 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 28 Jul 2008 11:53:57 -0700 Subject: [PATCH 030/138] xen: compile irq functions without -pg for ftrace For some reason I managed to miss a bunch of irq-related functions which also need to be compiled without -pg when using ftrace. This patch moves them into their own file, and starts a cleanup process I've been meaning to do anyway. Signed-off-by: Jeremy Fitzhardinge Cc: Sam Ravnborg Cc: "Alex Nixon (Intern)" Cc: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/xen/Makefile | 3 +- arch/x86/xen/enlighten.c | 122 +------------------------------- arch/x86/xen/irq.c | 143 ++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-asm_32.S | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- arch/x86/xen/xen-ops.h | 1 + drivers/xen/events.c | 11 --- 7 files changed, 150 insertions(+), 134 deletions(-) create mode 100644 arch/x86/xen/irq.c diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 5bfee243cf9a..9ee745fa5527 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -2,9 +2,10 @@ ifdef CONFIG_FTRACE # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_spinlock.o = -pg CFLAGS_REMOVE_time.o = -pg +CFLAGS_REMOVE_irq.o = -pg endif -obj-y := enlighten.o setup.o multicalls.o mmu.o \ +obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o spinlock.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b795470ec069..cf8b3a93122b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -226,94 +225,6 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static unsigned long xen_save_fl(void) -{ - struct vcpu_info *vcpu; - unsigned long flags; - - vcpu = x86_read_percpu(xen_vcpu); - - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; -} - -static void xen_restore_fl(unsigned long flags) -{ - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - if (flags == 0) { - preempt_check_resched(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); - } -} - -static void xen_irq_disable(void) -{ - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} - -static void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; - - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ - - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); -} - -static void xen_safe_halt(void) -{ - /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) - BUG(); -} - -static void xen_halt(void) -{ - if (irqs_disabled()) - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); - else - xen_safe_halt(); -} - static void xen_leave_lazy(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); @@ -1308,36 +1219,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { }, }; -static void __init __xen_init_IRQ(void) -{ -#ifdef CONFIG_X86_64 - int i; - - /* Create identity vector->irq map */ - for(i = 0; i < NR_VECTORS; i++) { - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(vector_irq, cpu)[i] = i; - } -#endif /* CONFIG_X86_64 */ - - xen_init_IRQ(); -} - -static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = __xen_init_IRQ, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif -}; - static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = xen_apic_write, @@ -1740,10 +1621,11 @@ asmlinkage void __init xen_start_kernel(void) pv_init_ops = xen_init_ops; pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; - pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + xen_init_irq_ops(); + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c new file mode 100644 index 000000000000..28b85ab8422e --- /dev/null +++ b/arch/x86/xen/irq.c @@ -0,0 +1,143 @@ +#include + +#include +#include +#include + +#include +#include + +#include "xen-ops.h" + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void xen_force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} + +static void __init __xen_init_IRQ(void) +{ +#ifdef CONFIG_X86_64 + int i; + + /* Create identity vector->irq map */ + for(i = 0; i < NR_VECTORS; i++) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[i] = i; + } +#endif /* CONFIG_X86_64 */ + + xen_init_IRQ(); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + vcpu = x86_read_percpu(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + preempt_enable_no_resched(); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + if (flags == 0) { + preempt_check_resched(); + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); + } +} + +static void xen_irq_disable(void) +{ + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + /* We don't need to worry about being preempted here, since + either a) interrupts are disabled, so no preemption, or b) + the caller is confused and is trying to re-enable interrupts + on an indeterminate processor. */ + + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); +} + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) + BUG(); +} + +static void xen_halt(void) +{ + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + else + xen_safe_halt(); +} + +static const struct pv_irq_ops xen_irq_ops __initdata = { + .init_IRQ = __xen_init_IRQ, + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = xen_adjust_exception_frame, +#endif +}; + +void __init xen_init_irq_ops() +{ + pv_irq_ops = xen_irq_ops; +} diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 2497a30f41de..42786f59d9c0 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -298,7 +298,7 @@ check_events: push %eax push %ecx push %edx - call force_evtchn_callback + call xen_force_evtchn_callback pop %edx pop %ecx pop %eax diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 7f58304fafb3..3b9bda46487a 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -122,7 +122,7 @@ check_events: push %r9 push %r10 push %r11 - call force_evtchn_callback + call xen_force_evtchn_callback pop %r11 pop %r10 pop %r9 diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8847fb34f17e..3c70ebc50b1b 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -31,6 +31,7 @@ void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); +void xen_init_irq_ops(void); void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 0e0c28574af8..a0837036d898 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -84,17 +84,6 @@ static int irq_bindcount[NR_IRQS]; /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) -/* - * Force a proper event-channel callback from Xen after clearing the - * callback mask. We do this in a very simple manner, by making a call - * down into Xen. The pending flag will be checked by Xen on return. - */ -void force_evtchn_callback(void) -{ - (void)HYPERVISOR_xen_version(0, NULL); -} -EXPORT_SYMBOL_GPL(force_evtchn_callback); - static struct irq_chip xen_dynamic_chip; /* Constructor for packed IRQ information. */ From cef43bf6b3afd819f7cdcba356af0e8220fb3789 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 28 Jul 2008 13:33:44 -0700 Subject: [PATCH 031/138] xen: fix allocation and use of large ldts, cleanup Add a proper comment for set_aliased_prot() and fix an unsigned long/void * warning. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index cf8b3a93122b..04ec69e4d02e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -237,8 +237,10 @@ static unsigned long xen_store_tr(void) } /* - * If 'v' is a vmalloc mapping, then find the linear mapping of the - * page (if any) and also set its protections to match: + * Set the page permissions for a particular virtual address. If the + * address is a vmalloc mapping (or other non-linear mapping), then + * find the linear mapping of the page and also set its protections to + * match. */ static void set_aliased_prot(void *v, pgprot_t prot) { @@ -387,8 +389,7 @@ static void xen_load_gs_index(unsigned int idx) static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { - unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = arbitrary_virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); u64 entry = *(u64 *)ptr; preempt_disable(); From 169ad16bb87c10a3f7c108bb7008ebc0270f617a Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Mon, 28 Jul 2008 18:32:09 -0300 Subject: [PATCH 032/138] xen_alloc_ptpage: cast PFN_PHYS() argument to unsigned long Currently paravirt_ops alloc_p*() uses u32 for the pfn args. We should change that later, but while the pfn parameter is still u32, we need to cast the PFN_PHYS() argument at xen_alloc_ptpage() to unsigned long, otherwise it will lose bits on the shift. I think PFN_PHYS() should behave better when fed with smaller integers, but a cast to unsigned long won't be enough for all cases on 32-bit PAE, and a cast to u64 would be overkill for most users of PFN_PHYS(). We could have two different flavors of PFN_PHYS: one for low pages only (unsigned long) and another that works for any page (u64)), but while we don't have it, we will need the cast to unsigned long on xen_alloc_ptpage(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 04ec69e4d02e..53afa14eb314 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -822,7 +822,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) SetPagePinned(page); if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); if (level == PT_PTE) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else From a0ac87d61ba20932e54f08464d3f3439d78fa878 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Tue, 29 Jul 2008 17:41:04 +0200 Subject: [PATCH 033/138] x86: AMD microcode patch loader style corrections Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 07e52beacb44..6789530ef33b 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -47,9 +47,9 @@ MODULE_LICENSE("GPL v2"); #define UCODE_UCODE_TYPE 0x00000001 #define UCODE_MAX_SIZE (2048) -#define DEFAULT_UCODE_DATASIZE (896) /* 896 bytes */ -#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) /* 64 bytes */ -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 960 bytes */ +#define DEFAULT_UCODE_DATASIZE (896) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define DWSIZE (sizeof(u32)) /* For now we support a fixed ucode total size only */ #define get_totalsize(mc) \ From f516526febb75500f280def2108688d388df4e1e Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Tue, 29 Jul 2008 17:41:05 +0200 Subject: [PATCH 034/138] x86: Intel microcode patch loader style corrections Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_intel.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 6da4a85ff465..a61986e8b691 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -55,8 +55,8 @@ * in a single CPU package. * 1.10 28 Feb 2002 Asit K Mallick and * Tigran Aivazian , - * Serialize updates as required on HT processors due to speculative - * nature of implementation. + * Serialize updates as required on HT processors due to + * speculative nature of implementation. * 1.11 22 Mar 2002 Tigran Aivazian * Fix the panic when writing zero-length microcode chunk. * 1.12 29 Sep 2003 Nitin Kamble , @@ -71,7 +71,7 @@ * Thanks to Stuart Swales for pointing out this bug. */ -//#define DEBUG /* pr_debug */ +/* #define DEBUG */ /* pr_debug */ #include #include #include @@ -99,11 +99,11 @@ MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); -#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) /* 48 bytes */ -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) /* 20 bytes */ -#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) /* 12 bytes */ +#define DEFAULT_UCODE_DATASIZE (2000) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) #define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ (((struct microcode_intel *)mc)->hdr.totalsize ? \ From 831f9bd3151ebd22959a0cb2a20b44a06b26d4ed Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Tue, 29 Jul 2008 17:41:06 +0200 Subject: [PATCH 035/138] x86: moved function declarations out from AMD microcode patch loader to heade file Signed-off-by: Peter Oruba Cc: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 6789530ef33b..2b98e6ab1bf9 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -56,9 +56,6 @@ MODULE_LICENSE("GPL v2"); ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ + MC_HEADER_SIZE) -extern int microcode_init(void *opaque, struct module *module); -extern void microcode_exit(void); - /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); From daa9c0fee1cbe1d49fbe29af3d4bb16312043b1e Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Tue, 29 Jul 2008 17:41:07 +0200 Subject: [PATCH 036/138] x86: minor pointer type cast in AMD microcode patch loader Signed-off-by: Peter Oruba Cc: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 2b98e6ab1bf9..33b2a217a8c5 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -394,7 +394,8 @@ static int cpu_request_microcode_amd(int cpu) return error; } - buf_pos = buf = firmware->data; + buf_pos = (unsigned int *)firmware->data; + buf = (void *)firmware->data; size = firmware->size; if (buf_pos[0] != UCODE_MAGIC) { From 34a1b9fc4995102ecbb3a980b348aebf168d8196 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 12 Aug 2008 13:25:44 +0100 Subject: [PATCH 037/138] Fix date output in x86 microcode driver. The microcode stores its date in a uint32_t in some weird order approximating pdp-endian. Rather than printing it like that, print it properly in ISO standard form. Signed-off-by: David Woodhouse Cc: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_intel.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index a61986e8b691..d2d9d74f4cbb 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -339,8 +339,11 @@ static void apply_microcode(int cpu) return; } printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %08x \n", - cpu_num, uci->rev, val[1], uci->mc.mc_intel->hdr.date); + "0x%x to 0x%x, date = %04x-%02x-%02x \n", + cpu_num, uci->rev, val[1], + uci->mc.mc_intel->hdr.date & 0xffff, + uci->mc.mc_intel->hdr.date >> 24, + (uci->mc.mc_intel->hdr.date >> 16) & 0xff); uci->rev = val[1]; } From d33dcb9e7d272cc3171dcc3a21049902185ab03d Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Fri, 1 Aug 2008 12:46:45 +0200 Subject: [PATCH 038/138] x86: Microcode patch loader style corrections Style corrections to main microcode module. Signed-off-by: Peter Oruba Cc: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 28dba1a6e4aa..39961bb83293 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -55,8 +55,8 @@ * in a single CPU package. * 1.10 28 Feb 2002 Asit K Mallick and * Tigran Aivazian , - * Serialize updates as required on HT processors due to speculative - * nature of implementation. + * Serialize updates as required on HT processors due to + * speculative nature of implementation. * 1.11 22 Mar 2002 Tigran Aivazian * Fix the panic when writing zero-length microcode chunk. * 1.12 29 Sep 2003 Nitin Kamble , @@ -71,7 +71,7 @@ * Thanks to Stuart Swales for pointing out this bug. */ -//#define DEBUG /* pr_debug */ +/*#define DEBUG pr_debug */ #include #include #include @@ -116,7 +116,7 @@ EXPORT_SYMBOL_GPL(user_buffer); unsigned int user_buffer_size; /* it's size */ EXPORT_SYMBOL_GPL(user_buffer_size); -static int do_microcode_update (void) +static int do_microcode_update(void) { long cursor = 0; int error = 0; @@ -158,18 +158,20 @@ out: return error; } -static int microcode_open (struct inode *unused1, struct file *unused2) +static int microcode_open(struct inode *unused1, struct file *unused2) { cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } -static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) { ssize_t ret; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", + num_physpages); return -EINVAL; } @@ -201,7 +203,7 @@ static struct miscdevice microcode_dev = { .fops = µcode_fops, }; -static int __init microcode_dev_init (void) +static int __init microcode_dev_init(void) { int error; @@ -216,7 +218,7 @@ static int __init microcode_dev_init (void) return 0; } -static void microcode_dev_exit (void) +static void microcode_dev_exit(void) { misc_deregister(µcode_dev); } @@ -224,7 +226,7 @@ static void microcode_dev_exit (void) MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #else #define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while(0) +#define microcode_dev_exit() do { } while (0) #endif /* fake device for request_firmware */ @@ -451,7 +453,7 @@ int microcode_init(void *opaque, struct module *module) } EXPORT_SYMBOL_GPL(microcode_init); -void __exit microcode_exit (void) +void __exit microcode_exit(void) { microcode_dev_exit(); From 636a31781684d0f49208aa163f1a5a3a74210eb4 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Fri, 1 Aug 2008 12:46:46 +0200 Subject: [PATCH 039/138] x86: Fixed NULL function pointer dereference in AMD microcode patch loader. Dereference took place in code part responsible for manual installation of microcode patches through /dev/cpu/microcode. Signed-off-by: Peter Oruba Cc: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 39961bb83293..ad136ad99cb3 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -127,7 +127,8 @@ static int do_microcode_update(void) old = current->cpus_allowed; while ((cursor = microcode_ops->get_next_ucode(&new_mc, cursor)) > 0) { - error = microcode_ops->microcode_sanity_check(new_mc); + if (microcode_ops->microcode_sanity_check != NULL) + error = microcode_ops->microcode_sanity_check(new_mc); if (error) goto out; /* From 467cd0529a480c9c25f06154b788d1d1ba77828a Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 18 Aug 2008 16:56:29 -0700 Subject: [PATCH 040/138] x86: early_printk.c trivial sparse fixes arch/x86/kernel/early_printk.c:404:13: warning: incorrect type in assignment (different base types) arch/x86/kernel/early_printk.c:404:13: expected restricted __le16 [assigned] [usertype] wValue arch/x86/kernel/early_printk.c:404:13: got int [signed] value arch/x86/kernel/early_printk.c:405:13: warning: incorrect type in assignment (different base types) arch/x86/kernel/early_printk.c:405:13: expected restricted __le16 [assigned] [usertype] wIndex arch/x86/kernel/early_printk.c:405:13: got int [signed] index arch/x86/kernel/early_printk.c:406:14: warning: incorrect type in assignment (different base types) arch/x86/kernel/early_printk.c:406:14: expected restricted __le16 [assigned] [usertype] wLength arch/x86/kernel/early_printk.c:406:14: got int [signed] size arch/x86/kernel/early_printk.c:845:16: warning: Using plain integer as NULL pointer arch/x86/kernel/early_printk.c:992:13: warning: symbol 'enable_debug_console' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 28155acf706e..825e9136b026 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -401,9 +401,9 @@ static int dbgp_control_msg(unsigned devnum, int requesttype, int request, /* Compute the control message */ req.bRequestType = requesttype; req.bRequest = request; - req.wValue = value; - req.wIndex = index; - req.wLength = size; + req.wValue = cpu_to_le16(value); + req.wIndex = cpu_to_le16(index); + req.wLength = cpu_to_le16(size); pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); addr = DBGP_EPADDR(devnum, 0); @@ -842,7 +842,7 @@ static int __init early_dbgp_init(char *s) ret = ehci_setup(); if (ret < 0) { dbgp_printk("ehci_setup failed\n"); - ehci_debug = 0; + ehci_debug = NULL; return -1; } @@ -989,7 +989,7 @@ static int __init setup_early_printk(char *buf) return 0; } -void __init enable_debug_console(char *buf) +static void __init enable_debug_console(char *buf) { #ifdef DBGP_DEBUG struct console *old_early_console = NULL; From 8343ef2437c599d30568e6b5a257a40bf2f4902b Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Wed, 20 Aug 2008 00:16:13 +0200 Subject: [PATCH 041/138] x86-microcode: fix unbalanced use of get_cpu() Don't use get_cpu() at all. Resort to checking a boot-up CPU (#0) in microcode_{intel,amd}_module_init(). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 10 ++++++---- arch/x86/kernel/microcode_intel.c | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 33b2a217a8c5..a6e76ccf8158 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -500,13 +500,15 @@ static struct microcode_ops microcode_amd_ops = { static int __init microcode_amd_module_init(void) { - struct cpuinfo_x86 *c = &cpu_data(get_cpu()); + struct cpuinfo_x86 *c = &cpu_data(0); equiv_cpu_table = NULL; - if (c->x86_vendor == X86_VENDOR_AMD) - return microcode_init(µcode_amd_ops, THIS_MODULE); - else + if (c->x86_vendor != X86_VENDOR_AMD) { + printk(KERN_ERR "microcode: CPU platform is not AMD-capable\n"); return -ENODEV; + } + + return microcode_init(µcode_amd_ops, THIS_MODULE); } static void __exit microcode_amd_module_exit(void) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index d2d9d74f4cbb..6dd8907ff22e 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -531,12 +531,14 @@ static struct microcode_ops microcode_intel_ops = { static int __init microcode_intel_module_init(void) { - struct cpuinfo_x86 *c = &cpu_data(get_cpu()); + struct cpuinfo_x86 *c = &cpu_data(0); - if (c->x86_vendor == X86_VENDOR_INTEL) - return microcode_init(µcode_intel_ops, THIS_MODULE); - else + if (c->x86_vendor != X86_VENDOR_INTEL) { + printk(KERN_ERR "microcode: CPU platform is not Intel-capable\n"); return -ENODEV; + } + + return microcode_init(µcode_intel_ops, THIS_MODULE); } static void __exit microcode_intel_module_exit(void) From d45de40934897c6ee5b05141f7895bbb28512395 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Wed, 20 Aug 2008 00:22:26 +0200 Subject: [PATCH 042/138] x86-microcode: generic interface refactoring This is the 1st patch in the series. Here the aim was to avoid any significant changes, logically-wise. So it's mainly about generic interface refactoring: e.g. make microcode_{intel,amd}.c more about arch-specific details and less about policies like make-sure-we-run-on-a-target-cpu (no more set_cpus_allowed_ptr() here) and generic synchronization (no more microcode_mutex here). All in all, more line have been deleted than added. 4 files changed, 145 insertions(+), 198 deletions(-) Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 155 ++++++++++++++++++++---------- arch/x86/kernel/microcode_amd.c | 77 +++------------ arch/x86/kernel/microcode_intel.c | 91 +++--------------- include/asm-x86/microcode.h | 20 +++- 4 files changed, 145 insertions(+), 198 deletions(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index ad136ad99cb3..b2f84ce5eed3 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -71,7 +71,7 @@ * Thanks to Stuart Swales for pointing out this bug. */ -/*#define DEBUG pr_debug */ +/* #define DEBUG pr_debug */ #include #include #include @@ -104,8 +104,7 @@ MODULE_LICENSE("GPL"); struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -DEFINE_MUTEX(microcode_mutex); -EXPORT_SYMBOL_GPL(microcode_mutex); +static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); @@ -234,22 +233,6 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); struct platform_device *microcode_pdev; EXPORT_SYMBOL_GPL(microcode_pdev); -static void microcode_init_cpu(int cpu, int resume) -{ - cpumask_t old; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - old = current->cpus_allowed; - - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - mutex_lock(µcode_mutex); - microcode_ops->collect_cpu_info(cpu); - if (uci->valid && system_state == SYSTEM_RUNNING && !resume) - microcode_ops->cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); -} - static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t sz) @@ -266,14 +249,15 @@ static ssize_t reload_store(struct sys_device *dev, cpumask_t old = current->cpus_allowed; get_online_cpus(); - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - mutex_lock(µcode_mutex); - if (uci->valid) - err = microcode_ops->cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); + if (cpu_online(cpu)) { + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + mutex_lock(µcode_mutex); + if (uci->valid) + err = microcode_ops->cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); + } put_online_cpus(); - set_cpus_allowed_ptr(current, &old); } if (err) return err; @@ -285,7 +269,7 @@ static ssize_t version_show(struct sys_device *dev, { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - return sprintf(buf, "0x%x\n", uci->rev); + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); } static ssize_t pf_show(struct sys_device *dev, @@ -293,7 +277,7 @@ static ssize_t pf_show(struct sys_device *dev, { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - return sprintf(buf, "0x%x\n", uci->pf); + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } static SYSDEV_ATTR(reload, 0200, NULL, reload_store); @@ -312,7 +296,85 @@ static struct attribute_group mc_attr_group = { .name = "microcode", }; -static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + microcode_ops->microcode_fini_cpu(cpu); + uci->valid = 0; + mutex_unlock(µcode_mutex); +} + +static void collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + memset(uci, 0, sizeof(*uci)); + if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) + uci->valid = 1; +} + +static void microcode_resume_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct cpu_signature nsig; + + pr_debug("microcode: CPU%d resumed\n", cpu); + + if (!uci->mc.valid_mc) + return; + + /* + * Let's verify that the 'cached' ucode does belong + * to this cpu (a bit of paranoia): + */ + if (microcode_ops->collect_cpu_info(cpu, &nsig)) { + microcode_fini_cpu(cpu); + return; + } + + if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { + microcode_fini_cpu(cpu); + /* Should we look for a new ucode here? */ + return; + } + + microcode_ops->apply_microcode(cpu); +} + +void microcode_update_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu); + + mutex_lock(µcode_mutex); + /* + * Check if the system resume is in progress (uci->valid != NULL), + * otherwise just request a firmware: + */ + if (uci->valid) { + microcode_resume_cpu(cpu); + } else { + collect_cpu_info(cpu); + if (uci->valid && system_state == SYSTEM_RUNNING) + microcode_ops->cpu_request_microcode(cpu); + } + mutex_unlock(µcode_mutex); +} + +static void microcode_init_cpu(int cpu) +{ + cpumask_t old = current->cpus_allowed; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + microcode_update_cpu(cpu); + set_cpus_allowed_ptr(current, &old); +} + +static int mc_sysdev_add(struct sys_device *sys_dev) { int err, cpu = sys_dev->id; struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -327,16 +389,10 @@ static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) if (err) return err; - microcode_init_cpu(cpu, resume); - + microcode_init_cpu(cpu); return 0; } -static int mc_sysdev_add(struct sys_device *sys_dev) -{ - return __mc_sysdev_add(sys_dev, 0); -} - static int mc_sysdev_remove(struct sys_device *sys_dev) { int cpu = sys_dev->id; @@ -345,7 +401,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) return 0; pr_debug("microcode: CPU%d removed\n", cpu); - microcode_ops->microcode_fini_cpu(cpu); + microcode_fini_cpu(cpu); sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); return 0; } @@ -376,33 +432,26 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) sys_dev = get_cpu_sysdev(cpu); switch (action) { - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_ops->microcode_fini_cpu(cpu); - break; case CPU_ONLINE: - case CPU_DOWN_FAILED: - mc_sysdev_add(sys_dev); - break; case CPU_ONLINE_FROZEN: - /* System-wide resume is in progress, try to apply microcode */ - if (microcode_ops->apply_microcode_check_cpu(cpu)) { - /* The application of microcode failed */ - microcode_ops->microcode_fini_cpu(cpu); - __mc_sysdev_add(sys_dev, 1); - break; - } + microcode_init_cpu(cpu); + case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: + pr_debug("microcode: CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) printk(KERN_ERR "microcode: Failed to create the sysfs " "group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: - mc_sysdev_remove(sys_dev); - break; case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + pr_debug("microcode: CPU%d removed\n", cpu); + break; + case CPU_DEAD: + case CPU_UP_CANCELED_FROZEN: + /* The CPU refused to come up during a system resume */ + microcode_fini_cpu(cpu); break; } return NOTIFY_OK; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index a6e76ccf8158..4006e5e3adf0 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -59,38 +59,28 @@ MODULE_LICENSE("GPL v2"); /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -extern struct mutex (microcode_mutex); - struct equiv_cpu_entry *equiv_cpu_table; -extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; - -static void collect_cpu_info_amd(int cpu) +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu); - uci->rev = 0; - uci->pf = 0; - uci->mc.mc_amd = NULL; - uci->valid = 1; + memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", cpu); - uci->valid = 0; - return; + return -1; } asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (uci->rev) + : "=a" (csig->rev) : "i" (0x0000008B) : "ecx"); printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", - uci->rev); + csig->rev); + + return 0; } static int get_matching_microcode_amd(void *mc, int cpu) @@ -119,7 +109,7 @@ static int get_matching_microcode_amd(void *mc, int cpu) if (equiv_cpu_table == NULL) { printk(KERN_INFO "microcode: CPU%d microcode update with " "version 0x%x (current=0x%x)\n", - cpu, mc_header->patch_id, uci->rev); + cpu, mc_header->patch_id, uci->cpu_sig.rev); goto out; } @@ -185,12 +175,12 @@ static int get_matching_microcode_amd(void *mc, int cpu) pci_dev_put(sb_pci_dev); } - if (mc_header->patch_id <= uci->rev) + if (mc_header->patch_id <= uci->cpu_sig.rev) return 0; printk(KERN_INFO "microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", - cpu, mc_header->patch_id, uci->rev); + cpu, mc_header->patch_id, uci->cpu_sig.rev); out: new_mc = vmalloc(UCODE_MAX_SIZE); @@ -250,9 +240,9 @@ static void apply_microcode_amd(int cpu) printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x \n", - cpu_num, uci->rev, uci->mc.mc_amd->hdr.patch_id); + cpu_num, uci->cpu_sig.rev, uci->mc.mc_amd->hdr.patch_id); - uci->rev = rev; + uci->cpu_sig.rev = rev; } #ifdef CONFIG_MICROCODE_OLD_INTERFACE @@ -437,61 +427,18 @@ static int cpu_request_microcode_amd(int cpu) return error; } -static int apply_microcode_check_cpu_amd(int cpu) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - unsigned int rev; - cpumask_t old; - int err = 0; - - /* Check if the microcode is available */ - if (!uci->mc.mc_amd) - return 0; - - old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - /* Check if the microcode we have in memory matches the CPU */ - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 16) - err = -EINVAL; - - if (!err) { - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (rev) - : "i" (0x0000008B) : "ecx"); - - if (uci->rev != rev) - err = -EINVAL; - } - - if (!err) - apply_microcode_amd(cpu); - else - printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" - " rev=0x%x\n", - cpu, uci->rev); - - set_cpus_allowed(current, old); - return err; -} - static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - mutex_lock(µcode_mutex); - uci->valid = 0; vfree(uci->mc.mc_amd); uci->mc.mc_amd = NULL; - mutex_unlock(µcode_mutex); } static struct microcode_ops microcode_amd_ops = { .get_next_ucode = get_next_ucode_amd, .get_matching_microcode = get_matching_microcode_amd, .microcode_sanity_check = NULL, - .apply_microcode_check_cpu = apply_microcode_check_cpu_amd, .cpu_request_microcode = cpu_request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 6dd8907ff22e..c9b53202ba3d 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -122,46 +122,37 @@ MODULE_LICENSE("GPL"); /* serialize access to the physical write to MSR 0x79 */ static DEFINE_SPINLOCK(microcode_update_lock); -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -extern struct mutex microcode_mutex; - -extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; - -static void collect_cpu_info(int cpu_num) +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; unsigned int val[2]; - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu_num); - uci->pf = uci->rev = 0; - uci->mc.mc_intel = NULL; - uci->valid = 1; + memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { printk(KERN_ERR "microcode: CPU%d not a capable Intel " "processor\n", cpu_num); - uci->valid = 0; - return; + return -1; } - uci->sig = cpuid_eax(0x00000001); + csig->sig = cpuid_eax(0x00000001); if ((c->x86_model >= 5) || (c->x86 > 6)) { /* get processor flags from MSR 0x17 */ rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - uci->pf = 1 << ((val[1] >> 18) & 7); + csig->pf = 1 << ((val[1] >> 18) & 7); } wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); + rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - uci->sig, uci->pf, uci->rev); + csig->sig, csig->pf, csig->rev); + + return 0; } static inline int microcode_update_match(int cpu_num, @@ -169,8 +160,8 @@ static inline int microcode_update_match(int cpu_num, { struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - if (!sigmatch(sig, uci->sig, pf, uci->pf) - || mc_header->rev <= uci->rev) + if (!sigmatch(sig, uci->cpu_sig.sig, pf, uci->cpu_sig.pf) + || mc_header->rev <= uci->cpu_sig.rev) return 0; return 1; } @@ -289,7 +280,7 @@ static int get_matching_microcode(void *mc, int cpu) find: pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", - cpu, mc_header->rev, uci->rev); + cpu, mc_header->rev, uci->cpu_sig.rev); new_mc = vmalloc(total_size); if (!new_mc) { printk(KERN_ERR "microcode: error! Can not allocate memory\n"); @@ -335,16 +326,16 @@ static void apply_microcode(int cpu) spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != uci->mc.mc_intel->hdr.rev) { printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); + "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); return; } printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x, date = %04x-%02x-%02x \n", - cpu_num, uci->rev, val[1], + cpu_num, uci->cpu_sig.rev, val[1], uci->mc.mc_intel->hdr.date & 0xffff, uci->mc.mc_intel->hdr.date >> 24, (uci->mc.mc_intel->hdr.date >> 16) & 0xff); - uci->rev = val[1]; + uci->cpu_sig.rev = val[1]; } #ifdef CONFIG_MICROCODE_OLD_INTERFACE @@ -459,70 +450,18 @@ static int cpu_request_microcode(int cpu) return error; } -static int apply_microcode_check_cpu(int cpu) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - cpumask_t old; - unsigned int val[2]; - int err = 0; - - /* Check if the microcode is available */ - if (!uci->mc.mc_intel) - return 0; - - old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - /* Check if the microcode we have in memory matches the CPU */ - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001)) - err = -EINVAL; - - if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - if (uci->pf != (1 << ((val[1] >> 18) & 7))) - err = -EINVAL; - } - - if (!err) { - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (uci->rev != val[1]) - err = -EINVAL; - } - - if (!err) - apply_microcode(cpu); - else - printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" - " sig=0x%x, pf=0x%x, rev=0x%x\n", - cpu, uci->sig, uci->pf, uci->rev); - - set_cpus_allowed_ptr(current, &old); - return err; -} - static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - mutex_lock(µcode_mutex); - uci->valid = 0; vfree(uci->mc.mc_intel); uci->mc.mc_intel = NULL; - mutex_unlock(µcode_mutex); } static struct microcode_ops microcode_intel_ops = { .get_next_ucode = get_next_ucode, .get_matching_microcode = get_matching_microcode, .microcode_sanity_check = microcode_sanity_check, - .apply_microcode_check_cpu = apply_microcode_check_cpu, .cpu_request_microcode = cpu_request_microcode, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode, diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h index 18b2aeec2adf..7ceff48fa657 100644 --- a/include/asm-x86/microcode.h +++ b/include/asm-x86/microcode.h @@ -1,14 +1,18 @@ +#ifndef ASM_X86__MICROCODE_H +#define ASM_X86__MICROCODE_H + extern int microcode_init(void *opaque, struct module *module); extern void microcode_exit(void); +struct cpu_signature; + struct microcode_ops { long (*get_next_ucode)(void **mc, long offset); long (*microcode_get_next_ucode)(void **mc, long offset); int (*get_matching_microcode)(void *mc, int cpu); - int (*apply_microcode_check_cpu)(int cpu); int (*microcode_sanity_check)(void *mc); int (*cpu_request_microcode)(int cpu); - void (*collect_cpu_info)(int cpu_num); + int (*collect_cpu_info)(int cpu_num, struct cpu_signature *csig); void (*apply_microcode)(int cpu); void (*microcode_fini_cpu)(int cpu); void (*clear_patch)(void *data); @@ -75,13 +79,21 @@ struct microcode_amd { unsigned int mpb[0]; }; -struct ucode_cpu_info { - int valid; +struct cpu_signature { unsigned int sig; unsigned int pf; unsigned int rev; +}; + +struct ucode_cpu_info { + struct cpu_signature cpu_sig; + int valid; union { struct microcode_intel *mc_intel; struct microcode_amd *mc_amd; + void *valid_mc; } mc; }; +extern struct ucode_cpu_info ucode_cpu_info[]; + +#endif /* ASM_X86__MICROCODE_H */ From 6e833587e11ed0dbf12e647127f2650e2f80b26d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:16:17 -0700 Subject: [PATCH 043/138] xen: clean up domain mode predicates There are four operating modes Xen code may find itself running in: - native - hvm domain - pv dom0 - pv domU Clean up predicates for testing for these states to make them more consistent. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 9 +++++++-- drivers/block/xen-blkfront.c | 2 +- drivers/char/hvc_xen.c | 6 +++--- drivers/input/xen-kbdfront.c | 4 ++-- drivers/net/xen-netfront.c | 6 +++--- drivers/video/xen-fbfront.c | 4 ++-- drivers/xen/balloon.c | 2 +- drivers/xen/grant-table.c | 2 +- drivers/xen/xenbus/xenbus_probe.c | 8 ++++---- include/asm-x86/xen/hypervisor.h | 14 ++++++++++++-- 10 files changed, 36 insertions(+), 21 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 53afa14eb314..b106e825d266 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +enum xen_domain_type xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); + /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -1613,6 +1616,8 @@ asmlinkage void __init xen_start_kernel(void) if (!xen_start_info) return; + xen_domain_type = XEN_PV_DOMAIN; + BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); xen_setup_features(); @@ -1650,7 +1655,7 @@ asmlinkage void __init xen_start_kernel(void) /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!is_initial_xendomain()) + if (!xen_initial_domain()) __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); /* Don't do the full vcpu_info placement stuff until we have a @@ -1685,7 +1690,7 @@ asmlinkage void __init xen_start_kernel(void) boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); - if (!is_initial_xendomain()) { + if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 3ca643cafccd..d5e753255153 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1032,7 +1032,7 @@ static struct xenbus_driver blkfront = { static int __init xlblk_init(void) { - if (!is_running_on_xen()) + if (!xen_domain()) return -ENODEV; if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c index 6b70aa66a587..538ceea5e7df 100644 --- a/drivers/char/hvc_xen.c +++ b/drivers/char/hvc_xen.c @@ -108,8 +108,8 @@ static int __init xen_init(void) { struct hvc_struct *hp; - if (!is_running_on_xen() || - is_initial_xendomain() || + if (!xen_pv_domain() || + xen_initial_domain() || !xen_start_info->console.domU.evtchn) return -ENODEV; @@ -142,7 +142,7 @@ static void __exit xen_fini(void) static int xen_cons_init(void) { - if (!is_running_on_xen()) + if (!xen_pv_domain()) return 0; hvc_instantiate(HVC_COOKIE, 0, &hvc_ops); diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c index 9ce3b3baf3a2..3ab6362f043c 100644 --- a/drivers/input/xen-kbdfront.c +++ b/drivers/input/xen-kbdfront.c @@ -335,11 +335,11 @@ static struct xenbus_driver xenkbd = { static int __init xenkbd_init(void) { - if (!is_running_on_xen()) + if (!xen_domain()) return -ENODEV; /* Nothing to do if running in dom0. */ - if (is_initial_xendomain()) + if (xen_initial_domain()) return -ENODEV; return xenbus_register_frontend(&xenkbd); diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index c749bdba214c..3c3dd403f5dd 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1794,10 +1794,10 @@ static struct xenbus_driver netfront = { static int __init netif_init(void) { - if (!is_running_on_xen()) + if (!xen_domain()) return -ENODEV; - if (is_initial_xendomain()) + if (xen_initial_domain()) return 0; printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n"); @@ -1809,7 +1809,7 @@ module_init(netif_init); static void __exit netif_exit(void) { - if (is_initial_xendomain()) + if (xen_initial_domain()) return; xenbus_unregister_driver(&netfront); diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c index 47ed39b52f9c..a463b3dd837b 100644 --- a/drivers/video/xen-fbfront.c +++ b/drivers/video/xen-fbfront.c @@ -680,11 +680,11 @@ static struct xenbus_driver xenfb = { static int __init xenfb_init(void) { - if (!is_running_on_xen()) + if (!xen_domain()) return -ENODEV; /* Nothing to do if running in dom0. */ - if (is_initial_xendomain()) + if (xen_initial_domain()) return -ENODEV; return xenbus_register_frontend(&xenfb); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index fff987b10e0f..a51f3e17a5fd 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -419,7 +419,7 @@ static int __init balloon_init(void) unsigned long pfn; struct page *page; - if (!is_running_on_xen()) + if (!xen_pv_domain()) return -ENODEV; pr_info("xen_balloon: Initialising balloon driver.\n"); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index e9e11168616a..06592b9da83c 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -508,7 +508,7 @@ static int __devinit gnttab_init(void) unsigned int max_nr_glist_frames, nr_glist_frames; unsigned int nr_init_grefs; - if (!is_running_on_xen()) + if (!xen_domain()) return -ENODEV; nr_grant_frames = 1; diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 57ceb5346b74..7f24a98a446f 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -814,7 +814,7 @@ static int __init xenbus_probe_init(void) DPRINTK(""); err = -ENODEV; - if (!is_running_on_xen()) + if (!xen_domain()) goto out_error; /* Register ourselves with the kernel bus subsystem */ @@ -829,7 +829,7 @@ static int __init xenbus_probe_init(void) /* * Domain0 doesn't have a store_evtchn or store_mfn yet. */ - if (is_initial_xendomain()) { + if (xen_initial_domain()) { /* dom0 not yet supported */ } else { xenstored_ready = 1; @@ -846,7 +846,7 @@ static int __init xenbus_probe_init(void) goto out_unreg_back; } - if (!is_initial_xendomain()) + if (!xen_initial_domain()) xenbus_probe(NULL); return 0; @@ -937,7 +937,7 @@ static void wait_for_devices(struct xenbus_driver *xendrv) unsigned long timeout = jiffies + 10*HZ; struct device_driver *drv = xendrv ? &xendrv->driver : NULL; - if (!ready_to_wait_for_devices || !is_running_on_xen()) + if (!ready_to_wait_for_devices || !xen_domain()) return; while (exists_disconnected_device(drv)) { diff --git a/include/asm-x86/xen/hypervisor.h b/include/asm-x86/xen/hypervisor.h index 8e15dd28c91f..d9dd28caa508 100644 --- a/include/asm-x86/xen/hypervisor.h +++ b/include/asm-x86/xen/hypervisor.h @@ -55,7 +55,6 @@ /* arch/i386/kernel/setup.c */ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN) /* arch/i386/mach-xen/evtchn.c */ /* Force a proper event-channel callback from Xen. */ @@ -68,6 +67,17 @@ u64 jiffies_to_st(unsigned long jiffies); #define MULTI_UVMFLAGS_INDEX 3 #define MULTI_UVMDOMID_INDEX 4 -#define is_running_on_xen() (xen_start_info ? 1 : 0) +enum xen_domain_type { + XEN_NATIVE, + XEN_PV_DOMAIN, + XEN_HVM_DOMAIN, +}; + +extern enum xen_domain_type xen_domain_type; + +#define xen_domain() (xen_domain_type != XEN_NATIVE) +#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) +#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) +#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) #endif /* __HYPERVISOR_H__ */ From 63d3a75d6f1fcf2f33e6abbe84e1f428c3586152 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:19:36 -0700 Subject: [PATCH 044/138] x86/paravirt: add spin_lock_flags lock op It is useful for a pv_lock_ops backend to know whether interrupts are enabled or not in the context a spin_lock is being called. This allows it to enable interrupts while spinning, which could be particularly helpful when spinning becomes blocking. The default implementation just calls the normal spin_lock op, ignoring the flags. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt-spinlocks.c | 6 ++++++ include/asm-x86/paravirt.h | 7 +++++++ include/asm-x86/spinlock.h | 9 +++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 38d7f7f1dbc9..206359a8e43f 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -7,12 +7,18 @@ #include +static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __raw_spin_lock(lock); +} + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP .spin_is_locked = __ticket_spin_is_locked, .spin_is_contended = __ticket_spin_is_contended, .spin_lock = __ticket_spin_lock, + .spin_lock_flags = default_spin_lock_flags, .spin_trylock = __ticket_spin_trylock, .spin_unlock = __ticket_spin_unlock, #endif diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index db9b0647b346..8e9b1266898c 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -333,6 +333,7 @@ struct pv_lock_ops { int (*spin_is_locked)(struct raw_spinlock *lock); int (*spin_is_contended)(struct raw_spinlock *lock); void (*spin_lock)(struct raw_spinlock *lock); + void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); int (*spin_trylock)(struct raw_spinlock *lock); void (*spin_unlock)(struct raw_spinlock *lock); }; @@ -1414,6 +1415,12 @@ static __always_inline void __raw_spin_lock(struct raw_spinlock *lock) PVOP_VCALL1(pv_lock_ops.spin_lock, lock); } +static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock, + unsigned long flags) +{ + PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); +} + static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h index e39c790dbfd2..b755ea86367e 100644 --- a/include/asm-x86/spinlock.h +++ b/include/asm-x86/spinlock.h @@ -182,8 +182,6 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) } #endif -#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) - #ifdef CONFIG_PARAVIRT /* * Define virtualization-friendly old-style lock byte lock, for use in @@ -272,6 +270,13 @@ static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) { __ticket_spin_unlock(lock); } + +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, + unsigned long flags) +{ + __raw_spin_lock(lock); +} + #endif /* CONFIG_PARAVIRT */ static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) From 11ad93e59d114f4b218873f1c93261be725d2e22 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:32:51 -0700 Subject: [PATCH 045/138] xen: clarify locking used when pinning a pagetable. Add some comments explaining the locking and pinning algorithm when using split pte locks. Also implement a minor optimisation of not pinning the PTE when not using split pte locks. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469da696..d3752b6ce6e6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -590,8 +590,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pmdidx_limit = 0; #endif - flush |= (*func)(virt_to_page(pgd), PT_PGD); - for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { pud_t *pud; @@ -637,7 +635,11 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), } } } + out: + /* Do the top level last, so that the callbacks can use it as + a cue to do final things like tlb flushes. */ + flush |= (*func)(virt_to_page(pgd), PT_PGD); return flush; } @@ -691,6 +693,26 @@ static int pin_page(struct page *page, enum pt_level level) flush = 0; + /* + * We need to hold the pagetable lock between the time + * we make the pagetable RO and when we actually pin + * it. If we don't, then other users may come in and + * attempt to update the pagetable by writing it, + * which will fail because the memory is RO but not + * pinned, so Xen won't do the trap'n'emulate. + * + * If we're using split pte locks, we can't hold the + * entire pagetable's worth of locks during the + * traverse, because we may wrap the preempt count (8 + * bits). The solution is to mark RO and pin each PTE + * page while holding the lock. This means the number + * of locks we end up holding is never more than a + * batch size (~32 entries, at present). + * + * If we're not using split pte locks, we needn't pin + * the PTE pages independently, because we're + * protected by the overall pagetable lock. + */ ptl = NULL; if (level == PT_PTE) ptl = lock_pte(page); @@ -699,10 +721,9 @@ static int pin_page(struct page *page, enum pt_level level) pfn_pte(pfn, PAGE_KERNEL_RO), level == PT_PGD ? UVMF_TLB_FLUSH : 0); - if (level == PT_PTE) + if (ptl) { xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); - if (ptl) { /* Queue a deferred unlock for when this batch is completed. */ xen_mc_callback(do_unlock, ptl); @@ -796,10 +817,18 @@ static int unpin_page(struct page *page, enum pt_level level) spinlock_t *ptl = NULL; struct multicall_space mcs; + /* + * Do the converse to pin_page. If we're using split + * pte locks, we must be holding the lock for while + * the pte page is unpinned but still RO to prevent + * concurrent updates from seeing it in this + * partially-pinned state. + */ if (level == PT_PTE) { ptl = lock_pte(page); - xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + if (ptl) + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); } mcs = __xen_mc_entry(0); @@ -837,7 +866,7 @@ static void xen_pgd_unpin(pgd_t *pgd) #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif pgd_walk(pgd, unpin_page, USER_LIMIT); From 7708ad64a24a674f7905aa7a5099a50f055debec Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:34:22 -0700 Subject: [PATCH 046/138] xen: add xen_ prefixes to make tracing with ftrace easier It's easier to pattern match on Xen function if they all start with xen_. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 66 ++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d3752b6ce6e6..d9a35a363095 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -229,14 +229,14 @@ void make_lowmem_page_readwrite(void *vaddr) } -static bool page_pinned(void *ptr) +static bool xen_page_pinned(void *ptr) { struct page *page = virt_to_page(ptr); return PagePinned(page); } -static void extend_mmu_update(const struct mmu_update *update) +static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; struct mmu_update *u; @@ -265,7 +265,7 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) /* ptr may be ioremapped for 64-bit pagetable setup */ u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pmd_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -276,7 +276,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) { /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } @@ -334,7 +334,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; u.val = pte_val_ma(pte); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -400,7 +400,7 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) /* ptr may be ioremapped for 64-bit pagetable setup */ u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pud_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -411,7 +411,7 @@ void xen_set_pud(pud_t *ptr, pud_t val) { /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } @@ -490,7 +490,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) u.ptr = virt_to_machine(ptr).maddr; u.val = pgd_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); } /* @@ -519,10 +519,10 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; if (user_ptr) { - WARN_ON(page_pinned(user_ptr)); + WARN_ON(xen_page_pinned(user_ptr)); *user_ptr = val; } return; @@ -555,8 +555,8 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) * For 64-bit, we must skip the Xen hole in the middle of the address * space, just after the big x86-64 virtual hole. */ -static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), - unsigned long limit) +static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), + unsigned long limit) { int flush = 0; unsigned hole_low, hole_high; @@ -644,7 +644,9 @@ out: return flush; } -static spinlock_t *lock_pte(struct page *page) +/* If we're using split pte locks, then take the page's lock and + return a pointer to it. Otherwise return NULL. */ +static spinlock_t *xen_pte_lock(struct page *page) { spinlock_t *ptl = NULL; @@ -656,7 +658,7 @@ static spinlock_t *lock_pte(struct page *page) return ptl; } -static void do_unlock(void *v) +static void xen_pte_unlock(void *v) { spinlock_t *ptl = v; spin_unlock(ptl); @@ -674,7 +676,7 @@ static void xen_do_pin(unsigned level, unsigned long pfn) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); } -static int pin_page(struct page *page, enum pt_level level) +static int xen_pin_page(struct page *page, enum pt_level level) { unsigned pgfl = TestSetPagePinned(page); int flush; @@ -715,7 +717,7 @@ static int pin_page(struct page *page, enum pt_level level) */ ptl = NULL; if (level == PT_PTE) - ptl = lock_pte(page); + ptl = xen_pte_lock(page); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), @@ -726,7 +728,7 @@ static int pin_page(struct page *page, enum pt_level level) /* Queue a deferred unlock for when this batch is completed. */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -740,7 +742,7 @@ void xen_pgd_pin(pgd_t *pgd) { xen_mc_batch(); - if (pgd_walk(pgd, pin_page, USER_LIMIT)) { + if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); @@ -754,14 +756,14 @@ void xen_pgd_pin(pgd_t *pgd) xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); if (user_pgd) { - pin_page(virt_to_page(user_pgd), PT_PGD); + xen_pin_page(virt_to_page(user_pgd), PT_PGD); xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is pinnable */ - pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); #endif /* CONFIG_X86_64 */ @@ -796,7 +798,7 @@ void xen_mm_pin_all(void) * that's before we have page structures to store the bits. So do all * the book-keeping now. */ -static __init int mark_pinned(struct page *page, enum pt_level level) +static __init int xen_mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); return 0; @@ -804,10 +806,10 @@ static __init int mark_pinned(struct page *page, enum pt_level level) void __init xen_mark_init_mm_pinned(void) { - pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); + xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, enum pt_level level) +static int xen_unpin_page(struct page *page, enum pt_level level) { unsigned pgfl = TestClearPagePinned(page); @@ -825,7 +827,7 @@ static int unpin_page(struct page *page, enum pt_level level) * partially-pinned state. */ if (level == PT_PTE) { - ptl = lock_pte(page); + ptl = xen_pte_lock(page); if (ptl) xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); @@ -839,7 +841,7 @@ static int unpin_page(struct page *page, enum pt_level level) if (ptl) { /* unlock when batch completed */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -859,17 +861,17 @@ static void xen_pgd_unpin(pgd_t *pgd) if (user_pgd) { xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); - unpin_page(virt_to_page(user_pgd), PT_PGD); + xen_unpin_page(virt_to_page(user_pgd), PT_PGD); } } #endif #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif - pgd_walk(pgd, unpin_page, USER_LIMIT); + xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT); xen_mc_issue(0); } @@ -936,7 +938,7 @@ static void drop_other_mm_ref(void *info) } } -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { cpumask_t mask; unsigned cpu; @@ -966,7 +968,7 @@ static void drop_mm_ref(struct mm_struct *mm) smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { if (current->active_mm == mm) load_cr3(swapper_pg_dir); @@ -990,13 +992,13 @@ static void drop_mm_ref(struct mm_struct *mm) void xen_exit_mmap(struct mm_struct *mm) { get_cpu(); /* make sure we don't move around */ - drop_mm_ref(mm); + xen_drop_mm_ref(mm); put_cpu(); spin_lock(&mm->page_table_lock); /* pgd may not be pinned in the error exit path of execve */ - if (page_pinned(mm->pgd)) + if (xen_page_pinned(mm->pgd)) xen_pgd_unpin(mm->pgd); spin_unlock(&mm->page_table_lock); From 168d2f464ab9860f0d1e66cf1f9684973222f1c6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:18 -0700 Subject: [PATCH 047/138] xen: save previous spinlock when blocking A spinlock can be interrupted while spinning, so make sure we preserve the previous lock of interest if we're taking a lock from within an interrupt handler. We also need to deal with the case where the blocking path gets interrupted between testing to see if the lock is free and actually blocking. If we get interrupted there and end up in the state where the lock is free but the irq isn't pending, then we'll block indefinitely in the hypervisor. This fix is to make sure that any nested lock-takers will always leave the irq pending if there's any chance the outer lock became free. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 65 +++++++++++++++++++++++++++++++---------- drivers/xen/events.c | 25 ++++++++++++++++ include/xen/events.h | 2 ++ 3 files changed, 77 insertions(+), 15 deletions(-) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 8dc4d31da67f..4884bc603aa7 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -47,25 +47,41 @@ static int xen_spin_trylock(struct raw_spinlock *lock) static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); -static inline void spinning_lock(struct xen_spinlock *xl) +/* + * Mark a cpu as interested in a lock. Returns the CPU's previous + * lock of interest, in case we got preempted by an interrupt. + */ +static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) { + struct xen_spinlock *prev; + + prev = __get_cpu_var(lock_spinners); __get_cpu_var(lock_spinners) = xl; + wmb(); /* set lock of interest before count */ + asm(LOCK_PREFIX " incw %0" : "+m" (xl->spinners) : : "memory"); + + return prev; } -static inline void unspinning_lock(struct xen_spinlock *xl) +/* + * Mark a cpu as no longer interested in a lock. Restores previous + * lock of interest (NULL for none). + */ +static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) { asm(LOCK_PREFIX " decw %0" : "+m" (xl->spinners) : : "memory"); - wmb(); /* decrement count before clearing lock */ - __get_cpu_var(lock_spinners) = NULL; + wmb(); /* decrement count before restoring lock */ + __get_cpu_var(lock_spinners) = prev; } static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; + struct xen_spinlock *prev; int irq = __get_cpu_var(lock_kicker_irq); int ret; @@ -74,23 +90,42 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) return 0; /* announce we're spinning */ - spinning_lock(xl); + prev = spinning_lock(xl); - /* clear pending */ - xen_clear_irq_pending(irq); + do { + /* clear pending */ + xen_clear_irq_pending(irq); - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) - goto out; + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) { + /* + * If we interrupted another spinlock while it + * was blocking, make sure it doesn't block + * without rechecking the lock. + */ + if (prev != NULL) + xen_set_irq_pending(irq); + goto out; + } + + /* + * Block until irq becomes pending. If we're + * interrupted at this point (after the trylock but + * before entering the block), then the nested lock + * handler guarantees that the irq will be left + * pending if there's any chance the lock became free; + * xen_poll_irq() returns immediately if the irq is + * pending. + */ + xen_poll_irq(irq); + } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - /* block until irq becomes pending */ - xen_poll_irq(irq); kstat_this_cpu.irqs[irq]++; out: - unspinning_lock(xl); + unspinning_lock(xl, prev); return ret; } diff --git a/drivers/xen/events.c b/drivers/xen/events.c index a0837036d898..b6c2b8f16bee 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -164,6 +164,12 @@ static inline void set_evtchn(int port) sync_set_bit(port, &s->evtchn_pending[0]); } +static inline int test_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_bit(port, &s->evtchn_pending[0]); +} + /** * notify_remote_via_irq - send event to remote end of event channel via irq @@ -732,6 +738,25 @@ void xen_clear_irq_pending(int irq) clear_evtchn(evtchn); } +void xen_set_irq_pending(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + set_evtchn(evtchn); +} + +bool xen_test_irq_pending(int irq) +{ + int evtchn = evtchn_from_irq(irq); + bool ret = false; + + if (VALID_EVTCHN(evtchn)) + ret = test_evtchn(evtchn); + + return ret; +} + /* Poll waiting for an irq to become pending. In the usual case, the irq will be disabled so it won't deliver an interrupt. */ void xen_poll_irq(int irq) diff --git a/include/xen/events.h b/include/xen/events.h index 4680ff3fbc91..0d5f1adc0363 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -46,6 +46,8 @@ extern void xen_irq_resume(void); /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq); +void xen_set_irq_pending(int irq); +bool xen_test_irq_pending(int irq); /* Poll waiting for an irq to become pending. In the usual case, the irq will be disabled so it won't deliver an interrupt. */ From 994025caba3e6beade9bde84dd1b70d9d250f27b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:19 -0700 Subject: [PATCH 048/138] xen: add debugfs support Add support for exporting statistics on mmu updates, multicall batching and pv spinlocks into debugfs. The base path is xen/ and each subsystem adds its own directory: mmu, multicalls, spinlocks. In each directory, writing 1 to "zero_stats" will cause the corresponding stats to be zeroed the next time they're updated. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 10 ++- arch/x86/xen/Makefile | 3 +- arch/x86/xen/debugfs.c | 123 ++++++++++++++++++++++++++++ arch/x86/xen/debugfs.h | 10 +++ arch/x86/xen/mmu.c | 163 ++++++++++++++++++++++++++++++++++++- arch/x86/xen/multicalls.c | 115 +++++++++++++++++++++++++- arch/x86/xen/spinlock.c | 165 +++++++++++++++++++++++++++++++++++++- 7 files changed, 580 insertions(+), 9 deletions(-) create mode 100644 arch/x86/xen/debugfs.c create mode 100644 arch/x86/xen/debugfs.h diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 3815e425f470..d3e68465ace9 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -27,4 +27,12 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool depends on PM - default y \ No newline at end of file + default y + +config XEN_DEBUG_FS + bool "Enable Xen debug and tuning parameters in debugfs" + depends on XEN && DEBUG_FS + default n + help + Enable statistics output and various tuning options in debugfs. + Enabling this option may incur a significant performance overhead. \ No newline at end of file diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 9ee745fa5527..313947940a1a 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -8,4 +8,5 @@ endif obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o -obj-$(CONFIG_SMP) += smp.o spinlock.o +obj-$(CONFIG_SMP) += smp.o spinlock.o +obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c new file mode 100644 index 000000000000..b53225d2cac3 --- /dev/null +++ b/arch/x86/xen/debugfs.c @@ -0,0 +1,123 @@ +#include +#include +#include + +#include "debugfs.h" + +static struct dentry *d_xen_debug; + +struct dentry * __init xen_init_debugfs(void) +{ + if (!d_xen_debug) { + d_xen_debug = debugfs_create_dir("xen", NULL); + + if (!d_xen_debug) + pr_warning("Could not create 'xen' debugfs directory\n"); + } + + return d_xen_debug; +} + +struct array_data +{ + void *array; + unsigned elements; +}; + +static int u32_array_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return nonseekable_open(inode, file); +} + +static size_t format_array(char *buf, size_t bufsize, const char *fmt, + u32 *array, unsigned array_size) +{ + size_t ret = 0; + unsigned i; + + for(i = 0; i < array_size; i++) { + size_t len; + + len = snprintf(buf, bufsize, fmt, array[i]); + len++; /* ' ' or '\n' */ + ret += len; + + if (buf) { + buf += len; + bufsize -= len; + buf[-1] = (i == array_size-1) ? '\n' : ' '; + } + } + + ret++; /* \0 */ + if (buf) + *buf = '\0'; + + return ret; +} + +static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) +{ + size_t len = format_array(NULL, 0, fmt, array, array_size); + char *ret; + + ret = kmalloc(len, GFP_KERNEL); + if (ret == NULL) + return NULL; + + format_array(ret, len, fmt, array, array_size); + return ret; +} + +static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, + loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct array_data *data = inode->i_private; + size_t size; + + if (*ppos == 0) { + if (file->private_data) { + kfree(file->private_data); + file->private_data = NULL; + } + + file->private_data = format_array_alloc("%u", data->array, data->elements); + } + + size = 0; + if (file->private_data) + size = strlen(file->private_data); + + return simple_read_from_buffer(buf, len, ppos, file->private_data, size); +} + +static int xen_array_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + + return 0; +} + +static struct file_operations u32_array_fops = { + .owner = THIS_MODULE, + .open = u32_array_open, + .release= xen_array_release, + .read = u32_array_read, +}; + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements) +{ + struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data == NULL) + return NULL; + + data->array = array; + data->elements = elements; + + return debugfs_create_file(name, mode, parent, data, &u32_array_fops); +} diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h new file mode 100644 index 000000000000..e28132084832 --- /dev/null +++ b/arch/x86/xen/debugfs.h @@ -0,0 +1,10 @@ +#ifndef _XEN_DEBUGFS_H +#define _XEN_DEBUGFS_H + +struct dentry * __init xen_init_debugfs(void); + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements); + +#endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d9a35a363095..f5af913fd7b0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -40,6 +40,7 @@ */ #include #include +#include #include #include @@ -57,6 +58,61 @@ #include "multicalls.h" #include "mmu.h" +#include "debugfs.h" + +#define MMU_UPDATE_HISTO 30 + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct { + u32 pgd_update; + u32 pgd_update_pinned; + u32 pgd_update_batched; + + u32 pud_update; + u32 pud_update_pinned; + u32 pud_update_batched; + + u32 pmd_update; + u32 pmd_update_pinned; + u32 pmd_update_batched; + + u32 pte_update; + u32 pte_update_pinned; + u32 pte_update_batched; + + u32 mmu_update; + u32 mmu_update_extended; + u32 mmu_update_histo[MMU_UPDATE_HISTO]; + + u32 prot_commit; + u32 prot_commit_batched; + + u32 set_pte_at; + u32 set_pte_at_batched; + u32 set_pte_at_pinned; + u32 set_pte_at_current; + u32 set_pte_at_kernel; +} mmu_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mmu_stats, 0, sizeof(mmu_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); mmu_stats.elem += (val); } while(0) + +#else /* !CONFIG_XEN_DEBUG_FS */ + +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +#endif /* CONFIG_XEN_DEBUG_FS */ /* * Just beyond the highest usermode address. STACK_TOP_MAX has a @@ -243,11 +299,21 @@ static void xen_extend_mmu_update(const struct mmu_update *update) mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); - if (mcs.mc != NULL) + if (mcs.mc != NULL) { + ADD_STATS(mmu_update_extended, 1); + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); + mcs.mc->args[1]++; - else { + + if (mcs.mc->args[1] < MMU_UPDATE_HISTO) + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); + else + ADD_STATS(mmu_update_histo[0], 1); + } else { + ADD_STATS(mmu_update, 1); mcs = __xen_mc_entry(sizeof(*u)); MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + ADD_STATS(mmu_update_histo[1], 1); } u = mcs.args; @@ -267,6 +333,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) u.val = pmd_val_ma(val); xen_extend_mmu_update(&u); + ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); @@ -274,6 +342,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) void xen_set_pmd(pmd_t *ptr, pmd_t val) { + ADD_STATS(pmd_update, 1); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -281,6 +351,8 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) return; } + ADD_STATS(pmd_update_pinned, 1); + xen_set_pmd_hyper(ptr, val); } @@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, if (mm == &init_mm) preempt_disable(); + ADD_STATS(set_pte_at, 1); +// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); + ADD_STATS(set_pte_at_current, mm == current->mm); + ADD_STATS(set_pte_at_kernel, mm == &init_mm); + if (mm == current->mm || mm == &init_mm) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; mcs = xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); + ADD_STATS(set_pte_at_batched, 1); xen_mc_issue(PARAVIRT_LAZY_MMU); goto out; } else @@ -336,6 +414,9 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, u.val = pte_val_ma(pte); xen_extend_mmu_update(&u); + ADD_STATS(prot_commit, 1); + ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -402,6 +483,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) u.val = pud_val_ma(val); xen_extend_mmu_update(&u); + ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); @@ -409,6 +492,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) void xen_set_pud(pud_t *ptr, pud_t val) { + ADD_STATS(pud_update, 1); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -416,11 +501,17 @@ void xen_set_pud(pud_t *ptr, pud_t val) return; } + ADD_STATS(pud_update_pinned, 1); + xen_set_pud_hyper(ptr, val); } void xen_set_pte(pte_t *ptep, pte_t pte) { + ADD_STATS(pte_update, 1); +// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); + ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + #ifdef CONFIG_X86_PAE ptep->pte_high = pte.pte_high; smp_wmb(); @@ -517,6 +608,8 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) { pgd_t *user_ptr = xen_get_user_pgd(ptr); + ADD_STATS(pgd_update, 1); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -528,6 +621,9 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) return; } + ADD_STATS(pgd_update_pinned, 1); + ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + /* If it's pinned, then we can at least batch the kernel and user updates together. */ xen_mc_batch(); @@ -1003,3 +1099,66 @@ void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mmu_debug; + +static int __init xen_mmu_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mmu_debug = debugfs_create_dir("mmu", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); + + debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); + debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + + debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); + debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + + debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); + debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + + debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); +// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, +// &mmu_stats.pte_update_pinned); + debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, + &mmu_stats.pte_update_pinned); + + debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); + debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, + &mmu_stats.mmu_update_extended); + xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, + mmu_stats.mmu_update_histo, 20); + + debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); + debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_batched); + debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_current); + debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_kernel); + + debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); + debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, + &mmu_stats.prot_commit_batched); + + return 0; +} +fs_initcall(xen_mmu_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 9efd1c6c9776..8ea8a0d0b0de 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -21,16 +21,20 @@ */ #include #include +#include #include #include "multicalls.h" +#include "debugfs.h" + +#define MC_BATCH 32 #define MC_DEBUG 1 -#define MC_BATCH 32 #define MC_ARGS (MC_BATCH * 16) + struct mc_buffer { struct multicall_entry entries[MC_BATCH]; #if MC_DEBUG @@ -47,6 +51,76 @@ struct mc_buffer { static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); +/* flush reasons 0- slots, 1- args, 2- callbacks */ +enum flush_reasons +{ + FL_SLOTS, + FL_ARGS, + FL_CALLBACKS, + + FL_N_REASONS +}; + +#ifdef CONFIG_XEN_DEBUG_FS +#define NHYPERCALLS 40 /* not really */ + +static struct { + unsigned histo[MC_BATCH+1]; + + unsigned issued; + unsigned arg_total; + unsigned hypercalls; + unsigned histo_hypercalls[NHYPERCALLS]; + + unsigned flush[FL_N_REASONS]; +} mc_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mc_stats, 0, sizeof(mc_stats)); + zero_stats = 0; + } +} + +static void mc_add_stats(const struct mc_buffer *mc) +{ + int i; + + check_zero(); + + mc_stats.issued++; + mc_stats.hypercalls += mc->mcidx; + mc_stats.arg_total += mc->argidx; + + mc_stats.histo[mc->mcidx]++; + for(i = 0; i < mc->mcidx; i++) { + unsigned op = mc->entries[i].op; + if (op < NHYPERCALLS) + mc_stats.histo_hypercalls[op]++; + } +} + +static void mc_stats_flush(enum flush_reasons idx) +{ + check_zero(); + + mc_stats.flush[idx]++; +} + +#else /* !CONFIG_XEN_DEBUG_FS */ + +static inline void mc_add_stats(const struct mc_buffer *mc) +{ +} + +static inline void mc_stats_flush(enum flush_reasons idx) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ + void xen_mc_flush(void) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); @@ -60,6 +134,8 @@ void xen_mc_flush(void) something in the middle */ local_irq_save(flags); + mc_add_stats(b); + if (b->mcidx) { #if MC_DEBUG memcpy(b->debug, b->entries, @@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args) if (b->mcidx == MC_BATCH || (argidx + args) > MC_ARGS) { + mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); } @@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data) struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct callback *cb; - if (b->cbidx == MC_BATCH) + if (b->cbidx == MC_BATCH) { + mc_stats_flush(FL_CALLBACKS); xen_mc_flush(); + } cb = &b->callbacks[b->cbidx++]; cb->fn = fn; cb->data = data; } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mc_debug; + +static int __init xen_mc_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mc_debug = debugfs_create_dir("multicalls", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats); + + debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued); + debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls); + debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total); + + xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug, + mc_stats.histo, MC_BATCH); + xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug, + mc_stats.histo_hypercalls, NHYPERCALLS); + xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug, + mc_stats.flush, FL_N_REASONS); + + return 0; +} +fs_initcall(xen_mc_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 4884bc603aa7..0d8f3b2d9bec 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -4,6 +4,8 @@ */ #include #include +#include +#include #include @@ -11,6 +13,93 @@ #include #include "xen-ops.h" +#include "debugfs.h" + +#ifdef CONFIG_XEN_DEBUG_FS +static struct xen_spinlock_stats +{ + u64 taken; + u32 taken_slow; + u32 taken_slow_nested; + u32 taken_slow_pickup; + u32 taken_slow_spurious; + + u64 released; + u32 released_slow; + u32 released_slow_kicked; + +#define HISTO_BUCKETS 20 + u32 histo_spin_fast[HISTO_BUCKETS+1]; + u32 histo_spin[HISTO_BUCKETS+1]; + + u64 spinning_time; + u64 total_time; +} spinlock_stats; + +static u8 zero_stats; + +static unsigned lock_timeout = 1 << 10; +#define TIMEOUT lock_timeout + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); spinlock_stats.elem += (val); } while(0) + +static inline u64 spin_time_start(void) +{ + return xen_clocksource_read(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index = ilog2(delta); + + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_fast(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_fast); + spinlock_stats.spinning_time += delta; +} + +static inline void spin_time_accum(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin); + spinlock_stats.total_time += delta; +} +#else /* !CONFIG_XEN_DEBUG_FS */ +#define TIMEOUT (1 << 10) +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_fast(u64 start) +{ +} +static inline void spin_time_accum(u64 start) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ struct xen_spinlock { unsigned char lock; /* 0 -> free; 1 -> locked */ @@ -92,6 +181,9 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) /* announce we're spinning */ prev = spinning_lock(xl); + ADD_STATS(taken_slow, 1); + ADD_STATS(taken_slow_nested, prev != NULL); + do { /* clear pending */ xen_clear_irq_pending(irq); @@ -100,6 +192,8 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) we weren't looking */ ret = xen_spin_trylock(lock); if (ret) { + ADD_STATS(taken_slow_pickup, 1); + /* * If we interrupted another spinlock while it * was blocking, make sure it doesn't block @@ -120,6 +214,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) * pending. */ xen_poll_irq(irq); + ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ kstat_this_cpu.irqs[irq]++; @@ -132,11 +227,18 @@ out: static void xen_spin_lock(struct raw_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int timeout; + unsigned timeout; u8 oldval; + u64 start_spin; + + ADD_STATS(taken, 1); + + start_spin = spin_time_start(); do { - timeout = 1 << 10; + u64 start_spin_fast = spin_time_start(); + + timeout = TIMEOUT; asm("1: xchgb %1,%0\n" " testb %1,%1\n" @@ -151,16 +253,22 @@ static void xen_spin_lock(struct raw_spinlock *lock) : "1" (1) : "memory"); - } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); + spin_time_accum_fast(start_spin_fast); + } while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock)))); + + spin_time_accum(start_spin); } static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) { int cpu; + ADD_STATS(released_slow, 1); + for_each_online_cpu(cpu) { /* XXX should mix up next cpu selection */ if (per_cpu(lock_spinners, cpu) == xl) { + ADD_STATS(released_slow_kicked, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); break; } @@ -171,6 +279,8 @@ static void xen_spin_unlock(struct raw_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; + ADD_STATS(released, 1); + smp_wmb(); /* make sure no writes get moved after unlock */ xl->lock = 0; /* release lock */ @@ -216,3 +326,52 @@ void __init xen_init_spinlocks(void) pv_lock_ops.spin_trylock = xen_spin_trylock; pv_lock_ops.spin_unlock = xen_spin_unlock; } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_spin_debug; + +static int __init xen_spinlock_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); + + debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.taken_slow); + debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, + &spinlock_stats.taken_slow_nested); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.taken_slow_pickup); + debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, + &spinlock_stats.taken_slow_spurious); + + debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.released_slow); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.released_slow_kicked); + + debugfs_create_u64("time_spinning", 0444, d_spin_debug, + &spinlock_stats.spinning_time); + debugfs_create_u64("time_total", 0444, d_spin_debug, + &spinlock_stats.total_time); + + xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, + spinlock_stats.histo_spin, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, + spinlock_stats.histo_spin_fast, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(xen_spinlock_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ From 1e696f638b523958ee3d95e655becdb4c4b6fcde Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:20 -0700 Subject: [PATCH 049/138] xen: allow interrupts to be enabled while doing a blocking spin If spin_lock is called in an interrupts-enabled context, we can safely enable interrupts while spinning. We don't bother for the actual spin loop, but if we timeout and fall back to blocking, it's definitely worthwhile enabling interrupts if possible. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 0d8f3b2d9bec..e6061f2e64f2 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -23,6 +23,7 @@ static struct xen_spinlock_stats u32 taken_slow_nested; u32 taken_slow_pickup; u32 taken_slow_spurious; + u32 taken_slow_irqenable; u64 released; u32 released_slow; @@ -167,12 +168,13 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock __get_cpu_var(lock_spinners) = prev; } -static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; struct xen_spinlock *prev; int irq = __get_cpu_var(lock_kicker_irq); int ret; + unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) @@ -181,6 +183,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) /* announce we're spinning */ prev = spinning_lock(xl); + flags = __raw_local_save_flags(); + if (irq_enable) { + ADD_STATS(taken_slow_irqenable, 1); + raw_local_irq_enable(); + } + ADD_STATS(taken_slow, 1); ADD_STATS(taken_slow_nested, prev != NULL); @@ -220,11 +228,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) kstat_this_cpu.irqs[irq]++; out: + raw_local_irq_restore(flags); unspinning_lock(xl, prev); return ret; } -static void xen_spin_lock(struct raw_spinlock *lock) +static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; unsigned timeout; @@ -254,11 +263,23 @@ static void xen_spin_lock(struct raw_spinlock *lock) : "memory"); spin_time_accum_fast(start_spin_fast); - } while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock)))); + + } while (unlikely(oldval != 0 && + (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); spin_time_accum(start_spin); } +static void xen_spin_lock(struct raw_spinlock *lock) +{ + __xen_spin_lock(lock, false); +} + +static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); +} + static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) { int cpu; @@ -323,6 +344,7 @@ void __init xen_init_spinlocks(void) pv_lock_ops.spin_is_locked = xen_spin_is_locked; pv_lock_ops.spin_is_contended = xen_spin_is_contended; pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; pv_lock_ops.spin_trylock = xen_spin_trylock; pv_lock_ops.spin_unlock = xen_spin_unlock; } @@ -353,6 +375,8 @@ static int __init xen_spinlock_debugfs(void) &spinlock_stats.taken_slow_pickup); debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, &spinlock_stats.taken_slow_spurious); + debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, + &spinlock_stats.taken_slow_irqenable); debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); debugfs_create_u32("released_slow", 0444, d_spin_debug, From f8eca41f0afcc7042dc3398c8ba5878c59b1cf1b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:21 -0700 Subject: [PATCH 050/138] xen: measure how long spinlocks spend blocking Measure how long spinlocks spend blocked. Also rename some fields to be more consistent. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 60 ++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index e6061f2e64f2..d072823bc06d 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -29,12 +29,14 @@ static struct xen_spinlock_stats u32 released_slow; u32 released_slow_kicked; -#define HISTO_BUCKETS 20 - u32 histo_spin_fast[HISTO_BUCKETS+1]; - u32 histo_spin[HISTO_BUCKETS+1]; +#define HISTO_BUCKETS 30 + u32 histo_spin_total[HISTO_BUCKETS+1]; + u32 histo_spin_spinning[HISTO_BUCKETS+1]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; - u64 spinning_time; - u64 total_time; + u64 time_total; + u64 time_spinning; + u64 time_blocked; } spinlock_stats; static u8 zero_stats; @@ -70,20 +72,28 @@ static void __spin_time_accum(u64 delta, u32 *array) array[HISTO_BUCKETS]++; } -static inline void spin_time_accum_fast(u64 start) +static inline void spin_time_accum_spinning(u64 start) { u32 delta = xen_clocksource_read() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin_fast); - spinlock_stats.spinning_time += delta; + __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); + spinlock_stats.time_spinning += delta; } -static inline void spin_time_accum(u64 start) +static inline void spin_time_accum_total(u64 start) { u32 delta = xen_clocksource_read() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin); - spinlock_stats.total_time += delta; + __spin_time_accum(delta, spinlock_stats.histo_spin_total); + spinlock_stats.time_total += delta; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; } #else /* !CONFIG_XEN_DEBUG_FS */ #define TIMEOUT (1 << 10) @@ -94,10 +104,13 @@ static inline u64 spin_time_start(void) return 0; } -static inline void spin_time_accum_fast(u64 start) +static inline void spin_time_accum_total(u64 start) { } -static inline void spin_time_accum(u64 start) +static inline void spin_time_accum_spinning(u64 start) +{ +} +static inline void spin_time_accum_blocked(u64 start) { } #endif /* CONFIG_XEN_DEBUG_FS */ @@ -175,11 +188,14 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl int irq = __get_cpu_var(lock_kicker_irq); int ret; unsigned long flags; + u64 start; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) return 0; + start = spin_time_start(); + /* announce we're spinning */ prev = spinning_lock(xl); @@ -230,6 +246,8 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl out: raw_local_irq_restore(flags); unspinning_lock(xl, prev); + spin_time_accum_blocked(start); + return ret; } @@ -262,12 +280,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) : "1" (1) : "memory"); - spin_time_accum_fast(start_spin_fast); + spin_time_accum_spinning(start_spin_fast); } while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); - spin_time_accum(start_spin); + spin_time_accum_total(start_spin); } static void xen_spin_lock(struct raw_spinlock *lock) @@ -385,14 +403,18 @@ static int __init xen_spinlock_debugfs(void) &spinlock_stats.released_slow_kicked); debugfs_create_u64("time_spinning", 0444, d_spin_debug, - &spinlock_stats.spinning_time); + &spinlock_stats.time_spinning); + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); debugfs_create_u64("time_total", 0444, d_spin_debug, - &spinlock_stats.total_time); + &spinlock_stats.time_total); xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin, HISTO_BUCKETS + 1); + spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_fast, HISTO_BUCKETS + 1); + spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); return 0; } From 5b792d320f28ff83dd4c13f984807e26235f7703 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 21 Aug 2008 13:43:51 -0700 Subject: [PATCH 051/138] x86, microcode_amd: fix shift warning microcode_amd.c uses ">> 32" on a 32-bit value, so gcc warns about that. The code could use something like this *untested* patch. linux-next-20080821/arch/x86/kernel/microcode_amd.c:229: warning: right shift count >= width of type Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 4006e5e3adf0..d606a05545ca 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -206,6 +206,7 @@ static void apply_microcode_amd(int cpu) unsigned int rev; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + unsigned long addr; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -215,10 +216,9 @@ static void apply_microcode_amd(int cpu) spin_lock_irqsave(µcode_update_lock, flags); - edx = (unsigned int)(((unsigned long) - &(uci->mc.mc_amd->hdr.data_code)) >> 32); - eax = (unsigned int)(((unsigned long) - &(uci->mc.mc_amd->hdr.data_code)) & 0xffffffffL); + addr = (unsigned long)&uci->mc.mc_amd->hdr.data_code; + edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); + eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); asm volatile("movl %0, %%ecx; wrmsr" : : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); From ee7686bc043921a488d9bf89fe93241c5457a74e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 21 Aug 2008 13:17:56 -0700 Subject: [PATCH 052/138] x86: build fix in "xen spinlock updates and performance measurements" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ingo Molnar wrote: > -tip testing found this build failure: > > arch/x86/xen/spinlock.c: In function ‘spin_time_start’: > arch/x86/xen/spinlock.c:60: error: implicit declaration of function ‘xen_clocksource_read’ > > i've excluded these new commits for now from tip/master - could you > please send a delta fix against tip/x86/xen? Make xen_clocksource_read non-static. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/time.c | 4 +--- arch/x86/xen/xen-ops.h | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 685b77470fc3..20182d9072c4 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -30,8 +30,6 @@ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) -static cycle_t xen_clocksource_read(void); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void) return xen_khz; } -static cycle_t xen_clocksource_read(void) +cycle_t xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3c70ebc50b1b..1e8bfdaa20d3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -2,6 +2,7 @@ #define XEN_OPS_H #include +#include #include #include @@ -33,6 +34,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); void __init xen_time_init(void); From 25258ef762bc4a05fa9c4523f7dae56e3fd01864 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 11:31:07 -0700 Subject: [PATCH 053/138] x86: export pv_lock_ops non-GPL None of the spinlock API is exported GPL, so there's no reason for pv_lock_ops to be. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Cc: drago01 --- arch/x86/kernel/paravirt-spinlocks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 206359a8e43f..0e9f1982b1dd 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -23,7 +23,7 @@ struct pv_lock_ops pv_lock_ops = { .spin_unlock = __ticket_spin_unlock, #endif }; -EXPORT_SYMBOL_GPL(pv_lock_ops); +EXPORT_SYMBOL(pv_lock_ops); void __init paravirt_use_bytelocks(void) { From 93be71b672f167b1e8c23725114f86305354f0ac Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:11 +0100 Subject: [PATCH 054/138] x86: add cpu hotplug hooks into smp_ops Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 4 ++-- arch/x86/kernel/smp.c | 6 +++++- arch/x86/kernel/smpboot.c | 8 ++++---- include/asm-x86/smp.h | 28 ++++++++++++++++++++++++---- 5 files changed, 37 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 3b7a1ddcc0bc..e382fe0ccd66 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -91,7 +91,7 @@ static void cpu_exit_clear(void) } /* We don't actually take CPU down, just spin without interrupts. */ -static inline void play_dead(void) +void native_play_dead(void) { /* This must be done before dead CPU ack */ cpu_exit_clear(); @@ -107,7 +107,7 @@ static inline void play_dead(void) wbinvd_halt(); } #else -static inline void play_dead(void) +void native_play_dead(void) { BUG(); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 71553b664e2a..dfd3f5752085 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -90,7 +90,7 @@ DECLARE_PER_CPU(int, cpu_state); #include /* We halt the CPU with physical CPU hotplug */ -static inline void play_dead(void) +void native_play_dead(void) { idle_task_exit(); mb(); @@ -102,7 +102,7 @@ static inline void play_dead(void) wbinvd_halt(); } #else -static inline void play_dead(void) +void native_play_dead(void) { BUG(); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 361b7a4c640c..18f9b19f5f8f 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, - .cpu_up = native_cpu_up, .smp_cpus_done = native_smp_cpus_done, .smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7985c5b3f916..c414cee296ba 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1346,7 +1346,7 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } -int __cpu_disable(void) +int native_cpu_disable(void) { int cpu = smp_processor_id(); @@ -1385,7 +1385,7 @@ int __cpu_disable(void) return 0; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ unsigned int i; @@ -1403,12 +1403,12 @@ void __cpu_die(unsigned int cpu) printk(KERN_ERR "CPU %u didn't die...\n", cpu); } #else /* ... !CONFIG_HOTPLUG_CPU */ -int __cpu_disable(void) +int native_cpu_disable(void) { return -ENOSYS; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We said "no" in __cpu_disable */ BUG(); diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 3c877f74f279..dbf4249e2a6d 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -47,12 +47,16 @@ extern struct { struct smp_ops { void (*smp_prepare_boot_cpu)(void); void (*smp_prepare_cpus)(unsigned max_cpus); - int (*cpu_up)(unsigned cpu); void (*smp_cpus_done)(unsigned max_cpus); void (*smp_send_stop)(void); void (*smp_send_reschedule)(int cpu); + int (*cpu_up)(unsigned cpu); + int (*cpu_disable)(void); + void (*cpu_die)(unsigned int cpu); + void (*play_dead)(void); + void (*send_call_func_ipi)(cpumask_t mask); void (*send_call_func_single_ipi)(int cpu); }; @@ -91,6 +95,21 @@ static inline int __cpu_up(unsigned int cpu) return smp_ops.cpu_up(cpu); } +static inline int __cpu_disable(void) +{ + return smp_ops.cpu_disable(); +} + +static inline void __cpu_die(unsigned int cpu) +{ + smp_ops.cpu_die(cpu); +} + +static inline void play_dead(void) +{ + smp_ops.play_dead(); +} + static inline void smp_send_reschedule(int cpu) { smp_ops.smp_send_reschedule(cpu); @@ -110,12 +129,13 @@ void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); int native_cpu_up(unsigned int cpunum); +int native_cpu_disable(void); +void native_cpu_die(unsigned int cpu); +void native_play_dead(void); + void native_send_call_func_ipi(cpumask_t mask); void native_send_call_func_single_ipi(int cpu); -extern int __cpu_disable(void); -extern void __cpu_die(unsigned int cpu); - void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) From 379002586368ae22916f668011c9118c8ce8189c Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:12 +0100 Subject: [PATCH 055/138] x86_32: clean up play_dead The removal of the CPU from the various maps was redundant as it already happened in cpu_disable. After cleaning this up, cpu_uninit only resets the tlb state, so rename it and create a noop version for the X86_64 case (so the two play_deads can be unified later). Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 6 +----- arch/x86/kernel/process_32.c | 17 ++++------------- include/asm-x86/smp.h | 7 ++++++- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa39..18f5551b32dd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -714,14 +714,10 @@ void __cpuinit cpu_init(void) mxcsr_feature_mask_init(); } -#ifdef CONFIG_HOTPLUG_CPU -void __cpuinit cpu_uninit(void) +void reset_lazy_tlbstate(void) { int cpu = raw_smp_processor_id(); - cpu_clear(cpu, cpu_initialized); - /* lazy TLB state */ per_cpu(cpu_tlbstate, cpu).state = 0; per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; } -#endif diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index e382fe0ccd66..d55eb49584b1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -75,26 +75,17 @@ unsigned long thread_saved_pc(struct task_struct *tsk) #ifdef CONFIG_HOTPLUG_CPU #include -static void cpu_exit_clear(void) +/* We don't actually take CPU down, just spin without interrupts. */ +void native_play_dead(void) { int cpu = raw_smp_processor_id(); idle_task_exit(); - cpu_uninit(); + reset_lazy_tlbstate(); + irq_ctx_exit(cpu); - cpu_clear(cpu, cpu_callout_map); - cpu_clear(cpu, cpu_callin_map); - - numa_remove_cpu(cpu); -} - -/* We don't actually take CPU down, just spin without interrupts. */ -void native_play_dead(void) -{ - /* This must be done before dead CPU ack */ - cpu_exit_clear(); mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index dbf4249e2a6d..3bec6b478213 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -221,7 +221,12 @@ static inline int hard_smp_processor_id(void) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_HOTPLUG_CPU -extern void cpu_uninit(void); +#ifdef CONFIG_X86_32 +extern void reset_lazy_tlbstate(void); +#else +static inline void reset_lazy_tlbstate(void) +{ } +#endif /* CONFIG_X86_32 */ #endif #endif /* __ASSEMBLY__ */ From a21f5d88c17a40941f6239d1959d89e8493e8e01 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:13 +0100 Subject: [PATCH 056/138] x86: unify x86_32 and x86_64 play_dead into one function Add the new play_dead into smpboot.c, as it fits more cleanly in there alongside other CONFIG_HOTPLUG functions. Separate out the common code into its own function. Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 32 -------------------------------- arch/x86/kernel/process_64.c | 23 ----------------------- arch/x86/kernel/smpboot.c | 29 +++++++++++++++++++++++++++++ include/asm-x86/smp.h | 1 + 4 files changed, 30 insertions(+), 55 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d55eb49584b1..633cf06578fa 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -72,38 +72,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -#ifdef CONFIG_HOTPLUG_CPU -#include - -/* We don't actually take CPU down, just spin without interrupts. */ -void native_play_dead(void) -{ - int cpu = raw_smp_processor_id(); - - idle_task_exit(); - - reset_lazy_tlbstate(); - - irq_ctx_exit(cpu); - - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - /* - * With physical CPU hotplug, we should halt the cpu - */ - local_irq_disable(); - /* mask all interrupts, flush any and all caches, and halt */ - wbinvd_halt(); -} -#else -void native_play_dead(void) -{ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index dfd3f5752085..aa28ed01cdf3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -85,29 +85,6 @@ void exit_idle(void) __exit_idle(); } -#ifdef CONFIG_HOTPLUG_CPU -DECLARE_PER_CPU(int, cpu_state); - -#include -/* We halt the CPU with physical CPU hotplug */ -void native_play_dead(void) -{ - idle_task_exit(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - local_irq_disable(); - /* mask all interrupts, flush any and all caches, and halt */ - wbinvd_halt(); -} -#else -void native_play_dead(void) -{ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c414cee296ba..28b4287296aa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1402,6 +1402,29 @@ void native_cpu_die(unsigned int cpu) } printk(KERN_ERR "CPU %u didn't die...\n", cpu); } + +void play_dead_common(void) +{ + idle_task_exit(); + reset_lazy_tlbstate(); + irq_ctx_exit(raw_smp_processor_id()); + + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); +} + +void native_play_dead(void) +{ + play_dead_common(); + wbinvd_halt(); +} + #else /* ... !CONFIG_HOTPLUG_CPU */ int native_cpu_disable(void) { @@ -1413,4 +1436,10 @@ void native_cpu_die(unsigned int cpu) /* We said "no" in __cpu_disable */ BUG(); } + +void native_play_dead(void) +{ + BUG(); +} + #endif diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 3bec6b478213..17305e72f348 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -132,6 +132,7 @@ int native_cpu_up(unsigned int cpunum); int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); void native_play_dead(void); +void play_dead_common(void); void native_send_call_func_ipi(cpumask_t mask); void native_send_call_func_single_ipi(int cpu); From 8227dce7dc2cfdcc28ee0eadfb482a7ee77fba03 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:14 +0100 Subject: [PATCH 057/138] x86: separate generic cpu disabling code from APIC writes in cpu_disable It allows paravirt implementations of cpu_disable to share the cpu_disable_common code, without having to take on board APIC writes, which may not be appropriate. Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 40 ++++++++++++++++++++++----------------- include/asm-x86/smp.h | 1 + 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 28b4287296aa..66b04e598817 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1346,6 +1346,28 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } +void cpu_disable_common(void) +{ + int cpu = smp_processor_id(); + /* + * HACK: + * Allow any queued timer interrupts to get serviced + * This is only a temporary solution until we cleanup + * fixup_irqs as we do for IA64. + */ + local_irq_enable(); + mdelay(1); + + local_irq_disable(); + remove_siblinginfo(cpu); + + /* It's now safe to remove this processor from the online map */ + lock_vector_lock(); + remove_cpu_from_maps(cpu); + unlock_vector_lock(); + fixup_irqs(cpu_online_map); +} + int native_cpu_disable(void) { int cpu = smp_processor_id(); @@ -1365,23 +1387,7 @@ int native_cpu_disable(void) stop_apic_nmi_watchdog(NULL); clear_local_APIC(); - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - - local_irq_disable(); - remove_siblinginfo(cpu); - - /* It's now safe to remove this processor from the online map */ - lock_vector_lock(); - remove_cpu_from_maps(cpu); - unlock_vector_lock(); - fixup_irqs(cpu_online_map); + cpu_disable_common(); return 0; } diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 17305e72f348..8bdaa4a25f05 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -125,6 +125,7 @@ static inline void arch_send_call_function_ipi(cpumask_t mask) smp_ops.send_call_func_ipi(mask); } +void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); From d68d82afd4c88e25763b23cd9cd4974573a3706f Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:15 +0100 Subject: [PATCH 058/138] xen: implement CPU hotplugging Note the changes from 2.6.18-xen CPU hotplugging: A vcpu_down request from the remote admin via Xenbus both hotunplugs the CPU, and disables it by removing it from the cpu_present map, and removing its entry in /sys. A vcpu_up request from the remote admin only re-enables the CPU, and does not immediately bring the CPU up. A udev event is emitted, which can be caught by the user if he wishes to automatically re-up CPUs when available, or implement a more complex policy. Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 60 ++++++++++++++++++++------ arch/x86/xen/spinlock.c | 5 +++ arch/x86/xen/time.c | 8 ++++ arch/x86/xen/xen-ops.h | 6 +++ drivers/xen/Makefile | 2 +- drivers/xen/cpu_hotplug.c | 90 +++++++++++++++++++++++++++++++++++++++ drivers/xen/events.c | 4 ++ 7 files changed, 162 insertions(+), 13 deletions(-) create mode 100644 drivers/xen/cpu_hotplug.c diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index baca7f2fbd8a..be5cbb2b7c60 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -11,8 +11,6 @@ * useful topology information for the kernel to make use of. As a * result, all CPUs are treated as if they're single-core and * single-threaded. - * - * This does not handle HOTPLUG_CPU yet. */ #include #include @@ -61,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static __cpuinit void cpu_bringup_and_idle(void) +static __cpuinit void cpu_bringup(void) { int cpu = smp_processor_id(); cpu_init(); + touch_softlockup_watchdog(); preempt_disable(); xen_enable_sysenter(); @@ -86,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void) local_irq_enable(); wmb(); /* make sure everything is out */ +} + +static __cpuinit void cpu_bringup_and_idle(void) +{ + cpu_bringup(); cpu_idle(); } @@ -209,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) cpu_set(cpu, cpu_present_map); } - - //init_xenbus_allowed_cpumask(); } static __cpuinit int @@ -278,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) struct task_struct *idle = idle_task(cpu); int rc; -#if 0 - rc = cpu_up_check(cpu); - if (rc) - return rc; -#endif - #ifdef CONFIG_X86_64 /* Allocate node local memory for AP pdas */ WARN_ON(cpu == 0); @@ -336,6 +332,42 @@ static void xen_smp_cpus_done(unsigned int max_cpus) { } +int xen_cpu_disable(void) +{ + unsigned int cpu = smp_processor_id(); + if (cpu == 0) + return -EBUSY; + + cpu_disable_common(); + + load_cr3(swapper_pg_dir); + return 0; +} + +void xen_cpu_die(unsigned int cpu) +{ + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + + if (num_online_cpus() == 1) + alternatives_smp_switch(0); +} + +void xen_play_dead(void) +{ + play_dead_common(); + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + cpu_bringup(); +} + static void stop_self(void *v) { int cpu = smp_processor_id(); @@ -419,9 +451,13 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, .smp_cpus_done = xen_smp_cpus_done, + .cpu_up = xen_cpu_up, + .cpu_die = xen_cpu_die, + .cpu_disable = xen_cpu_disable, + .play_dead = xen_play_dead, + .smp_send_stop = xen_smp_send_stop, .smp_send_reschedule = xen_smp_send_reschedule, diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index d072823bc06d..dd71e3a021cd 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -357,6 +357,11 @@ void __cpuinit xen_init_lock_cpu(int cpu) printk("cpu %d spinlock event irq %d\n", cpu, irq); } +void xen_uninit_lock_cpu(int cpu) +{ + unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); +} + void __init xen_init_spinlocks(void) { pv_lock_ops.spin_is_locked = xen_spin_is_locked; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 20182d9072c4..004ba86326ae 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -450,6 +450,14 @@ void xen_setup_timer(int cpu) setup_runstate_info(cpu); } +void xen_teardown_timer(int cpu) +{ + struct clock_event_device *evt; + BUG_ON(cpu == 0); + evt = &per_cpu(xen_clock_events, cpu); + unbind_from_irqhandler(evt->irq, NULL); +} + void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1e8bfdaa20d3..8dbd97fd7f18 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -34,6 +34,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); @@ -50,11 +51,16 @@ void xen_mark_init_mm_pinned(void); void __init xen_setup_vcpu_info_placement(void); +void xen_play_dead(void); +void xen_cpu_die(unsigned int cpu); +int xen_cpu_disable(void); + #ifdef CONFIG_SMP void xen_smp_init(void); void __init xen_init_spinlocks(void); __cpuinit void xen_init_lock_cpu(int cpu); +void xen_uninit_lock_cpu(int cpu); extern cpumask_t xen_cpu_initialized_map; #else diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 363286c54290..f62d8df27696 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,4 +1,4 @@ -obj-y += grant-table.o features.o events.o manage.o +obj-y += grant-table.o features.o events.o manage.o cpu_hotplug.o obj-y += xenbus/ obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c new file mode 100644 index 000000000000..1bc003536cdb --- /dev/null +++ b/drivers/xen/cpu_hotplug.c @@ -0,0 +1,90 @@ +#include + +#include + +#include +#include + +static void enable_hotplug_cpu(int cpu) +{ + if (!cpu_present(cpu)) + arch_register_cpu(cpu); + + cpu_set(cpu, cpu_present_map); +} + +static void disable_hotplug_cpu(int cpu) +{ + if (cpu_present(cpu)) + arch_unregister_cpu(cpu); + + cpu_clear(cpu, cpu_present_map); +} + +static void vcpu_hotplug(unsigned int cpu) +{ + int err; + char dir[32], state[32]; + + if (!cpu_possible(cpu)) + return; + + sprintf(dir, "cpu/%u", cpu); + err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); + if (err != 1) { + printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); + return; + } + + if (strcmp(state, "online") == 0) { + enable_hotplug_cpu(cpu); + } else if (strcmp(state, "offline") == 0) { + (void)cpu_down(cpu); + disable_hotplug_cpu(cpu); + } else { + printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", + state, cpu); + } +} + +static void handle_vcpu_hotplug_event(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned int cpu; + char *cpustr; + const char *node = vec[XS_WATCH_PATH]; + + cpustr = strstr(node, "cpu/"); + if (cpustr != NULL) { + sscanf(cpustr, "cpu/%u", &cpu); + vcpu_hotplug(cpu); + } +} + +static int setup_cpu_watcher(struct notifier_block *notifier, + unsigned long event, void *data) +{ + static struct xenbus_watch cpu_watch = { + .node = "cpu", + .callback = handle_vcpu_hotplug_event}; + + (void)register_xenbus_watch(&cpu_watch); + + return NOTIFY_DONE; +} + +static int __init setup_vcpu_hotplug_event(void) +{ + static struct notifier_block xsn_cpu = { + .notifier_call = setup_cpu_watcher }; + + if (!is_running_on_xen()) + return -ENODEV; + + register_xenstore_notifier(&xsn_cpu); + + return 0; +} + +arch_initcall(setup_vcpu_hotplug_event); + diff --git a/drivers/xen/events.c b/drivers/xen/events.c index b6c2b8f16bee..c3290bc186a0 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -360,6 +360,10 @@ static void unbind_from_irq(unsigned int irq) per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) [index_from_irq(irq)] = -1; break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; default: break; } From a57a5c2e8db8d80f460dcad77877895718c9f209 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 4 Sep 2008 01:03:02 +0400 Subject: [PATCH 059/138] x86 setup: remove remnants of CONFIG_VIDEO_SELECT (read: vga=) Impact: cleanup Video mode selection became always possible in 2.6.23-rc1 after i386 setup code rewrite in C. Regardless, VIDEO_SELECT is stupid config option because it affects only kernel setup code, not code which always stays in memory. vga= always possible now which is good. Signed-off-by: Alexey Dobriyan Signed-off-by: H. Peter Anvin --- arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - drivers/video/Kconfig | 2 -- drivers/video/console/Kconfig | 16 ---------------- 4 files changed, 20 deletions(-) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 9bc34e2033ec..a7ae4385670e 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1482,7 +1482,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y # CONFIG_FRAMEBUFFER_CONSOLE is not set CONFIG_LOGO=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index ae5124e064d4..743e80392731 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1449,7 +1449,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y # CONFIG_FRAMEBUFFER_CONSOLE is not set CONFIG_LOGO=y diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index 9b887ef64ff1..d58509a67068 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -678,7 +678,6 @@ config FB_VESA select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT - select VIDEO_SELECT help This is the frame buffer device driver for generic VESA 2.0 compliant graphic cards. The older VESA 1.2 cards are not supported. @@ -1583,7 +1582,6 @@ config FB_CYBLA tristate "Cyberblade/i1 support" depends on FB && PCI && X86_32 && !64BIT select FB_CFB_IMAGEBLIT - select VIDEO_SELECT ---help--- This driver is supposed to support the Trident Cyberblade/i1 graphics core integrated in the VIA VT8601A North Bridge, diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig index 06f87b04f207..2f50a80b413e 100644 --- a/drivers/video/console/Kconfig +++ b/drivers/video/console/Kconfig @@ -43,22 +43,6 @@ config VGACON_SOFT_SCROLLBACK_SIZE buffer. Each 64KB will give you approximately 16 80x25 screenfuls of scrollback buffer -config VIDEO_SELECT - bool "Video mode selection support" - depends on X86 && VGA_CONSOLE - ---help--- - This enables support for text mode selection on kernel startup. If - you want to take advantage of some high-resolution text mode your - card's BIOS offers, but the traditional Linux utilities like - SVGATextMode don't, you can say Y here and set the mode using the - "vga=" option from your boot loader (lilo or loadlin) or set - "vga=ask" which brings up a video mode menu on kernel startup. (Try - "man bootparam" or see the documentation of your boot loader about - how to pass options to the kernel.) - - Read the file for more information - about the Video mode selection support. If unsure, say N. - config MDA_CONSOLE depends on !M68K && !PARISC && ISA tristate "MDA text console (dual-headed) (EXPERIMENTAL)" From 7f16a339787d45f997d67c1a4dea3c357f48e121 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 4 Sep 2008 06:19:45 -0700 Subject: [PATCH 060/138] x86: boot/compressed/Makefile: fix "make clean" The Kbuild variable "targets" is supposed to be configuration-independent and reflect "all possible targets". This is required to make "make clean" work properly. Therefore, move all manipulation of "targets" as well as custom rules out of the x86-32 ifdef statement. Only leave inside the ifdefs the things that are genuinely configuration-dependent. Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 92fdd35bd93e..1771c804e02f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -ifeq ($(CONFIG_X86_32),y) -targets += vmlinux.bin.all vmlinux.relocs -hostprogs-y := relocs +targets += vmlinux.bin.all vmlinux.relocs relocs +hostprogs-$(CONFIG_X86_32) += relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< @@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@ $(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE $(call if_changed,relocbin) +ifeq ($(CONFIG_X86_32),y) + ifdef CONFIG_RELOCATABLE $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE $(call if_changed,gzip) @@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T endif - $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE $(call if_changed,ld) From ef1f3413284b9270266cb04a944647e59735f0f1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 5 Sep 2008 13:26:39 +0100 Subject: [PATCH 061/138] x86: ticket spin locks: fix asm constraints In addition to these changes I doubt the 'volatile' on all the ticket lock asm()-s are really necessary. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- include/asm-x86/spinlock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h index 93adae338ac6..acd9bdda55cf 100644 --- a/include/asm-x86/spinlock.h +++ b/include/asm-x86/spinlock.h @@ -101,7 +101,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) "1:" "sete %b1\n\t" "movzbl %b1,%0\n\t" - : "=&a" (tmp), "=Q" (new), "+m" (lock->slock) + : "=&a" (tmp), "=&Q" (new), "+m" (lock->slock) : : "memory", "cc"); @@ -146,7 +146,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) /* don't need lfence here, because loads are in-order */ "jmp 1b\n" "2:" - : "+Q" (inc), "+m" (lock->slock), "=r" (tmp) + : "+r" (inc), "+m" (lock->slock), "=&r" (tmp) : : "memory", "cc"); } @@ -166,7 +166,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) "1:" "sete %b1\n\t" "movzbl %b1,%0\n\t" - : "=&a" (tmp), "=r" (new), "+m" (lock->slock) + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) : : "memory", "cc"); From 08f5fcbe6e0ea029c7e9b1b1c338700ab7809daf Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 5 Sep 2008 13:26:39 +0100 Subject: [PATCH 062/138] x86: ticket spin locks: factor out more common code Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- include/asm-x86/spinlock.h | 42 +++++++++++++++----------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h index acd9bdda55cf..63d3b610a5ef 100644 --- a/include/asm-x86/spinlock.h +++ b/include/asm-x86/spinlock.h @@ -54,19 +54,7 @@ * much between them in performance though, especially as locks are out of line. */ #if (NR_CPUS < 256) -static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) -{ - int tmp = ACCESS_ONCE(lock->slock); - - return (((tmp >> 8) & 0xff) != (tmp & 0xff)); -} - -static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) -{ - int tmp = ACCESS_ONCE(lock->slock); - - return (((tmp >> 8) - tmp) & 0xff) > 1; -} +#define TICKET_SHIFT 8 static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) { @@ -116,19 +104,7 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) : "memory", "cc"); } #else -static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) -{ - int tmp = ACCESS_ONCE(lock->slock); - - return (((tmp >> 16) & 0xffff) != (tmp & 0xffff)); -} - -static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) -{ - int tmp = ACCESS_ONCE(lock->slock); - - return (((tmp >> 16) - tmp) & 0xffff) > 1; -} +#define TICKET_SHIFT 16 static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) { @@ -182,6 +158,20 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) } #endif +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) +{ + int tmp = ACCESS_ONCE(lock->slock); + + return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); +} + +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) +{ + int tmp = ACCESS_ONCE(lock->slock); + + return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; +} + #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) #ifdef CONFIG_PARAVIRT From 74e91604b2452c15bbe72d77b37cf47ed0310d13 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 5 Sep 2008 13:27:45 +0100 Subject: [PATCH 063/138] x86: ticket spin locks: reduce instruction dependencies Reduce the amount of partial register accesses in the NR_CPUS < 256 case, and slightly weaken resource dependencies in the other case. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- include/asm-x86/spinlock.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h index 63d3b610a5ef..b5a4551fd565 100644 --- a/include/asm-x86/spinlock.h +++ b/include/asm-x86/spinlock.h @@ -21,8 +21,10 @@ #ifdef CONFIG_X86_32 # define LOCK_PTR_REG "a" +# define REG_PTR_MODE "k" #else # define LOCK_PTR_REG "D" +# define REG_PTR_MODE "q" #endif #if defined(CONFIG_X86_32) && \ @@ -77,19 +79,17 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) { - int tmp; - short new; + int tmp, new; - asm volatile("movw %2,%w0\n\t" + asm volatile("movzwl %2, %0\n\t" "cmpb %h0,%b0\n\t" + "leal 0x100(%" REG_PTR_MODE "0), %1\n\t" "jne 1f\n\t" - "movw %w0,%w1\n\t" - "incb %h1\n\t" LOCK_PREFIX "cmpxchgw %w1,%2\n\t" "1:" "sete %b1\n\t" "movzbl %b1,%0\n\t" - : "=&a" (tmp), "=&Q" (new), "+m" (lock->slock) + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) : : "memory", "cc"); @@ -136,8 +136,8 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) "movl %0,%1\n\t" "roll $16, %0\n\t" "cmpl %0,%1\n\t" + "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t" "jne 1f\n\t" - "addl $0x00010000, %1\n\t" LOCK_PREFIX "cmpxchgl %1,%2\n\t" "1:" "sete %b1\n\t" From 5c51c637e489f3f4d58d51365b0c558549918858 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Wed, 3 Sep 2008 14:30:21 +0100 Subject: [PATCH 064/138] Xen: fix cpu_hotplug.c build by replacing is_running_on_xen() with xen_pv_domain() Signed-off-by: Alex Nixon Signed-off-by: Ingo Molnar --- drivers/xen/cpu_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 1bc003536cdb..565280ec1c6a 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -78,7 +78,7 @@ static int __init setup_vcpu_hotplug_event(void) static struct notifier_block xsn_cpu = { .notifier_call = setup_cpu_watcher }; - if (!is_running_on_xen()) + if (!xen_pv_domain()) return -ENODEV; register_xenstore_notifier(&xsn_cpu); From 5ab6d815dc23117cd9c5895cb9592824de3d4a68 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Wed, 3 Sep 2008 14:30:22 +0100 Subject: [PATCH 065/138] Xen: fix cpu_hotplug build when !CONFIG_SMP Compile cpu_hotplug.c conditionally on CONFIG_HOTPLUG_CPU Signed-off-by: Alex Nixon Signed-off-by: Ingo Molnar --- drivers/xen/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index f62d8df27696..d2a8fdf0e191 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,4 +1,5 @@ -obj-y += grant-table.o features.o events.o manage.o cpu_hotplug.o +obj-y += grant-table.o features.o events.o manage.o obj-y += xenbus/ +obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o From 913da64b54b2b3bb212a59aba2e6f2b8294ca1fa Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Wed, 3 Sep 2008 14:30:23 +0100 Subject: [PATCH 066/138] x86: build fix for !CONFIG_SMP Move reset_lazy_tlbstate into tlb_32.c, and define noop versions of play_dead() in process_{32,64}.c when !CONFIG_SMP. Signed-off-by: Alex Nixon Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 7 ------- arch/x86/kernel/process_32.c | 7 +++++++ arch/x86/kernel/process_64.c | 7 +++++++ arch/x86/kernel/tlb_32.c | 8 ++++++++ include/asm-x86/smp.h | 9 --------- include/asm-x86/tlbflush.h | 10 ++++++++++ 6 files changed, 32 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 18f5551b32dd..76d10d5c2fa7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -714,10 +714,3 @@ void __cpuinit cpu_init(void) mxcsr_feature_mask_init(); } -void reset_lazy_tlbstate(void) -{ - int cpu = raw_smp_processor_id(); - - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 633cf06578fa..b76b38ff962b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -72,6 +72,13 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index aa28ed01cdf3..ec27afa43d7e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -85,6 +85,13 @@ void exit_idle(void) __exit_idle(); } +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index fec1ecedc9b7..e00534b33534 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -241,3 +241,11 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1); } +void reset_lazy_tlbstate(void) +{ + int cpu = raw_smp_processor_id(); + + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} + diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 8bdaa4a25f05..30b5146cc436 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -222,14 +222,5 @@ static inline int hard_smp_processor_id(void) #endif /* CONFIG_X86_LOCAL_APIC */ -#ifdef CONFIG_HOTPLUG_CPU -#ifdef CONFIG_X86_32 -extern void reset_lazy_tlbstate(void); -#else -static inline void reset_lazy_tlbstate(void) -{ } -#endif /* CONFIG_X86_32 */ -#endif - #endif /* __ASSEMBLY__ */ #endif diff --git a/include/asm-x86/tlbflush.h b/include/asm-x86/tlbflush.h index 35c76ceb9f40..0e7bbb549116 100644 --- a/include/asm-x86/tlbflush.h +++ b/include/asm-x86/tlbflush.h @@ -119,6 +119,10 @@ static inline void native_flush_tlb_others(const cpumask_t *cpumask, { } +static inline void reset_lazy_tlbstate(void) +{ +} + #else /* SMP */ #include @@ -151,6 +155,12 @@ struct tlb_state { char __cacheline_padding[L1_CACHE_BYTES-8]; }; DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); + +void reset_lazy_tlbstate(void); +#else +static inline void reset_lazy_tlbstate(void) +{ +} #endif #endif /* SMP */ From d2f37384fc9957ad0162d5285a5660f0a86ef243 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 5 Sep 2008 21:28:27 -0700 Subject: [PATCH 067/138] x86: when building image.iso, use isohybrid if it exists When building image.iso (make isoimage), use the isohybrid tool if it exists. isohybrid is a script included with Syslinux 3.72 and higher, which creates an image that can be booted either as a hard disk (including removable, e.g. USB disk) or as a CD-ROM. If isohybrid doesn't exist, then this has no effect. Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 7ee102f9c4f8..cceba1f46365 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -181,6 +181,7 @@ isoimage: $(BOOTIMAGE) mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ -no-emul-boot -boot-load-size 4 -boot-info-table \ $(obj)/isoimage + isohybrid $(obj)/image.iso 2>/dev/null || true rm -rf $(obj)/isoimage zlilo: $(BOOTIMAGE) From bb57925f5057837c248b57a51181fd6b138a32f3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:26:55 -0700 Subject: [PATCH 068/138] x86_32: signal: use syscall_get_nr and error Use asm/syscall.h interfaces that do the same things. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b21070ea33a4..3e4a688bb84f 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "sigframe.h" @@ -507,9 +508,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; @@ -623,9 +624,9 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: From 72fa50f4ef9014f4212945b766af84ea94308903 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:27:11 -0700 Subject: [PATCH 069/138] x86_32: signal: introduce signal_fault() implement signal_fault() for 32bit. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 18 +++++++++++++++++- include/asm-x86/ptrace.h | 4 ++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 3e4a688bb84f..76d05d703845 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -243,7 +243,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) return ax; badframe: - force_sig(SIGSEGV, current); + signal_fault(regs, frame, "rt sigreturn"); return 0; } @@ -669,3 +669,19 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) clear_thread_flag(TIF_IRET); } + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + + if (show_unhandled_signals && printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk(KERN_CONT "\n"); + } + + force_sig(SIGSEGV, me); +} diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h index a33f027dbbea..fad807769910 100644 --- a/include/asm-x86/ptrace.h +++ b/include/asm-x86/ptrace.h @@ -144,10 +144,10 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); #ifdef CONFIG_X86_32 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code); -#else -void signal_fault(struct pt_regs *regs, void __user *frame, char *where); #endif +void signal_fault(struct pt_regs *regs, void __user *frame, char *where); + extern long syscall_trace_enter(struct pt_regs *); extern void syscall_trace_leave(struct pt_regs *); From 8fcd8e20f388d787f6abf701b11037b122029d5b Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:27:39 -0700 Subject: [PATCH 070/138] x86: signal: make NR_restart_syscall make NR_restart_syscall macro for cosmetic unification of handle_signal(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 3 ++- arch/x86/kernel/signal_64.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 76d05d703845..bd9b65031a9a 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -572,6 +572,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, return 0; } +#define NR_restart_syscall __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -635,7 +636,7 @@ static void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 823a55bf8c39..19e2b9143205 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -362,6 +362,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, return ret; } +#define NR_restart_syscall \ + test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -423,9 +425,7 @@ static void do_signal(struct pt_regs *regs) regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: - regs->ax = test_thread_flag(TIF_IA32) ? - __NR_ia32_restart_syscall : - __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } From 1d13024e624d0e2e47f117b383917249a41ef5ce Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:28:06 -0700 Subject: [PATCH 071/138] x86: signal: split out frame setups Make setup_rt_frame() and split out frame setups from handle_signal(). This is for cosmetic unification of handle_signal(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 29 ++++++++++++++++++++--------- arch/x86/kernel/signal_64.c | 30 ++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index bd9b65031a9a..48982f7ce886 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -338,8 +338,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } static int -setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, - struct pt_regs *regs) +__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, + struct pt_regs *regs) { struct sigframe __user *frame; void __user *restorer; @@ -416,8 +416,8 @@ give_sigsegv: return -EFAULT; } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *restorer; @@ -501,6 +501,21 @@ give_sigsegv: /* * OK, we're invoking a handler: */ +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int ret; + + /* Set up the stack frame */ + if (ka->sa.sa_flags & SA_SIGINFO) + ret = __setup_rt_frame(sig, ka, info, set, regs); + else + ret = __setup_frame(sig, ka, set, regs); + + return ret; +} + static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) @@ -537,11 +552,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = setup_frame(sig, ka, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); if (ret) return ret; diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 19e2b9143205..8fbdd23d5cc6 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -195,8 +195,8 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) return (void __user *)round_down(sp - size, 64); } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *fp = NULL; @@ -280,6 +280,24 @@ give_sigsegv: /* * OK, we're invoking a handler */ +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int ret; + +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = ia32_setup_rt_frame(sig, ka, info, set, regs); + else + ret = ia32_setup_frame(sig, ka, set, regs); + } else +#endif + ret = __setup_rt_frame(sig, ka, info, set, regs); + + return ret; +} static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, @@ -317,14 +335,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = ia32_setup_frame(sig, ka, oldset, regs); - } else -#endif ret = setup_rt_frame(sig, ka, info, oldset, regs); if (ret == 0) { From 13ad7725e9955ac68fff670a039da99ab33e86a0 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:28:38 -0700 Subject: [PATCH 072/138] x86_32: signal: move signal number conversion to upper layer Bring signal number conversion in __setup_frame() and __setup_rt_frame() up into the common part setup_frame(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 48982f7ce886..b668efc18eae 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -344,7 +344,6 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct sigframe __user *frame; void __user *restorer; int err = 0; - int usig; void __user *fpstate = NULL; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); @@ -352,13 +351,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; - - err = __put_user(usig, &frame->sig); + err = __put_user(sig, &frame->sig); if (err) goto give_sigsegv; @@ -422,7 +415,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct rt_sigframe __user *frame; void __user *restorer; int err = 0; - int usig; void __user *fpstate = NULL; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); @@ -430,13 +422,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; - - err |= __put_user(usig, &frame->sig); + err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= copy_siginfo_to_user(&frame->info, info); @@ -482,7 +468,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; regs->ip = (unsigned long)ka->sa.sa_handler; - regs->ax = (unsigned long)usig; + regs->ax = (unsigned long)sig; regs->dx = (unsigned long)&frame->info; regs->cx = (unsigned long)&frame->uc; @@ -506,12 +492,19 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { int ret; + int usig; + + usig = current_thread_info()->exec_domain + && current_thread_info()->exec_domain->signal_invmap + && sig < 32 + ? current_thread_info()->exec_domain->signal_invmap[sig] + : sig; /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) - ret = __setup_rt_frame(sig, ka, info, set, regs); + ret = __setup_rt_frame(usig, ka, info, set, regs); else - ret = __setup_frame(sig, ka, set, regs); + ret = __setup_frame(usig, ka, set, regs); return ret; } From 464f04c9e9b3b1c4f5ffb89c51d8ba2a2034c846 Mon Sep 17 00:00:00 2001 From: Andrey Borzenkov Date: Sat, 6 Sep 2008 12:40:21 +0400 Subject: [PATCH 073/138] x86: fix ghost EDD devices in /sys again > This is regression but old enough. Apparently I had for whatever reasons > EDD turned off till recently. This is 2.6.27-rc5 just in case. > > In 2006 I fixed ghost devices due to buggy BIOS: > > http://marc.info/?l=linux-kernel&m=114087765422490&w=2 > > Later edd.S has been rewritten in C, and apparently this patch has been > lost: > > {pts/1}% ls /sys/firmware/edd > int13_dev80/ int13_dev84/ int13_dev88/ int13_dev8c/ > int13_dev81/ int13_dev85/ int13_dev89/ int13_dev8d/ > int13_dev82/ int13_dev86/ int13_dev8a/ int13_dev8e/ > int13_dev83/ int13_dev87/ int13_dev8b/ int13_dev8f/ > > But I have just a single disk. This is the same system BTW. Some BIOSes do not always set CF on error before return from int13. The patch adds additional check for status being zero (AH == 0). This was fixed for edd.S in http://marc.info/?l=linux-kernel&m=114087765422490&w=2, but lost again when edd.S was rewritten in C. Signed-off-by: Andrey Borzenkov Signed-off-by: Ingo Molnar --- arch/x86/boot/edd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index d93cbc6464d0..bf4ae6ff518e 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -32,7 +32,9 @@ static int read_mbr(u8 devno, void *buf) : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx) : : "esi", "edi", "memory"); - return -(u8)ax; /* 0 or -1 */ + /* Some BIOSes do not set carry flag on error but still return + * error in AH. The condition below is expected to catch both */ + return -!!ax; /* 0 or -1 */ } static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) From a19aac8548d85cf15d744335b8e82201da760d80 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 30 Aug 2008 06:03:34 +0400 Subject: [PATCH 074/138] x86: make setup_xstate_init() __init WARNING: vmlinux.o(.text+0x22453): Section mismatch in reference from the function setup_xstate_init() to the function .init.text:__alloc_bootmem() The function setup_xstate_init() references the function __init __alloc_bootmem(). This is often because setup_xstate_init lacks a __init annotation or the annotation of __alloc_bootmem is wrong. Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar --- arch/x86/kernel/xsave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 07713d64debe..ed5274a5bb0f 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -272,7 +272,7 @@ void __cpuinit xsave_init(void) /* * setup the xstate image representing the init state */ -void setup_xstate_init(void) +static void __init setup_xstate_init(void) { init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; From 30cec97967beb5aa08e893e8ff69623d5fecd9b7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 12:49:55 +0100 Subject: [PATCH 075/138] x86: init annotations in early_printk() setup Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 825e9136b026..02fc5b40c14b 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -120,7 +120,7 @@ static __init void early_serial_init(char *s) if (!strncmp(s, "0x", 2)) { early_serial_base = simple_strtoul(s, &e, 16); } else { - static int bases[] = { 0x3f8, 0x2f8 }; + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; if (!strncmp(s, "ttyS", 4)) s += 4; @@ -920,7 +920,7 @@ static struct console simnow_console = { /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; -static int early_console_initialized; +static int __initdata early_console_initialized; asmlinkage void early_printk(const char *fmt, ...) { From cc643d4687533345fd8ebcba836f9ee25df7c458 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 12:53:45 +0100 Subject: [PATCH 076/138] x86: adjust vmalloc_sync_all() for Xen (2nd try) Since the fourth PDPT entry cannot be shared under Xen, vmalloc_sync_all() must iterate over pmd-s rather than pgd-s here. Luckily, the code isn't used for native PAE (SHARED_KERNEL_PMD is 1) and the change is benign to non-PAE. Also do a little more cleanup in that function. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Cc: Jeremy Fitzhardinge --- arch/x86/mm/fault.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 455f3fe67b42..356ed2dec3a6 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -915,15 +915,15 @@ LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { -#ifdef CONFIG_X86_32 - unsigned long start = VMALLOC_START & PGDIR_MASK; unsigned long address; +#ifdef CONFIG_X86_32 if (SHARED_KERNEL_PMD) return; - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); - for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { unsigned long flags; struct page *page; @@ -936,10 +936,8 @@ void vmalloc_sync_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } #else /* CONFIG_X86_64 */ - unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; From 5394f80f92642c61fc2a95385be85f2fdcfb5adb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 01:51:32 -0700 Subject: [PATCH 077/138] x86: check for and defend against BIOS memory corruption Some BIOSes have been observed to corrupt memory in the low 64k. This change: - Reserves all memory which does not have to be in that area, to prevent it from being used as general memory by the kernel. Things like the SMP trampoline are still in the memory, however. - Clears the reserved memory so we can observe changes to it. - Adds a function check_for_bios_corruption() which checks and reports on memory becoming unexpectedly non-zero. Currently it's called in the x86 fault handler, and the powermanagement debug output. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 5 ++ arch/x86/Kconfig | 3 + arch/x86/kernel/setup.c | 87 +++++++++++++++++++++++++++++ arch/x86/mm/fault.c | 2 + drivers/base/power/main.c | 1 + include/linux/kernel.h | 12 ++++ 6 files changed, 110 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1150444a21ab..df48af505d15 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -360,6 +360,11 @@ and is between 256 and 4096 characters. It is defined in the file Format: ,, See header of drivers/net/hamradio/baycom_ser_hdx.c. + bios_corruption_check=0/1 [X86] + Some BIOSes seem to corrupt the first 64k of memory + when doing things like suspend/resume. Setting this + option will scan the memory looking for corruption. + boot_delay= Milliseconds to delay each printk during boot. Values larger than 10 seconds (10000) are changed to no delay (0). diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d1325..1bb52e2ca02e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -201,6 +201,9 @@ config X86_TRAMPOLINE depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP) default y +config X86_CHECK_BIOS_CORRUPTION + def_bool y + config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 362d4e7f2d38..ee89ebc5aabc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -578,6 +578,89 @@ static struct x86_quirks default_x86_quirks __initdata; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +#define MAX_SCAN_AREAS 8 +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + +static void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + while(addr < 0x10000 && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > 0x10000) + size = 0x10000 - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "scanning %d areas for BIOS corruption\n", + num_scan_areas); + update_e820(); +} + +static int __read_mostly bios_corruption_check = 1; + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!bios_corruption_check) + return; + + for(i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for(; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + if (corruption) + dump_stack(); +} + +static int set_bios_corruption_check(char *arg) +{ + char *end; + + bios_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("bios_corruption_check", set_bios_corruption_check); +#endif + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -750,6 +833,10 @@ void __init setup_arch(char **cmdline_p) high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION + setup_bios_corruption_check(); +#endif + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Sun, 7 Sep 2008 01:51:33 -0700 Subject: [PATCH 078/138] x86: add periodic corruption check Perodically check for corruption in low phusical memory. Don't bother checking at fault time, since it won't show anything useful. Signed-off-by: Hugh Dickins Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 17 +++++++++++++++++ arch/x86/mm/fault.c | 2 -- arch/x86/mm/init_32.c | 2 ++ arch/x86/mm/init_64.c | 2 ++ include/linux/kernel.h | 1 + 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ee89ebc5aabc..c239b3780973 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -623,6 +623,7 @@ static void __init setup_bios_corruption_check(void) } static int __read_mostly bios_corruption_check = 1; +static struct timer_list periodic_check_timer; void check_for_bios_corruption(void) { @@ -650,6 +651,22 @@ void check_for_bios_corruption(void) dump_stack(); } +static void periodic_check_for_corruption(unsigned long data) +{ + check_for_bios_corruption(); + mod_timer(&periodic_check_timer, jiffies + 60*HZ); +} + +void start_periodic_check_for_corruption(void) +{ + if (!bios_corruption_check) + return; + + init_timer(&periodic_check_timer); + periodic_check_timer.function = &periodic_check_for_corruption; + periodic_check_for_corruption(0); +} + static int set_bios_corruption_check(char *arg) { char *end; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 5140bdf03020..455f3fe67b42 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -848,8 +848,6 @@ no_context: * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - check_for_bios_corruption(); - #ifdef CONFIG_X86_32 bust_spinlocks(1); #else diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0c..657a16ad61ba 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -907,6 +907,8 @@ void __init mem_init(void) int codesize, reservedpages, datasize, initsize; int tmp; + start_periodic_check_for_corruption(); + #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d3746efb060d..f4db5276fa21 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -769,6 +769,8 @@ void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; + start_periodic_check_for_corruption(); + pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 8017129e6b63..00bb251c6451 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -246,6 +246,7 @@ extern int root_mountflags; * able to scatter it around anywhere in the kernel. */ void check_for_bios_corruption(void); +void start_periodic_check_for_corruption(void); #else static inline void check_for_bios_corruption(void) { From 9f077871ce7237e2387fc76542b3b4033cb05e49 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 01:51:34 -0700 Subject: [PATCH 079/138] x86: clean up memory corruption check and add more kernel parameters The corruption check is enabled in Kconfig by default, but disabled at runtime. This patch adds several kernel parameters to control the corruption check's behaviour; these are documented in kernel-parameters.txt. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 28 ++++++++-- arch/x86/Kconfig | 26 ++++++++-- arch/x86/kernel/setup.c | 80 +++++++++++++++++++++-------- 3 files changed, 106 insertions(+), 28 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index df48af505d15..6a2629d00598 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -360,11 +360,6 @@ and is between 256 and 4096 characters. It is defined in the file Format: ,, See header of drivers/net/hamradio/baycom_ser_hdx.c. - bios_corruption_check=0/1 [X86] - Some BIOSes seem to corrupt the first 64k of memory - when doing things like suspend/resume. Setting this - option will scan the memory looking for corruption. - boot_delay= Milliseconds to delay each printk during boot. Values larger than 10 seconds (10000) are changed to no delay (0). @@ -1233,6 +1228,29 @@ and is between 256 and 4096 characters. It is defined in the file or memmap=0x10000$0x18690000 + memory_corruption_check=0/1 [X86] + Some BIOSes seem to corrupt the first 64k of + memory when doing things like suspend/resume. + Setting this option will scan the memory + looking for corruption. Enabling this will + both detect corruption and prevent the kernel + from using the memory being corrupted. + However, its intended as a diagnostic tool; if + repeatable BIOS-originated corruption always + affects the same memory, you can use memmap= + to prevent the kernel from using that memory. + + memory_corruption_check_size=size [X86] + By default it checks for corruption in the low + 64k, making this memory unavailable for normal + use. Use this parameter to scan for + corruption in more or less memory. + + memory_corruption_check_period=seconds [X86] + By default it checks for corruption every 60 + seconds. Use this parameter to check at some + other rate. 0 disables periodic checking. + memtest= [KNL,X86] Enable memtest Format: range: 0,4 : pattern number diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1bb52e2ca02e..cbee4199689c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -201,9 +201,6 @@ config X86_TRAMPOLINE depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP) default y -config X86_CHECK_BIOS_CORRUPTION - def_bool y - config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" @@ -1062,6 +1059,29 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config X86_CHECK_BIOS_CORRUPTION + bool "Check for low memory corruption" + default y + help + Periodically check for memory corruption in low memory, which + is suspected to be caused by BIOS. Even when enabled in the + configuration, it is disabled at runtime. Enable it by + setting "memory_corruption_check=1" on the kernel command + line. By default it scans the low 64k of memory every 60 + seconds; see the memory_corruption_check_size and + memory_corruption_check_period parameters in + Documentation/kernel-parameters.txt to adjust this. + + When enabled with the default parameters, this option has + almost no overhead, as it reserves a relatively small amount + of memory and scans it infrequently. It both detects corruption + and prevents it from affecting the running system. + + It is, however, intended as a diagnostic tool; if repeatable + BIOS-originated corruption always affects the same memory, + you can use memmap= to prevent the kernel from using that + memory. + config MATH_EMULATION bool prompt "Math emulation" if X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c239b3780973..27ae91288855 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -586,22 +586,71 @@ struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; */ #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION #define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = 0; +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + static struct e820entry scan_areas[MAX_SCAN_AREAS]; static int num_scan_areas; + +static int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + static void __init setup_bios_corruption_check(void) { u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ - while(addr < 0x10000 && num_scan_areas < MAX_SCAN_AREAS) { + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { u64 size; addr = find_e820_area_size(addr, &size, PAGE_SIZE); if (addr == 0) break; - if ((addr + size) > 0x10000) - size = 0x10000 - addr; + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; if (size == 0) break; @@ -617,12 +666,11 @@ static void __init setup_bios_corruption_check(void) addr += size; } - printk(KERN_INFO "scanning %d areas for BIOS corruption\n", + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas); update_e820(); } -static int __read_mostly bios_corruption_check = 1; static struct timer_list periodic_check_timer; void check_for_bios_corruption(void) @@ -630,7 +678,7 @@ void check_for_bios_corruption(void) int i; int corruption = 0; - if (!bios_corruption_check) + if (!memory_corruption_check) return; for(i = 0; i < num_scan_areas; i++) { @@ -647,35 +695,27 @@ void check_for_bios_corruption(void) } } - if (corruption) - dump_stack(); + WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); } static void periodic_check_for_corruption(unsigned long data) { check_for_bios_corruption(); - mod_timer(&periodic_check_timer, jiffies + 60*HZ); + mod_timer(&periodic_check_timer, jiffies + corruption_check_period*HZ); } void start_periodic_check_for_corruption(void) { - if (!bios_corruption_check) + if (!memory_corruption_check || corruption_check_period == 0) return; + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + init_timer(&periodic_check_timer); periodic_check_timer.function = &periodic_check_for_corruption; periodic_check_for_corruption(0); } - -static int set_bios_corruption_check(char *arg) -{ - char *end; - - bios_corruption_check = simple_strtol(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("bios_corruption_check", set_bios_corruption_check); #endif /* From c885df50f571faf9fd9f395361cfff1b3a16e06e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 02:37:32 -0700 Subject: [PATCH 080/138] x86: default corruption check to off, but put parameter default in Kconfig Default the low memory corruption check to off, but make the default setting of the memory_corruption_check kernel parameter a config parameter. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 9 ++++++++- arch/x86/kernel/setup.c | 13 ++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cbee4199689c..7820d447bb8d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1061,7 +1061,6 @@ config HIGHPTE config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" - default y help Periodically check for memory corruption in low memory, which is suspected to be caused by BIOS. Even when enabled in the @@ -1082,6 +1081,14 @@ config X86_CHECK_BIOS_CORRUPTION you can use memmap= to prevent the kernel from using that memory. +config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + bool "Set the default setting of memory_corruption_check" + depends on X86_CHECK_BIOS_CORRUPTION + default y + help + Set whether the default state of memory_corruption_check is + on or off. + config MATH_EMULATION bool prompt "Math emulation" if X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 27ae91288855..ec7e56c1b984 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -587,7 +587,8 @@ struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION #define MAX_SCAN_AREAS 8 -static int __read_mostly memory_corruption_check = 0; +static int __read_mostly memory_corruption_check = -1; + static unsigned __read_mostly corruption_check_size = 64*1024; static unsigned __read_mostly corruption_check_period = 60; /* seconds */ @@ -634,6 +635,16 @@ static void __init setup_bios_corruption_check(void) { u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + if (corruption_check_size == 0) memory_corruption_check = 0; From 2737146b3aa2cf8b5d5ae87a18c49fe1c374528b Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 8 Sep 2008 13:43:33 +0100 Subject: [PATCH 081/138] x86, xen: fix build when !CONFIG_HOTPLUG_CPU Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index be5cbb2b7c60..bf51a466d62c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -332,6 +332,7 @@ static void xen_smp_cpus_done(unsigned int max_cpus) { } +#ifdef CONFIG_HOTPLUG_CPU int xen_cpu_disable(void) { unsigned int cpu = smp_processor_id(); @@ -368,6 +369,23 @@ void xen_play_dead(void) cpu_bringup(); } +#else /* !CONFIG_HOTPLUG_CPU */ +int xen_cpu_disable(void) +{ + return -ENOSYS; +} + +void xen_cpu_die(unsigned int cpu) +{ + BUG(); +} + +void xen_play_dead(void) +{ + BUG(); +} + +#endif static void stop_self(void *v) { int cpu = smp_processor_id(); From 26fd10517e810dd59ea050b052de24a75ee6dc07 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 8 Sep 2008 13:43:34 +0100 Subject: [PATCH 082/138] xen: make CPU hotplug functions static There's no need for these functions to be accessed from outside of xen/smp.c Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 12 ++++++------ arch/x86/xen/xen-ops.h | 4 ---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index bf51a466d62c..d77da613b1d2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -333,7 +333,7 @@ static void xen_smp_cpus_done(unsigned int max_cpus) } #ifdef CONFIG_HOTPLUG_CPU -int xen_cpu_disable(void) +static int xen_cpu_disable(void) { unsigned int cpu = smp_processor_id(); if (cpu == 0) @@ -345,7 +345,7 @@ int xen_cpu_disable(void) return 0; } -void xen_cpu_die(unsigned int cpu) +static void xen_cpu_die(unsigned int cpu) { while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { current->state = TASK_UNINTERRUPTIBLE; @@ -362,7 +362,7 @@ void xen_cpu_die(unsigned int cpu) alternatives_smp_switch(0); } -void xen_play_dead(void) +static void xen_play_dead(void) { play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); @@ -370,17 +370,17 @@ void xen_play_dead(void) } #else /* !CONFIG_HOTPLUG_CPU */ -int xen_cpu_disable(void) +static int xen_cpu_disable(void) { return -ENOSYS; } -void xen_cpu_die(unsigned int cpu) +static void xen_cpu_die(unsigned int cpu) { BUG(); } -void xen_play_dead(void) +static void xen_play_dead(void) { BUG(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8dbd97fd7f18..d7422dc2a55c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -51,10 +51,6 @@ void xen_mark_init_mm_pinned(void); void __init xen_setup_vcpu_info_placement(void); -void xen_play_dead(void); -void xen_cpu_die(unsigned int cpu); -int xen_cpu_disable(void); - #ifdef CONFIG_SMP void xen_smp_init(void); From 9c3254ad42e7925c3a3907a072185273705bd7b2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 14:54:41 -0700 Subject: [PATCH 083/138] x86: fix compile error with corruption checking disabled Fix compile error: arch/x86/mm/init_32.c: In function 'mem_init': arch/x86/mm/init_32.c:908: error: implicit declaration of function 'start_periodic_check_for_corruption' Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 00bb251c6451..50873b211788 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -251,6 +251,10 @@ void start_periodic_check_for_corruption(void); static inline void check_for_bios_corruption(void) { } + +static inline void start_periodic_check_for_corruption(void) +{ +} #endif /* Values used for system_state */ From 43789e21638626d826c0e8d62e50ceb76b9d61ed Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 8 Sep 2008 14:43:11 -0700 Subject: [PATCH 084/138] kbuild: fix cc-option and cc-option-yn David Sanders wrote: > I'm getting this error: > as: unrecognized option `-mtune=generic32' > I have binutils 2.17. Use -c instead of -S in cc-option and cc-option-yn, so we can probe options related to the assembler. Signed-off-by: H. Peter Anvin Cc: Sam Ravnborg Cc: kbuild devel Signed-off-by: Ingo Molnar --- scripts/Kbuild.include | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index d64e6badc942..982dcae7bbe2 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -105,12 +105,12 @@ as-instr = $(call try-run,\ # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) cc-option = $(call try-run,\ - $(CC) $(KBUILD_CFLAGS) $(1) -S -xc /dev/null -o "$$TMP",$(1),$(2)) + $(CC) $(KBUILD_CFLAGS) $(1) -c -xc /dev/null -o "$$TMP",$(1),$(2)) # cc-option-yn # Usage: flag := $(call cc-option-yn,-march=winchip-c6) cc-option-yn = $(call try-run,\ - $(CC) $(KBUILD_CFLAGS) $(1) -S -xc /dev/null -o "$$TMP",y,n) + $(CC) $(KBUILD_CFLAGS) $(1) -c -xc /dev/null -o "$$TMP",y,n) # cc-option-align # Prefix align with either -falign or -malign From 28f7e66fc1da53997a545684b21b91fb3ca3f321 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 8 Sep 2008 12:01:48 -0700 Subject: [PATCH 085/138] x86: prevent binutils from being "smart" and generating NOPLs for us binutils, contrary to documented behaviour, will generate long NOPs (a P6-or-higher instruction which is broken on at least some VIA chips, Virtual PC/Virtual Server, and some versions of Qemu) depending on the -mtune= option, which is not supposed to change architectural behaviour. Pass an explicit override to the assembler, in case ends up passing the -mtune= parameter to gas (gcc 4.3.0 does not appear to.) Signed-off-by: H. Peter Anvin --- arch/x86/Makefile_32.cpu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index e372b584e919..b72b4f753113 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -45,3 +45,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) +# Bug fix for binutils: this option is required in order to keep +# binutils from generating NOPL instructions against our will. +ifneq ($(CONFIG_X86_P6_NOP),y) +cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,) +endif From b2994ef0de252d41f1511e5f87704bedda12dabc Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 9 Sep 2008 17:18:50 -0700 Subject: [PATCH 086/138] x86: signal_64.c: clean up signal_fault() clean up and make signal_fault() same as 32bit. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 8fbdd23d5cc6..552a331313ff 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -473,12 +473,14 @@ void do_notify_resume(struct pt_regs *regs, void *unused, void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; + if (show_unhandled_signals && printk_ratelimit()) { - printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", - me->comm, me->pid, where, frame, regs->ip, - regs->sp, regs->orig_ax); + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk("\n"); + printk(KERN_CONT "\n"); } force_sig(SIGSEGV, me); From 0c40ed71736c20f447843b3c96b715bb9c4904d7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 9 Sep 2008 17:21:17 -0700 Subject: [PATCH 087/138] x86: signal_64.c: arg for restore_i387_xstate() is void __user * restore_i387_xstate() is declared as: int restore_i387_xstate(void __user *buf); so, make the variable buf void __user *. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 552a331313ff..321da93f2fb7 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -94,7 +94,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, } { - struct _fpstate __user *buf; + void __user *buf; + err |= __get_user(buf, &sc->fpstate); err |= restore_i387_xstate(buf); } From 764e8d128f9343027cf09afe8a145e8ff186e129 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 9 Sep 2008 17:22:12 -0700 Subject: [PATCH 088/138] x86: signal_64.c: make handle_signal() similar Make handle_signal() same as 32bit. No change in functionality intended. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 57 +++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 321da93f2fb7..43e6862fc7a2 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -338,39 +338,40 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret == 0) { - /* - * This has nothing to do with segment registers, - * despite the name. This magic affects uaccess.h - * macros' behavior. Reset it to the normal setting. - */ - set_fs(USER_DS); + if (ret) + return ret; - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; - tracehook_signal_handler(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - } + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(¤t->blocked, sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); - return ret; + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + + return 0; } #define NR_restart_syscall \ From f7d0b926ac8c8ec0c7a83ee69409bd2e6bb39f81 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 9 Sep 2008 15:43:22 -0700 Subject: [PATCH 089/138] mm: define USE_SPLIT_PTLOCKS rather than repeating expression Define USE_SPLIT_PTLOCKS as a constant expression rather than repeating "NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS" all over the place. Signed-off-by: Jeremy Fitzhardinge Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 2 +- include/linux/mm.h | 6 +++--- include/linux/mm_types.h | 10 ++++++---- include/linux/sched.h | 6 +++--- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469da696..2e1b64088490 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -646,7 +646,7 @@ static spinlock_t *lock_pte(struct page *page) { spinlock_t *ptl = NULL; -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#if USE_SPLIT_PTLOCKS ptl = __pte_lockptr(page); spin_lock(ptl); #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 72a15dc26bbf..4194bf8e4f6c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -919,7 +919,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#if USE_SPLIT_PTLOCKS /* * We tuck a spinlock to guard each pagetable page into its struct page, * at page->private, with BUILD_BUG_ON to make sure that this will not @@ -932,14 +932,14 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } while (0) #define pte_lock_deinit(page) ((page)->mapping = NULL) #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) -#else +#else /* !USE_SPLIT_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ #define pte_lock_init(page) do {} while (0) #define pte_lock_deinit(page) do {} while (0) #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) -#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +#endif /* USE_SPLIT_PTLOCKS */ static inline void pgtable_page_ctor(struct page *page) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bf334138c7c1..9d49fa36bbef 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -21,11 +21,13 @@ struct address_space; -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) + +#if USE_SPLIT_PTLOCKS typedef atomic_long_t mm_counter_t; -#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +#else /* !USE_SPLIT_PTLOCKS */ typedef unsigned long mm_counter_t; -#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +#endif /* !USE_SPLIT_PTLOCKS */ /* * Each physical page in the system has a struct page associated with @@ -65,7 +67,7 @@ struct page { * see PAGE_MAPPING_ANON below. */ }; -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#if USE_SPLIT_PTLOCKS spinlock_t ptl; #endif struct kmem_cache *slab; /* SLUB: Pointer to slab */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3d9120c5ad15..272c35309df2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -352,7 +352,7 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#if USE_SPLIT_PTLOCKS /* * The mm counters are not protected by its page_table_lock, * so must be incremented atomically. @@ -363,7 +363,7 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) -#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +#else /* !USE_SPLIT_PTLOCKS */ /* * The mm counters are protected by its page_table_lock, * so can be incremented directly. @@ -374,7 +374,7 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- -#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +#endif /* !USE_SPLIT_PTLOCKS */ #define get_mm_rss(mm) \ (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) From 6a9e91846bf52cc70a0417de19fdfac224c435c4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 9 Sep 2008 15:43:25 -0700 Subject: [PATCH 090/138] xen: fix pinning when not using split pte locks We only pin PTE pages when using split PTE locks, so don't do the pin/unpin when attaching/detaching pte pages to a pinned pagetable. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b106e825d266..8ca2f88bde1e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -826,7 +826,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else /* make sure there are no stray mappings of @@ -894,7 +894,7 @@ static void xen_release_ptpage(u32 pfn, unsigned level) if (PagePinned(page)) { if (!PageHighMem(page)) { - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } From a0a29b62a9cac6b7d83b7514679f2ed8d33d4372 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 11 Sep 2008 23:27:52 +0200 Subject: [PATCH 091/138] x86, microcode rework, v2 this is a rework of the microcode splitup in tip/x86/microcode (1) I think this new interface is cleaner (look at the changes in 'struct microcode_ops' in microcode.h); (2) it's -64 lines of code; Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 86 ++++---- arch/x86/kernel/microcode_amd.c | 329 ++++++++++++++---------------- arch/x86/kernel/microcode_intel.c | 220 +++++++++----------- include/asm-x86/microcode.h | 17 +- 4 files changed, 294 insertions(+), 358 deletions(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index b2f84ce5eed3..902dada2eb6d 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -110,50 +110,28 @@ struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); #ifdef CONFIG_MICROCODE_OLD_INTERFACE -void __user *user_buffer; /* user area microcode data buffer */ -EXPORT_SYMBOL_GPL(user_buffer); -unsigned int user_buffer_size; /* it's size */ -EXPORT_SYMBOL_GPL(user_buffer_size); - -static int do_microcode_update(void) +static int do_microcode_update(const void __user *buf, size_t size) { - long cursor = 0; - int error = 0; - void *new_mc = NULL; - int cpu; cpumask_t old; + int error = 0; + int cpu; old = current->cpus_allowed; - while ((cursor = microcode_ops->get_next_ucode(&new_mc, cursor)) > 0) { - if (microcode_ops->microcode_sanity_check != NULL) - error = microcode_ops->microcode_sanity_check(new_mc); - if (error) - goto out; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (!uci->valid) - continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->get_matching_microcode(new_mc, - cpu); - if (error < 0) - goto out; - if (error == 1) - microcode_ops->apply_microcode(cpu); - } - vfree(new_mc); + if (!uci->valid) + continue; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + error = microcode_ops->request_microcode_user(cpu, buf, size); + if (error < 0) + goto out; + if (!error) + microcode_ops->apply_microcode(cpu); } out: - if (cursor > 0) - vfree(new_mc); - if (cursor < 0) - error = cursor; set_cpus_allowed_ptr(current, &old); return error; } @@ -178,10 +156,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, get_online_cpus(); mutex_lock(µcode_mutex); - user_buffer = (void __user *) buf; - user_buffer_size = (int) len; - - ret = do_microcode_update(); + ret = do_microcode_update(buf, len); if (!ret) ret = (ssize_t)len; @@ -231,7 +206,6 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ struct platform_device *microcode_pdev; -EXPORT_SYMBOL_GPL(microcode_pdev); static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, @@ -252,8 +226,12 @@ static ssize_t reload_store(struct sys_device *dev, if (cpu_online(cpu)) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); mutex_lock(µcode_mutex); - if (uci->valid) - err = microcode_ops->cpu_request_microcode(cpu); + if (uci->valid) { + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); + if (!err) + microcode_ops->apply_microcode(cpu); + } mutex_unlock(µcode_mutex); set_cpus_allowed_ptr(current, &old); } @@ -315,7 +293,7 @@ static void collect_cpu_info(int cpu) uci->valid = 1; } -static void microcode_resume_cpu(int cpu) +static int microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct cpu_signature nsig; @@ -323,7 +301,7 @@ static void microcode_resume_cpu(int cpu) pr_debug("microcode: CPU%d resumed\n", cpu); if (!uci->mc.valid_mc) - return; + return 1; /* * Let's verify that the 'cached' ucode does belong @@ -331,21 +309,22 @@ static void microcode_resume_cpu(int cpu) */ if (microcode_ops->collect_cpu_info(cpu, &nsig)) { microcode_fini_cpu(cpu); - return; + return -1; } if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { microcode_fini_cpu(cpu); /* Should we look for a new ucode here? */ - return; + return 1; } - microcode_ops->apply_microcode(cpu); + return 0; } void microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int err = 0; /* We should bind the task to the CPU */ BUG_ON(raw_smp_processor_id() != cpu); @@ -356,12 +335,17 @@ void microcode_update_cpu(int cpu) * otherwise just request a firmware: */ if (uci->valid) { - microcode_resume_cpu(cpu); + err = microcode_resume_cpu(cpu); } else { collect_cpu_info(cpu); if (uci->valid && system_state == SYSTEM_RUNNING) - microcode_ops->cpu_request_microcode(cpu); + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); } + + if (!err) + microcode_ops->apply_microcode(cpu); + mutex_unlock(µcode_mutex); } @@ -414,7 +398,7 @@ static int mc_sysdev_resume(struct sys_device *dev) return 0; pr_debug("microcode: CPU%d resumed\n", cpu); /* only CPU 0 will apply ucode here */ - microcode_ops->apply_microcode(0); + microcode_update_cpu(0); return 0; } diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index d606a05545ca..6815837a7753 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -59,7 +59,7 @@ MODULE_LICENSE("GPL v2"); /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); -struct equiv_cpu_entry *equiv_cpu_table; +static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { @@ -83,36 +83,37 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static int get_matching_microcode_amd(void *mc, int cpu) +static int get_matching_microcode(int cpu, void *mc, int rev) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_amd *mc_header = mc; - unsigned long total_size = get_totalsize(mc_header); - void *new_mc; struct pci_dev *nb_pci_dev, *sb_pci_dev; unsigned int current_cpu_id; unsigned int equiv_cpu_id = 0x00; unsigned int i = 0; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - - /* This is a tricky part. We might be called from a write operation */ - /* to the device file instead of the usual process of firmware */ - /* loading. This routine needs to be able to distinguish both */ -/* cases. This is done by checking if there alread is a equivalent */ - /* CPU table installed. If not, we're written through */ - /* /dev/cpu/microcode. */ -/* Since we ignore all checks. The error case in which going through */ -/* firmware loading and that table is not loaded has already been */ - /* checked earlier. */ + /* + * dimm: do we need this? Why an update via /dev/... is different + * from the one via firmware? + * + * This is a tricky part. We might be called from a write operation + * to the device file instead of the usual process of firmware + * loading. This routine needs to be able to distinguish both + * cases. This is done by checking if there alread is a equivalent + * CPU table installed. If not, we're written through + * /dev/cpu/microcode. + * Since we ignore all checks. The error case in which going through + * firmware loading and that table is not loaded has already been + * checked earlier. + */ + BUG_ON(equiv_cpu_table == NULL); +#if 0 if (equiv_cpu_table == NULL) { printk(KERN_INFO "microcode: CPU%d microcode update with " "version 0x%x (current=0x%x)\n", cpu, mc_header->patch_id, uci->cpu_sig.rev); goto out; } - +#endif current_cpu_id = cpuid_eax(0x00000001); while (equiv_cpu_table[i].installed_cpu != 0) { @@ -175,27 +176,9 @@ static int get_matching_microcode_amd(void *mc, int cpu) pci_dev_put(sb_pci_dev); } - if (mc_header->patch_id <= uci->cpu_sig.rev) + if (mc_header->patch_id <= rev) return 0; - printk(KERN_INFO "microcode: CPU%d found a matching microcode " - "update with version 0x%x (current=0x%x)\n", - cpu, mc_header->patch_id, uci->cpu_sig.rev); - -out: - new_mc = vmalloc(UCODE_MAX_SIZE); - if (!new_mc) { - printk(KERN_ERR "microcode: error, can't allocate memory\n"); - return -ENOMEM; - } - memset(new_mc, 0, UCODE_MAX_SIZE); - - /* free previous update file */ - vfree(uci->mc.mc_amd); - - memcpy(new_mc, mc, total_size); - - uci->mc.mc_amd = new_mc; return 1; } @@ -245,104 +228,65 @@ static void apply_microcode_amd(int cpu) uci->cpu_sig.rev = rev; } -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -extern void __user *user_buffer; /* user area microcode data buffer */ -extern unsigned int user_buffer_size; /* it's size */ - -static long get_next_ucode_amd(void **mc, long offset) +static void * get_next_ucode(u8 *buf, unsigned int size, + int (*get_ucode_data)(void *, const void *, size_t), + unsigned int *mc_size) { - struct microcode_header_amd mc_header; - unsigned long total_size; + unsigned int total_size; +#define UCODE_UNKNOWN_HDR 8 + u8 hdr[UCODE_UNKNOWN_HDR]; + void *mc; - /* No more data */ - if (offset >= user_buffer_size) - return 0; - if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - return -EFAULT; - } - total_size = get_totalsize(&mc_header); - if (offset + total_size > user_buffer_size) { - printk(KERN_ERR "microcode: error! Bad total size in microcode " - "data file\n"); - return -EINVAL; - } - *mc = vmalloc(UCODE_MAX_SIZE); - if (!*mc) - return -ENOMEM; - memset(*mc, 0, UCODE_MAX_SIZE); + if (get_ucode_data(hdr, buf, UCODE_UNKNOWN_HDR)) + return NULL; - if (copy_from_user(*mc, user_buffer + offset, total_size)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - vfree(*mc); - return -EFAULT; - } - return offset + total_size; -} -#else -#define get_next_ucode_amd() NULL -#endif - -static long get_next_ucode_from_buffer_amd(void **mc, void *buf, - unsigned long size, long offset) -{ - struct microcode_header_amd *mc_header; - unsigned long total_size; - unsigned char *buf_pos = buf; - - /* No more data */ - if (offset >= size) - return 0; - - if (buf_pos[offset] != UCODE_UCODE_TYPE) { + if (hdr[0] != UCODE_UCODE_TYPE) { printk(KERN_ERR "microcode: error! " "Wrong microcode payload type field\n"); - return -EINVAL; + return NULL; } - mc_header = (struct microcode_header_amd *)(&buf_pos[offset+8]); + /* Why not by means of get_totalsize(hdr)? */ + total_size = (unsigned long) (hdr[4] + (hdr[5] << 8)); - total_size = (unsigned long) (buf_pos[offset+4] + - (buf_pos[offset+5] << 8)); + printk(KERN_INFO "microcode: size %u, total_size %u\n", + size, total_size); - printk(KERN_INFO "microcode: size %lu, total_size %lu, offset %ld\n", - size, total_size, offset); - - if (offset + total_size > size) { + if (total_size > size || total_size > UCODE_MAX_SIZE) { printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - return -EINVAL; + return NULL; } - *mc = vmalloc(UCODE_MAX_SIZE); - if (!*mc) { - printk(KERN_ERR "microcode: error! " - "Can not allocate memory for microcode patch\n"); - return -ENOMEM; + mc = vmalloc(UCODE_MAX_SIZE); + if (mc) { + memset(mc, 0, UCODE_MAX_SIZE); + if (get_ucode_data(mc, buf + UCODE_UNKNOWN_HDR, total_size)) { + vfree(mc); + mc = NULL; + } else + *mc_size = total_size + UCODE_UNKNOWN_HDR; } - - memset(*mc, 0, UCODE_MAX_SIZE); - memcpy(*mc, buf + offset + 8, total_size); - - return offset + total_size + 8; +#undef UCODE_UNKNOWN_HDR + return mc; } -static long install_equiv_cpu_table(void *buf, unsigned long size, long offset) + +static int install_equiv_cpu_table(u8 *buf, + int (*get_ucode_data)(void *, const void *, size_t)) { - unsigned int *buf_pos = buf; +#define UCODE_HEADER_SIZE 12 + u8 *hdr[UCODE_HEADER_SIZE]; + unsigned int *buf_pos = (unsigned int *)hdr; + unsigned long size; - /* No more data */ - if (offset >= size) + if (get_ucode_data(&hdr, buf, UCODE_HEADER_SIZE)) return 0; - if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalnet cpu table type field\n"); - return 0; - } + size = buf_pos[2]; - if (size == 0) { + if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalnet cpu table length\n"); + "Wrong microcode equivalnet cpu table\n"); return 0; } @@ -352,79 +296,118 @@ static long install_equiv_cpu_table(void *buf, unsigned long size, long offset) return 0; } - memset(equiv_cpu_table, 0, size); - memcpy(equiv_cpu_table, &buf_pos[3], size); + buf += UCODE_HEADER_SIZE; + if (get_ucode_data(equiv_cpu_table, buf, size)) { + vfree(equiv_cpu_table); + return 0; + } - return size + 12; /* add header length */ + return size + UCODE_HEADER_SIZE; /* add header length */ +#undef UCODE_HEADER_SIZE } -/* fake device for request_firmware */ -extern struct platform_device *microcode_pdev; - -static int cpu_request_microcode_amd(int cpu) +static void free_equiv_cpu_table(void) { - char name[30]; - const struct firmware *firmware; - void *buf; - unsigned int *buf_pos; - unsigned long size; - long offset = 0; - int error; - void *mc; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - - sprintf(name, "amd-ucode/microcode_amd.bin"); - error = request_firmware(&firmware, "amd-ucode/microcode_amd.bin", - µcode_pdev->dev); - if (error) { - printk(KERN_ERR "microcode: ucode data file %s load failed\n", - name); - return error; + if (equiv_cpu_table) { + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; } +} - buf_pos = (unsigned int *)firmware->data; - buf = (void *)firmware->data; - size = firmware->size; - - if (buf_pos[0] != UCODE_MAGIC) { - printk(KERN_ERR "microcode: error! Wrong microcode patch file magic\n"); - return -EINVAL; - } - - offset = install_equiv_cpu_table(buf, buf_pos[2], offset); +static int generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover; + unsigned long offset; + offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); if (!offset) { printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); return -EINVAL; } - while ((offset = - get_next_ucode_from_buffer_amd(&mc, buf, size, offset)) > 0) { - error = get_matching_microcode_amd(mc, cpu); - if (error < 0) + ucode_ptr += offset; + leftover = size - offset; + + while (leftover) { + unsigned int mc_size; + struct microcode_header_amd *mc_header; + + mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); + if (!mc) break; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - if (error == 1) { - apply_microcode_amd(cpu); - error = 0; - } - vfree(mc); + + mc_header = (struct microcode_header_amd *)mc; + if (get_matching_microcode(cpu, mc, new_rev)) { + new_rev = mc_header->patch_id; + new_mc = mc; + } else + vfree(mc); + + ucode_ptr += mc_size; + leftover -= mc_size; } - if (offset > 0) { - vfree(mc); - vfree(equiv_cpu_table); - equiv_cpu_table = NULL; + + if (new_mc) { + if (!leftover) { + if (uci->mc.mc_amd) + vfree(uci->mc.mc_amd); + uci->mc.mc_amd = (struct microcode_amd *)new_mc; + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", + cpu, uci->mc.mc_amd->hdr.patch_id, uci->cpu_sig.rev); + } else + vfree(new_mc); } - if (offset < 0) - error = offset; + + free_equiv_cpu_table(); + + return (int)leftover; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static int request_microcode_fw(int cpu, struct device *device) +{ + const char *fw_name = "amd-ucode/microcode_amd.bin"; + const struct firmware *firmware; + int ret; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + ret = request_firmware(&firmware, fw_name, device); + if (ret) { + printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); + return ret; + } + + ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + &get_ucode_fw); + release_firmware(firmware); - return error; + return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ + return copy_from_user(to, from, n); +} + +static int request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); } static void microcode_fini_cpu_amd(int cpu) @@ -436,10 +419,8 @@ static void microcode_fini_cpu_amd(int cpu) } static struct microcode_ops microcode_amd_ops = { - .get_next_ucode = get_next_ucode_amd, - .get_matching_microcode = get_matching_microcode_amd, - .microcode_sanity_check = NULL, - .cpu_request_microcode = cpu_request_microcode_amd, + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index c9b53202ba3d..f4930b55c6a0 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -155,15 +155,15 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -static inline int microcode_update_match(int cpu_num, - struct microcode_header_intel *mc_header, int sig, int pf) +static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; +} - if (!sigmatch(sig, uci->cpu_sig.sig, pf, uci->cpu_sig.pf) - || mc_header->rev <= uci->cpu_sig.rev) - return 0; - return 1; +static inline int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; } static int microcode_sanity_check(void *mc) @@ -248,51 +248,36 @@ static int microcode_sanity_check(void *mc) /* * return 0 - no update found * return 1 - found update - * return < 0 - error */ -static int get_matching_microcode(void *mc, int cpu) +static int +get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); int ext_sigcount, i; struct extended_signature *ext_sig; - void *new_mc; - if (microcode_update_match(cpu, mc_header, - mc_header->sig, mc_header->pf)) - goto find; + if (!update_match_revision(mc_header, rev)) + return 0; + if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf)) + return 1; + + /* Look for ext. headers: */ if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) return 0; ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; ext_sigcount = ext_header->count; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + for (i = 0; i < ext_sigcount; i++) { - if (microcode_update_match(cpu, mc_header, - ext_sig->sig, ext_sig->pf)) - goto find; + if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf)) + return 1; ext_sig++; } return 0; -find: - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, mc_header->rev, uci->cpu_sig.rev); - new_mc = vmalloc(total_size); - if (!new_mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; - } - - /* free previous update file */ - vfree(uci->mc.mc_intel); - - memcpy(new_mc, mc, total_size); - uci->mc.mc_intel = new_mc; - return 1; } static void apply_microcode(int cpu) @@ -300,7 +285,7 @@ static void apply_microcode(int cpu) unsigned long flags; unsigned int val[2]; int cpu_num = raw_smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -338,116 +323,105 @@ static void apply_microcode(int cpu) uci->cpu_sig.rev = val[1]; } -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -extern void __user *user_buffer; /* user area microcode data buffer */ -extern unsigned int user_buffer_size; /* it's size */ - -static long get_next_ucode(void **mc, long offset) +static int generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) { - struct microcode_header_intel mc_header; - unsigned long total_size; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover = size; - /* No more data */ - if (offset >= user_buffer_size) - return 0; - if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - return -EFAULT; - } - total_size = get_totalsize(&mc_header); - if (offset + total_size > user_buffer_size) { - printk(KERN_ERR "microcode: error! Bad total size in microcode " - "data file\n"); - return -EINVAL; - } - *mc = vmalloc(total_size); - if (!*mc) - return -ENOMEM; - if (copy_from_user(*mc, user_buffer + offset, total_size)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - vfree(*mc); - return -EFAULT; - } - return offset + total_size; -} -#endif + while (leftover) { + struct microcode_header_intel mc_header; + unsigned int mc_size; -static long get_next_ucode_from_buffer(void **mc, const u8 *buf, - unsigned long size, long offset) -{ - struct microcode_header_intel *mc_header; - unsigned long total_size; + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) + break; - /* No more data */ - if (offset >= size) - return 0; - mc_header = (struct microcode_header_intel *)(buf + offset); - total_size = get_totalsize(mc_header); + mc_size = get_totalsize(&mc_header); + if (!mc_size || mc_size > leftover) { + printk(KERN_ERR "microcode: error!" + "Bad data in microcode data file\n"); + break; + } - if (offset + total_size > size) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - return -EINVAL; + mc = vmalloc(mc_size); + if (!mc) + break; + + if (get_ucode_data(mc, ucode_ptr, mc_size) || + microcode_sanity_check(mc) < 0) { + vfree(mc); + break; + } + + if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { + new_rev = mc_header.rev; + new_mc = mc; + } else + vfree(mc); + + ucode_ptr += mc_size; + leftover -= mc_size; } - *mc = vmalloc(total_size); - if (!*mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; + if (new_mc) { + if (!leftover) { + if (uci->mc.mc_intel) + vfree(uci->mc.mc_intel); + uci->mc.mc_intel = (struct microcode_intel *)new_mc; + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", + cpu, uci->mc.mc_intel->hdr.rev, uci->cpu_sig.rev); + } else + vfree(new_mc); } - memcpy(*mc, buf + offset, total_size); - return offset + total_size; + + return (int)leftover; } -/* fake device for request_firmware */ -extern struct platform_device *microcode_pdev; +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} -static int cpu_request_microcode(int cpu) +static int request_microcode_fw(int cpu, struct device *device) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - const u8 *buf; - unsigned long size; - long offset = 0; - int error; - void *mc; + int ret; /* We should bind the task to the CPU */ BUG_ON(cpu != raw_smp_processor_id()); sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); - error = request_firmware(&firmware, name, µcode_pdev->dev); - if (error) { + ret = request_firmware(&firmware, name, device); + if (ret) { pr_debug("microcode: data file %s load failed\n", name); - return error; + return ret; } - buf = firmware->data; - size = firmware->size; - while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) - > 0) { - error = microcode_sanity_check(mc); - if (error) - break; - error = get_matching_microcode(mc, cpu); - if (error < 0) - break; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - if (error == 1) { - apply_microcode(cpu); - error = 0; - } - vfree(mc); - } - if (offset > 0) - vfree(mc); - if (offset < 0) - error = offset; + + ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + &get_ucode_fw); + release_firmware(firmware); - return error; + return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ + return copy_from_user(to, from, n); +} + +static int request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); } static void microcode_fini_cpu(int cpu) @@ -459,10 +433,8 @@ static void microcode_fini_cpu(int cpu) } static struct microcode_ops microcode_intel_ops = { - .get_next_ucode = get_next_ucode, - .get_matching_microcode = get_matching_microcode, - .microcode_sanity_check = microcode_sanity_check, - .cpu_request_microcode = cpu_request_microcode, + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode, .microcode_fini_cpu = microcode_fini_cpu, diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h index 7ceff48fa657..e2887facdb4a 100644 --- a/include/asm-x86/microcode.h +++ b/include/asm-x86/microcode.h @@ -5,17 +5,16 @@ extern int microcode_init(void *opaque, struct module *module); extern void microcode_exit(void); struct cpu_signature; +struct device; struct microcode_ops { - long (*get_next_ucode)(void **mc, long offset); - long (*microcode_get_next_ucode)(void **mc, long offset); - int (*get_matching_microcode)(void *mc, int cpu); - int (*microcode_sanity_check)(void *mc); - int (*cpu_request_microcode)(int cpu); - int (*collect_cpu_info)(int cpu_num, struct cpu_signature *csig); - void (*apply_microcode)(int cpu); - void (*microcode_fini_cpu)(int cpu); - void (*clear_patch)(void *data); + int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); + int (*request_microcode_fw) (int cpu, struct device *device); + + void (*apply_microcode) (int cpu); + + int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); + void (*microcode_fini_cpu) (int cpu); }; struct microcode_header_intel { From a1c75cc5018f17ff6d80ce45a13435b1536f76db Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 14:50:26 +0200 Subject: [PATCH 092/138] x86, microcode rework, v2, fix based on patch from Dmitry Adamushko. - add missing vfree() - update debug printks Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 4 +--- arch/x86/kernel/microcode_amd.c | 6 ++++-- arch/x86/kernel/microcode_intel.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 902dada2eb6d..0c2634f4fd7c 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -70,8 +70,6 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ - -/* #define DEBUG pr_debug */ #include #include #include @@ -396,7 +394,7 @@ static int mc_sysdev_resume(struct sys_device *dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d resumed\n", cpu); + /* only CPU 0 will apply ucode here */ microcode_update_cpu(0); return 0; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 6815837a7753..48aec9f48e4f 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -92,7 +92,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) unsigned int i = 0; /* - * dimm: do we need this? Why an update via /dev/... is different + * FIXME! dimm: do we need this? Why an update via /dev/... is different * from the one via firmware? * * This is a tricky part. We might be called from a write operation @@ -246,7 +246,7 @@ static void * get_next_ucode(u8 *buf, unsigned int size, return NULL; } - /* Why not by means of get_totalsize(hdr)? */ + /* FIXME! dimm: Why not by means of get_totalsize(hdr)? */ total_size = (unsigned long) (hdr[4] + (hdr[5] << 8)); printk(KERN_INFO "microcode: size %u, total_size %u\n", @@ -342,6 +342,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, mc_header = (struct microcode_header_amd *)mc; if (get_matching_microcode(cpu, mc, new_rev)) { + if (new_mc) + vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; } else diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index f4930b55c6a0..48ed3cef58c1 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,8 +70,6 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ - -/* #define DEBUG */ /* pr_debug */ #include #include #include @@ -356,6 +354,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, } if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { + if (new_mc) + vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; } else From 3d0aedd9538e6be8afec1a9d8b084bf90bc91495 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 12 Sep 2008 17:01:09 -0700 Subject: [PATCH 093/138] x86: signal: put give_sigsegv of setup frames together When setup frame fails, force_sigsegv is called and returns -EFAULT. There is similar code in ia32_setup_frame(), ia32_setup_rt_frame(), __setup_frame() and __setup_rt_frame(). Make them identical. No change in functionality intended. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 26 +++++++++----------------- arch/x86/kernel/signal_32.c | 31 ++++++++++++++----------------- arch/x86/kernel/signal_64.c | 17 +++++++++-------- 3 files changed, 32 insertions(+), 42 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 8d64c1bc8474..5f42cfcc1c5a 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -444,21 +444,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; err |= __put_user(sig, &frame->sig); if (err) - goto give_sigsegv; + return -EFAULT; err |= ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); if (err) - goto give_sigsegv; + return -EFAULT; if (_COMPAT_NSIG_WORDS > 1) { err |= __copy_to_user(frame->extramask, &set->sig[1], sizeof(frame->extramask)); if (err) - goto give_sigsegv; + return -EFAULT; } if (ka->sa.sa_flags & SA_RESTORER) { @@ -479,7 +479,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, */ err |= __copy_to_user(frame->retcode, &code, 8); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -502,10 +502,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, #endif return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -533,14 +529,14 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; err |= __put_user(sig, &frame->sig); err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); err |= copy_siginfo_to_user32(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; /* Create the ucontext. */ if (cpu_has_xsave) @@ -556,7 +552,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return -EFAULT; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -571,7 +567,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, */ err |= __copy_to_user(frame->retcode, &code, 8); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -599,8 +595,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b668efc18eae..1c22e0067fe7 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -349,21 +349,21 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; err = __put_user(sig, &frame->sig); if (err) - goto give_sigsegv; + return -EFAULT; err = setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); if (err) - goto give_sigsegv; + return -EFAULT; if (_NSIG_WORDS > 1) { err = __copy_to_user(&frame->extramask, &set->sig[1], sizeof(frame->extramask)); if (err) - goto give_sigsegv; + return -EFAULT; } if (current->mm->context.vdso) @@ -388,7 +388,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -403,10 +403,6 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -420,14 +416,14 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= copy_siginfo_to_user(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; /* Create the ucontext. */ if (cpu_has_xsave) @@ -443,7 +439,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -463,7 +459,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -478,10 +474,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* @@ -506,6 +498,11 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, else ret = __setup_frame(usig, ka, set, regs); + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + return ret; } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 43e6862fc7a2..3b79e179ba3a 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -215,12 +215,12 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; if (ka->sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; } /* Create the ucontext. */ @@ -248,11 +248,11 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); } else { /* could use a vstub here */ - goto give_sigsegv; + return -EFAULT; } if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->di = sig; @@ -272,10 +272,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* @@ -297,6 +293,11 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif ret = __setup_rt_frame(sig, ka, info, set, regs); + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + return ret; } From 2ba48e16e78216bb5b9fd08a088bfefda478df25 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 12 Sep 2008 17:02:53 -0700 Subject: [PATCH 094/138] x86: signal: remove unneeded err handling This patch eliminates unused or unneeded variable handling. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 11 ++++------- arch/x86/kernel/signal_32.c | 11 ++++------- arch/x86/kernel/signal_64.c | 5 ++--- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 5f42cfcc1c5a..e47bed2440ee 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -446,18 +446,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; - err |= __put_user(sig, &frame->sig); - if (err) + if (__put_user(sig, &frame->sig)) return -EFAULT; - err |= ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); - if (err) + if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) return -EFAULT; if (_COMPAT_NSIG_WORDS > 1) { - err |= __copy_to_user(frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) + if (__copy_to_user(frame->extramask, &set->sig[1], + sizeof(frame->extramask))) return -EFAULT; } diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 1c22e0067fe7..d433861a6599 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -351,18 +351,15 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; - err = __put_user(sig, &frame->sig); - if (err) + if (__put_user(sig, &frame->sig)) return -EFAULT; - err = setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); - if (err) + if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) return -EFAULT; if (_NSIG_WORDS > 1) { - err = __copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) + if (__copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask))) return -EFAULT; } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 3b79e179ba3a..a21c85197295 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -210,7 +210,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; if (save_i387_xstate(fp) < 0) - err |= -1; + return -EFAULT; } else frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; @@ -218,8 +218,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return -EFAULT; if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); - if (err) + if (copy_siginfo_to_user(&frame->info, info)) return -EFAULT; } From e6babb6b7fed93c93f8fc5ef8ebd3a474fc2df3e Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 12 Sep 2008 17:03:31 -0700 Subject: [PATCH 095/138] x86: signal: introduce do_rt_sigreturn() introduce do_rt_sigreturn(), to collect common part of sys_rt_sigreturn(). No change in functionality intended. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 12 +++++++++--- arch/x86/kernel/signal_64.c | 11 ++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d433861a6599..da3cf3270f83 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -215,9 +215,8 @@ badframe: return 0; } -asmlinkage int sys_rt_sigreturn(unsigned long __unused) +static long do_rt_sigreturn(struct pt_regs *regs) { - struct pt_regs *regs = (struct pt_regs *)&__unused; struct rt_sigframe __user *frame; unsigned long ax; sigset_t set; @@ -243,10 +242,17 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) return ax; badframe: - signal_fault(regs, frame, "rt sigreturn"); + signal_fault(regs, frame, "rt_sigreturn"); return 0; } +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} + /* * Set up a signal frame. */ diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index a21c85197295..bf77d4789a2d 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -104,11 +104,11 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, return err; } -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +static long do_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; - sigset_t set; unsigned long ax; + sigset_t set; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -131,10 +131,15 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) return ax; badframe: - signal_fault(regs, frame, "sigreturn"); + signal_fault(regs, frame, "rt_sigreturn"); return 0; } +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} + /* * Set up a signal frame. */ From 5649b7c30316a51792808422ac03ee825d26aa5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Sep 2008 09:29:09 +0200 Subject: [PATCH 096/138] x86: add DMI quirk for AMI BIOS which corrupts address 0xc000 during resume Alan Jenkins and Andy Wettstein reported a suspend/resume memory corruption bug and extensively documented it here: http://bugzilla.kernel.org/show_bug.cgi?id=11237 The bug is that the BIOS overwrites 1K of memory at 0xc000 physical, without registering it in e820 as reserved or giving the kernel any idea about this. Detect AMI BIOSen and reserve that 1K. We paint this bug around with a very broad brush (reserving that 1K on all AMI BIOS systems), as the bug was extremely hard to find and needed several weeks and lots of debugging and patching. The bug was found via the CONFIG_X86_CHECK_BIOS_CORRUPTION=y debug feature, if similar bugs are suspected then this feature can be enabled on other systems as well to scan low memory for corrupted memory. Reported-by: Alan Jenkins Reported-by: Andy Wettstein Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ec7e56c1b984..3109ca37a67c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -729,6 +729,29 @@ void start_periodic_check_for_corruption(void) } #endif +static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE + "%s detected: BIOS corrupts 0xc000, working it around.\n", + d->ident); + + reserve_early(0xc000, 0xc400, "BIOS quirk"); + + return 0; +} + +/* List of systems that have known low memory corruption BIOS problems */ +static struct dmi_system_id __initdata bad_bios_dmi_table[] = { + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + }, + }, + {} +}; + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -752,6 +775,8 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + dmi_check_system(bad_bios_dmi_table); + early_cpu_init(); early_ioremap_init(); @@ -1037,3 +1062,5 @@ void __init setup_arch(char **cmdline_p) #endif #endif } + + From 1e22436eba84edfec9c25e5a25d09062c4f91ca9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Sep 2008 09:58:02 +0200 Subject: [PATCH 097/138] x86: reserve low 64K on AMI and Phoenix BIOS boxen there's multiple reports about suspend/resume related low memory corruption in this bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=11237 the common pattern is that the corruption is caused by the BIOS, and that it affects some portion of the first 64K of physical RAM. So add a DMI quirk This will waste 64K RAM on 'good' systems too, but without knowing the exact nature of this BIOS memory corruption this is the safest approach. This might as well solve a wide range of suspend/resume breakages under Linux. Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3109ca37a67c..33719544a224 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -732,10 +732,10 @@ void start_periodic_check_for_corruption(void) static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE - "%s detected: BIOS corrupts 0xc000, working it around.\n", + "%s detected: BIOS may corrupt low RAM, working it around.\n", d->ident); - reserve_early(0xc000, 0xc400, "BIOS quirk"); + reserve_early(0x0, 0x10000, "BIOS quirk"); return 0; } @@ -749,6 +749,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), + }, + }, {} }; From fc38151947477596aa27df6c4306ad6008dc6711 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Sep 2008 10:07:34 +0200 Subject: [PATCH 098/138] x86: add X86_RESERVE_LOW_64K This bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=11237 Documents a wide range of systems where the BIOS utilizes the first 64K of physical memory during suspend/resume and other hardware events. Currently we reserve this memory on all AMI and Phoenix BIOS systems. Life is too short to hunt subtle memory corruption problems like this, so we try to be robust by default. Still, allow this to be overriden: allow users who want that first 64K of memory to be available to the kernel disable the quirk, via CONFIG_X86_RESERVE_LOW_64K=n. Also, allow the early reservation to overlap with other early reservations. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 20 ++++++++++++++++++++ arch/x86/kernel/setup.c | 4 +++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7820d447bb8d..633f25dd9ee2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1089,6 +1089,26 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK Set whether the default state of memory_corruption_check is on or off. +config X86_RESERVE_LOW_64K + bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" + default y + help + Reserve the first 64K of physical RAM on BIOSes that are known + to potentially corrupt that memory range. A numbers of BIOSes are + known to utilize this area during suspend/resume, so it must not + be used by the kernel. + + Set this to N if you are absolutely sure that you trust the BIOS + to get all its memory reservations and usages right. + + If you have doubts about the BIOS (e.g. suspend/resume does not + work or there's kernel crashes after certain hardware hotplug + events) and it's not AMI or Phoenix, then you might want to enable + X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical + corruption patterns. + + Say Y if unsure. + config MATH_EMULATION bool prompt "Math emulation" if X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 33719544a224..786c1886ae53 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -735,13 +735,14 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) "%s detected: BIOS may corrupt low RAM, working it around.\n", d->ident); - reserve_early(0x0, 0x10000, "BIOS quirk"); + reserve_early_overlap_ok(0x0, 0x10000, "BIOS quirk"); return 0; } /* List of systems that have known low memory corruption BIOS problems */ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { +#ifdef CONFIG_X86_RESERVE_LOW_64K { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", @@ -757,6 +758,7 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { }, }, {} +#endif }; /* From 5871c6b0a52bb052c3891a787655043b90ae18e3 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 20 Sep 2008 20:35:44 -0700 Subject: [PATCH 099/138] x86: use round_jiffies() for the corruption check timer the exact timing of the corruption check isn't too important (it's once a minute timer), use round_jiffies() to align it and avoid extra wakeups. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 786c1886ae53..161f1b33ecec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -712,7 +712,7 @@ void check_for_bios_corruption(void) static void periodic_check_for_corruption(unsigned long data) { check_for_bios_corruption(); - mod_timer(&periodic_check_timer, jiffies + corruption_check_period*HZ); + mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); } void start_periodic_check_for_corruption(void) From 2216d199b1430d1c0affb1498a9ebdbd9c0de439 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Sep 2008 02:52:26 -0700 Subject: [PATCH 100/138] x86: fix CONFIG_X86_RESERVE_LOW_64K=y The bad_bios_dmi_table() quirk never triggered because we do DMI setup too late. Move it a bit earlier. Also change the CONFIG_X86_RESERVE_LOW_64K quirk to operate on the e820 table directly instead of messing with early reservations - this handles overlaps (which do occur in this low range of RAM) more gracefully. Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 161f1b33ecec..d29951c11be0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -735,7 +735,8 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) "%s detected: BIOS may corrupt low RAM, working it around.\n", d->ident); - reserve_early_overlap_ok(0x0, 0x10000, "BIOS quirk"); + e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return 0; } @@ -784,8 +785,6 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif - dmi_check_system(bad_bios_dmi_table); - early_cpu_init(); early_ioremap_init(); @@ -880,6 +879,10 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + dmi_scan_machine(); + + dmi_check_system(bad_bios_dmi_table); + #ifdef CONFIG_X86_32 probe_roms(); #endif @@ -967,8 +970,6 @@ void __init setup_arch(char **cmdline_p) vsmp_init(); #endif - dmi_scan_machine(); - io_delay_init(); /* From a8b71a2810386a5ac8f43d2095fe3355f0d8db37 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 23 Sep 2008 00:35:33 -0700 Subject: [PATCH 101/138] x86: fix macro with bad_bios_dmi_table DMI tables need a blank NULL tail. fixes the crash on Ingo's test box. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d29951c11be0..3ce3edfcc3cd 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -758,8 +758,8 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), }, }, - {} #endif + {} }; /* From 18dbc9160507dc7df998e00cd1dcd7889557307b Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Tue, 23 Sep 2008 12:08:44 +0200 Subject: [PATCH 102/138] x86: moved microcode.c to microcode_intel.c Combine both generic and arch-specific parts of microcode into a single module (arch-specific parts are config-dependent). Also while we are at it, move arch-specific parts from microcode.h into their respective arch-specific .c files. Signed-off-by: Dmitry Adamushko Cc: "Peter Oruba" Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 10 +- arch/x86/kernel/Makefile | 8 +- arch/x86/kernel/microcode_amd.c | 72 ++++++++------ .../kernel/{microcode.c => microcode_core.c} | 26 ++--- arch/x86/kernel/microcode_intel.c | 79 +++++++++------ include/asm-x86/microcode.h | 99 +++++-------------- 6 files changed, 138 insertions(+), 156 deletions(-) rename arch/x86/kernel/{microcode.c => microcode_core.c} (96%) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0e5bf1eddcea..2e6080951350 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -801,7 +801,7 @@ config MICROCODE module will be called microcode. config MICROCODE_INTEL - tristate "Intel microcode patch loading support" + bool "Intel microcode patch loading support" depends on MICROCODE default MICROCODE select FW_LOADER @@ -813,20 +813,14 @@ config MICROCODE_INTEL Intel ingredients for this driver, check: . - This driver is only available as a module: the module - will be called microcode_intel. - config MICROCODE_AMD - tristate "AMD microcode patch loading support" + bool "AMD microcode patch loading support" depends on MICROCODE select FW_LOADER --help--- If you select this option, microcode patch loading support for AMD processors will be enabled. - This driver is only available as a module: the module - will be called microcode_amd. - config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index be454f348c3b..f891996f6849 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -51,9 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o -obj-$(CONFIG_MICROCODE_AMD) += microcode_amd.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o @@ -101,6 +98,11 @@ scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +microcode-y := microcode_core.o +microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o +microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o +obj-$(CONFIG_MICROCODE) += microcode.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 48aec9f48e4f..03ea4e52e87a 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -46,6 +46,35 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 +struct equiv_cpu_entry { + unsigned int installed_cpu; + unsigned int fixed_errata_mask; + unsigned int fixed_errata_compare; + unsigned int equiv_cpu; +}; + +struct microcode_header_amd { + unsigned int data_code; + unsigned int patch_id; + unsigned char mc_patch_data_id[2]; + unsigned char mc_patch_data_len; + unsigned char init_flag; + unsigned int mc_patch_data_checksum; + unsigned int nb_dev_id; + unsigned int sb_dev_id; + unsigned char processor_rev_id[2]; + unsigned char nb_rev_id; + unsigned char sb_rev_id; + unsigned char bios_api_rev; + unsigned char reserved1[3]; + unsigned int match_reg[8]; +}; + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[0]; +}; + #define UCODE_MAX_SIZE (2048) #define DEFAULT_UCODE_DATASIZE (896) #define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) @@ -189,17 +218,18 @@ static void apply_microcode_amd(int cpu) unsigned int rev; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + struct microcode_amd *mc_amd = uci->mc; unsigned long addr; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); - if (uci->mc.mc_amd == NULL) + if (mc_amd == NULL) return; spin_lock_irqsave(µcode_update_lock, flags); - addr = (unsigned long)&uci->mc.mc_amd->hdr.data_code; + addr = (unsigned long)&mc_amd->hdr.data_code; edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); @@ -214,16 +244,16 @@ static void apply_microcode_amd(int cpu) spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ - if (rev != uci->mc.mc_amd->hdr.patch_id) { + if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d update from revision " "0x%x to 0x%x failed\n", cpu_num, - uci->mc.mc_amd->hdr.patch_id, rev); + mc_amd->hdr.patch_id, rev); return; } printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x \n", - cpu_num, uci->cpu_sig.rev, uci->mc.mc_amd->hdr.patch_id); + cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); uci->cpu_sig.rev = rev; } @@ -355,12 +385,12 @@ static int generic_load_microcode(int cpu, void *data, size_t size, if (new_mc) { if (!leftover) { - if (uci->mc.mc_amd) - vfree(uci->mc.mc_amd); - uci->mc.mc_amd = (struct microcode_amd *)new_mc; + if (uci->mc) + vfree(uci->mc); + uci->mc = new_mc; pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", - cpu, uci->mc.mc_amd->hdr.patch_id, uci->cpu_sig.rev); + cpu, new_rev, uci->cpu_sig.rev); } else vfree(new_mc); } @@ -416,8 +446,8 @@ static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - vfree(uci->mc.mc_amd); - uci->mc.mc_amd = NULL; + vfree(uci->mc); + uci->mc = NULL; } static struct microcode_ops microcode_amd_ops = { @@ -428,23 +458,7 @@ static struct microcode_ops microcode_amd_ops = { .microcode_fini_cpu = microcode_fini_cpu_amd, }; -static int __init microcode_amd_module_init(void) +struct microcode_ops * __init init_amd_microcode(void) { - struct cpuinfo_x86 *c = &cpu_data(0); - - equiv_cpu_table = NULL; - if (c->x86_vendor != X86_VENDOR_AMD) { - printk(KERN_ERR "microcode: CPU platform is not AMD-capable\n"); - return -ENODEV; - } - - return microcode_init(µcode_amd_ops, THIS_MODULE); + return µcode_amd_ops; } - -static void __exit microcode_amd_module_exit(void) -{ - microcode_exit(); -} - -module_init(microcode_amd_module_init) -module_exit(microcode_amd_module_exit) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode_core.c similarity index 96% rename from arch/x86/kernel/microcode.c rename to arch/x86/kernel/microcode_core.c index 0c2634f4fd7c..ff031dbccdf6 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode_core.c @@ -298,7 +298,7 @@ static int microcode_resume_cpu(int cpu) pr_debug("microcode: CPU%d resumed\n", cpu); - if (!uci->mc.valid_mc) + if (!uci->mc) return 1; /* @@ -443,17 +443,20 @@ static struct notifier_block __refdata mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; -int microcode_init(void *opaque, struct module *module) +static int __init microcode_init(void) { - struct microcode_ops *ops = (struct microcode_ops *)opaque; + struct cpuinfo_x86 *c = &cpu_data(0); int error; - if (microcode_ops) { - printk(KERN_ERR "microcode: already loaded the other module\n"); - return -EEXIST; - } + if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor != X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); - microcode_ops = ops; + if (!microcode_ops) { + printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + return -ENODEV; + } error = microcode_dev_init(); if (error) @@ -483,9 +486,8 @@ int microcode_init(void *opaque, struct module *module) return 0; } -EXPORT_SYMBOL_GPL(microcode_init); -void __exit microcode_exit(void) +static void __exit microcode_exit(void) { microcode_dev_exit(); @@ -502,4 +504,6 @@ void __exit microcode_exit(void) printk(KERN_INFO "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } -EXPORT_SYMBOL_GPL(microcode_exit); + +module_init(microcode_init); +module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 48ed3cef58c1..622dc4a21784 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -97,6 +97,38 @@ MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); +struct microcode_header_intel { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode_intel { + struct microcode_header_intel hdr; + unsigned int bits[0]; +}; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; + #define DEFAULT_UCODE_DATASIZE (2000) #define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) @@ -284,11 +316,12 @@ static void apply_microcode(int cpu) unsigned int val[2]; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_intel *mc_intel = uci->mc; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); - if (uci->mc.mc_intel == NULL) + if (mc_intel == NULL) return; /* serialize access to the physical write to MSR 0x79 */ @@ -296,8 +329,8 @@ static void apply_microcode(int cpu) /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) uci->mc.mc_intel->bits, - (unsigned long) uci->mc.mc_intel->bits >> 16 >> 16); + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ @@ -307,7 +340,7 @@ static void apply_microcode(int cpu) rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); spin_unlock_irqrestore(µcode_update_lock, flags); - if (val[1] != uci->mc.mc_intel->hdr.rev) { + if (val[1] != mc_intel->hdr.rev) { printk(KERN_ERR "microcode: CPU%d update from revision " "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); return; @@ -315,9 +348,9 @@ static void apply_microcode(int cpu) printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x, date = %04x-%02x-%02x \n", cpu_num, uci->cpu_sig.rev, val[1], - uci->mc.mc_intel->hdr.date & 0xffff, - uci->mc.mc_intel->hdr.date >> 24, - (uci->mc.mc_intel->hdr.date >> 16) & 0xff); + mc_intel->hdr.date & 0xffff, + mc_intel->hdr.date >> 24, + (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; } @@ -367,12 +400,12 @@ static int generic_load_microcode(int cpu, void *data, size_t size, if (new_mc) { if (!leftover) { - if (uci->mc.mc_intel) - vfree(uci->mc.mc_intel); - uci->mc.mc_intel = (struct microcode_intel *)new_mc; + if (uci->mc) + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", - cpu, uci->mc.mc_intel->hdr.rev, uci->cpu_sig.rev); + cpu, new_rev, uci->cpu_sig.rev); } else vfree(new_mc); } @@ -428,11 +461,11 @@ static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - vfree(uci->mc.mc_intel); - uci->mc.mc_intel = NULL; + vfree(uci->mc); + uci->mc = NULL; } -static struct microcode_ops microcode_intel_ops = { +struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, @@ -440,22 +473,8 @@ static struct microcode_ops microcode_intel_ops = { .microcode_fini_cpu = microcode_fini_cpu, }; -static int __init microcode_intel_module_init(void) +struct microcode_ops * __init init_intel_microcode(void) { - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_INTEL) { - printk(KERN_ERR "microcode: CPU platform is not Intel-capable\n"); - return -ENODEV; - } - - return microcode_init(µcode_intel_ops, THIS_MODULE); + return µcode_intel_ops; } -static void __exit microcode_intel_module_exit(void) -{ - microcode_exit(); -} - -module_init(microcode_intel_module_init) -module_exit(microcode_intel_module_exit) diff --git a/include/asm-x86/microcode.h b/include/asm-x86/microcode.h index e2887facdb4a..62c793bb70ca 100644 --- a/include/asm-x86/microcode.h +++ b/include/asm-x86/microcode.h @@ -1,10 +1,12 @@ #ifndef ASM_X86__MICROCODE_H #define ASM_X86__MICROCODE_H -extern int microcode_init(void *opaque, struct module *module); -extern void microcode_exit(void); +struct cpu_signature { + unsigned int sig; + unsigned int pf; + unsigned int rev; +}; -struct cpu_signature; struct device; struct microcode_ops { @@ -17,82 +19,29 @@ struct microcode_ops { void (*microcode_fini_cpu) (int cpu); }; -struct microcode_header_intel { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode_intel { - struct microcode_header_intel hdr; - unsigned int bits[0]; -}; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; - -struct equiv_cpu_entry { - unsigned int installed_cpu; - unsigned int fixed_errata_mask; - unsigned int fixed_errata_compare; - unsigned int equiv_cpu; -}; - -struct microcode_header_amd { - unsigned int data_code; - unsigned int patch_id; - unsigned char mc_patch_data_id[2]; - unsigned char mc_patch_data_len; - unsigned char init_flag; - unsigned int mc_patch_data_checksum; - unsigned int nb_dev_id; - unsigned int sb_dev_id; - unsigned char processor_rev_id[2]; - unsigned char nb_rev_id; - unsigned char sb_rev_id; - unsigned char bios_api_rev; - unsigned char reserved1[3]; - unsigned int match_reg[8]; -}; - -struct microcode_amd { - struct microcode_header_amd hdr; - unsigned int mpb[0]; -}; - -struct cpu_signature { - unsigned int sig; - unsigned int pf; - unsigned int rev; -}; - struct ucode_cpu_info { struct cpu_signature cpu_sig; int valid; - union { - struct microcode_intel *mc_intel; - struct microcode_amd *mc_amd; - void *valid_mc; - } mc; + void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; +#ifdef CONFIG_MICROCODE_INTEL +extern struct microcode_ops * __init init_intel_microcode(void); +#else +static inline struct microcode_ops * __init init_intel_microcode(void) +{ + return NULL; +} +#endif /* CONFIG_MICROCODE_INTEL */ + +#ifdef CONFIG_MICROCODE_AMD +extern struct microcode_ops * __init init_amd_microcode(void); +#else +static inline struct microcode_ops * __init init_amd_microcode(void) +{ + return NULL; +} +#endif + #endif /* ASM_X86__MICROCODE_H */ From da654b74bda14c45a7d98c731bf3c1a43b6b74e2 Mon Sep 17 00:00:00 2001 From: Srinivasa Ds Date: Tue, 23 Sep 2008 15:23:52 +0530 Subject: [PATCH 103/138] signals: demultiplexing SIGTRAP signal Currently a SIGTRAP can denote any one of below reasons. - Breakpoint hit - H/W debug register hit - Single step - Signal sent through kill() or rasie() Architectures like powerpc/parisc provides infrastructure to demultiplex SIGTRAP signal by passing down the information for receiving SIGTRAP through si_code of siginfot_t structure. Here is an attempt is generalise this infrastructure by extending it to x86 and x86_64 archs. Signed-off-by: Srinivasa DS Cc: Roland McGrath Cc: akpm@linux-foundation.org Cc: paulus@samba.org Cc: linuxppc-dev@ozlabs.org Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/siginfo.h | 5 ----- arch/powerpc/include/asm/siginfo.h | 5 ----- arch/x86/kernel/ptrace.c | 7 ++++--- arch/x86/kernel/traps_32.c | 4 +++- arch/x86/kernel/traps_64.c | 2 +- include/asm-generic/siginfo.h | 2 ++ include/asm-parisc/siginfo.h | 5 ----- include/asm-x86/ptrace.h | 2 +- include/asm-x86/traps.h | 10 ++++++++++ 9 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/ia64/include/asm/siginfo.h b/arch/ia64/include/asm/siginfo.h index 9294e4b0c8bc..118d42979003 100644 --- a/arch/ia64/include/asm/siginfo.h +++ b/arch/ia64/include/asm/siginfo.h @@ -113,11 +113,6 @@ typedef struct siginfo { #undef NSIGSEGV #define NSIGSEGV 3 -/* - * SIGTRAP si_codes - */ -#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */ -#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint or watchpoint */ #undef NSIGTRAP #define NSIGTRAP 4 diff --git a/arch/powerpc/include/asm/siginfo.h b/arch/powerpc/include/asm/siginfo.h index 12f1bce037be..49495b0534ed 100644 --- a/arch/powerpc/include/asm/siginfo.h +++ b/arch/powerpc/include/asm/siginfo.h @@ -15,11 +15,6 @@ #include -/* - * SIGTRAP si_codes - */ -#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */ -#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint or watchpoint */ #undef NSIGTRAP #define NSIGTRAP 4 diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 9e43a48ad6e0..bf45cdf1aaca 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1358,7 +1358,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) { struct siginfo info; @@ -1367,7 +1368,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) memset(&info, 0, sizeof(info)); info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; + info.si_code = si_code; /* User-mode ip? */ info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; @@ -1454,5 +1455,5 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) */ if (test_thread_flag(TIF_SINGLESTEP) && tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) - send_sigtrap(current, regs, 0); + send_sigtrap(current, regs, 0, TRAP_BRKPT); } diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index da5a5964fccb..0429c5de5ea9 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -891,6 +891,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; unsigned int condition; + int si_code; trace_hardirqs_fixup(); @@ -935,8 +936,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_TF_reenable; } + si_code = get_si_code((unsigned long)condition); /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code); + send_sigtrap(tsk, regs, error_code, si_code); /* * Disable additional traps. They'll be re-enabled when diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 56d6f1147785..011d8e1fac6e 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -941,7 +941,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs *regs, tsk->thread.error_code = error_code; info.si_signo = SIGTRAP; info.si_errno = 0; - info.si_code = TRAP_BRKPT; + info.si_code = get_si_code(condition); info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; force_sig_info(SIGTRAP, &info, tsk); diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h index 8786e01e0db8..969570167e9e 100644 --- a/include/asm-generic/siginfo.h +++ b/include/asm-generic/siginfo.h @@ -199,6 +199,8 @@ typedef struct siginfo { */ #define TRAP_BRKPT (__SI_FAULT|1) /* process breakpoint */ #define TRAP_TRACE (__SI_FAULT|2) /* process trace trap */ +#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */ +#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint/watchpoint */ #define NSIGTRAP 2 /* diff --git a/include/asm-parisc/siginfo.h b/include/asm-parisc/siginfo.h index d4909f55fe35..d7034728f377 100644 --- a/include/asm-parisc/siginfo.h +++ b/include/asm-parisc/siginfo.h @@ -3,11 +3,6 @@ #include -/* - * SIGTRAP si_codes - */ -#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */ -#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint or watchpoint */ #undef NSIGTRAP #define NSIGTRAP 4 diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h index fad807769910..c2f368273079 100644 --- a/include/asm-x86/ptrace.h +++ b/include/asm-x86/ptrace.h @@ -143,7 +143,7 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); #ifdef CONFIG_X86_32 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code); + int error_code, int si_code); #endif void signal_fault(struct pt_regs *regs, void __user *frame, char *where); diff --git a/include/asm-x86/traps.h b/include/asm-x86/traps.h index 2ccebc6fb0b0..4b1e90409251 100644 --- a/include/asm-x86/traps.h +++ b/include/asm-x86/traps.h @@ -36,6 +36,16 @@ void do_invalid_op(struct pt_regs *, long); void do_general_protection(struct pt_regs *, long); void do_nmi(struct pt_regs *, long); +static inline int get_si_code(unsigned long condition) +{ + if (condition & DR_STEP) + return TRAP_TRACE; + else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) + return TRAP_HWBKPT; + else + return TRAP_BRKPT; +} + extern int panic_on_unrecovered_nmi; extern int kstack_depth_to_print; From b6cffde1a20409f9720d5c9aba28d3efd3a4f04e Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Wed, 17 Sep 2008 15:05:52 +0200 Subject: [PATCH 104/138] x86, microcode rework, v2, renaming Renaming based on patch from Dmitry Adamushko. Made code more readable by renaming define and variables related to microcode _container_file_ header to make it distinguishable from microcode _patch_ header. Signed-off-by: Peter Oruba Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 03ea4e52e87a..62058e9c8b4e 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -304,12 +304,12 @@ static void * get_next_ucode(u8 *buf, unsigned int size, static int install_equiv_cpu_table(u8 *buf, int (*get_ucode_data)(void *, const void *, size_t)) { -#define UCODE_HEADER_SIZE 12 - u8 *hdr[UCODE_HEADER_SIZE]; - unsigned int *buf_pos = (unsigned int *)hdr; +#define UCODE_CONTAINER_HEADER_SIZE 12 + u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; + unsigned int *buf_pos = (unsigned int *)container_hdr; unsigned long size; - if (get_ucode_data(&hdr, buf, UCODE_HEADER_SIZE)) + if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) return 0; size = buf_pos[2]; @@ -326,14 +326,14 @@ static int install_equiv_cpu_table(u8 *buf, return 0; } - buf += UCODE_HEADER_SIZE; + buf += UCODE_CONTAINER_HEADER_SIZE; if (get_ucode_data(equiv_cpu_table, buf, size)) { vfree(equiv_cpu_table); return 0; } - return size + UCODE_HEADER_SIZE; /* add header length */ -#undef UCODE_HEADER_SIZE + return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ +#undef UCODE_CONTAINER_HEADER_SIZE } static void free_equiv_cpu_table(void) From d4738792fb86600b6cb7220459d9c47e819b3580 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Wed, 17 Sep 2008 15:39:18 +0200 Subject: [PATCH 105/138] x86, microcode rework, v2, renaming cont. Renaming based on patch from Dmitry Adamushko. Further clarification by renaming define and variable related to microcode container file. Signed-off-by: Peter Oruba Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 62058e9c8b4e..829415208ff8 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -263,21 +263,20 @@ static void * get_next_ucode(u8 *buf, unsigned int size, unsigned int *mc_size) { unsigned int total_size; -#define UCODE_UNKNOWN_HDR 8 - u8 hdr[UCODE_UNKNOWN_HDR]; +#define UCODE_CONTAINER_SECTION_HDR 8 + u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; void *mc; - if (get_ucode_data(hdr, buf, UCODE_UNKNOWN_HDR)) + if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) return NULL; - if (hdr[0] != UCODE_UCODE_TYPE) { + if (section_hdr[0] != UCODE_UCODE_TYPE) { printk(KERN_ERR "microcode: error! " "Wrong microcode payload type field\n"); return NULL; } - /* FIXME! dimm: Why not by means of get_totalsize(hdr)? */ - total_size = (unsigned long) (hdr[4] + (hdr[5] << 8)); + total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); printk(KERN_INFO "microcode: size %u, total_size %u\n", size, total_size); @@ -290,13 +289,13 @@ static void * get_next_ucode(u8 *buf, unsigned int size, mc = vmalloc(UCODE_MAX_SIZE); if (mc) { memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_UNKNOWN_HDR, total_size)) { + if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { vfree(mc); mc = NULL; } else - *mc_size = total_size + UCODE_UNKNOWN_HDR; + *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; } -#undef UCODE_UNKNOWN_HDR +#undef UCODE_CONTAINER_SECTION_HDR return mc; } From e8d3f455de4f42d4bab2f6f1aeb2cf3bd18eb508 Mon Sep 17 00:00:00 2001 From: Srinivasa Ds Date: Tue, 23 Sep 2008 15:23:52 +0530 Subject: [PATCH 106/138] signals: demultiplexing SIGTRAP signal, fix fix build breakage, missing header file. Signed-off-by: Srinivasa DS Signed-off-by: Ingo Molnar --- include/asm-x86/traps.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-x86/traps.h b/include/asm-x86/traps.h index 4b1e90409251..7a692baa51ae 100644 --- a/include/asm-x86/traps.h +++ b/include/asm-x86/traps.h @@ -1,6 +1,8 @@ #ifndef ASM_X86__TRAPS_H #define ASM_X86__TRAPS_H +#include + /* Common in X86_32 and X86_64 */ asmlinkage void divide_error(void); asmlinkage void debug(void); From 5fd933303bd1efacbd0acbe452ba9b889440eb40 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 23 Sep 2008 17:19:44 -0700 Subject: [PATCH 107/138] x86: signal: cosmetic unification of do_signal() Make do_signal() same. Thia patch modifies only comments in signal_64.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index bf77d4789a2d..5a5fbc3b1eea 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -410,7 +410,8 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* Re-enable any watchpoints before delivering the + /* + * Re-enable any watchpoints before delivering the * signal to user space. The processor register will * have been cleared if the watchpoint triggered * inside the kernel. @@ -418,7 +419,7 @@ static void do_signal(struct pt_regs *regs) if (current->thread.debugreg7) set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ + /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* * A signal was successfully delivered; the saved @@ -441,6 +442,7 @@ static void do_signal(struct pt_regs *regs) regs->ax = regs->orig_ax; regs->ip -= 2; break; + case -ERESTART_RESTARTBLOCK: regs->ax = NR_restart_syscall; regs->ip -= 2; From ee847c54ba7fc09f85f13a5bf18f45ea6c19aa83 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 23 Sep 2008 17:21:45 -0700 Subject: [PATCH 108/138] x86: signal: cosmetic unification of do_notify_resume() Make do_notify_resume() same. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 8 ++++++++ arch/x86/kernel/signal_64.c | 16 ++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index da3cf3270f83..b94463f264b4 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -663,6 +663,12 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); @@ -672,7 +678,9 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } +#ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 5a5fbc3b1eea..9087752f4109 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -460,14 +460,18 @@ static void do_signal(struct pt_regs *regs) } } -void do_notify_resume(struct pt_regs *regs, void *unused, - __u32 thread_info_flags) +/* + * notification of userspace execution resumption + * - triggered by the TIF_WORK_MASK flags + */ +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_MCE +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_user(); -#endif /* CONFIG_X86_MCE */ +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) @@ -477,6 +481,10 @@ void do_notify_resume(struct pt_regs *regs, void *unused, clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } + +#ifdef CONFIG_X86_32 + clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) From 86d3237cd1a09136b4fb3a1d73d3c3fd6331cb14 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 23 Sep 2008 17:22:32 -0700 Subject: [PATCH 109/138] x86: signal: cosmetic unification of handle_signal() Make handle_signal() same. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 9 +++++++++ arch/x86/kernel/signal_64.c | 2 ++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b94463f264b4..bb05917f232c 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -550,6 +550,15 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, if (ret) return ret; +#ifdef CONFIG_X86_64 + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); +#endif + /* * Clear the direction flag as per the ABI for function entry. */ diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 9087752f4109..963236f2c3c1 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -346,12 +346,14 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, if (ret) return ret; +#ifdef CONFIG_X86_64 /* * This has nothing to do with segment registers, * despite the name. This magic affects uaccess.h * macros' behavior. Reset it to the normal setting. */ set_fs(USER_DS); +#endif /* * Clear the direction flag as per the ABI for function entry. From 2f9284e4e3be7b0b4d8d0638f9805603069a762d Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Tue, 23 Sep 2008 22:56:35 +0200 Subject: [PATCH 110/138] x86, microcode_amd: cleanup, mark request_microcode_user() as unsupported (1) mark mc_size in generic_load_microcode() as unitialized_var to avoid gcc's (false) warning; (2) mark request_microcode_user() as unsupported. The required changes can be added later. Note, we don't break any user-space interfaces here, as there were no kernels with support for AMD-specific ucode update yet. The ucode has to be updated via 'firmware'. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 36 ++++----------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 829415208ff8..7a1f8eeac2c7 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -120,29 +120,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) unsigned int equiv_cpu_id = 0x00; unsigned int i = 0; - /* - * FIXME! dimm: do we need this? Why an update via /dev/... is different - * from the one via firmware? - * - * This is a tricky part. We might be called from a write operation - * to the device file instead of the usual process of firmware - * loading. This routine needs to be able to distinguish both - * cases. This is done by checking if there alread is a equivalent - * CPU table installed. If not, we're written through - * /dev/cpu/microcode. - * Since we ignore all checks. The error case in which going through - * firmware loading and that table is not loaded has already been - * checked earlier. - */ BUG_ON(equiv_cpu_table == NULL); -#if 0 - if (equiv_cpu_table == NULL) { - printk(KERN_INFO "microcode: CPU%d microcode update with " - "version 0x%x (current=0x%x)\n", - cpu, mc_header->patch_id, uci->cpu_sig.rev); - goto out; - } -#endif current_cpu_id = cpuid_eax(0x00000001); while (equiv_cpu_table[i].installed_cpu != 0) { @@ -362,7 +340,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover = size - offset; while (leftover) { - unsigned int mc_size; + unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); @@ -428,17 +406,11 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } -static int get_ucode_user(void *to, const void *from, size_t n) -{ - return copy_from_user(to, from, n); -} - static int request_microcode_user(int cpu, const void __user *buf, size_t size) { - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - - return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); + printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" + "is not supported\n"); + return -1; } static void microcode_fini_cpu_amd(int cpu) From 82b078659ed04e1ecdebf8326e189cf76ed361af Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Wed, 24 Sep 2008 11:50:35 +0200 Subject: [PATCH 111/138] x86: microcode patch loader bugfix Corrected CPU vendor check condition for AMD microcode patch loader initialization. Signed-off-by: Peter Oruba Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index ff031dbccdf6..8db2eb55e9e2 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -450,7 +450,7 @@ static int __init microcode_init(void) if (c->x86_vendor == X86_VENDOR_INTEL) microcode_ops = init_intel_microcode(); - else if (c->x86_vendor != X86_VENDOR_AMD) + else if (c->x86_vendor == X86_VENDOR_AMD) microcode_ops = init_amd_microcode(); if (!microcode_ops) { From 8d8c13bdb53977319593d9efc4971a4cacc8bd03 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 24 Sep 2008 19:10:29 -0700 Subject: [PATCH 112/138] x86: signal_32.c: introduce signr_convert() Introduce signr_convert(). This function will help unification of setup_rt_frame(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index bb05917f232c..b1bc90f19b9a 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -482,18 +482,21 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* * OK, we're invoking a handler: */ +static int signr_convert(int sig) +{ + struct thread_info *info = current_thread_info(); + + if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) + return info->exec_domain->signal_invmap[sig]; + return sig; +} + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { + int usig = signr_convert(sig); int ret; - int usig; - - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) From b94fd69827b2104a2a4d9d0bc05a8e908a937ae8 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 24 Sep 2008 19:12:54 -0700 Subject: [PATCH 113/138] x86: signal_64.c: introduce helper function signr_convert() This helper function is for unification of setup_rt_frame(). No effect in binary. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 963236f2c3c1..c5d80024a8f3 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -281,18 +281,24 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* * OK, we're invoking a handler */ +static int signr_convert(int sig) +{ + return sig; +} + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { + int usig = signr_convert(sig); int ret; #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) { if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(sig, ka, info, set, regs); + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); else - ret = ia32_setup_frame(sig, ka, set, regs); + ret = ia32_setup_frame(usig, ka, set, regs); } else #endif ret = __setup_rt_frame(sig, ka, info, set, regs); From 455edbc423db282bb64dec2d7c8968498ea8e619 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 24 Sep 2008 19:13:11 -0700 Subject: [PATCH 114/138] x86: signal: introduce helper macro is_ia32 Introduce new macro is_ia32 for unification of setup_rt_frame(). No effect in binary, compiler will optimize. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 13 +++++++++---- arch/x86/kernel/signal_64.c | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b1bc90f19b9a..cf62f70cc2a6 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -491,6 +491,8 @@ static int signr_convert(int sig) return sig; } +#define is_ia32 1 + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) @@ -499,10 +501,13 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, int ret; /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - ret = __setup_rt_frame(usig, ka, info, set, regs); - else - ret = __setup_frame(usig, ka, set, regs); + if (is_ia32) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = __setup_rt_frame(usig, ka, info, set, regs); + else + ret = __setup_frame(usig, ka, set, regs); + } else + ret = __setup_rt_frame(sig, ka, info, set, regs); if (ret) { force_sigsegv(sig, current); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index c5d80024a8f3..53f86d95985b 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -286,6 +286,12 @@ static int signr_convert(int sig) return sig; } +#ifdef CONFIG_IA32_EMULATION +#define is_ia32 test_thread_flag(TIF_IA32) +#else +#define is_ia32 0 +#endif + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) @@ -293,15 +299,14 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, int usig = signr_convert(sig); int ret; -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { + /* Set up the stack frame */ + if (is_ia32) { if (ka->sa.sa_flags & SA_SIGINFO) ret = ia32_setup_rt_frame(usig, ka, info, set, regs); else ret = ia32_setup_frame(usig, ka, set, regs); } else -#endif - ret = __setup_rt_frame(sig, ka, info, set, regs); + ret = __setup_rt_frame(sig, ka, info, set, regs); if (ret) { force_sigsegv(sig, current); From 4694d2391221e14fe671602fe34ea7f24f5a561f Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 24 Sep 2008 19:13:29 -0700 Subject: [PATCH 115/138] x86: signal_32.c: introduce macro ia32_setup_frame and ia32_setup_rt_frame Make 32-bit setup_rt_frame() look like 64-bit version for unification. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index cf62f70cc2a6..4337cd510f0a 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -492,6 +492,8 @@ static int signr_convert(int sig) } #define is_ia32 1 +#define ia32_setup_frame __setup_frame +#define ia32_setup_rt_frame __setup_rt_frame static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -503,9 +505,9 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up the stack frame */ if (is_ia32) { if (ka->sa.sa_flags & SA_SIGINFO) - ret = __setup_rt_frame(usig, ka, info, set, regs); + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); else - ret = __setup_frame(usig, ka, set, regs); + ret = ia32_setup_frame(usig, ka, set, regs); } else ret = __setup_rt_frame(sig, ka, info, set, regs); From 08115ab4d98cb577a83971ebd57cdfbcc6f50b68 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Mon, 29 Sep 2008 18:24:23 -0400 Subject: [PATCH 116/138] xen: make CONFIG_XEN_SAVE_RESTORE depend on CONFIG_XEN Xen options need to depend on XEN. Also, add newline at end of file. Without this patch you need to disable CONFIG_PM in order to disable CPU hotplugging. Signed-off-by: Chuck Ebbert Acked-by Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index d3e68465ace9..87b9ab166423 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -26,7 +26,7 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool - depends on PM + depends on XEN && PM default y config XEN_DEBUG_FS @@ -35,4 +35,4 @@ config XEN_DEBUG_FS default n help Enable statistics output and various tuning options in debugfs. - Enabling this option may incur a significant performance overhead. \ No newline at end of file + Enabling this option may incur a significant performance overhead. From bff0aa4b8f8a1c8492e99c522d044bb98a6ec098 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 30 Sep 2008 20:28:20 -0700 Subject: [PATCH 117/138] x86: ia32_signal.c: remove unnecessary cast to u32 __put_user() looks type of the 2nd parameter, so casting the 1st parameter is not necessary. text data bss dec hex filename 6227 0 8 6235 185b ia32_signal.o.new 6227 0 8 6235 185b ia32_signal.o.old Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e47bed2440ee..690a480c68c5 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -351,21 +351,21 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, savesegment(es, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->es); - err |= __put_user((u32)regs->di, &sc->di); - err |= __put_user((u32)regs->si, &sc->si); - err |= __put_user((u32)regs->bp, &sc->bp); - err |= __put_user((u32)regs->sp, &sc->sp); - err |= __put_user((u32)regs->bx, &sc->bx); - err |= __put_user((u32)regs->dx, &sc->dx); - err |= __put_user((u32)regs->cx, &sc->cx); - err |= __put_user((u32)regs->ax, &sc->ax); - err |= __put_user((u32)regs->cs, &sc->cs); - err |= __put_user((u32)regs->ss, &sc->ss); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); + err |= __put_user(regs->cs, &sc->cs); + err |= __put_user(regs->ss, &sc->ss); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user((u32)regs->ip, &sc->ip); - err |= __put_user((u32)regs->flags, &sc->flags); - err |= __put_user((u32)regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); tmp = save_i387_xstate_ia32(fpstate); if (tmp < 0) From 7b9cee16ffb495558c1e3ada55cba906e520006e Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 30 Sep 2008 20:30:51 -0700 Subject: [PATCH 118/138] x86: ia32_signal.c remove unnecessary function calls the below 2 functions are called in save_i387_xstate_ia32() - clear_used_math(); - stts(); Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 690a480c68c5..4bc02b23674b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -370,12 +370,9 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, tmp = save_i387_xstate_ia32(fpstate); if (tmp < 0) err = -EFAULT; - else { - clear_used_math(); - stts(); + else err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), &sc->fpstate); - } /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); From fd1452ebf257317f24e0e285a17a2ec2ce3e6df7 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 2 Oct 2008 16:56:19 +0300 Subject: [PATCH 119/138] x86/microcode: fix sleeping function called from invalid context at kernel/mutex.c Fix the following "sleeping function called from invalid context" bug: ... __might_sleep mutex_lock_nested microcode_update_cpu mc_sysdev_resume __sysdev_resume sysdev_resume device_power_up ... Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 8db2eb55e9e2..936d8d55f230 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -324,10 +324,6 @@ void microcode_update_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu); - - mutex_lock(µcode_mutex); /* * Check if the system resume is in progress (uci->valid != NULL), * otherwise just request a firmware: @@ -340,11 +336,8 @@ void microcode_update_cpu(int cpu) err = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); } - if (!err) microcode_ops->apply_microcode(cpu); - - mutex_unlock(µcode_mutex); } static void microcode_init_cpu(int cpu) @@ -352,7 +345,13 @@ static void microcode_init_cpu(int cpu) cpumask_t old = current->cpus_allowed; set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu); + + mutex_lock(µcode_mutex); microcode_update_cpu(cpu); + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); } From db053b86f4b1ec790da2dafe2acb93be76288bb9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Oct 2008 16:41:31 -0700 Subject: [PATCH 120/138] xen: clean up x86-64 warnings There are a couple of Xen features which rely on directly accessing per-cpu data via a segment register, which is not yet available on x86-64. In the meantime, just disable direct access to the vcpu info structure; this leaves some of the code as dead, but it will come to life in time, and the warnings are suppressed. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 61 +++++++-------------------------------- arch/x86/xen/xen-asm_64.S | 20 +++++++++++-- 2 files changed, 27 insertions(+), 54 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8ca2f88bde1e..85692c9f6496 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -112,7 +112,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 1; +static int have_vcpu_info_placement = +#ifdef CONFIG_X86_32 + 1 +#else + 0 +#endif + ; + static void xen_vcpu_setup(int cpu) { @@ -941,6 +948,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) } #endif +#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { /* If there's an existing pte, then don't allow _PAGE_RW to be set */ @@ -959,6 +967,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) xen_set_pte(ptep, pte); } +#endif static __init void xen_pagetable_setup_start(pgd_t *base) { @@ -1025,7 +1034,6 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ -#ifdef CONFIG_X86_32 if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); @@ -1035,7 +1043,6 @@ void xen_setup_vcpu_info_placement(void) pv_irq_ops.irq_enable = xen_irq_enable_direct; pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } -#endif } static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, @@ -1056,12 +1063,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { -#ifdef CONFIG_X86_32 SITE(pv_irq_ops, irq_enable); SITE(pv_irq_ops, irq_disable); SITE(pv_irq_ops, save_fl); SITE(pv_irq_ops, restore_fl); -#endif /* CONFIG_X86_32 */ #undef SITE patch_site: @@ -1399,48 +1404,11 @@ static void *m2v(phys_addr_t maddr) return __ka(m2p(maddr)); } -#ifdef CONFIG_X86_64 -static void walk(pgd_t *pgd, unsigned long addr) -{ - unsigned l4idx = pgd_index(addr); - unsigned l3idx = pud_index(addr); - unsigned l2idx = pmd_index(addr); - unsigned l1idx = pte_index(addr); - pgd_t l4; - pud_t l3; - pmd_t l2; - pte_t l1; - - xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", - pgd, addr, l4idx, l3idx, l2idx, l1idx); - - l4 = pgd[l4idx]; - xen_raw_printk(" l4: %016lx\n", l4.pgd); - xen_raw_printk(" %016lx\n", pgd_val(l4)); - - l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; - xen_raw_printk(" l3: %016lx\n", l3.pud); - xen_raw_printk(" %016lx\n", pud_val(l3)); - - l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; - xen_raw_printk(" l2: %016lx\n", l2.pmd); - xen_raw_printk(" %016lx\n", pmd_val(l2)); - - l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; - xen_raw_printk(" l1: %016lx\n", l1.pte); - xen_raw_printk(" %016lx\n", pte_val(l1)); -} -#endif - static void set_page_prot(void *addr, pgprot_t prot) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", - addr, pfn, get_phys_to_machine(pfn), - pgprot_val(prot), pte.pte); - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } @@ -1698,15 +1666,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("about to get started...\n"); -#if 0 - xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", - &boot_params, __pa_symbol(&boot_params), - __va(__pa_symbol(&boot_params))); - - walk(pgd, &boot_params); - walk(pgd, __va(__pa(&boot_params))); -#endif - /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 3b9bda46487a..05794c566e87 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -26,8 +26,15 @@ /* Pseudo-flag used for virtual NMI, which we don't implement yet */ #define XEN_EFLAGS_NMI 0x80000000 -#if 0 -#include +#if 1 +/* + x86-64 does not yet support direct access to percpu variables + via a segment override, so we just need to make sure this code + never gets used + */ +#define BUG ud2a +#define PER_CPU_VAR(var, off) 0xdeadbeef +#endif /* Enable events. This clears the event mask and tests the pending @@ -35,6 +42,8 @@ events, then enter the hypervisor to get them handled. */ ENTRY(xen_irq_enable_direct) + BUG + /* Unmask events */ movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) @@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct) non-zero. */ ENTRY(xen_irq_disable_direct) + BUG + movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) ENDPATCH(xen_irq_disable_direct) ret @@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct) Xen and x86 use opposite senses (mask vs enable). */ ENTRY(xen_save_fl_direct) + BUG + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) setz %ah addb %ah,%ah @@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct) if so. */ ENTRY(xen_restore_fl_direct) + BUG + testb $X86_EFLAGS_IF>>8, %ah setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) /* Preempt here doesn't matter because that will deal with @@ -133,7 +148,6 @@ check_events: pop %rcx pop %rax ret -#endif ENTRY(xen_adjust_exception_frame) mov 8+0(%rsp),%rcx From d19c8e516e0a17e049bcfbe96f86e040254ddf14 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Oct 2008 16:42:35 -0700 Subject: [PATCH 121/138] xen: remove unused balloon.h The balloon driver doesn't have any externally callable functions at the moment, so remove the (effectively empty) header. We can add it back if we need to. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- drivers/xen/balloon.c | 1 - include/xen/balloon.h | 61 ------------------------------------------- 2 files changed, 62 deletions(-) delete mode 100644 include/xen/balloon.h diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index a51f3e17a5fd..8c83abc73400 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -53,7 +53,6 @@ #include #include -#include #include #include #include diff --git a/include/xen/balloon.h b/include/xen/balloon.h deleted file mode 100644 index fe43b0f3c86a..000000000000 --- a/include/xen/balloon.h +++ /dev/null @@ -1,61 +0,0 @@ -/****************************************************************************** - * balloon.h - * - * Xen balloon driver - enables returning/claiming memory to/from Xen. - * - * Copyright (c) 2003, B Dragovic - * Copyright (c) 2003-2004, M Williamson, K Fraser - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __XEN_BALLOON_H__ -#define __XEN_BALLOON_H__ - -#include - -#if 0 -/* - * Inform the balloon driver that it should allow some slop for device-driver - * memory activities. - */ -void balloon_update_driver_allowance(long delta); - -/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */ -struct page **alloc_empty_pages_and_pagevec(int nr_pages); -void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages); - -void balloon_release_driver_page(struct page *page); - -/* - * Prevent the balloon driver from changing the memory reservation during - * a driver critical region. - */ -extern spinlock_t balloon_lock; -#define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags) -#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags) -#endif - -#endif /* __XEN_BALLOON_H__ */ From a2e8d3dcfd420177aaa0c53aca60a869bad75f4b Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 2 Oct 2008 22:09:20 -0700 Subject: [PATCH 122/138] x86: signal: move macros out from restore_sigcontext() move macros, COPY, COPY_SEG*, GET_SEG, out from restore_sigcontext(). x86_64: introduce COPY_SEG_STRICT for cs. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 38 ++++++++++++++++++++----------------- arch/x86/kernel/signal_64.c | 18 +++++++++++------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 4337cd510f0a..545448b7aeba 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -113,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx) return do_sigaltstack(uss, uoss, regs->sp); } +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp; \ +} + +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + +#define GET_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg, tmp); \ +} /* * Do a signal return; undo the signal stack. @@ -126,23 +147,6 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) - -#define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp; } - -#define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp|3; } - -#define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg, tmp); } - GET_SEG(gs); COPY_SEG(fs); COPY_SEG(es); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 53f86d95985b..feff4a91d095 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -52,6 +52,16 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + /* * Do a signal return; undo the signal stack. */ @@ -64,8 +74,6 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) (err |= __get_user(regs->x, &sc->x)) - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(r8); @@ -80,11 +88,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Kernel saves and restores only the CS segment register on signals, * which is the bare minimum needed to allow mixed 32/64-bit code. * App's signal handler can save/restore other segments if needed. */ - { - unsigned cs; - err |= __get_user(cs, &sc->cs); - regs->cs = cs | 3; /* Force into user mode */ - } + COPY_SEG_STRICT(cs); { unsigned int tmpflags; From 69e13ad56f9e2cd81c4f8bfd6267211c10c14c08 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 2 Oct 2008 22:18:47 -0700 Subject: [PATCH 123/138] x86: signal: remove indent in restore_sigcontext() remove braces and indent for flags and fpstate in restore_sigcontext(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 21 +++++++-------------- arch/x86/kernel/signal_64.c | 19 +++++++------------ 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 545448b7aeba..d6dd057d0f22 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -142,6 +142,8 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ @@ -156,21 +158,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); - { - unsigned int tmpflags; + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | - (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } - - { - void __user *buf; - - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index feff4a91d095..a5c9627f4db9 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -69,6 +69,8 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ @@ -90,19 +92,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, * App's signal handler can save/restore other segments if needed. */ COPY_SEG_STRICT(cs); - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - { - void __user *buf; - - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; From 6cdcdb99cf7c2e1835fc5b471864d21161c3e679 Mon Sep 17 00:00:00 2001 From: Andrey Borzenkov Date: Fri, 3 Oct 2008 21:08:49 +0400 Subject: [PATCH 124/138] x86 setup: fix ghost entries under /sys/firmware/edd take 3 Some BIOSes do not indicate error when trying to read from non- existing device. Zero buffer before reading and check that we possibly have valid MBR by looking for MBR magic. This was fixed in different way for edd.S in http://marc.info/?l=linux-kernel&m=114087765422490&w=2, but lost again when edd.S was rewritten in C. Signed-off-by: Andrey Borzenkov < arvidjaar@mail.ru> Signed-off-by: H. Peter Anvin --- arch/x86/boot/edd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index bf4ae6ff518e..067e28cd3c5f 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -43,6 +43,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) char *mbrbuf_ptr, *mbrbuf_end; u32 buf_base, mbr_base; extern char _end[]; + u16 mbr_magic; sector_size = ei->params.bytes_per_sector; if (!sector_size) @@ -60,11 +61,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) return -1; + memset(mbrbuf_ptr, 0, sector_size); if (read_mbr(devno, mbrbuf_ptr)) return -1; *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; - return 0; + mbr_magic = *(u16 *)&mbrbuf_ptr[510]; + + /* check for valid MBR magic */ + return mbr_magic == 0xAA55 ? 0 : -1; } static int get_edd_info(u8 devno, struct edd_info *ei) From 98920dc3d1113b883cbc73e3293446d3525c6042 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 3 Oct 2008 10:22:33 -0700 Subject: [PATCH 125/138] Revert "x86: fix ghost EDD devices in /sys again" This reverts commit 464f04c9e9b3b1c4f5ffb89c51d8ba2a2034c846. Obsoleted by commit 6cdcdb99cf7c2e1835fc5b471864d21161c3e679. Signed-off-by: H. Peter Anvin --- arch/x86/boot/edd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 067e28cd3c5f..1aae8f3e5ca1 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -32,9 +32,7 @@ static int read_mbr(u8 devno, void *buf) : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx) : : "esi", "edi", "memory"); - /* Some BIOSes do not set carry flag on error but still return - * error in AH. The condition below is expected to catch both */ - return -!!ax; /* 0 or -1 */ + return -(u8)ax; /* 0 or -1 */ } static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) From ba31a5f88b6f907e715ff43db06403e12465b703 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sat, 4 Oct 2008 21:21:44 +0200 Subject: [PATCH 126/138] x86 setup: remove DEF_INITSEG and DEF_SETUPSEG Since v.2.6.23 DEF_INITSEG and DEF_SETUPSEG are unused. Commit c397368 ("Remove old i386 setup code") dropped their usage for i386. They did not return in the x86 tree. (Something similar must have happened for x86_64.) Remove these. Signed-off-by: Paul Bolle Signed-off-by: H. Peter Anvin --- include/asm-x86/boot.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/asm-x86/boot.h b/include/asm-x86/boot.h index 2faed7ecb092..7b287df4ab55 100644 --- a/include/asm-x86/boot.h +++ b/include/asm-x86/boot.h @@ -2,9 +2,7 @@ #define _ASM_BOOT_H /* Don't touch these, unless you really know what you're doing. */ -#define DEF_INITSEG 0x9000 #define DEF_SYSSEG 0x1000 -#define DEF_SETUPSEG 0x9020 #define DEF_SYSSIZE 0x7F00 /* Internal svga startup constants */ From d960c9ce47c5360b39e47c064818e91c89df0e21 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sat, 4 Oct 2008 21:18:51 +0200 Subject: [PATCH 127/138] x86 setup: remove IMAGE_OFFSET After commit 968de4f ("i386: Relocatable kernel support") IMAGE_OFFSET wasn't actually used anymore in the (current) X86 build system. Now remove its last traces. Signed-off-by: Paul Bolle Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index cceba1f46365..cd48c7210016 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -$(obj)/zImage: IMAGE_OFFSET := 0x1000 $(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -$(obj)/bzImage: IMAGE_OFFSET := 0x100000 $(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ $(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b @@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE $(call if_changed,objcopy) $(obj)/compressed/vmlinux: FORCE - $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ + $(Q)$(MAKE) $(build)=$(obj)/compressed $@ # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel FDARGS = From 2407390bd20de38740eef87eab4fe3d1deafdbdd Mon Sep 17 00:00:00 2001 From: Michal Januszewski Date: Sun, 5 Oct 2008 12:16:04 +0200 Subject: [PATCH 128/138] x86: replace a magic number with a named constant in the VESA boot code Replace a magic number with a named constant in the VESA boot code. Signed-off-by: Michal Januszewski Cc: linux-fbdev-devel@lists.sourceforge.net Signed-off-by: Ingo Molnar --- arch/x86/boot/video-vesa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 401ad998ad08..1e6fe0214c85 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -224,7 +224,7 @@ static void vesa_store_pm_info(void) static void vesa_store_mode_params_graphics(void) { /* Tell the kernel we're in VESA graphics mode */ - boot_params.screen_info.orig_video_isVGA = 0x23; + boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB; /* Mode parameters */ boot_params.screen_info.vesa_attributes = vminfo.mode_attr; From 33fb0e4eb53f16af312f9698f974e2e64af39c12 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 7 Oct 2008 00:11:22 +0200 Subject: [PATCH 129/138] x86: SB450: skip IRQ0 override if it is not routed to INT2 of IOAPIC On some HP nx6... laptops (e.g. nx6325) BIOS reports an IRQ0 override but the SB450 chipset is configured such that timer interrupts goe to INT0 of IOAPIC. Check IRQ0 routing and if it is routed to INT0 of IOAPIC skip the timer override. [ This more generic PCI ID based quirk should alleviate the need for dmi_ignore_irq0_timer_override DMI quirks. ] Signed-off-by: Andreas Herrmann Acked-by: "Maciej W. Rozycki" Tested-by: Dmitry Torokhov Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 4353cf5e6fac..6b839b147644 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -95,6 +95,52 @@ static void __init nvidia_bugs(int num, int slot, int func) } +static u32 ati_ixp4x0_rev(int num, int slot, int func) +{ + u32 d; + u8 b; + + b = read_pci_config_byte(num, slot, func, 0xac); + b &= ~(1<<5); + write_pci_config_byte(num, slot, func, 0xac, b); + + d = read_pci_config(num, slot, func, 0x70); + d |= 1<<8; + write_pci_config(num, slot, func, 0x70, d); + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + return d; +} + +static void __init ati_bugs(int num, int slot, int func) +{ +#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC) + u32 d; + u8 b; + + if (acpi_use_timer_override) + return; + + d = ati_ixp4x0_rev(num, slot, func); + if (d < 0x82) + acpi_skip_timer_override = 1; + else { + /* check for IRQ0 interrupt swap */ + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x2)) + acpi_skip_timer_override = 1; + } + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB4X0 revision 0x%x\n", d); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +#endif +} + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -114,6 +160,8 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, {} }; From 8d89adf44cf750e49691ba5b744b2ad77a05e997 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 7 Oct 2008 06:47:52 +0200 Subject: [PATCH 130/138] x86: SB450: deprioritize DMI quirks This PCI ID based quick should be a full solution for the IRQ0 override related slowdown problem on SB450 based systems: 33fb0e4: x86: SB450: skip IRQ0 override if it is not routed to INT2 of IOAPIC Emit a warning in those cases where the DMI quirk triggers but the PCI ID based quirk didnt. If this warning does not trigger then we can phase out the DMI quirks. Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c102af85df9c..096102d9b24e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1421,8 +1421,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d) */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); - acpi_skip_timer_override = 1; + /* + * The ati_ixp4x0_rev() early PCI quirk should have set + * the acpi_skip_timer_override flag already: + */ + if (!acpi_skip_timer_override) { + WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); + pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", + d->ident); + acpi_skip_timer_override = 1; + } return 0; } From f364eadab59b316ea0bd9f9bc01af0ad89065569 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 7 Oct 2008 14:04:27 -0700 Subject: [PATCH 131/138] x86: xsave: fix error condition in save_i387_xstate() Actually return failure on error. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index ed5274a5bb0f..448fde96963c 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -121,6 +121,8 @@ int save_i387_xstate(void __user *buf) err |= __put_user(FP_XSTATE_MAGIC2, (__u32 __user *) (buf + sig_xstate_size - FP_XSTATE_MAGIC2_SIZE)); + if (err) + return err; } return 1; From 04944b793e18ece23f63c0252646b310c1845940 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 7 Oct 2008 14:04:28 -0700 Subject: [PATCH 132/138] x86: xsave: set FP, SSE bits in the xsave header in the user sigcontext If a processor implementation discern that a processor state component is in its initialized state, it may modify the corresponding bit in the xsave header.xstate_bv as '0'. State in the memory layout setup by 'xsave' will be consistent with the bit values in the header. During signal handling, legacy applications may change the FP/SSE bits in the sigcontext memory layout without touching the FP/SSE header bits in the xsave header. So always set FP/SSE bits in the xsave header while saving the sigcontext state to the user space. During signal return, this will enable the kernel to capture any changes to the FP/SSE bits by the legacy applications which don't touch xsave headers. xsave aware apps can change the xstate_bv in the xsave header aswell as change any contents in the memory layout. xrestor as part of sigreturn will capture all the changes. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i387.c | 14 ++++++++++++++ arch/x86/kernel/xsave.c | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 45723f1fe198..1f20608d4ca8 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -468,9 +468,23 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) static int save_i387_xsave(void __user *buf) { + struct task_struct *tsk = current; struct _fpstate_ia32 __user *fx = buf; int err = 0; + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. + * This will enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware applications can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + if (save_i387_fxsave(fx) < 0) return -1; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 448fde96963c..2f98323716d9 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -114,6 +114,8 @@ int save_i387_xstate(void __user *buf) if (task_thread_info(tsk)->status & TS_XSAVE) { struct _fpstate __user *fx = buf; + struct _xstate __user *x = buf; + u64 xstate_bv; err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, sizeof(struct _fpx_sw_bytes)); @@ -121,6 +123,29 @@ int save_i387_xstate(void __user *buf) err |= __put_user(FP_XSTATE_MAGIC2, (__u32 __user *) (buf + sig_xstate_size - FP_XSTATE_MAGIC2_SIZE)); + + /* + * Read the xstate_bv which we copied (directly from the cpu or + * from the state in task struct) to the user buffers and + * set the FP/SSE bits. + */ + err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware apps can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xstate_bv |= XSTATE_FPSSE; + + err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv); + if (err) return err; } From eefb47f6a1e855653d275cb90592a3587ea93a09 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 8 Oct 2008 13:01:39 -0700 Subject: [PATCH 133/138] xen: use spin_lock_nest_lock when pinning a pagetable When pinning/unpinning a pagetable with split pte locks, we can end up holding multiple pte locks at once (we need to hold the locks while there's a pending batched hypercall affecting the pte page). Because all the pte locks are in the same lock class, lockdep thinks that we're potentially taking a lock recursively. This warning is spurious because we always take the pte locks while holding mm->page_table_lock. lockdep now has spin_lock_nest_lock to express this kind of dominant lock use, so use it here so that lockdep knows what's going on. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 74 ++++++++++++++++++++++++++++++---------------- arch/x86/xen/mmu.h | 3 -- 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 64e58681767e..ae173f6edd8b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -651,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) * For 64-bit, we must skip the Xen hole in the middle of the address * space, just after the big x86-64 virtual hole. */ -static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), +static int xen_pgd_walk(struct mm_struct *mm, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), unsigned long limit) { + pgd_t *pgd = mm->pgd; int flush = 0; unsigned hole_low, hole_high; unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; @@ -698,7 +701,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pud = pud_offset(&pgd[pgdidx], 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), PT_PUD); + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { pmd_t *pmd; @@ -713,7 +716,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pmd = pmd_offset(&pud[pudidx], 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), PT_PMD); + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { struct page *pte; @@ -727,7 +730,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), continue; pte = pmd_page(pmd[pmdidx]); - flush |= (*func)(pte, PT_PTE); + flush |= (*func)(mm, pte, PT_PTE); } } } @@ -735,20 +738,20 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), out: /* Do the top level last, so that the callbacks can use it as a cue to do final things like tlb flushes. */ - flush |= (*func)(virt_to_page(pgd), PT_PGD); + flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); return flush; } /* If we're using split pte locks, then take the page's lock and return a pointer to it. Otherwise return NULL. */ -static spinlock_t *xen_pte_lock(struct page *page) +static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; #if USE_SPLIT_PTLOCKS ptl = __pte_lockptr(page); - spin_lock(ptl); + spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif return ptl; @@ -772,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); } -static int xen_pin_page(struct page *page, enum pt_level level) +static int xen_pin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestSetPagePinned(page); int flush; @@ -813,7 +817,7 @@ static int xen_pin_page(struct page *page, enum pt_level level) */ ptl = NULL; if (level == PT_PTE) - ptl = xen_pte_lock(page); + ptl = xen_pte_lock(page, mm); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), @@ -834,11 +838,11 @@ static int xen_pin_page(struct page *page, enum pt_level level) /* This is called just after a mm has been created, but it has not been used yet. We need to make sure that its pagetable is all read-only, and can be pinned. */ -void xen_pgd_pin(pgd_t *pgd) +static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); - if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) { + if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); @@ -852,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd) xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); if (user_pgd) { - xen_pin_page(virt_to_page(user_pgd), PT_PGD); + xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is pinnable */ - xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); #endif xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); #endif /* CONFIG_X86_64 */ xen_mc_issue(0); } +static void xen_pgd_pin(struct mm_struct *mm) +{ + __xen_pgd_pin(mm, mm->pgd); +} + /* * On save, we need to pin all pagetables to make sure they get their * mfns turned into pfns. Search the list for any unpinned pgds and pin * them (unpinned pgds are not currently in use, probably because the * process is under construction or destruction). + * + * Expected to be called in stop_machine() ("equivalent to taking + * every spinlock in the system"), so the locking doesn't really + * matter all that much. */ void xen_mm_pin_all(void) { @@ -881,7 +895,7 @@ void xen_mm_pin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (!PagePinned(page)) { - xen_pgd_pin((pgd_t *)page_address(page)); + __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); SetPageSavePinned(page); } } @@ -894,7 +908,8 @@ void xen_mm_pin_all(void) * that's before we have page structures to store the bits. So do all * the book-keeping now. */ -static __init int xen_mark_pinned(struct page *page, enum pt_level level) +static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, + enum pt_level level) { SetPagePinned(page); return 0; @@ -902,10 +917,11 @@ static __init int xen_mark_pinned(struct page *page, enum pt_level level) void __init xen_mark_init_mm_pinned(void) { - xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP); + xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } -static int xen_unpin_page(struct page *page, enum pt_level level) +static int xen_unpin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestClearPagePinned(page); @@ -923,7 +939,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level) * partially-pinned state. */ if (level == PT_PTE) { - ptl = xen_pte_lock(page); + ptl = xen_pte_lock(page, mm); if (ptl) xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); @@ -945,7 +961,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level) } /* Release a pagetables pages back as normal RW */ -static void xen_pgd_unpin(pgd_t *pgd) +static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); @@ -957,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd) if (user_pgd) { xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); - xen_unpin_page(virt_to_page(user_pgd), PT_PGD); + xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); } } #endif #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); #endif - xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT); + xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); xen_mc_issue(0); } +static void xen_pgd_unpin(struct mm_struct *mm) +{ + __xen_pgd_unpin(mm, mm->pgd); +} + /* * On resume, undo any pinning done at save, so that the rest of the * kernel doesn't see any unexpected pinned pagetables. @@ -986,7 +1008,7 @@ void xen_mm_unpin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { BUG_ON(!PagePinned(page)); - xen_pgd_unpin((pgd_t *)page_address(page)); + __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); ClearPageSavePinned(page); } } @@ -997,14 +1019,14 @@ void xen_mm_unpin_all(void) void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); - xen_pgd_pin(next->pgd); + xen_pgd_pin(next); spin_unlock(&next->page_table_lock); } void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { spin_lock(&mm->page_table_lock); - xen_pgd_pin(mm->pgd); + xen_pgd_pin(mm); spin_unlock(&mm->page_table_lock); } @@ -1095,7 +1117,7 @@ void xen_exit_mmap(struct mm_struct *mm) /* pgd may not be pinned in the error exit path of execve */ if (xen_page_pinned(mm->pgd)) - xen_pgd_unpin(mm->pgd); + xen_pgd_unpin(mm); spin_unlock(&mm->page_table_lock); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 0f59bd03f9e3..98d71659da5a 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); void xen_exit_mmap(struct mm_struct *mm); -void xen_pgd_pin(pgd_t *pgd); -//void xen_pgd_unpin(pgd_t *pgd); - pteval_t xen_pte_val(pte_t); pmdval_t xen_pmd_val(pmd_t); pgdval_t xen_pgd_val(pgd_t); From 5dc64a3442b98eaa0e3730c35fcf00cf962a93e7 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 10 Oct 2008 11:27:38 +0100 Subject: [PATCH 134/138] xen: do not reserve 2 pages of padding between hypervisor and fixmap. When reserving space for the hypervisor the Xen paravirt backend adds an extra two pages (this was carried forward from the 2.6.18-xen tree which had them "for safety"). Depending on various CONFIG options this can cause the boot time fixmaps to span multiple PMDs which is not supported and triggers a WARN in early_ioremap_init(). This was exposed by 2216d199b1430d1c0affb1498a9ebdbd9c0de439 which moved the dmi table parsing earlier. x86: fix CONFIG_X86_RESERVE_LOW_64K=y The bad_bios_dmi_table() quirk never triggered because we do DMI setup too late. Move it a bit earlier. There is no real reason to reserve these two extra pages and the fixmap already incorporates FIX_HOLE which serves the same purpose. None of the other callers of reserve_top_address do this. Signed-off-by: Ian Campbell Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 85692c9f6496..977a54255fb4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1370,7 +1370,7 @@ static void __init xen_reserve_top(void) if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) top = pp.virt_start; - reserve_top_address(-top + 2 * PAGE_SIZE); + reserve_top_address(-top); #endif /* CONFIG_X86_32 */ } From 325af5fb1418c79953db0954556de048e061d8b6 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 8 Aug 2008 15:58:39 -0700 Subject: [PATCH 135/138] x86: ioperm user_regset This adds a user_regset type for the x86 io permissions bitmap. This makes it appear in core dumps (when ioperm has been used). It will also make it visible to debuggers in the future. Signed-off-by: Roland McGrath Signed-off-by: H. Peter Anvin [conflict resolutions: Signed-off-by: Ingo Molnar ] --- arch/x86/kernel/ptrace.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/elf.h | 1 + 2 files changed, 38 insertions(+) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e375b658efc3..4e1ef66c2ea4 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -40,7 +40,9 @@ enum x86_regset { REGSET_GENERAL, REGSET_FP, REGSET_XFP, + REGSET_IOPERM64 = REGSET_XFP, REGSET_TLS, + REGSET_IOPERM32, }; /* @@ -555,6 +557,29 @@ static int ptrace_set_debugreg(struct task_struct *child, return 0; } +/* + * These access the current or another (stopped) task's io permission + * bitmap for debugging or core dump. + */ +static int ioperm_active(struct task_struct *target, + const struct user_regset *regset) +{ + return target->thread.io_bitmap_max / regset->size; +} + +static int ioperm_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (!target->thread.io_bitmap_ptr) + return -ENXIO; + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + target->thread.io_bitmap_ptr, + 0, IO_BITMAP_BYTES); +} + #ifdef CONFIG_X86_PTRACE_BTS /* * The configuration for a particular BTS hardware implementation. @@ -1385,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = { .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_IOPERM64] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), .align = sizeof(long), + .active = ioperm_active, .get = ioperm_get + }, }; static const struct user_regset_view user_x86_64_view = { @@ -1431,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = { .active = regset_tls_active, .get = regset_tls_get, .set = regset_tls_set }, + [REGSET_IOPERM32] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = ioperm_active, .get = ioperm_get + }, }; static const struct user_regset_view user_x86_32_view = { diff --git a/include/linux/elf.h b/include/linux/elf.h index edc3dac3f02f..0b61ca41a044 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -360,6 +360,7 @@ typedef struct elf64_shdr { #define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ +#define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ /* Note header in a PT_NOTE section */ From 46eaa6702016e3ac9a188172a2c309d6ca1be1cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Oct 2008 15:06:29 +0200 Subject: [PATCH 136/138] x86: memory corruption check - cleanup Move the prototypes from the generic kernel.h header to the more appropriate include/asm-x86/bios_ebda.h header file. Also, remove the check from the power management code - this is a pure x86 matter for now. Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 1 + arch/x86/mm/init_64.c | 1 + drivers/base/power/main.c | 1 - include/asm-x86/bios_ebda.h | 17 +++++++++++++++++ include/linux/kernel.h | 17 ----------------- 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7e05462ffb11..bbe044dbe014 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d84d3e91d348..3e10054c5731 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index bf6d3554e506..273a944d4040 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -254,7 +254,6 @@ static char *pm_verb(int event) static void pm_dev_dbg(struct device *dev, pm_message_t state, char *info) { - check_for_bios_corruption(); dev_dbg(dev, "%s%s%s\n", info, pm_verb(state.event), ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? ", may wakeup" : ""); diff --git a/include/asm-x86/bios_ebda.h b/include/asm-x86/bios_ebda.h index ec42ed874591..79b4b88505d7 100644 --- a/include/asm-x86/bios_ebda.h +++ b/include/asm-x86/bios_ebda.h @@ -16,4 +16,21 @@ static inline unsigned int get_bios_ebda(void) void reserve_ebda_region(void); +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +/* + * This is obviously not a great place for this, but we want to be + * able to scatter it around anywhere in the kernel. + */ +void check_for_bios_corruption(void); +void start_periodic_check_for_corruption(void); +#else +static inline void check_for_bios_corruption(void) +{ +} + +static inline void start_periodic_check_for_corruption(void) +{ +} +#endif + #endif /* ASM_X86__BIOS_EBDA_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 50873b211788..2651f805ba6d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -240,23 +240,6 @@ extern const char *print_tainted(void); extern void add_taint(unsigned); extern int root_mountflags; -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION -/* - * This is obviously not a great place for this, but we want to be - * able to scatter it around anywhere in the kernel. - */ -void check_for_bios_corruption(void); -void start_periodic_check_for_corruption(void); -#else -static inline void check_for_bios_corruption(void) -{ -} - -static inline void start_periodic_check_for_corruption(void) -{ -} -#endif - /* Values used for system_state */ extern enum system_states { SYSTEM_BOOTING, From 9f482807a6bd7e2aa1ed0d8cfc48463ec4ca3568 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 18 Aug 2008 12:59:32 +0200 Subject: [PATCH 137/138] x86, fpu: check __clear_user() return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix warning: arch/x86/kernel/xsave.c: In function ‘save_i387_xstate’: arch/x86/kernel/xsave.c:98: warning: ignoring return value of ‘__clear_user’, declared with attribute warn_unused_result check the return value and act on it. We should not be ignoring faults at this point. Signed-off-by: Ingo Molnar --- arch/x86/kernel/xsave.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 2f98323716d9..9abac8a9d823 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -95,7 +95,9 @@ int save_i387_xstate(void __user *buf) * Start with clearing the user buffer. This will present a * clean context for the bytes not touched by the fxsave/xsave. */ - __clear_user(buf, sig_xstate_size); + err = __clear_user(buf, sig_xstate_size); + if (err) + return err; if (task_thread_info(tsk)->status & TS_XSAVE) err = xsave_user(buf); From 45e96f26f257bd873017c6244a6cafd27f6f5439 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Aug 2008 10:37:14 +0200 Subject: [PATCH 138/138] warnings: fix arch/x86/kernel/early_printk.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix warning: arch/x86/kernel/early_printk.c:993: warning: ‘enable_debug_console’ defined but not used Eliminate dead code. Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 02fc5b40c14b..34ad997d3834 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -989,22 +989,4 @@ static int __init setup_early_printk(char *buf) return 0; } -static void __init enable_debug_console(char *buf) -{ -#ifdef DBGP_DEBUG - struct console *old_early_console = NULL; - - if (early_console_initialized && early_console) { - old_early_console = early_console; - unregister_console(early_console); - early_console_initialized = 0; - } - - setup_early_printk(buf); - - if (early_console == old_early_console && old_early_console) - register_console(old_early_console); -#endif -} - early_param("earlyprintk", setup_early_printk);