From ded797547548a5b8e7b92383a41e4c0e6b0ecb7f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Sep 2013 00:50:25 +0200 Subject: [PATCH 1/8] irq: Force hardirq exit's softirq processing on its own stack The commit facd8b80c67a3cf64a467c4a2ac5fb31f2e6745b ("irq: Sanitize invoke_softirq") converted irq exit calls of do_softirq() to __do_softirq() on all architectures, assuming it was only used there for its irq disablement properties. But as a side effect, the softirqs processed in the end of the hardirq are always called on the inline current stack that is used by irq_exit() instead of the softirq stack provided by the archs that override do_softirq(). The result is mostly safe if the architecture runs irq_exit() on a separate irq stack because then softirqs are processed on that same stack that is near empty at this stage (assuming hardirq aren't nesting). Otherwise irq_exit() runs in the task stack and so does the softirq too. The interrupted call stack can be randomly deep already and the softirq can dig through it even further. To add insult to the injury, this softirq can be interrupted by a new hardirq, maximizing the chances for a stack overrun as reported in powerpc for example: do_IRQ: stack overflow: 1920 CPU: 0 PID: 1602 Comm: qemu-system-ppc Not tainted 3.10.4-300.1.fc19.ppc64p7 #1 Call Trace: [c0000000050a8740] .show_stack+0x130/0x200 (unreliable) [c0000000050a8810] .dump_stack+0x28/0x3c [c0000000050a8880] .do_IRQ+0x2b8/0x2c0 [c0000000050a8930] hardware_interrupt_common+0x154/0x180 --- Exception: 501 at .cp_start_xmit+0x3a4/0x820 [8139cp] LR = .cp_start_xmit+0x390/0x820 [8139cp] [c0000000050a8d40] .dev_hard_start_xmit+0x394/0x640 [c0000000050a8e00] .sch_direct_xmit+0x110/0x260 [c0000000050a8ea0] .dev_queue_xmit+0x260/0x630 [c0000000050a8f40] .br_dev_queue_push_xmit+0xc4/0x130 [bridge] [c0000000050a8fc0] .br_dev_xmit+0x198/0x270 [bridge] [c0000000050a9070] .dev_hard_start_xmit+0x394/0x640 [c0000000050a9130] .dev_queue_xmit+0x428/0x630 [c0000000050a91d0] .ip_finish_output+0x2a4/0x550 [c0000000050a9290] .ip_local_out+0x50/0x70 [c0000000050a9310] .ip_queue_xmit+0x148/0x420 [c0000000050a93b0] .tcp_transmit_skb+0x4e4/0xaf0 [c0000000050a94a0] .__tcp_ack_snd_check+0x7c/0xf0 [c0000000050a9520] .tcp_rcv_established+0x1e8/0x930 [c0000000050a95f0] .tcp_v4_do_rcv+0x21c/0x570 [c0000000050a96c0] .tcp_v4_rcv+0x734/0x930 [c0000000050a97a0] .ip_local_deliver_finish+0x184/0x360 [c0000000050a9840] .ip_rcv_finish+0x148/0x400 [c0000000050a98d0] .__netif_receive_skb_core+0x4f8/0xb00 [c0000000050a99d0] .netif_receive_skb+0x44/0x110 [c0000000050a9a70] .br_handle_frame_finish+0x2bc/0x3f0 [bridge] [c0000000050a9b20] .br_nf_pre_routing_finish+0x2ac/0x420 [bridge] [c0000000050a9bd0] .br_nf_pre_routing+0x4dc/0x7d0 [bridge] [c0000000050a9c70] .nf_iterate+0x114/0x130 [c0000000050a9d30] .nf_hook_slow+0xb4/0x1e0 [c0000000050a9e00] .br_handle_frame+0x290/0x330 [bridge] [c0000000050a9ea0] .__netif_receive_skb_core+0x34c/0xb00 [c0000000050a9fa0] .netif_receive_skb+0x44/0x110 [c0000000050aa040] .napi_gro_receive+0xe8/0x120 [c0000000050aa0c0] .cp_rx_poll+0x31c/0x590 [8139cp] [c0000000050aa1d0] .net_rx_action+0x1dc/0x310 [c0000000050aa2b0] .__do_softirq+0x158/0x330 [c0000000050aa3b0] .irq_exit+0xc8/0x110 [c0000000050aa430] .do_IRQ+0xdc/0x2c0 [c0000000050aa4e0] hardware_interrupt_common+0x154/0x180 --- Exception: 501 at .bad_range+0x1c/0x110 LR = .get_page_from_freelist+0x908/0xbb0 [c0000000050aa7d0] .list_del+0x18/0x50 (unreliable) [c0000000050aa850] .get_page_from_freelist+0x908/0xbb0 [c0000000050aa9e0] .__alloc_pages_nodemask+0x21c/0xae0 [c0000000050aaba0] .alloc_pages_vma+0xd0/0x210 [c0000000050aac60] .handle_pte_fault+0x814/0xb70 [c0000000050aad50] .__get_user_pages+0x1a4/0x640 [c0000000050aae60] .get_user_pages_fast+0xec/0x160 [c0000000050aaf10] .__gfn_to_pfn_memslot+0x3b0/0x430 [kvm] [c0000000050aafd0] .kvmppc_gfn_to_pfn+0x64/0x130 [kvm] [c0000000050ab070] .kvmppc_mmu_map_page+0x94/0x530 [kvm] [c0000000050ab190] .kvmppc_handle_pagefault+0x174/0x610 [kvm] [c0000000050ab270] .kvmppc_handle_exit_pr+0x464/0x9b0 [kvm] [c0000000050ab320] kvm_start_lightweight+0x1ec/0x1fc [kvm] [c0000000050ab4f0] .kvmppc_vcpu_run_pr+0x168/0x3b0 [kvm] [c0000000050ab9c0] .kvmppc_vcpu_run+0xc8/0xf0 [kvm] [c0000000050aba50] .kvm_arch_vcpu_ioctl_run+0x5c/0x1a0 [kvm] [c0000000050abae0] .kvm_vcpu_ioctl+0x478/0x730 [kvm] [c0000000050abc90] .do_vfs_ioctl+0x4ec/0x7c0 [c0000000050abd80] .SyS_ioctl+0xd4/0xf0 [c0000000050abe30] syscall_exit+0x0/0x98 Since this is a regression, this patch proposes a minimalistic and low-risk solution by blindly forcing the hardirq exit processing of softirqs on the softirq stack. This way we should reduce significantly the opportunities for task stack overflow dug by softirqs. Longer term solutions may involve extending the hardirq stack coverage to irq_exit(), etc... Reported-by: Benjamin Herrenschmidt Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: #3.9.. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- kernel/softirq.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 53cc09ceb0b8..d7d498d8cc4f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -328,10 +328,19 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (!force_irqthreads) - __do_softirq(); - else + if (!force_irqthreads) { + /* + * We can safely execute softirq on the current stack if + * it is the irq stack, because it should be near empty + * at this stage. But we have no way to know if the arch + * calls irq_exit() on the irq stack. So call softirq + * in its own stack to prevent from any overrun on top + * of a potentially deep task stack. + */ + do_softirq(); + } else { wakeup_softirqd(); + } } static inline void tick_irq_exit(void) From 7d65f4a6553203da6a22097821d151fbbe7e4956 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 5 Sep 2013 15:49:45 +0200 Subject: [PATCH 2/8] irq: Consolidate do_softirq() arch overriden implementations All arch overriden implementations of do_softirq() share the following common code: disable irqs (to avoid races with the pending check), check if there are softirqs pending, then execute __do_softirq() on a specific stack. Consolidate the common parts such that archs only worry about the stack switch. Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- arch/metag/kernel/irq.c | 48 ++++++++++++---------------------- arch/parisc/kernel/irq.c | 17 ++---------- arch/powerpc/kernel/irq.c | 17 +----------- arch/s390/kernel/irq.c | 50 ++++++++++++++--------------------- arch/sh/kernel/irq.c | 53 ++++++++++++++------------------------ arch/sparc/kernel/irq_64.c | 31 +++++++--------------- arch/x86/kernel/entry_64.S | 4 +-- arch/x86/kernel/irq_32.c | 30 ++++++--------------- arch/x86/kernel/irq_64.c | 21 --------------- include/linux/interrupt.h | 11 ++++++++ kernel/softirq.c | 8 +++--- 11 files changed, 93 insertions(+), 197 deletions(-) diff --git a/arch/metag/kernel/irq.c b/arch/metag/kernel/irq.c index 2a2c9d55187e..3b4b7f6c0950 100644 --- a/arch/metag/kernel/irq.c +++ b/arch/metag/kernel/irq.c @@ -159,44 +159,30 @@ void irq_ctx_exit(int cpu) extern asmlinkage void __do_softirq(void); -asmlinkage void do_softirq(void) +void do_softirq_own_stack(void) { - unsigned long flags; struct thread_info *curctx; union irq_ctx *irqctx; u32 *isp; - if (in_interrupt()) - return; + curctx = current_thread_info(); + irqctx = softirq_ctx[smp_processor_id()]; + irqctx->tinfo.task = curctx->task; - local_irq_save(flags); + /* build the stack frame on the softirq stack */ + isp = (u32 *) ((char *)irqctx + sizeof(struct thread_info)); - if (local_softirq_pending()) { - curctx = current_thread_info(); - irqctx = softirq_ctx[smp_processor_id()]; - irqctx->tinfo.task = curctx->task; - - /* build the stack frame on the softirq stack */ - isp = (u32 *) ((char *)irqctx + sizeof(struct thread_info)); - - asm volatile ( - "MOV D0.5,%0\n" - "SWAP A0StP,D0.5\n" - "CALLR D1RtP,___do_softirq\n" - "MOV A0StP,D0.5\n" - : - : "r" (isp) - : "memory", "cc", "D1Ar1", "D0Ar2", "D1Ar3", "D0Ar4", - "D1Ar5", "D0Ar6", "D0Re0", "D1Re0", "D0.4", "D1RtP", - "D0.5" - ); - /* - * Shouldn't happen, we returned above if in_interrupt(): - */ - WARN_ON_ONCE(softirq_count()); - } - - local_irq_restore(flags); + asm volatile ( + "MOV D0.5,%0\n" + "SWAP A0StP,D0.5\n" + "CALLR D1RtP,___do_softirq\n" + "MOV A0StP,D0.5\n" + : + : "r" (isp) + : "memory", "cc", "D1Ar1", "D0Ar2", "D1Ar3", "D0Ar4", + "D1Ar5", "D0Ar6", "D0Re0", "D1Re0", "D0.4", "D1RtP", + "D0.5" + ); } #endif diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 2e6443b1e922..ef5927685299 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -499,22 +499,9 @@ static void execute_on_irq_stack(void *func, unsigned long param1) *irq_stack_in_use = 1; } -asmlinkage void do_softirq(void) +void do_softirq_own_stack(void) { - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - - pending = local_softirq_pending(); - - if (pending) - execute_on_irq_stack(__do_softirq, 0); - - local_irq_restore(flags); + execute_on_irq_stack(__do_softirq, 0); } #endif /* CONFIG_IRQSTACKS */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 57d286a78f86..5c4adfc6a6d0 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -593,7 +593,7 @@ void irq_ctx_init(void) } } -static inline void do_softirq_onstack(void) +void do_softirq_own_stack(void) { struct thread_info *curtp, *irqtp; @@ -611,21 +611,6 @@ static inline void do_softirq_onstack(void) set_bits(irqtp->flags, &curtp->flags); } -void do_softirq(void) -{ - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - - if (local_softirq_pending()) - do_softirq_onstack(); - - local_irq_restore(flags); -} - irq_hw_number_t virq_to_hw(unsigned int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 8ac2097f13d4..bb27a262c44a 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -157,39 +157,29 @@ int arch_show_interrupts(struct seq_file *p, int prec) /* * Switch to the asynchronous interrupt stack for softirq execution. */ -asmlinkage void do_softirq(void) +void do_softirq_own_stack(void) { - unsigned long flags, old, new; + unsigned long old, new; - if (in_interrupt()) - return; - - local_irq_save(flags); - - if (local_softirq_pending()) { - /* Get current stack pointer. */ - asm volatile("la %0,0(15)" : "=a" (old)); - /* Check against async. stack address range. */ - new = S390_lowcore.async_stack; - if (((new - old) >> (PAGE_SHIFT + THREAD_ORDER)) != 0) { - /* Need to switch to the async. stack. */ - new -= STACK_FRAME_OVERHEAD; - ((struct stack_frame *) new)->back_chain = old; - - asm volatile(" la 15,0(%0)\n" - " basr 14,%2\n" - " la 15,0(%1)\n" - : : "a" (new), "a" (old), - "a" (__do_softirq) - : "0", "1", "2", "3", "4", "5", "14", - "cc", "memory" ); - } else { - /* We are already on the async stack. */ - __do_softirq(); - } + /* Get current stack pointer. */ + asm volatile("la %0,0(15)" : "=a" (old)); + /* Check against async. stack address range. */ + new = S390_lowcore.async_stack; + if (((new - old) >> (PAGE_SHIFT + THREAD_ORDER)) != 0) { + /* Need to switch to the async. stack. */ + new -= STACK_FRAME_OVERHEAD; + ((struct stack_frame *) new)->back_chain = old; + asm volatile(" la 15,0(%0)\n" + " basr 14,%2\n" + " la 15,0(%1)\n" + : : "a" (new), "a" (old), + "a" (__do_softirq) + : "0", "1", "2", "3", "4", "5", "14", + "cc", "memory" ); + } else { + /* We are already on the async stack. */ + __do_softirq(); } - - local_irq_restore(flags); } /* diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index 063af10ff3c1..0833736afa32 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -149,47 +149,32 @@ void irq_ctx_exit(int cpu) hardirq_ctx[cpu] = NULL; } -asmlinkage void do_softirq(void) +void do_softirq_own_stack(void) { - unsigned long flags; struct thread_info *curctx; union irq_ctx *irqctx; u32 *isp; - if (in_interrupt()) - return; + curctx = current_thread_info(); + irqctx = softirq_ctx[smp_processor_id()]; + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_sp = current_stack_pointer; - local_irq_save(flags); + /* build the stack frame on the softirq stack */ + isp = (u32 *)((char *)irqctx + sizeof(*irqctx)); - if (local_softirq_pending()) { - curctx = current_thread_info(); - irqctx = softirq_ctx[smp_processor_id()]; - irqctx->tinfo.task = curctx->task; - irqctx->tinfo.previous_sp = current_stack_pointer; - - /* build the stack frame on the softirq stack */ - isp = (u32 *)((char *)irqctx + sizeof(*irqctx)); - - __asm__ __volatile__ ( - "mov r15, r9 \n" - "jsr @%0 \n" - /* switch to the softirq stack */ - " mov %1, r15 \n" - /* restore the thread stack */ - "mov r9, r15 \n" - : /* no outputs */ - : "r" (__do_softirq), "r" (isp) - : "memory", "r0", "r1", "r2", "r3", "r4", - "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" - ); - - /* - * Shouldn't happen, we returned above if in_interrupt(): - */ - WARN_ON_ONCE(softirq_count()); - } - - local_irq_restore(flags); + __asm__ __volatile__ ( + "mov r15, r9 \n" + "jsr @%0 \n" + /* switch to the softirq stack */ + " mov %1, r15 \n" + /* restore the thread stack */ + "mov r9, r15 \n" + : /* no outputs */ + : "r" (__do_softirq), "r" (isp) + : "memory", "r0", "r1", "r2", "r3", "r4", + "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" + ); } #else static inline void handle_one_irq(unsigned int irq) diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index d4840cec2c55..666193f4e8bb 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -698,30 +698,19 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs) set_irq_regs(old_regs); } -void do_softirq(void) +void do_softirq_own_stack(void) { - unsigned long flags; + void *orig_sp, *sp = softirq_stack[smp_processor_id()]; - if (in_interrupt()) - return; + sp += THREAD_SIZE - 192 - STACK_BIAS; - local_irq_save(flags); - - if (local_softirq_pending()) { - void *orig_sp, *sp = softirq_stack[smp_processor_id()]; - - sp += THREAD_SIZE - 192 - STACK_BIAS; - - __asm__ __volatile__("mov %%sp, %0\n\t" - "mov %1, %%sp" - : "=&r" (orig_sp) - : "r" (sp)); - __do_softirq(); - __asm__ __volatile__("mov %0, %%sp" - : : "r" (orig_sp)); - } - - local_irq_restore(flags); + __asm__ __volatile__("mov %%sp, %0\n\t" + "mov %1, %%sp" + : "=&r" (orig_sp) + : "r" (sp)); + __do_softirq(); + __asm__ __volatile__("mov %0, %%sp" + : : "r" (orig_sp)); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b077f4cc225a..083da7c2f40d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1342,7 +1342,7 @@ bad_gs: .previous /* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(call_softirq) +ENTRY(do_softirq_own_stack) CFI_STARTPROC pushq_cfi %rbp CFI_REL_OFFSET rbp,0 @@ -1359,7 +1359,7 @@ ENTRY(call_softirq) decl PER_CPU_VAR(irq_count) ret CFI_ENDPROC -END(call_softirq) +END(do_softirq_own_stack) #ifdef CONFIG_XEN zeroentry xen_hypervisor_callback xen_do_hypervisor_callback diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4186755f1d7c..8a5bb01dbc0e 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -149,35 +149,21 @@ void irq_ctx_init(int cpu) cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } -asmlinkage void do_softirq(void) +void do_softirq_own_stack(void) { - unsigned long flags; struct thread_info *curctx; union irq_ctx *irqctx; u32 *isp; - if (in_interrupt()) - return; + curctx = current_thread_info(); + irqctx = __this_cpu_read(softirq_ctx); + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; - local_irq_save(flags); + /* build the stack frame on the softirq stack */ + isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); - if (local_softirq_pending()) { - curctx = current_thread_info(); - irqctx = __this_cpu_read(softirq_ctx); - irqctx->tinfo.task = curctx->task; - irqctx->tinfo.previous_esp = current_stack_pointer; - - /* build the stack frame on the softirq stack */ - isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); - - call_on_stack(__do_softirq, isp); - /* - * Shouldn't happen, we returned above if in_interrupt(): - */ - WARN_ON_ONCE(softirq_count()); - } - - local_irq_restore(flags); + call_on_stack(__do_softirq, isp); } bool handle_irq(unsigned irq, struct pt_regs *regs) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index d04d3ecded62..4d1c746892eb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -87,24 +87,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) generic_handle_irq_desc(irq, desc); return true; } - - -extern void call_softirq(void); - -asmlinkage void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - pending = local_softirq_pending(); - /* Switch to interrupt stack */ - if (pending) { - call_softirq(); - WARN_ON_ONCE(softirq_count()); - } - local_irq_restore(flags); -} diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5e865b554940..c9e831dc80bc 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -19,6 +19,7 @@ #include #include +#include /* * These correspond to the IORESOURCE_IRQ_* defines in @@ -374,6 +375,16 @@ struct softirq_action asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); + +#ifdef __ARCH_HAS_DO_SOFTIRQ +void do_softirq_own_stack(void); +#else +static inline void do_softirq_own_stack(void) +{ + __do_softirq(); +} +#endif + extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); diff --git a/kernel/softirq.c b/kernel/softirq.c index d7d498d8cc4f..26ee72725d29 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -29,7 +29,6 @@ #define CREATE_TRACE_POINTS #include -#include /* - No shared variables, all the data are CPU local. - If a softirq needs serialization, let it serialize itself @@ -283,7 +282,7 @@ restart: tsk_restore_flags(current, old_flags, PF_MEMALLOC); } -#ifndef __ARCH_HAS_DO_SOFTIRQ + asmlinkage void do_softirq(void) { @@ -298,13 +297,12 @@ asmlinkage void do_softirq(void) pending = local_softirq_pending(); if (pending) - __do_softirq(); + do_softirq_own_stack(); + WARN_ON_ONCE(softirq_count()); local_irq_restore(flags); } -#endif - /* * Enter an interrupt context. */ From be6e1016440860fc4ec098b2d0aed3d0397b5d6e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Sep 2013 16:39:41 +0200 Subject: [PATCH 3/8] irq: Optimize call to softirq on hardirq exit Before processing softirqs on hardirq exit, we already do the check for pending softirqs while hardirqs are guaranteed to be disabled. So we can take a shortcut and safely jump to the arch specific implementation directly. Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 26ee72725d29..17c5cd2e3dae 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -335,7 +335,7 @@ static inline void invoke_softirq(void) * in its own stack to prevent from any overrun on top * of a potentially deep task stack. */ - do_softirq(); + do_softirq_own_stack(); } else { wakeup_softirqd(); } From 5d60d3e7c08a46643e902e39d9743cf394382151 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Sep 2013 04:11:35 +0200 Subject: [PATCH 4/8] irq: Improve a bit softirq debugging do_softirq() has a debug check that verifies that it is not nesting on softirqs processing, nor miscounting the softirq part of the preempt count. But making sure that softirqs processing don't nest is actually a more generic concern that applies to any caller of __do_softirq(). Do take it one step further and generalize that debug check to any softirq processing. Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- kernel/softirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 17c5cd2e3dae..9f8092b82a94 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -133,7 +133,6 @@ EXPORT_SYMBOL(local_bh_disable); static void __local_bh_enable(unsigned int cnt) { - WARN_ON_ONCE(in_irq()); WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == cnt) @@ -148,6 +147,7 @@ static void __local_bh_enable(unsigned int cnt) */ void _local_bh_enable(void) { + WARN_ON_ONCE(in_irq()); __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } @@ -279,6 +279,7 @@ restart: account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); + WARN_ON_ONCE(in_interrupt()); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -299,7 +300,6 @@ asmlinkage void do_softirq(void) if (pending) do_softirq_own_stack(); - WARN_ON_ONCE(softirq_count()); local_irq_restore(flags); } From 0bed698a334766ed07bacd6cb33f0228003a7f61 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 5 Sep 2013 16:14:00 +0200 Subject: [PATCH 5/8] irq: Justify the various softirq stack choices For clarity, comment the various stack choices for softirqs processing, whether we execute them from ksoftirqd or local_irq_enable() calls. Their use on irq_exit() is already commented. Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- kernel/softirq.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 9f8092b82a94..2b4328ea769f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -170,8 +170,13 @@ static inline void _local_bh_enable_ip(unsigned long ip) */ sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); - if (unlikely(!in_interrupt() && local_softirq_pending())) + if (unlikely(!in_interrupt() && local_softirq_pending())) { + /* + * Run softirq if any pending. And do it in its own stack + * as we may be calling this deep in a task call stack already. + */ do_softirq(); + } dec_preempt_count(); #ifdef CONFIG_TRACE_IRQFLAGS @@ -769,6 +774,10 @@ static void run_ksoftirqd(unsigned int cpu) { local_irq_disable(); if (local_softirq_pending()) { + /* + * We can safely run softirq on inline stack, as we are not deep + * in the task stack here. + */ __do_softirq(); rcu_note_context_switch(cpu); local_irq_enable(); From cc1f027454929924471bea2f362431072e3c71be Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Sep 2013 17:17:47 +0200 Subject: [PATCH 6/8] irq: Optimize softirq stack selection in irq exit If irq_exit() is called on the arch's specified irq stack, it should be safe to run softirqs inline under that same irq stack as it is near empty by the time we call irq_exit(). For example if we use the same stack for both hard and soft irqs here, the worst case scenario is: hardirq -> softirq -> hardirq. But then the softirq supersedes the first hardirq as the stack user since irq_exit() is called in a mostly empty stack. So the stack merge in this case looks acceptable. Stack overrun still have a chance to happen if hardirqs have more opportunities to nest, but then it's another problem to solve. So lets adapt the irq exit's softirq stack on top of a new Kconfig symbol that can be defined when irq_exit() runs on the irq stack. That way we can spare some stack switch on irq processing and all the cache issues that come along. Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- arch/Kconfig | 10 ++++++++++ kernel/softirq.c | 14 ++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index af2cc6eabcc7..ad95133f8fae 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -390,6 +390,16 @@ config HAVE_UNDERSCORE_SYMBOL_PREFIX Some architectures generate an _ in front of C symbols; things like module loading and assembly files need to know about this. +config HAVE_IRQ_EXIT_ON_IRQ_STACK + bool + help + Architecture doesn't only execute the irq handler on the irq stack + but also irq_exit(). This way we can process softirqs on this irq + stack instead of switching to a new one when we call __do_softirq() + in the end of an hardirq. + This spares a stack switch and improves cache usage on softirq + processing. + # # ABI hall of shame # diff --git a/kernel/softirq.c b/kernel/softirq.c index 2b4328ea769f..dacd0ab51df4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -332,15 +332,21 @@ void irq_enter(void) static inline void invoke_softirq(void) { if (!force_irqthreads) { +#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if * it is the irq stack, because it should be near empty - * at this stage. But we have no way to know if the arch - * calls irq_exit() on the irq stack. So call softirq - * in its own stack to prevent from any overrun on top - * of a potentially deep task stack. + * at this stage. + */ + __do_softirq(); +#else + /* + * Otherwise, irq_exit() is called on the task stack that can + * be potentially deep already. So call softirq in its own stack + * to prevent from any overrun. */ do_softirq_own_stack(); +#endif } else { wakeup_softirqd(); } From a2cd11f7d58a3a5390ee4cbcd8cff634a4d59173 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Sep 2013 17:18:36 +0200 Subject: [PATCH 7/8] x86: Tell about irq stack coverage x86-64 runs irq_exit() under the irq stack. So it can afford to run softirqs in hardirq exit without the need to switch the stacks. The hardirq stack is good enough for that. Now x86-64 runs softirqs in the hardirq stack anyway, so what we mostly skip is some needless per cpu refcounting updates there. x86-32 is not concerned because it only runs the irq handler on the irq stack. Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ee2fb9d37745..99eb1cc30dac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -123,6 +123,7 @@ config X86 select COMPAT_OLD_SIGACTION if IA32_EMULATION select RTC_LIB select HAVE_DEBUG_STACKOVERFLOW + select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 config INSTRUCTION_DECODER def_bool y From 62d26c8200a8382e1c67419ca3aff78d37898cc5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Sep 2013 17:18:36 +0200 Subject: [PATCH 8/8] powerpc: Tell about irq stack coverage Now that powerpc runs irq_exit() under the irq stack, let the softirq core know about that so that we spare the needless stack switch on irq exit's softirq processing. Acked-by: Benjamin Herrenschmidt Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 38f3b7e47ec5..b365d5cbb722 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -138,6 +138,7 @@ config PPC select OLD_SIGSUSPEND select OLD_SIGACTION if PPC32 select HAVE_DEBUG_STACKOVERFLOW + select HAVE_IRQ_EXIT_ON_IRQ_STACK config EARLY_PRINTK bool