2006-06-23 13:05:40 +04:00
/*
* Fast batching percpu counters .
*/
# include <linux/percpu_counter.h>
2007-07-16 10:39:51 +04:00
# include <linux/notifier.h>
# include <linux/mutex.h>
# include <linux/init.h>
# include <linux/cpu.h>
2006-06-23 13:05:40 +04:00
# include <linux/module.h>
2010-10-27 01:23:05 +04:00
# include <linux/debugobjects.h>
2006-06-23 13:05:40 +04:00
2011-11-01 04:12:34 +04:00
# ifdef CONFIG_HOTPLUG_CPU
2007-07-16 10:39:51 +04:00
static LIST_HEAD ( percpu_counters ) ;
2012-07-31 09:28:31 +04:00
static DEFINE_SPINLOCK ( percpu_counters_lock ) ;
2011-11-01 04:12:34 +04:00
# endif
2007-07-16 10:39:51 +04:00
2010-10-27 01:23:05 +04:00
# ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER
static struct debug_obj_descr percpu_counter_debug_descr ;
static int percpu_counter_fixup_free ( void * addr , enum debug_obj_state state )
{
struct percpu_counter * fbc = addr ;
switch ( state ) {
case ODEBUG_STATE_ACTIVE :
percpu_counter_destroy ( fbc ) ;
debug_object_free ( fbc , & percpu_counter_debug_descr ) ;
return 1 ;
default :
return 0 ;
}
}
static struct debug_obj_descr percpu_counter_debug_descr = {
. name = " percpu_counter " ,
. fixup_free = percpu_counter_fixup_free ,
} ;
static inline void debug_percpu_counter_activate ( struct percpu_counter * fbc )
{
debug_object_init ( fbc , & percpu_counter_debug_descr ) ;
debug_object_activate ( fbc , & percpu_counter_debug_descr ) ;
}
static inline void debug_percpu_counter_deactivate ( struct percpu_counter * fbc )
{
debug_object_deactivate ( fbc , & percpu_counter_debug_descr ) ;
debug_object_free ( fbc , & percpu_counter_debug_descr ) ;
}
# else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
static inline void debug_percpu_counter_activate ( struct percpu_counter * fbc )
{ }
static inline void debug_percpu_counter_deactivate ( struct percpu_counter * fbc )
{ }
# endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
2007-10-17 10:25:44 +04:00
void percpu_counter_set ( struct percpu_counter * fbc , s64 amount )
{
int cpu ;
2013-10-24 12:06:45 +04:00
unsigned long flags ;
2007-10-17 10:25:44 +04:00
2013-10-24 12:06:45 +04:00
raw_spin_lock_irqsave ( & fbc - > lock , flags ) ;
2007-10-17 10:25:44 +04:00
for_each_possible_cpu ( cpu ) {
s32 * pcount = per_cpu_ptr ( fbc - > counters , cpu ) ;
* pcount = 0 ;
}
fbc - > count = amount ;
2013-10-24 12:06:45 +04:00
raw_spin_unlock_irqrestore ( & fbc - > lock , flags ) ;
2007-10-17 10:25:44 +04:00
}
EXPORT_SYMBOL ( percpu_counter_set ) ;
2007-10-17 10:25:43 +04:00
void __percpu_counter_add ( struct percpu_counter * fbc , s64 amount , s32 batch )
2006-06-23 13:05:40 +04:00
{
2007-10-17 10:25:43 +04:00
s64 count ;
2006-06-23 13:05:40 +04:00
2010-10-27 01:23:09 +04:00
preempt_disable ( ) ;
percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options.
The this_cpu_* options can be used to optimize __percpu_counter_add a bit. Avoids
some address arithmetic and saves 12 bytes.
Before:
00000000000001d3 <__percpu_counter_add>:
1d3: 55 push %rbp
1d4: 48 89 e5 mov %rsp,%rbp
1d7: 41 55 push %r13
1d9: 41 54 push %r12
1db: 53 push %rbx
1dc: 48 89 fb mov %rdi,%rbx
1df: 48 83 ec 08 sub $0x8,%rsp
1e3: 4c 8b 67 30 mov 0x30(%rdi),%r12
1e7: 65 4c 03 24 25 00 00 add %gs:0x0,%r12
1ee: 00 00
1f0: 4d 63 2c 24 movslq (%r12),%r13
1f4: 48 63 c2 movslq %edx,%rax
1f7: 49 01 f5 add %rsi,%r13
1fa: 49 39 c5 cmp %rax,%r13
1fd: 7d 0a jge 209 <__percpu_counter_add+0x36>
1ff: f7 da neg %edx
201: 48 63 d2 movslq %edx,%rdx
204: 49 39 d5 cmp %rdx,%r13
207: 7f 1e jg 227 <__percpu_counter_add+0x54>
209: 48 89 df mov %rbx,%rdi
20c: e8 00 00 00 00 callq 211 <__percpu_counter_add+0x3e>
211: 4c 01 6b 18 add %r13,0x18(%rbx)
215: 48 89 df mov %rbx,%rdi
218: 41 c7 04 24 00 00 00 movl $0x0,(%r12)
21f: 00
220: e8 00 00 00 00 callq 225 <__percpu_counter_add+0x52>
225: eb 04 jmp 22b <__percpu_counter_add+0x58>
227: 45 89 2c 24 mov %r13d,(%r12)
22b: 5b pop %rbx
22c: 5b pop %rbx
22d: 41 5c pop %r12
22f: 41 5d pop %r13
231: c9 leaveq
232: c3 retq
After:
00000000000001d3 <__percpu_counter_add>:
1d3: 55 push %rbp
1d4: 48 63 ca movslq %edx,%rcx
1d7: 48 89 e5 mov %rsp,%rbp
1da: 41 54 push %r12
1dc: 53 push %rbx
1dd: 48 89 fb mov %rdi,%rbx
1e0: 48 8b 47 30 mov 0x30(%rdi),%rax
1e4: 65 44 8b 20 mov %gs:(%rax),%r12d
1e8: 4d 63 e4 movslq %r12d,%r12
1eb: 49 01 f4 add %rsi,%r12
1ee: 49 39 cc cmp %rcx,%r12
1f1: 7d 0a jge 1fd <__percpu_counter_add+0x2a>
1f3: f7 da neg %edx
1f5: 48 63 d2 movslq %edx,%rdx
1f8: 49 39 d4 cmp %rdx,%r12
1fb: 7f 21 jg 21e <__percpu_counter_add+0x4b>
1fd: 48 89 df mov %rbx,%rdi
200: e8 00 00 00 00 callq 205 <__percpu_counter_add+0x32>
205: 4c 01 63 18 add %r12,0x18(%rbx)
209: 48 8b 43 30 mov 0x30(%rbx),%rax
20d: 48 89 df mov %rbx,%rdi
210: 65 c7 00 00 00 00 00 movl $0x0,%gs:(%rax)
217: e8 00 00 00 00 callq 21c <__percpu_counter_add+0x49>
21c: eb 04 jmp 222 <__percpu_counter_add+0x4f>
21e: 65 44 89 20 mov %r12d,%gs:(%rax)
222: 5b pop %rbx
223: 41 5c pop %r12
225: c9 leaveq
226: c3 retq
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2010-12-06 20:16:19 +03:00
count = __this_cpu_read ( * fbc - > counters ) + amount ;
2007-10-17 10:25:43 +04:00
if ( count > = batch | | count < = - batch ) {
2013-10-24 12:06:45 +04:00
unsigned long flags ;
raw_spin_lock_irqsave ( & fbc - > lock , flags ) ;
2006-06-23 13:05:40 +04:00
fbc - > count + = count ;
2014-01-17 03:26:48 +04:00
__this_cpu_sub ( * fbc - > counters , count - amount ) ;
2013-10-24 12:06:45 +04:00
raw_spin_unlock_irqrestore ( & fbc - > lock , flags ) ;
2006-06-23 13:05:40 +04:00
} else {
2014-01-15 05:56:42 +04:00
this_cpu_add ( * fbc - > counters , amount ) ;
2006-06-23 13:05:40 +04:00
}
2010-10-27 01:23:09 +04:00
preempt_enable ( ) ;
2006-06-23 13:05:40 +04:00
}
2007-10-17 10:25:43 +04:00
EXPORT_SYMBOL ( __percpu_counter_add ) ;
2006-06-23 13:05:40 +04:00
/*
* Add up all the per - cpu counts , return the result . This is a more accurate
* but much slower version of percpu_counter_read_positive ( )
*/
2008-12-10 00:14:14 +03:00
s64 __percpu_counter_sum ( struct percpu_counter * fbc )
2006-06-23 13:05:40 +04:00
{
2006-06-23 13:05:41 +04:00
s64 ret ;
2006-06-23 13:05:40 +04:00
int cpu ;
2013-10-24 12:06:45 +04:00
unsigned long flags ;
2006-06-23 13:05:40 +04:00
2013-10-24 12:06:45 +04:00
raw_spin_lock_irqsave ( & fbc - > lock , flags ) ;
2006-06-23 13:05:40 +04:00
ret = fbc - > count ;
2007-07-16 10:39:51 +04:00
for_each_online_cpu ( cpu ) {
2006-06-23 13:05:41 +04:00
s32 * pcount = per_cpu_ptr ( fbc - > counters , cpu ) ;
2006-06-23 13:05:40 +04:00
ret + = * pcount ;
}
2013-10-24 12:06:45 +04:00
raw_spin_unlock_irqrestore ( & fbc - > lock , flags ) ;
2007-10-17 10:25:45 +04:00
return ret ;
2006-06-23 13:05:40 +04:00
}
2007-10-17 10:25:45 +04:00
EXPORT_SYMBOL ( __percpu_counter_sum ) ;
2007-07-16 10:39:51 +04:00
2014-09-08 04:51:29 +04:00
int __percpu_counter_init ( struct percpu_counter * fbc , s64 amount , gfp_t gfp ,
2008-12-26 17:08:55 +03:00
struct lock_class_key * key )
2007-07-16 10:39:51 +04:00
{
2014-09-08 04:51:29 +04:00
unsigned long flags __maybe_unused ;
2009-07-25 18:21:48 +04:00
raw_spin_lock_init ( & fbc - > lock ) ;
2008-12-26 17:08:55 +03:00
lockdep_set_class ( & fbc - > lock , key ) ;
2007-07-16 10:39:51 +04:00
fbc - > count = amount ;
2014-09-08 04:51:29 +04:00
fbc - > counters = alloc_percpu_gfp ( s32 , gfp ) ;
2007-10-17 10:25:45 +04:00
if ( ! fbc - > counters )
return - ENOMEM ;
2010-10-27 01:23:05 +04:00
debug_percpu_counter_activate ( fbc ) ;
2007-07-16 10:39:51 +04:00
# ifdef CONFIG_HOTPLUG_CPU
2010-10-27 01:21:20 +04:00
INIT_LIST_HEAD ( & fbc - > list ) ;
2014-09-08 04:51:29 +04:00
spin_lock_irqsave ( & percpu_counters_lock , flags ) ;
2007-07-16 10:39:51 +04:00
list_add ( & fbc - > list , & percpu_counters ) ;
2014-09-08 04:51:29 +04:00
spin_unlock_irqrestore ( & percpu_counters_lock , flags ) ;
2007-07-16 10:39:51 +04:00
# endif
2007-10-17 10:25:45 +04:00
return 0 ;
2007-07-16 10:39:51 +04:00
}
2008-12-26 17:08:55 +03:00
EXPORT_SYMBOL ( __percpu_counter_init ) ;
2007-07-16 10:39:51 +04:00
void percpu_counter_destroy ( struct percpu_counter * fbc )
{
2014-09-08 04:51:29 +04:00
unsigned long flags __maybe_unused ;
2007-10-17 10:25:45 +04:00
if ( ! fbc - > counters )
return ;
2010-10-27 01:23:05 +04:00
debug_percpu_counter_deactivate ( fbc ) ;
2007-07-16 10:39:51 +04:00
# ifdef CONFIG_HOTPLUG_CPU
2014-09-08 04:51:29 +04:00
spin_lock_irqsave ( & percpu_counters_lock , flags ) ;
2007-07-16 10:39:51 +04:00
list_del ( & fbc - > list ) ;
2014-09-08 04:51:29 +04:00
spin_unlock_irqrestore ( & percpu_counters_lock , flags ) ;
2007-07-16 10:39:51 +04:00
# endif
2008-12-10 00:14:11 +03:00
free_percpu ( fbc - > counters ) ;
fbc - > counters = NULL ;
2007-07-16 10:39:51 +04:00
}
EXPORT_SYMBOL ( percpu_counter_destroy ) ;
2009-01-07 01:41:04 +03:00
int percpu_counter_batch __read_mostly = 32 ;
EXPORT_SYMBOL ( percpu_counter_batch ) ;
static void compute_batch_value ( void )
{
int nr = num_online_cpus ( ) ;
percpu_counter_batch = max ( 32 , nr * 2 ) ;
}
2013-06-19 22:53:51 +04:00
static int percpu_counter_hotcpu_callback ( struct notifier_block * nb ,
2007-07-16 10:39:51 +04:00
unsigned long action , void * hcpu )
{
2009-01-07 01:41:04 +03:00
# ifdef CONFIG_HOTPLUG_CPU
2007-07-16 10:39:51 +04:00
unsigned int cpu ;
struct percpu_counter * fbc ;
2009-01-07 01:41:04 +03:00
compute_batch_value ( ) ;
lib/percpu_counter.c: fix bad percpu counter state during suspend
I got a bug report yesterday from Laszlo Ersek in which he states that
his kvm instance fails to suspend. Laszlo bisected it down to this
commit 1cf7e9c68fe8 ("virtio_blk: blk-mq support") where virtio-blk is
converted to use the blk-mq infrastructure.
After digging a bit, it became clear that the issue was with the queue
drain. blk-mq tracks queue usage in a percpu counter, which is
incremented on request alloc and decremented when the request is freed.
The initial hunt was for an inconsistency in blk-mq, but everything
seemed fine. In fact, the counter only returned crazy values when
suspend was in progress.
When a CPU is unplugged, the percpu counters merges that CPU state with
the general state. blk-mq takes care to register a hotcpu notifier with
the appropriate priority, so we know it runs after the percpu counter
notifier. However, the percpu counter notifier only merges the state
when the CPU is fully gone. This leaves a state transition where the
CPU going away is no longer in the online mask, yet it still holds
private values. This means that in this state, percpu_counter_sum()
returns invalid results, and the suspend then hangs waiting for
abs(dead-cpu-value) requests to complete which of course will never
happen.
Fix this by clearing the state earlier, so we never have a case where
the CPU isn't in online mask but still holds private state. This bug
has been there since forever, I guess we don't have a lot of users where
percpu counters needs to be reliable during the suspend cycle.
Signed-off-by: Jens Axboe <axboe@fb.com>
Reported-by: Laszlo Ersek <lersek@redhat.com>
Tested-by: Laszlo Ersek <lersek@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-09 03:04:12 +04:00
if ( action ! = CPU_DEAD & & action ! = CPU_DEAD_FROZEN )
2007-07-16 10:39:51 +04:00
return NOTIFY_OK ;
cpu = ( unsigned long ) hcpu ;
2014-09-08 04:51:29 +04:00
spin_lock_irq ( & percpu_counters_lock ) ;
2007-07-16 10:39:51 +04:00
list_for_each_entry ( fbc , & percpu_counters , list ) {
s32 * pcount ;
2007-10-19 10:40:47 +04:00
unsigned long flags ;
2007-07-16 10:39:51 +04:00
2009-07-25 18:21:48 +04:00
raw_spin_lock_irqsave ( & fbc - > lock , flags ) ;
2007-07-16 10:39:51 +04:00
pcount = per_cpu_ptr ( fbc - > counters , cpu ) ;
fbc - > count + = * pcount ;
* pcount = 0 ;
2009-07-25 18:21:48 +04:00
raw_spin_unlock_irqrestore ( & fbc - > lock , flags ) ;
2007-07-16 10:39:51 +04:00
}
2014-09-08 04:51:29 +04:00
spin_unlock_irq ( & percpu_counters_lock ) ;
2009-01-07 01:41:04 +03:00
# endif
2007-07-16 10:39:51 +04:00
return NOTIFY_OK ;
}
2010-08-10 04:19:04 +04:00
/*
* Compare counter against given value .
* Return 1 if greater , 0 if equal and - 1 if less
*/
2015-05-29 00:39:34 +03:00
int __percpu_counter_compare ( struct percpu_counter * fbc , s64 rhs , s32 batch )
2010-08-10 04:19:04 +04:00
{
s64 count ;
count = percpu_counter_read ( fbc ) ;
/* Check to see if rough count will be sufficient for comparison */
2015-05-29 00:39:34 +03:00
if ( abs ( count - rhs ) > ( batch * num_online_cpus ( ) ) ) {
2010-08-10 04:19:04 +04:00
if ( count > rhs )
return 1 ;
else
return - 1 ;
}
/* Need to use precise count */
count = percpu_counter_sum ( fbc ) ;
if ( count > rhs )
return 1 ;
else if ( count < rhs )
return - 1 ;
else
return 0 ;
}
2015-05-29 00:39:34 +03:00
EXPORT_SYMBOL ( __percpu_counter_compare ) ;
2010-08-10 04:19:04 +04:00
2007-07-16 10:39:51 +04:00
static int __init percpu_counter_startup ( void )
{
2009-01-07 01:41:04 +03:00
compute_batch_value ( ) ;
2007-07-16 10:39:51 +04:00
hotcpu_notifier ( percpu_counter_hotcpu_callback , 0 ) ;
return 0 ;
}
module_init ( percpu_counter_startup ) ;