2008-03-08 05:55:58 +03:00
/*
* Copyright ( c ) 2008 Intel Corporation
* Author : Matthew Wilcox < willy @ linux . intel . com >
*
* Distributed under the terms of the GNU GPL , version 2
2008-04-11 23:23:52 +04:00
*
* This file implements counting semaphores .
* A counting semaphore may be acquired ' n ' times before sleeping .
* See mutex . c for single - acquisition sleeping locks which enforce
* rules which allow code to be debugged more easily .
*/
/*
* Some notes on the implementation :
*
* The spinlock controls access to the other members of the semaphore .
* down_trylock ( ) and up ( ) can be called from interrupt context , so we
* have to disable interrupts when taking the lock . It turns out various
* parts of the kernel expect to be able to use down ( ) on a semaphore in
* interrupt context when they know it will succeed , so we have to use
* irqsave variants for down ( ) , down_interruptible ( ) and down_killable ( )
* too .
*
* The - > count variable represents how many more tasks can acquire this
* semaphore . If it ' s zero , there may be tasks waiting on the wait_list .
2008-03-08 05:55:58 +03:00
*/
# include <linux/compiler.h>
# include <linux/kernel.h>
# include <linux/module.h>
# include <linux/sched.h>
# include <linux/semaphore.h>
# include <linux/spinlock.h>
2008-05-12 23:21:15 +04:00
# include <linux/ftrace.h>
2008-03-08 05:55:58 +03:00
static noinline void __down ( struct semaphore * sem ) ;
static noinline int __down_interruptible ( struct semaphore * sem ) ;
2008-03-14 20:19:33 +03:00
static noinline int __down_killable ( struct semaphore * sem ) ;
2008-03-14 20:43:13 +03:00
static noinline int __down_timeout ( struct semaphore * sem , long jiffies ) ;
2008-03-08 05:55:58 +03:00
static noinline void __up ( struct semaphore * sem ) ;
2008-04-11 23:23:52 +04:00
/**
* down - acquire the semaphore
* @ sem : the semaphore to be acquired
*
* Acquires the semaphore . If no more tasks are allowed to acquire the
* semaphore , calling this function will put the task to sleep until the
* semaphore is released .
*
* Use of this function is deprecated , please use down_interruptible ( ) or
* down_killable ( ) instead .
*/
2008-03-08 05:55:58 +03:00
void down ( struct semaphore * sem )
{
unsigned long flags ;
spin_lock_irqsave ( & sem - > lock , flags ) ;
2008-05-11 07:43:22 +04:00
if ( likely ( sem - > count > 0 ) )
sem - > count - - ;
else
2008-03-08 05:55:58 +03:00
__down ( sem ) ;
spin_unlock_irqrestore ( & sem - > lock , flags ) ;
}
EXPORT_SYMBOL ( down ) ;
2008-04-11 23:23:52 +04:00
/**
* down_interruptible - acquire the semaphore unless interrupted
* @ sem : the semaphore to be acquired
*
* Attempts to acquire the semaphore . If no more tasks are allowed to
* acquire the semaphore , calling this function will put the task to sleep .
* If the sleep is interrupted by a signal , this function will return - EINTR .
* If the semaphore is successfully acquired , this function returns 0.
*/
2008-03-08 05:55:58 +03:00
int down_interruptible ( struct semaphore * sem )
{
unsigned long flags ;
int result = 0 ;
spin_lock_irqsave ( & sem - > lock , flags ) ;
2008-05-11 07:43:22 +04:00
if ( likely ( sem - > count > 0 ) )
semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date: Fri Mar 7 21:55:58 2008 -0500
|
| Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
list_del(&waiter->list);
waiter->up = 1;
wake_up_process(waiter->task);
and where __down() does this in essence:
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
waiter.up = 0;
for (;;) {
[...]
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
# v2.6.25 vanilla:
..................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 56096.4 91 207.5 789.7 0.4675
2000 55894.4 94 208.2 792.7 0.4658
# v2.6.26-rc1-166-gc0a1811 vanilla:
...................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 33230.6 83 350.3 784.5 0.2769
2000 31778.1 86 366.3 783.6 0.2648
# v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
...............................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 55707.1 92 209.0 795.6 0.4642
2000 55704.4 96 209.0 796.0 0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
text data bss dec hex filename
1241 0 0 1241 4d9 semaphore.o.before
1207 0 0 1207 4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 13:53:48 +04:00
sem - > count - - ;
2008-05-11 07:43:22 +04:00
else
result = __down_interruptible ( sem ) ;
2008-03-08 05:55:58 +03:00
spin_unlock_irqrestore ( & sem - > lock , flags ) ;
return result ;
}
EXPORT_SYMBOL ( down_interruptible ) ;
2008-04-11 23:23:52 +04:00
/**
* down_killable - acquire the semaphore unless killed
* @ sem : the semaphore to be acquired
*
* Attempts to acquire the semaphore . If no more tasks are allowed to
* acquire the semaphore , calling this function will put the task to sleep .
* If the sleep is interrupted by a fatal signal , this function will return
* - EINTR . If the semaphore is successfully acquired , this function returns
* 0.
*/
2008-03-14 20:19:33 +03:00
int down_killable ( struct semaphore * sem )
{
unsigned long flags ;
int result = 0 ;
spin_lock_irqsave ( & sem - > lock , flags ) ;
2008-05-11 07:43:22 +04:00
if ( likely ( sem - > count > 0 ) )
semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date: Fri Mar 7 21:55:58 2008 -0500
|
| Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
list_del(&waiter->list);
waiter->up = 1;
wake_up_process(waiter->task);
and where __down() does this in essence:
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
waiter.up = 0;
for (;;) {
[...]
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
# v2.6.25 vanilla:
..................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 56096.4 91 207.5 789.7 0.4675
2000 55894.4 94 208.2 792.7 0.4658
# v2.6.26-rc1-166-gc0a1811 vanilla:
...................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 33230.6 83 350.3 784.5 0.2769
2000 31778.1 86 366.3 783.6 0.2648
# v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
...............................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 55707.1 92 209.0 795.6 0.4642
2000 55704.4 96 209.0 796.0 0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
text data bss dec hex filename
1241 0 0 1241 4d9 semaphore.o.before
1207 0 0 1207 4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 13:53:48 +04:00
sem - > count - - ;
2008-05-11 07:43:22 +04:00
else
result = __down_killable ( sem ) ;
2008-03-14 20:19:33 +03:00
spin_unlock_irqrestore ( & sem - > lock , flags ) ;
return result ;
}
EXPORT_SYMBOL ( down_killable ) ;
2008-03-08 05:55:58 +03:00
/**
* down_trylock - try to acquire the semaphore , without waiting
* @ sem : the semaphore to be acquired
*
* Try to acquire the semaphore atomically . Returns 0 if the mutex has
2008-04-11 23:23:52 +04:00
* been acquired successfully or 1 if it it cannot be acquired .
2008-03-08 05:55:58 +03:00
*
* NOTE : This return value is inverted from both spin_trylock and
* mutex_trylock ! Be careful about this when converting code .
*
* Unlike mutex_trylock , this function can be used from interrupt context ,
* and the semaphore can be released by any task or interrupt .
*/
int down_trylock ( struct semaphore * sem )
{
unsigned long flags ;
int count ;
spin_lock_irqsave ( & sem - > lock , flags ) ;
count = sem - > count - 1 ;
if ( likely ( count > = 0 ) )
sem - > count = count ;
spin_unlock_irqrestore ( & sem - > lock , flags ) ;
return ( count < 0 ) ;
}
EXPORT_SYMBOL ( down_trylock ) ;
2008-04-11 23:23:52 +04:00
/**
* down_timeout - acquire the semaphore within a specified time
* @ sem : the semaphore to be acquired
* @ jiffies : how long to wait before failing
*
* Attempts to acquire the semaphore . If no more tasks are allowed to
* acquire the semaphore , calling this function will put the task to sleep .
* If the semaphore is not released within the specified number of jiffies ,
* this function returns - ETIME . It returns 0 if the semaphore was acquired .
*/
2008-03-14 20:43:13 +03:00
int down_timeout ( struct semaphore * sem , long jiffies )
{
unsigned long flags ;
int result = 0 ;
spin_lock_irqsave ( & sem - > lock , flags ) ;
2008-05-11 07:43:22 +04:00
if ( likely ( sem - > count > 0 ) )
semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date: Fri Mar 7 21:55:58 2008 -0500
|
| Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
list_del(&waiter->list);
waiter->up = 1;
wake_up_process(waiter->task);
and where __down() does this in essence:
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
waiter.up = 0;
for (;;) {
[...]
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
# v2.6.25 vanilla:
..................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 56096.4 91 207.5 789.7 0.4675
2000 55894.4 94 208.2 792.7 0.4658
# v2.6.26-rc1-166-gc0a1811 vanilla:
...................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 33230.6 83 350.3 784.5 0.2769
2000 31778.1 86 366.3 783.6 0.2648
# v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
...............................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 55707.1 92 209.0 795.6 0.4642
2000 55704.4 96 209.0 796.0 0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
text data bss dec hex filename
1241 0 0 1241 4d9 semaphore.o.before
1207 0 0 1207 4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 13:53:48 +04:00
sem - > count - - ;
2008-05-11 07:43:22 +04:00
else
result = __down_timeout ( sem , jiffies ) ;
2008-03-14 20:43:13 +03:00
spin_unlock_irqrestore ( & sem - > lock , flags ) ;
return result ;
}
EXPORT_SYMBOL ( down_timeout ) ;
2008-04-11 23:23:52 +04:00
/**
* up - release the semaphore
* @ sem : the semaphore to release
*
* Release the semaphore . Unlike mutexes , up ( ) may be called from any
* context and even by tasks which have never called down ( ) .
*/
2008-03-08 05:55:58 +03:00
void up ( struct semaphore * sem )
{
unsigned long flags ;
spin_lock_irqsave ( & sem - > lock , flags ) ;
2008-05-11 07:43:22 +04:00
if ( likely ( list_empty ( & sem - > wait_list ) ) )
sem - > count + + ;
else
2008-03-08 05:55:58 +03:00
__up ( sem ) ;
spin_unlock_irqrestore ( & sem - > lock , flags ) ;
}
EXPORT_SYMBOL ( up ) ;
/* Functions for the contended case */
struct semaphore_waiter {
struct list_head list ;
struct task_struct * task ;
2008-05-11 07:43:22 +04:00
int up ;
2008-03-08 05:55:58 +03:00
} ;
/*
2008-03-14 20:43:13 +03:00
* Because this function is inlined , the ' state ' parameter will be
* constant , and thus optimised away by the compiler . Likewise the
* ' timeout ' parameter for the cases without timeouts .
2008-03-08 05:55:58 +03:00
*/
2008-03-14 20:43:13 +03:00
static inline int __sched __down_common ( struct semaphore * sem , long state ,
long timeout )
2008-03-08 05:55:58 +03:00
{
struct task_struct * task = current ;
struct semaphore_waiter waiter ;
semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date: Fri Mar 7 21:55:58 2008 -0500
|
| Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
list_del(&waiter->list);
waiter->up = 1;
wake_up_process(waiter->task);
and where __down() does this in essence:
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
waiter.up = 0;
for (;;) {
[...]
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
# v2.6.25 vanilla:
..................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 56096.4 91 207.5 789.7 0.4675
2000 55894.4 94 208.2 792.7 0.4658
# v2.6.26-rc1-166-gc0a1811 vanilla:
...................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 33230.6 83 350.3 784.5 0.2769
2000 31778.1 86 366.3 783.6 0.2648
# v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
...............................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 55707.1 92 209.0 795.6 0.4642
2000 55704.4 96 209.0 796.0 0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
text data bss dec hex filename
1241 0 0 1241 4d9 semaphore.o.before
1207 0 0 1207 4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 13:53:48 +04:00
list_add_tail ( & waiter . list , & sem - > wait_list ) ;
2008-05-11 07:43:22 +04:00
waiter . task = task ;
waiter . up = 0 ;
2008-03-08 05:55:58 +03:00
for ( ; ; ) {
2008-08-06 00:01:13 +04:00
if ( signal_pending_state ( state , task ) )
2008-05-11 07:43:22 +04:00
goto interrupted ;
if ( timeout < = 0 )
goto timed_out ;
2008-03-08 05:55:58 +03:00
__set_task_state ( task , state ) ;
spin_unlock_irq ( & sem - > lock ) ;
2008-03-14 20:43:13 +03:00
timeout = schedule_timeout ( timeout ) ;
2008-03-08 05:55:58 +03:00
spin_lock_irq ( & sem - > lock ) ;
2008-05-11 07:43:22 +04:00
if ( waiter . up )
return 0 ;
2008-03-08 05:55:58 +03:00
}
2008-05-11 07:43:22 +04:00
timed_out :
list_del ( & waiter . list ) ;
return - ETIME ;
interrupted :
2008-03-08 05:55:58 +03:00
list_del ( & waiter . list ) ;
2008-05-11 07:43:22 +04:00
return - EINTR ;
2008-03-08 05:55:58 +03:00
}
static noinline void __sched __down ( struct semaphore * sem )
{
2008-03-14 20:43:13 +03:00
__down_common ( sem , TASK_UNINTERRUPTIBLE , MAX_SCHEDULE_TIMEOUT ) ;
2008-03-08 05:55:58 +03:00
}
static noinline int __sched __down_interruptible ( struct semaphore * sem )
{
2008-03-14 20:43:13 +03:00
return __down_common ( sem , TASK_INTERRUPTIBLE , MAX_SCHEDULE_TIMEOUT ) ;
2008-03-08 05:55:58 +03:00
}
2008-03-14 20:19:33 +03:00
static noinline int __sched __down_killable ( struct semaphore * sem )
{
2008-03-14 20:43:13 +03:00
return __down_common ( sem , TASK_KILLABLE , MAX_SCHEDULE_TIMEOUT ) ;
}
static noinline int __sched __down_timeout ( struct semaphore * sem , long jiffies )
{
return __down_common ( sem , TASK_UNINTERRUPTIBLE , jiffies ) ;
2008-03-14 20:19:33 +03:00
}
2008-03-08 05:55:58 +03:00
static noinline void __sched __up ( struct semaphore * sem )
{
2008-03-14 21:35:22 +03:00
struct semaphore_waiter * waiter = list_first_entry ( & sem - > wait_list ,
struct semaphore_waiter , list ) ;
2008-05-11 07:43:22 +04:00
list_del ( & waiter - > list ) ;
waiter - > up = 1 ;
2008-03-14 21:35:22 +03:00
wake_up_process ( waiter - > task ) ;
2008-03-08 05:55:58 +03:00
}