2017-04-14 11:00:02 +03:00
/*
* The Kyber I / O scheduler . Controls latency by throttling queue depths using
* scalable techniques .
*
* Copyright ( C ) 2017 Facebook
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program . If not , see < https : //www.gnu.org/licenses/>.
*/
# include <linux/kernel.h>
# include <linux/blkdev.h>
# include <linux/blk-mq.h>
# include <linux/elevator.h>
# include <linux/module.h>
# include <linux/sbitmap.h>
# include "blk.h"
# include "blk-mq.h"
2017-05-04 10:31:33 +03:00
# include "blk-mq-debugfs.h"
2017-04-14 11:00:02 +03:00
# include "blk-mq-sched.h"
# include "blk-mq-tag.h"
# include "blk-stat.h"
/* Scheduling domains. */
enum {
KYBER_READ ,
KYBER_SYNC_WRITE ,
KYBER_OTHER , /* Async writes, discard, etc. */
KYBER_NUM_DOMAINS ,
} ;
enum {
KYBER_MIN_DEPTH = 256 ,
/*
* In order to prevent starvation of synchronous requests by a flood of
* asynchronous requests , we reserve 25 % of requests for synchronous
* operations .
*/
KYBER_ASYNC_PERCENT = 75 ,
} ;
/*
* Initial device - wide depths for each scheduling domain .
*
* Even for fast devices with lots of tags like NVMe , you can saturate
* the device with only a fraction of the maximum possible queue depth .
* So , we cap these to a reasonable value .
*/
static const unsigned int kyber_depth [ ] = {
[ KYBER_READ ] = 256 ,
[ KYBER_SYNC_WRITE ] = 128 ,
[ KYBER_OTHER ] = 64 ,
} ;
/*
* Scheduling domain batch sizes . We favor reads .
*/
static const unsigned int kyber_batch_size [ ] = {
[ KYBER_READ ] = 16 ,
[ KYBER_SYNC_WRITE ] = 8 ,
[ KYBER_OTHER ] = 8 ,
} ;
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
/*
* There is a same mapping between ctx & hctx and kcq & khd ,
* we use request - > mq_ctx - > index_hw to index the kcq in khd .
*/
struct kyber_ctx_queue {
/*
* Used to ensure operations on rq_list and kcq_map to be an atmoic one .
* Also protect the rqs on rq_list when merge .
*/
spinlock_t lock ;
struct list_head rq_list [ KYBER_NUM_DOMAINS ] ;
} ____cacheline_aligned_in_smp ;
2017-04-14 11:00:02 +03:00
struct kyber_queue_data {
struct request_queue * q ;
struct blk_stat_callback * cb ;
/*
* The device is divided into multiple scheduling domains based on the
* request type . Each domain has a fixed number of in - flight requests of
* that type device - wide , limited by these tokens .
*/
struct sbitmap_queue domain_tokens [ KYBER_NUM_DOMAINS ] ;
/*
* Async request percentage , converted to per - word depth for
* sbitmap_get_shallow ( ) .
*/
unsigned int async_depth ;
/* Target latencies in nanoseconds. */
u64 read_lat_nsec , write_lat_nsec ;
} ;
struct kyber_hctx_data {
spinlock_t lock ;
struct list_head rqs [ KYBER_NUM_DOMAINS ] ;
unsigned int cur_domain ;
unsigned int batching ;
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
struct kyber_ctx_queue * kcqs ;
struct sbitmap kcq_map [ KYBER_NUM_DOMAINS ] ;
2017-06-20 13:06:13 +03:00
wait_queue_entry_t domain_wait [ KYBER_NUM_DOMAINS ] ;
2017-12-06 09:57:43 +03:00
struct sbq_wait_state * domain_ws [ KYBER_NUM_DOMAINS ] ;
2017-04-14 11:00:02 +03:00
atomic_t wait_index [ KYBER_NUM_DOMAINS ] ;
} ;
2017-12-06 09:57:43 +03:00
static int kyber_domain_wake ( wait_queue_entry_t * wait , unsigned mode , int flags ,
void * key ) ;
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
static unsigned int kyber_sched_domain ( unsigned int op )
2017-04-14 11:00:02 +03:00
{
if ( ( op & REQ_OP_MASK ) = = REQ_OP_READ )
return KYBER_READ ;
else if ( ( op & REQ_OP_MASK ) = = REQ_OP_WRITE & & op_is_sync ( op ) )
return KYBER_SYNC_WRITE ;
else
return KYBER_OTHER ;
}
enum {
NONE = 0 ,
GOOD = 1 ,
GREAT = 2 ,
BAD = - 1 ,
AWFUL = - 2 ,
} ;
# define IS_GOOD(status) ((status) > 0)
# define IS_BAD(status) ((status) < 0)
static int kyber_lat_status ( struct blk_stat_callback * cb ,
unsigned int sched_domain , u64 target )
{
u64 latency ;
if ( ! cb - > stat [ sched_domain ] . nr_samples )
return NONE ;
latency = cb - > stat [ sched_domain ] . mean ;
if ( latency > = 2 * target )
return AWFUL ;
else if ( latency > target )
return BAD ;
else if ( latency < = target / 2 )
return GREAT ;
else /* (latency <= target) */
return GOOD ;
}
/*
* Adjust the read or synchronous write depth given the status of reads and
* writes . The goal is that the latencies of the two domains are fair ( i . e . , if
* one is good , then the other is good ) .
*/
static void kyber_adjust_rw_depth ( struct kyber_queue_data * kqd ,
unsigned int sched_domain , int this_status ,
int other_status )
{
unsigned int orig_depth , depth ;
/*
* If this domain had no samples , or reads and writes are both good or
* both bad , don ' t adjust the depth .
*/
if ( this_status = = NONE | |
( IS_GOOD ( this_status ) & & IS_GOOD ( other_status ) ) | |
( IS_BAD ( this_status ) & & IS_BAD ( other_status ) ) )
return ;
orig_depth = depth = kqd - > domain_tokens [ sched_domain ] . sb . depth ;
if ( other_status = = NONE ) {
depth + + ;
} else {
switch ( this_status ) {
case GOOD :
if ( other_status = = AWFUL )
depth - = max ( depth / 4 , 1U ) ;
else
depth - = max ( depth / 8 , 1U ) ;
break ;
case GREAT :
if ( other_status = = AWFUL )
depth / = 2 ;
else
depth - = max ( depth / 4 , 1U ) ;
break ;
case BAD :
depth + + ;
break ;
case AWFUL :
if ( other_status = = GREAT )
depth + = 2 ;
else
depth + + ;
break ;
}
}
depth = clamp ( depth , 1U , kyber_depth [ sched_domain ] ) ;
if ( depth ! = orig_depth )
sbitmap_queue_resize ( & kqd - > domain_tokens [ sched_domain ] , depth ) ;
}
/*
* Adjust the depth of other requests given the status of reads and synchronous
* writes . As long as either domain is doing fine , we don ' t throttle , but if
* both domains are doing badly , we throttle heavily .
*/
static void kyber_adjust_other_depth ( struct kyber_queue_data * kqd ,
int read_status , int write_status ,
bool have_samples )
{
unsigned int orig_depth , depth ;
int status ;
orig_depth = depth = kqd - > domain_tokens [ KYBER_OTHER ] . sb . depth ;
if ( read_status = = NONE & & write_status = = NONE ) {
depth + = 2 ;
} else if ( have_samples ) {
if ( read_status = = NONE )
status = write_status ;
else if ( write_status = = NONE )
status = read_status ;
else
status = max ( read_status , write_status ) ;
switch ( status ) {
case GREAT :
depth + = 2 ;
break ;
case GOOD :
depth + + ;
break ;
case BAD :
depth - = max ( depth / 4 , 1U ) ;
break ;
case AWFUL :
depth / = 2 ;
break ;
}
}
depth = clamp ( depth , 1U , kyber_depth [ KYBER_OTHER ] ) ;
if ( depth ! = orig_depth )
sbitmap_queue_resize ( & kqd - > domain_tokens [ KYBER_OTHER ] , depth ) ;
}
/*
* Apply heuristics for limiting queue depths based on gathered latency
* statistics .
*/
static void kyber_stat_timer_fn ( struct blk_stat_callback * cb )
{
struct kyber_queue_data * kqd = cb - > data ;
int read_status , write_status ;
read_status = kyber_lat_status ( cb , KYBER_READ , kqd - > read_lat_nsec ) ;
write_status = kyber_lat_status ( cb , KYBER_SYNC_WRITE , kqd - > write_lat_nsec ) ;
kyber_adjust_rw_depth ( kqd , KYBER_READ , read_status , write_status ) ;
kyber_adjust_rw_depth ( kqd , KYBER_SYNC_WRITE , write_status , read_status ) ;
kyber_adjust_other_depth ( kqd , read_status , write_status ,
cb - > stat [ KYBER_OTHER ] . nr_samples ! = 0 ) ;
/*
* Continue monitoring latencies if we aren ' t hitting the targets or
* we ' re still throttling other requests .
*/
if ( ! blk_stat_is_active ( kqd - > cb ) & &
( ( IS_BAD ( read_status ) | | IS_BAD ( write_status ) | |
kqd - > domain_tokens [ KYBER_OTHER ] . sb . depth < kyber_depth [ KYBER_OTHER ] ) ) )
blk_stat_activate_msecs ( kqd - > cb , 100 ) ;
}
static unsigned int kyber_sched_tags_shift ( struct kyber_queue_data * kqd )
{
/*
* All of the hardware queues have the same depth , so we can just grab
* the shift of the first one .
*/
return kqd - > q - > queue_hw_ctx [ 0 ] - > sched_tags - > bitmap_tags . sb . shift ;
}
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
static int kyber_bucket_fn ( const struct request * rq )
{
return kyber_sched_domain ( rq - > cmd_flags ) ;
}
2017-04-14 11:00:02 +03:00
static struct kyber_queue_data * kyber_queue_data_alloc ( struct request_queue * q )
{
struct kyber_queue_data * kqd ;
unsigned int max_tokens ;
unsigned int shift ;
int ret = - ENOMEM ;
int i ;
kqd = kmalloc_node ( sizeof ( * kqd ) , GFP_KERNEL , q - > node ) ;
if ( ! kqd )
goto err ;
kqd - > q = q ;
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
kqd - > cb = blk_stat_alloc_callback ( kyber_stat_timer_fn , kyber_bucket_fn ,
2017-04-14 11:00:02 +03:00
KYBER_NUM_DOMAINS , kqd ) ;
if ( ! kqd - > cb )
goto err_kqd ;
/*
* The maximum number of tokens for any scheduling domain is at least
* the queue depth of a single hardware queue . If the hardware doesn ' t
* have many tags , still provide a reasonable number .
*/
max_tokens = max_t ( unsigned int , q - > tag_set - > queue_depth ,
KYBER_MIN_DEPTH ) ;
for ( i = 0 ; i < KYBER_NUM_DOMAINS ; i + + ) {
WARN_ON ( ! kyber_depth [ i ] ) ;
WARN_ON ( ! kyber_batch_size [ i ] ) ;
ret = sbitmap_queue_init_node ( & kqd - > domain_tokens [ i ] ,
max_tokens , - 1 , false , GFP_KERNEL ,
q - > node ) ;
if ( ret ) {
while ( - - i > = 0 )
sbitmap_queue_free ( & kqd - > domain_tokens [ i ] ) ;
goto err_cb ;
}
sbitmap_queue_resize ( & kqd - > domain_tokens [ i ] , kyber_depth [ i ] ) ;
}
shift = kyber_sched_tags_shift ( kqd ) ;
kqd - > async_depth = ( 1U < < shift ) * KYBER_ASYNC_PERCENT / 100U ;
kqd - > read_lat_nsec = 2000000ULL ;
kqd - > write_lat_nsec = 10000000ULL ;
return kqd ;
err_cb :
blk_stat_free_callback ( kqd - > cb ) ;
err_kqd :
kfree ( kqd ) ;
err :
return ERR_PTR ( ret ) ;
}
static int kyber_init_sched ( struct request_queue * q , struct elevator_type * e )
{
struct kyber_queue_data * kqd ;
struct elevator_queue * eq ;
eq = elevator_alloc ( q , e ) ;
if ( ! eq )
return - ENOMEM ;
kqd = kyber_queue_data_alloc ( q ) ;
if ( IS_ERR ( kqd ) ) {
kobject_put ( & eq - > kobj ) ;
return PTR_ERR ( kqd ) ;
}
eq - > elevator_data = kqd ;
q - > elevator = eq ;
blk_stat_add_callback ( q , kqd - > cb ) ;
return 0 ;
}
static void kyber_exit_sched ( struct elevator_queue * e )
{
struct kyber_queue_data * kqd = e - > elevator_data ;
struct request_queue * q = kqd - > q ;
int i ;
blk_stat_remove_callback ( q , kqd - > cb ) ;
for ( i = 0 ; i < KYBER_NUM_DOMAINS ; i + + )
sbitmap_queue_free ( & kqd - > domain_tokens [ i ] ) ;
blk_stat_free_callback ( kqd - > cb ) ;
kfree ( kqd ) ;
}
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
static void kyber_ctx_queue_init ( struct kyber_ctx_queue * kcq )
{
unsigned int i ;
spin_lock_init ( & kcq - > lock ) ;
for ( i = 0 ; i < KYBER_NUM_DOMAINS ; i + + )
INIT_LIST_HEAD ( & kcq - > rq_list [ i ] ) ;
}
2017-04-14 11:00:02 +03:00
static int kyber_init_hctx ( struct blk_mq_hw_ctx * hctx , unsigned int hctx_idx )
{
2018-05-09 22:55:14 +03:00
struct kyber_queue_data * kqd = hctx - > queue - > elevator - > elevator_data ;
2017-04-14 11:00:02 +03:00
struct kyber_hctx_data * khd ;
int i ;
khd = kmalloc_node ( sizeof ( * khd ) , GFP_KERNEL , hctx - > numa_node ) ;
if ( ! khd )
return - ENOMEM ;
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
khd - > kcqs = kmalloc_array_node ( hctx - > nr_ctx ,
sizeof ( struct kyber_ctx_queue ) ,
GFP_KERNEL , hctx - > numa_node ) ;
if ( ! khd - > kcqs )
goto err_khd ;
for ( i = 0 ; i < hctx - > nr_ctx ; i + + )
kyber_ctx_queue_init ( & khd - > kcqs [ i ] ) ;
for ( i = 0 ; i < KYBER_NUM_DOMAINS ; i + + ) {
if ( sbitmap_init_node ( & khd - > kcq_map [ i ] , hctx - > nr_ctx ,
ilog2 ( 8 ) , GFP_KERNEL , hctx - > numa_node ) ) {
while ( - - i > = 0 )
sbitmap_free ( & khd - > kcq_map [ i ] ) ;
goto err_kcqs ;
}
}
2017-04-14 11:00:02 +03:00
spin_lock_init ( & khd - > lock ) ;
for ( i = 0 ; i < KYBER_NUM_DOMAINS ; i + + ) {
INIT_LIST_HEAD ( & khd - > rqs [ i ] ) ;
2017-12-06 09:57:43 +03:00
init_waitqueue_func_entry ( & khd - > domain_wait [ i ] ,
kyber_domain_wake ) ;
khd - > domain_wait [ i ] . private = hctx ;
sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list naming
So I've noticed a number of instances where it was not obvious from the
code whether ->task_list was for a wait-queue head or a wait-queue entry.
Furthermore, there's a number of wait-queue users where the lists are
not for 'tasks' but other entities (poll tables, etc.), in which case
the 'task_list' name is actively confusing.
To clear this all up, name the wait-queue head and entry list structure
fields unambiguously:
struct wait_queue_head::task_list => ::head
struct wait_queue_entry::task_list => ::entry
For example, this code:
rqw->wait.task_list.next != &wait->task_list
... is was pretty unclear (to me) what it's doing, while now it's written this way:
rqw->wait.head.next != &wait->entry
... which makes it pretty clear that we are iterating a list until we see the head.
Other examples are:
list_for_each_entry_safe(pos, next, &x->task_list, task_list) {
list_for_each_entry(wq, &fence->wait.task_list, task_list) {
... where it's unclear (to me) what we are iterating, and during review it's
hard to tell whether it's trying to walk a wait-queue entry (which would be
a bug), while now it's written as:
list_for_each_entry_safe(pos, next, &x->head, entry) {
list_for_each_entry(wq, &fence->wait.head, entry) {
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-20 13:06:46 +03:00
INIT_LIST_HEAD ( & khd - > domain_wait [ i ] . entry ) ;
2017-04-14 11:00:02 +03:00
atomic_set ( & khd - > wait_index [ i ] , 0 ) ;
}
khd - > cur_domain = 0 ;
khd - > batching = 0 ;
hctx - > sched_data = khd ;
2018-05-09 22:55:14 +03:00
sbitmap_queue_min_shallow_depth ( & hctx - > sched_tags - > bitmap_tags ,
kqd - > async_depth ) ;
2017-04-14 11:00:02 +03:00
return 0 ;
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
err_kcqs :
kfree ( khd - > kcqs ) ;
err_khd :
kfree ( khd ) ;
return - ENOMEM ;
2017-04-14 11:00:02 +03:00
}
static void kyber_exit_hctx ( struct blk_mq_hw_ctx * hctx , unsigned int hctx_idx )
{
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
struct kyber_hctx_data * khd = hctx - > sched_data ;
int i ;
for ( i = 0 ; i < KYBER_NUM_DOMAINS ; i + + )
sbitmap_free ( & khd - > kcq_map [ i ] ) ;
kfree ( khd - > kcqs ) ;
2017-04-14 11:00:02 +03:00
kfree ( hctx - > sched_data ) ;
}
static int rq_get_domain_token ( struct request * rq )
{
return ( long ) rq - > elv . priv [ 0 ] ;
}
static void rq_set_domain_token ( struct request * rq , int token )
{
rq - > elv . priv [ 0 ] = ( void * ) ( long ) token ;
}
static void rq_clear_domain_token ( struct kyber_queue_data * kqd ,
struct request * rq )
{
unsigned int sched_domain ;
int nr ;
nr = rq_get_domain_token ( rq ) ;
if ( nr ! = - 1 ) {
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
sched_domain = kyber_sched_domain ( rq - > cmd_flags ) ;
2017-04-14 11:00:02 +03:00
sbitmap_queue_clear ( & kqd - > domain_tokens [ sched_domain ] , nr ,
rq - > mq_ctx - > cpu ) ;
}
}
2017-06-16 19:15:26 +03:00
static void kyber_limit_depth ( unsigned int op , struct blk_mq_alloc_data * data )
2017-04-14 11:00:02 +03:00
{
/*
* We use the scheduler tags as per - hardware queue queueing tokens .
* Async requests can be limited at this stage .
*/
2017-06-16 19:15:26 +03:00
if ( ! op_is_sync ( op ) ) {
struct kyber_queue_data * kqd = data - > q - > elevator - > elevator_data ;
2017-04-14 11:00:02 +03:00
data - > shallow_depth = kqd - > async_depth ;
2017-06-16 19:15:26 +03:00
}
}
2017-04-14 11:00:02 +03:00
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
static bool kyber_bio_merge ( struct blk_mq_hw_ctx * hctx , struct bio * bio )
{
struct kyber_hctx_data * khd = hctx - > sched_data ;
struct blk_mq_ctx * ctx = blk_mq_get_ctx ( hctx - > queue ) ;
struct kyber_ctx_queue * kcq = & khd - > kcqs [ ctx - > index_hw ] ;
unsigned int sched_domain = kyber_sched_domain ( bio - > bi_opf ) ;
struct list_head * rq_list = & kcq - > rq_list [ sched_domain ] ;
bool merged ;
spin_lock ( & kcq - > lock ) ;
merged = blk_mq_bio_list_merge ( hctx - > queue , rq_list , bio ) ;
spin_unlock ( & kcq - > lock ) ;
blk_mq_put_ctx ( ctx ) ;
return merged ;
}
2017-06-16 19:15:26 +03:00
static void kyber_prepare_request ( struct request * rq , struct bio * bio )
{
rq_set_domain_token ( rq , - 1 ) ;
2017-04-14 11:00:02 +03:00
}
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
static void kyber_insert_requests ( struct blk_mq_hw_ctx * hctx ,
struct list_head * rq_list , bool at_head )
{
struct kyber_hctx_data * khd = hctx - > sched_data ;
struct request * rq , * next ;
list_for_each_entry_safe ( rq , next , rq_list , queuelist ) {
unsigned int sched_domain = kyber_sched_domain ( rq - > cmd_flags ) ;
struct kyber_ctx_queue * kcq = & khd - > kcqs [ rq - > mq_ctx - > index_hw ] ;
struct list_head * head = & kcq - > rq_list [ sched_domain ] ;
spin_lock ( & kcq - > lock ) ;
if ( at_head )
list_move ( & rq - > queuelist , head ) ;
else
list_move_tail ( & rq - > queuelist , head ) ;
sbitmap_set_bit ( & khd - > kcq_map [ sched_domain ] ,
rq - > mq_ctx - > index_hw ) ;
blk_mq_sched_request_inserted ( rq ) ;
spin_unlock ( & kcq - > lock ) ;
}
}
2017-06-16 19:15:21 +03:00
static void kyber_finish_request ( struct request * rq )
2017-04-14 11:00:02 +03:00
{
2017-06-16 19:15:21 +03:00
struct kyber_queue_data * kqd = rq - > q - > elevator - > elevator_data ;
2017-04-14 11:00:02 +03:00
rq_clear_domain_token ( kqd , rq ) ;
}
static void kyber_completed_request ( struct request * rq )
{
struct request_queue * q = rq - > q ;
struct kyber_queue_data * kqd = q - > elevator - > elevator_data ;
unsigned int sched_domain ;
u64 now , latency , target ;
/*
* Check if this request met our latency goal . If not , quickly gather
* some statistics and start throttling .
*/
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
sched_domain = kyber_sched_domain ( rq - > cmd_flags ) ;
2017-04-14 11:00:02 +03:00
switch ( sched_domain ) {
case KYBER_READ :
target = kqd - > read_lat_nsec ;
break ;
case KYBER_SYNC_WRITE :
target = kqd - > write_lat_nsec ;
break ;
default :
return ;
}
/* If we are already monitoring latencies, don't check again. */
if ( blk_stat_is_active ( kqd - > cb ) )
return ;
2018-05-09 12:08:50 +03:00
now = ktime_get_ns ( ) ;
if ( now < rq - > io_start_time_ns )
2017-04-14 11:00:02 +03:00
return ;
2018-05-09 12:08:50 +03:00
latency = now - rq - > io_start_time_ns ;
2017-04-14 11:00:02 +03:00
if ( latency > target )
blk_stat_activate_msecs ( kqd - > cb , 10 ) ;
}
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
struct flush_kcq_data {
struct kyber_hctx_data * khd ;
unsigned int sched_domain ;
struct list_head * list ;
} ;
static bool flush_busy_kcq ( struct sbitmap * sb , unsigned int bitnr , void * data )
2017-04-14 11:00:02 +03:00
{
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
struct flush_kcq_data * flush_data = data ;
struct kyber_ctx_queue * kcq = & flush_data - > khd - > kcqs [ bitnr ] ;
2017-04-14 11:00:02 +03:00
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
spin_lock ( & kcq - > lock ) ;
list_splice_tail_init ( & kcq - > rq_list [ flush_data - > sched_domain ] ,
flush_data - > list ) ;
sbitmap_clear_bit ( sb , bitnr ) ;
spin_unlock ( & kcq - > lock ) ;
2017-04-14 11:00:02 +03:00
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
return true ;
}
static void kyber_flush_busy_kcqs ( struct kyber_hctx_data * khd ,
unsigned int sched_domain ,
struct list_head * list )
{
struct flush_kcq_data data = {
. khd = khd ,
. sched_domain = sched_domain ,
. list = list ,
} ;
sbitmap_for_each_set ( & khd - > kcq_map [ sched_domain ] ,
flush_busy_kcq , & data ) ;
2017-04-14 11:00:02 +03:00
}
2017-06-20 13:06:13 +03:00
static int kyber_domain_wake ( wait_queue_entry_t * wait , unsigned mode , int flags ,
2017-04-14 11:00:02 +03:00
void * key )
{
struct blk_mq_hw_ctx * hctx = READ_ONCE ( wait - > private ) ;
sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list naming
So I've noticed a number of instances where it was not obvious from the
code whether ->task_list was for a wait-queue head or a wait-queue entry.
Furthermore, there's a number of wait-queue users where the lists are
not for 'tasks' but other entities (poll tables, etc.), in which case
the 'task_list' name is actively confusing.
To clear this all up, name the wait-queue head and entry list structure
fields unambiguously:
struct wait_queue_head::task_list => ::head
struct wait_queue_entry::task_list => ::entry
For example, this code:
rqw->wait.task_list.next != &wait->task_list
... is was pretty unclear (to me) what it's doing, while now it's written this way:
rqw->wait.head.next != &wait->entry
... which makes it pretty clear that we are iterating a list until we see the head.
Other examples are:
list_for_each_entry_safe(pos, next, &x->task_list, task_list) {
list_for_each_entry(wq, &fence->wait.task_list, task_list) {
... where it's unclear (to me) what we are iterating, and during review it's
hard to tell whether it's trying to walk a wait-queue entry (which would be
a bug), while now it's written as:
list_for_each_entry_safe(pos, next, &x->head, entry) {
list_for_each_entry(wq, &fence->wait.head, entry) {
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-20 13:06:46 +03:00
list_del_init ( & wait - > entry ) ;
2017-04-14 11:00:02 +03:00
blk_mq_run_hw_queue ( hctx , true ) ;
return 1 ;
}
static int kyber_get_domain_token ( struct kyber_queue_data * kqd ,
struct kyber_hctx_data * khd ,
struct blk_mq_hw_ctx * hctx )
{
unsigned int sched_domain = khd - > cur_domain ;
struct sbitmap_queue * domain_tokens = & kqd - > domain_tokens [ sched_domain ] ;
2017-06-20 13:06:13 +03:00
wait_queue_entry_t * wait = & khd - > domain_wait [ sched_domain ] ;
2017-04-14 11:00:02 +03:00
struct sbq_wait_state * ws ;
int nr ;
nr = __sbitmap_queue_get ( domain_tokens ) ;
/*
* If we failed to get a domain token , make sure the hardware queue is
* run when one becomes available . Note that this is serialized on
* khd - > lock , but we still need to be careful about the waker .
*/
2017-12-06 09:57:43 +03:00
if ( nr < 0 & & list_empty_careful ( & wait - > entry ) ) {
2017-04-14 11:00:02 +03:00
ws = sbq_wait_ptr ( domain_tokens ,
& khd - > wait_index [ sched_domain ] ) ;
2017-12-06 09:57:43 +03:00
khd - > domain_ws [ sched_domain ] = ws ;
2017-04-14 11:00:02 +03:00
add_wait_queue ( & ws - > wait , wait ) ;
/*
* Try again in case a token was freed before we got on the wait
2017-12-06 09:57:43 +03:00
* queue .
2017-04-14 11:00:02 +03:00
*/
nr = __sbitmap_queue_get ( domain_tokens ) ;
2017-12-06 09:57:43 +03:00
}
2017-10-11 20:39:15 +03:00
2017-12-06 09:57:43 +03:00
/*
* If we got a token while we were on the wait queue , remove ourselves
* from the wait queue to ensure that all wake ups make forward
* progress . It ' s possible that the waker already deleted the entry
* between the ! list_empty_careful ( ) check and us grabbing the lock , but
* list_del_init ( ) is okay with that .
*/
if ( nr > = 0 & & ! list_empty_careful ( & wait - > entry ) ) {
ws = khd - > domain_ws [ sched_domain ] ;
spin_lock_irq ( & ws - > wait . lock ) ;
list_del_init ( & wait - > entry ) ;
spin_unlock_irq ( & ws - > wait . lock ) ;
2017-04-14 11:00:02 +03:00
}
2017-12-06 09:57:43 +03:00
2017-04-14 11:00:02 +03:00
return nr ;
}
static struct request *
kyber_dispatch_cur_domain ( struct kyber_queue_data * kqd ,
struct kyber_hctx_data * khd ,
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
struct blk_mq_hw_ctx * hctx )
2017-04-14 11:00:02 +03:00
{
struct list_head * rqs ;
struct request * rq ;
int nr ;
rqs = & khd - > rqs [ khd - > cur_domain ] ;
/*
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
* If we already have a flushed request , then we just need to get a
* token for it . Otherwise , if there are pending requests in the kcqs ,
* flush the kcqs , but only if we can get a token . If not , we should
* leave the requests in the kcqs so that they can be merged . Note that
* khd - > lock serializes the flushes , so if we observed any bit set in
* the kcq_map , we will always get a request .
2017-04-14 11:00:02 +03:00
*/
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
rq = list_first_entry_or_null ( rqs , struct request , queuelist ) ;
2017-04-14 11:00:02 +03:00
if ( rq ) {
nr = kyber_get_domain_token ( kqd , khd , hctx ) ;
if ( nr > = 0 ) {
khd - > batching + + ;
rq_set_domain_token ( rq , nr ) ;
list_del_init ( & rq - > queuelist ) ;
return rq ;
}
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
} else if ( sbitmap_any_bit_set ( & khd - > kcq_map [ khd - > cur_domain ] ) ) {
nr = kyber_get_domain_token ( kqd , khd , hctx ) ;
if ( nr > = 0 ) {
kyber_flush_busy_kcqs ( khd , khd - > cur_domain , rqs ) ;
rq = list_first_entry ( rqs , struct request , queuelist ) ;
khd - > batching + + ;
rq_set_domain_token ( rq , nr ) ;
list_del_init ( & rq - > queuelist ) ;
return rq ;
}
2017-04-14 11:00:02 +03:00
}
/* There were either no pending requests or no tokens. */
return NULL ;
}
static struct request * kyber_dispatch_request ( struct blk_mq_hw_ctx * hctx )
{
struct kyber_queue_data * kqd = hctx - > queue - > elevator - > elevator_data ;
struct kyber_hctx_data * khd = hctx - > sched_data ;
struct request * rq ;
int i ;
spin_lock ( & khd - > lock ) ;
/*
* First , if we are still entitled to batch , try to dispatch a request
* from the batch .
*/
if ( khd - > batching < kyber_batch_size [ khd - > cur_domain ] ) {
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
rq = kyber_dispatch_cur_domain ( kqd , khd , hctx ) ;
2017-04-14 11:00:02 +03:00
if ( rq )
goto out ;
}
/*
* Either ,
* 1. We were no longer entitled to a batch .
* 2. The domain we were batching didn ' t have any requests .
* 3. The domain we were batching was out of tokens .
*
* Start another batch . Note that this wraps back around to the original
* domain if no other domains have requests or tokens .
*/
khd - > batching = 0 ;
for ( i = 0 ; i < KYBER_NUM_DOMAINS ; i + + ) {
if ( khd - > cur_domain = = KYBER_NUM_DOMAINS - 1 )
khd - > cur_domain = 0 ;
else
khd - > cur_domain + + ;
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
rq = kyber_dispatch_cur_domain ( kqd , khd , hctx ) ;
2017-04-14 11:00:02 +03:00
if ( rq )
goto out ;
}
rq = NULL ;
out :
spin_unlock ( & khd - > lock ) ;
return rq ;
}
static bool kyber_has_work ( struct blk_mq_hw_ctx * hctx )
{
struct kyber_hctx_data * khd = hctx - > sched_data ;
int i ;
for ( i = 0 ; i < KYBER_NUM_DOMAINS ; i + + ) {
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
if ( ! list_empty_careful ( & khd - > rqs [ i ] ) | |
sbitmap_any_bit_set ( & khd - > kcq_map [ i ] ) )
2017-04-14 11:00:02 +03:00
return true ;
}
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
return false ;
2017-04-14 11:00:02 +03:00
}
# define KYBER_LAT_SHOW_STORE(op) \
static ssize_t kyber_ # # op # # _lat_show ( struct elevator_queue * e , \
char * page ) \
{ \
struct kyber_queue_data * kqd = e - > elevator_data ; \
\
return sprintf ( page , " %llu \n " , kqd - > op # # _lat_nsec ) ; \
} \
\
static ssize_t kyber_ # # op # # _lat_store ( struct elevator_queue * e , \
const char * page , size_t count ) \
{ \
struct kyber_queue_data * kqd = e - > elevator_data ; \
unsigned long long nsec ; \
int ret ; \
\
ret = kstrtoull ( page , 10 , & nsec ) ; \
if ( ret ) \
return ret ; \
\
kqd - > op # # _lat_nsec = nsec ; \
\
return count ; \
}
KYBER_LAT_SHOW_STORE ( read ) ;
KYBER_LAT_SHOW_STORE ( write ) ;
# undef KYBER_LAT_SHOW_STORE
# define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
static struct elv_fs_entry kyber_sched_attrs [ ] = {
KYBER_LAT_ATTR ( read ) ,
KYBER_LAT_ATTR ( write ) ,
__ATTR_NULL
} ;
# undef KYBER_LAT_ATTR
2017-05-04 10:31:33 +03:00
# ifdef CONFIG_BLK_DEBUG_FS
# define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
static int kyber_ # # name # # _tokens_show ( void * data , struct seq_file * m ) \
{ \
struct request_queue * q = data ; \
struct kyber_queue_data * kqd = q - > elevator - > elevator_data ; \
\
sbitmap_queue_show ( & kqd - > domain_tokens [ domain ] , m ) ; \
return 0 ; \
} \
\
static void * kyber_ # # name # # _rqs_start ( struct seq_file * m , loff_t * pos ) \
__acquires ( & khd - > lock ) \
{ \
struct blk_mq_hw_ctx * hctx = m - > private ; \
struct kyber_hctx_data * khd = hctx - > sched_data ; \
\
spin_lock ( & khd - > lock ) ; \
return seq_list_start ( & khd - > rqs [ domain ] , * pos ) ; \
} \
\
static void * kyber_ # # name # # _rqs_next ( struct seq_file * m , void * v , \
loff_t * pos ) \
{ \
struct blk_mq_hw_ctx * hctx = m - > private ; \
struct kyber_hctx_data * khd = hctx - > sched_data ; \
\
return seq_list_next ( v , & khd - > rqs [ domain ] , pos ) ; \
} \
\
static void kyber_ # # name # # _rqs_stop ( struct seq_file * m , void * v ) \
__releases ( & khd - > lock ) \
{ \
struct blk_mq_hw_ctx * hctx = m - > private ; \
struct kyber_hctx_data * khd = hctx - > sched_data ; \
\
spin_unlock ( & khd - > lock ) ; \
} \
\
static const struct seq_operations kyber_ # # name # # _rqs_seq_ops = { \
. start = kyber_ # # name # # _rqs_start , \
. next = kyber_ # # name # # _rqs_next , \
. stop = kyber_ # # name # # _rqs_stop , \
. show = blk_mq_debugfs_rq_show , \
} ; \
\
static int kyber_ # # name # # _waiting_show ( void * data , struct seq_file * m ) \
{ \
struct blk_mq_hw_ctx * hctx = data ; \
struct kyber_hctx_data * khd = hctx - > sched_data ; \
2017-06-20 13:06:13 +03:00
wait_queue_entry_t * wait = & khd - > domain_wait [ domain ] ; \
2017-05-04 10:31:33 +03:00
\
sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list naming
So I've noticed a number of instances where it was not obvious from the
code whether ->task_list was for a wait-queue head or a wait-queue entry.
Furthermore, there's a number of wait-queue users where the lists are
not for 'tasks' but other entities (poll tables, etc.), in which case
the 'task_list' name is actively confusing.
To clear this all up, name the wait-queue head and entry list structure
fields unambiguously:
struct wait_queue_head::task_list => ::head
struct wait_queue_entry::task_list => ::entry
For example, this code:
rqw->wait.task_list.next != &wait->task_list
... is was pretty unclear (to me) what it's doing, while now it's written this way:
rqw->wait.head.next != &wait->entry
... which makes it pretty clear that we are iterating a list until we see the head.
Other examples are:
list_for_each_entry_safe(pos, next, &x->task_list, task_list) {
list_for_each_entry(wq, &fence->wait.task_list, task_list) {
... where it's unclear (to me) what we are iterating, and during review it's
hard to tell whether it's trying to walk a wait-queue entry (which would be
a bug), while now it's written as:
list_for_each_entry_safe(pos, next, &x->head, entry) {
list_for_each_entry(wq, &fence->wait.head, entry) {
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-20 13:06:46 +03:00
seq_printf ( m , " %d \n " , ! list_empty_careful ( & wait - > entry ) ) ; \
2017-05-04 10:31:33 +03:00
return 0 ; \
}
KYBER_DEBUGFS_DOMAIN_ATTRS ( KYBER_READ , read )
KYBER_DEBUGFS_DOMAIN_ATTRS ( KYBER_SYNC_WRITE , sync_write )
KYBER_DEBUGFS_DOMAIN_ATTRS ( KYBER_OTHER , other )
# undef KYBER_DEBUGFS_DOMAIN_ATTRS
static int kyber_async_depth_show ( void * data , struct seq_file * m )
{
struct request_queue * q = data ;
struct kyber_queue_data * kqd = q - > elevator - > elevator_data ;
seq_printf ( m , " %u \n " , kqd - > async_depth ) ;
return 0 ;
}
static int kyber_cur_domain_show ( void * data , struct seq_file * m )
{
struct blk_mq_hw_ctx * hctx = data ;
struct kyber_hctx_data * khd = hctx - > sched_data ;
switch ( khd - > cur_domain ) {
case KYBER_READ :
seq_puts ( m , " READ \n " ) ;
break ;
case KYBER_SYNC_WRITE :
seq_puts ( m , " SYNC_WRITE \n " ) ;
break ;
case KYBER_OTHER :
seq_puts ( m , " OTHER \n " ) ;
break ;
default :
seq_printf ( m , " %u \n " , khd - > cur_domain ) ;
break ;
}
return 0 ;
}
static int kyber_batching_show ( void * data , struct seq_file * m )
{
struct blk_mq_hw_ctx * hctx = data ;
struct kyber_hctx_data * khd = hctx - > sched_data ;
seq_printf ( m , " %u \n " , khd - > batching ) ;
return 0 ;
}
# define KYBER_QUEUE_DOMAIN_ATTRS(name) \
{ # name " _tokens " , 0400 , kyber_ # # name # # _tokens_show }
static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs [ ] = {
KYBER_QUEUE_DOMAIN_ATTRS ( read ) ,
KYBER_QUEUE_DOMAIN_ATTRS ( sync_write ) ,
KYBER_QUEUE_DOMAIN_ATTRS ( other ) ,
{ " async_depth " , 0400 , kyber_async_depth_show } ,
{ } ,
} ;
# undef KYBER_QUEUE_DOMAIN_ATTRS
# define KYBER_HCTX_DOMAIN_ATTRS(name) \
{ # name " _rqs " , 0400 , . seq_ops = & kyber_ # # name # # _rqs_seq_ops } , \
{ # name " _waiting " , 0400 , kyber_ # # name # # _waiting_show }
static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs [ ] = {
KYBER_HCTX_DOMAIN_ATTRS ( read ) ,
KYBER_HCTX_DOMAIN_ATTRS ( sync_write ) ,
KYBER_HCTX_DOMAIN_ATTRS ( other ) ,
{ " cur_domain " , 0400 , kyber_cur_domain_show } ,
{ " batching " , 0400 , kyber_batching_show } ,
{ } ,
} ;
# undef KYBER_HCTX_DOMAIN_ATTRS
# endif
2017-04-14 11:00:02 +03:00
static struct elevator_type kyber_sched = {
. ops . mq = {
. init_sched = kyber_init_sched ,
. exit_sched = kyber_exit_sched ,
. init_hctx = kyber_init_hctx ,
. exit_hctx = kyber_exit_hctx ,
2017-06-16 19:15:26 +03:00
. limit_depth = kyber_limit_depth ,
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
. bio_merge = kyber_bio_merge ,
2017-06-16 19:15:26 +03:00
. prepare_request = kyber_prepare_request ,
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends
on ctx rq_list to do merging, however, most of time, it will not
leave any requests in ctx rq_list. This is because even if tokens
of one domain is used up, kyber will try to dispatch requests
from other domain and flush the rq_list there.
To improve this, we setup kyber_ctx_queue (kcq) which is similar
with ctx, but it has rq_lists for different domain and build same
mapping between kcq and khd as the ctx & hctx. Then we could merge,
insert and dispatch for different domains separately. At the same
time, only flush the rq_list of kcq when get domain token successfully.
Then if one domain token is used up, the requests could be left in
the rq_list of that domain and maybe merged with following io.
Following is my test result on machine with 8 cores and NVMe card
INTEL SSDPEKKR128G7
fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
seq/random
+------+---------------------------------------------------------------+
|patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge |
+----------------------------------------------------------------------+
| w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 |
+----------------------------------------------------------------------+
| w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k |
+----------------------------------------------------------------------+
When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
on my platform.
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-30 19:47:40 +03:00
. insert_requests = kyber_insert_requests ,
2017-06-16 19:15:21 +03:00
. finish_request = kyber_finish_request ,
2018-02-23 18:36:57 +03:00
. requeue_request = kyber_finish_request ,
2017-04-14 11:00:02 +03:00
. completed_request = kyber_completed_request ,
. dispatch_request = kyber_dispatch_request ,
. has_work = kyber_has_work ,
} ,
. uses_mq = true ,
2017-05-04 10:31:33 +03:00
# ifdef CONFIG_BLK_DEBUG_FS
. queue_debugfs_attrs = kyber_queue_debugfs_attrs ,
. hctx_debugfs_attrs = kyber_hctx_debugfs_attrs ,
# endif
2017-04-14 11:00:02 +03:00
. elevator_attrs = kyber_sched_attrs ,
. elevator_name = " kyber " ,
. elevator_owner = THIS_MODULE ,
} ;
static int __init kyber_init ( void )
{
return elv_register ( & kyber_sched ) ;
}
static void __exit kyber_exit ( void )
{
elv_unregister ( & kyber_sched ) ;
}
module_init ( kyber_init ) ;
module_exit ( kyber_exit ) ;
MODULE_AUTHOR ( " Omar Sandoval " ) ;
MODULE_LICENSE ( " GPL " ) ;
MODULE_DESCRIPTION ( " Kyber I/O scheduler " ) ;