Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe: - fix for a memory leak on certain unplug events - a collection of bcache fixes from Kent and Nicolas - a few null_blk fixes and updates form Matias - a marking of static of functions in the stec pci-e driver * 'for-linus' of git://git.kernel.dk/linux-block: null_blk: support submit_queues on use_per_node_hctx null_blk: set use_per_node_hctx param to false null_blk: corrections to documentation null_blk: warning on ignored submit_queues param null_blk: refactor init and init errors code paths null_blk: documentation null_blk: mem garbage on NUMA systems during init drivers: block: Mark the functions as static in skd_main.c bcache: New writeback PD controller bcache: bugfix for race between moving_gc and bucket_invalidate bcache: fix for gc and writeback race bcache: bugfix - moving_gc now moves only correct buckets bcache: fix for gc crashing when no sectors are used bcache: Fix heap_peek() macro bcache: Fix for can_attach_cache() bcache: Fix dirty_data accounting bcache: Use uninterruptible sleep in writeback bcache: kthread don't set writeback task to INTERUPTIBLE block: fix memory leaks on unplugging block device bcache: fix sparse non static symbol warning
This commit is contained in:
commit
c5fdd531b5
72
Documentation/block/null_blk.txt
Normal file
72
Documentation/block/null_blk.txt
Normal file
@ -0,0 +1,72 @@
|
||||
Null block device driver
|
||||
================================================================================
|
||||
|
||||
I. Overview
|
||||
|
||||
The null block device (/dev/nullb*) is used for benchmarking the various
|
||||
block-layer implementations. It emulates a block device of X gigabytes in size.
|
||||
The following instances are possible:
|
||||
|
||||
Single-queue block-layer
|
||||
- Request-based.
|
||||
- Single submission queue per device.
|
||||
- Implements IO scheduling algorithms (CFQ, Deadline, noop).
|
||||
Multi-queue block-layer
|
||||
- Request-based.
|
||||
- Configurable submission queues per device.
|
||||
No block-layer (Known as bio-based)
|
||||
- Bio-based. IO requests are submitted directly to the device driver.
|
||||
- Directly accepts bio data structure and returns them.
|
||||
|
||||
All of them have a completion queue for each core in the system.
|
||||
|
||||
II. Module parameters applicable for all instances:
|
||||
|
||||
queue_mode=[0-2]: Default: 2-Multi-queue
|
||||
Selects which block-layer the module should instantiate with.
|
||||
|
||||
0: Bio-based.
|
||||
1: Single-queue.
|
||||
2: Multi-queue.
|
||||
|
||||
home_node=[0--nr_nodes]: Default: NUMA_NO_NODE
|
||||
Selects what CPU node the data structures are allocated from.
|
||||
|
||||
gb=[Size in GB]: Default: 250GB
|
||||
The size of the device reported to the system.
|
||||
|
||||
bs=[Block size (in bytes)]: Default: 512 bytes
|
||||
The block size reported to the system.
|
||||
|
||||
nr_devices=[Number of devices]: Default: 2
|
||||
Number of block devices instantiated. They are instantiated as /dev/nullb0,
|
||||
etc.
|
||||
|
||||
irq_mode=[0-2]: Default: 1-Soft-irq
|
||||
The completion mode used for completing IOs to the block-layer.
|
||||
|
||||
0: None.
|
||||
1: Soft-irq. Uses IPI to complete IOs across CPU nodes. Simulates the overhead
|
||||
when IOs are issued from another CPU node than the home the device is
|
||||
connected to.
|
||||
2: Timer: Waits a specific period (completion_nsec) for each IO before
|
||||
completion.
|
||||
|
||||
completion_nsec=[ns]: Default: 10.000ns
|
||||
Combined with irq_mode=2 (timer). The time each completion event must wait.
|
||||
|
||||
submit_queues=[0..nr_cpus]:
|
||||
The number of submission queues attached to the device driver. If unset, it
|
||||
defaults to 1 on single-queue and bio-based instances. For multi-queue,
|
||||
it is ignored when use_per_node_hctx module parameter is 1.
|
||||
|
||||
hw_queue_depth=[0..qdepth]: Default: 64
|
||||
The hardware queue depth of the device.
|
||||
|
||||
III: Multi-queue specific parameters
|
||||
|
||||
use_per_node_hctx=[0/1]: Default: 0
|
||||
0: The number of submit queues are set to the value of the submit_queues
|
||||
parameter.
|
||||
1: The multi-queue block layer is instantiated with a hardware dispatch
|
||||
queue for each CPU node in the system.
|
@ -335,9 +335,22 @@ static struct kobj_type blk_mq_hw_ktype = {
|
||||
void blk_mq_unregister_disk(struct gendisk *disk)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
struct blk_mq_ctx *ctx;
|
||||
int i, j;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
hctx_for_each_ctx(hctx, ctx, j) {
|
||||
kobject_del(&ctx->kobj);
|
||||
kobject_put(&ctx->kobj);
|
||||
}
|
||||
kobject_del(&hctx->kobj);
|
||||
kobject_put(&hctx->kobj);
|
||||
}
|
||||
|
||||
kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
|
||||
kobject_del(&q->mq_kobj);
|
||||
kobject_put(&q->mq_kobj);
|
||||
|
||||
kobject_put(&disk_to_dev(disk)->kobj);
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/fs.h>
|
||||
@ -65,7 +66,7 @@ enum {
|
||||
NULL_Q_MQ = 2,
|
||||
};
|
||||
|
||||
static int submit_queues = 1;
|
||||
static int submit_queues;
|
||||
module_param(submit_queues, int, S_IRUGO);
|
||||
MODULE_PARM_DESC(submit_queues, "Number of submission queues");
|
||||
|
||||
@ -101,9 +102,9 @@ static int hw_queue_depth = 64;
|
||||
module_param(hw_queue_depth, int, S_IRUGO);
|
||||
MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
|
||||
|
||||
static bool use_per_node_hctx = true;
|
||||
static bool use_per_node_hctx = false;
|
||||
module_param(use_per_node_hctx, bool, S_IRUGO);
|
||||
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: true");
|
||||
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
|
||||
|
||||
static void put_tag(struct nullb_queue *nq, unsigned int tag)
|
||||
{
|
||||
@ -346,8 +347,37 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
|
||||
|
||||
static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
|
||||
{
|
||||
return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL,
|
||||
hctx_index);
|
||||
int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes);
|
||||
int tip = (reg->nr_hw_queues % nr_online_nodes);
|
||||
int node = 0, i, n;
|
||||
|
||||
/*
|
||||
* Split submit queues evenly wrt to the number of nodes. If uneven,
|
||||
* fill the first buckets with one extra, until the rest is filled with
|
||||
* no extra.
|
||||
*/
|
||||
for (i = 0, n = 1; i < hctx_index; i++, n++) {
|
||||
if (n % b_size == 0) {
|
||||
n = 0;
|
||||
node++;
|
||||
|
||||
tip--;
|
||||
if (!tip)
|
||||
b_size = reg->nr_hw_queues / nr_online_nodes;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* A node might not be online, therefore map the relative node id to the
|
||||
* real node id.
|
||||
*/
|
||||
for_each_online_node(n) {
|
||||
if (!node)
|
||||
break;
|
||||
node--;
|
||||
}
|
||||
|
||||
return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n);
|
||||
}
|
||||
|
||||
static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
|
||||
@ -355,16 +385,24 @@ static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
|
||||
kfree(hctx);
|
||||
}
|
||||
|
||||
static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
|
||||
{
|
||||
BUG_ON(!nullb);
|
||||
BUG_ON(!nq);
|
||||
|
||||
init_waitqueue_head(&nq->wait);
|
||||
nq->queue_depth = nullb->queue_depth;
|
||||
}
|
||||
|
||||
static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
||||
unsigned int index)
|
||||
{
|
||||
struct nullb *nullb = data;
|
||||
struct nullb_queue *nq = &nullb->queues[index];
|
||||
|
||||
init_waitqueue_head(&nq->wait);
|
||||
nq->queue_depth = nullb->queue_depth;
|
||||
nullb->nr_queues++;
|
||||
hctx->driver_data = nq;
|
||||
null_init_queue(nullb, nq);
|
||||
nullb->nr_queues++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -417,13 +455,13 @@ static int setup_commands(struct nullb_queue *nq)
|
||||
|
||||
nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL);
|
||||
if (!nq->cmds)
|
||||
return 1;
|
||||
return -ENOMEM;
|
||||
|
||||
tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
|
||||
nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL);
|
||||
if (!nq->tag_map) {
|
||||
kfree(nq->cmds);
|
||||
return 1;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (i = 0; i < nq->queue_depth; i++) {
|
||||
@ -454,33 +492,37 @@ static void cleanup_queues(struct nullb *nullb)
|
||||
|
||||
static int setup_queues(struct nullb *nullb)
|
||||
{
|
||||
struct nullb_queue *nq;
|
||||
int i;
|
||||
|
||||
nullb->queues = kzalloc(submit_queues * sizeof(*nq), GFP_KERNEL);
|
||||
nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue),
|
||||
GFP_KERNEL);
|
||||
if (!nullb->queues)
|
||||
return 1;
|
||||
return -ENOMEM;
|
||||
|
||||
nullb->nr_queues = 0;
|
||||
nullb->queue_depth = hw_queue_depth;
|
||||
|
||||
if (queue_mode == NULL_Q_MQ)
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int init_driver_queues(struct nullb *nullb)
|
||||
{
|
||||
struct nullb_queue *nq;
|
||||
int i, ret = 0;
|
||||
|
||||
for (i = 0; i < submit_queues; i++) {
|
||||
nq = &nullb->queues[i];
|
||||
init_waitqueue_head(&nq->wait);
|
||||
nq->queue_depth = hw_queue_depth;
|
||||
if (setup_commands(nq))
|
||||
break;
|
||||
|
||||
null_init_queue(nullb, nq);
|
||||
|
||||
ret = setup_commands(nq);
|
||||
if (ret)
|
||||
goto err_queue;
|
||||
nullb->nr_queues++;
|
||||
}
|
||||
|
||||
if (i == submit_queues)
|
||||
return 0;
|
||||
|
||||
return 0;
|
||||
err_queue:
|
||||
cleanup_queues(nullb);
|
||||
return 1;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int null_add_dev(void)
|
||||
@ -518,11 +560,13 @@ static int null_add_dev(void)
|
||||
} else if (queue_mode == NULL_Q_BIO) {
|
||||
nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
|
||||
blk_queue_make_request(nullb->q, null_queue_bio);
|
||||
init_driver_queues(nullb);
|
||||
} else {
|
||||
nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
|
||||
blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
|
||||
if (nullb->q)
|
||||
blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
|
||||
init_driver_queues(nullb);
|
||||
}
|
||||
|
||||
if (!nullb->q)
|
||||
@ -579,7 +623,13 @@ static int __init null_init(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
if (submit_queues > nr_cpu_ids)
|
||||
if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
|
||||
if (submit_queues < nr_online_nodes) {
|
||||
pr_warn("null_blk: submit_queues param is set to %u.",
|
||||
nr_online_nodes);
|
||||
submit_queues = nr_online_nodes;
|
||||
}
|
||||
} else if (submit_queues > nr_cpu_ids)
|
||||
submit_queues = nr_cpu_ids;
|
||||
else if (!submit_queues)
|
||||
submit_queues = 1;
|
||||
|
@ -5269,7 +5269,7 @@ const char *skd_skdev_state_to_str(enum skd_drvr_state state)
|
||||
}
|
||||
}
|
||||
|
||||
const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state)
|
||||
static const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state)
|
||||
{
|
||||
switch (state) {
|
||||
case SKD_MSG_STATE_IDLE:
|
||||
@ -5281,7 +5281,7 @@ const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state)
|
||||
}
|
||||
}
|
||||
|
||||
const char *skd_skreq_state_to_str(enum skd_req_state state)
|
||||
static const char *skd_skreq_state_to_str(enum skd_req_state state)
|
||||
{
|
||||
switch (state) {
|
||||
case SKD_REQ_STATE_IDLE:
|
||||
|
@ -421,9 +421,11 @@ out:
|
||||
|
||||
if (watermark <= WATERMARK_METADATA) {
|
||||
SET_GC_MARK(b, GC_MARK_METADATA);
|
||||
SET_GC_MOVE(b, 0);
|
||||
b->prio = BTREE_PRIO;
|
||||
} else {
|
||||
SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
|
||||
SET_GC_MOVE(b, 0);
|
||||
b->prio = INITIAL_PRIO;
|
||||
}
|
||||
|
||||
|
@ -197,7 +197,7 @@ struct bucket {
|
||||
uint8_t disk_gen;
|
||||
uint8_t last_gc; /* Most out of date gen in the btree */
|
||||
uint8_t gc_gen;
|
||||
uint16_t gc_mark;
|
||||
uint16_t gc_mark; /* Bitfield used by GC. See below for field */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -209,7 +209,8 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
|
||||
#define GC_MARK_RECLAIMABLE 0
|
||||
#define GC_MARK_DIRTY 1
|
||||
#define GC_MARK_METADATA 2
|
||||
BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
|
||||
BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13);
|
||||
BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
|
||||
|
||||
#include "journal.h"
|
||||
#include "stats.h"
|
||||
@ -372,14 +373,14 @@ struct cached_dev {
|
||||
unsigned char writeback_percent;
|
||||
unsigned writeback_delay;
|
||||
|
||||
int writeback_rate_change;
|
||||
int64_t writeback_rate_derivative;
|
||||
uint64_t writeback_rate_target;
|
||||
int64_t writeback_rate_proportional;
|
||||
int64_t writeback_rate_derivative;
|
||||
int64_t writeback_rate_change;
|
||||
|
||||
unsigned writeback_rate_update_seconds;
|
||||
unsigned writeback_rate_d_term;
|
||||
unsigned writeback_rate_p_term_inverse;
|
||||
unsigned writeback_rate_d_smooth;
|
||||
};
|
||||
|
||||
enum alloc_watermarks {
|
||||
@ -445,7 +446,6 @@ struct cache {
|
||||
* call prio_write() to keep gens from wrapping.
|
||||
*/
|
||||
uint8_t need_save_prio;
|
||||
unsigned gc_move_threshold;
|
||||
|
||||
/*
|
||||
* If nonzero, we know we aren't going to find any buckets to invalidate
|
||||
|
@ -1561,6 +1561,28 @@ size_t bch_btree_gc_finish(struct cache_set *c)
|
||||
SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
|
||||
GC_MARK_METADATA);
|
||||
|
||||
/* don't reclaim buckets to which writeback keys point */
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < c->nr_uuids; i++) {
|
||||
struct bcache_device *d = c->devices[i];
|
||||
struct cached_dev *dc;
|
||||
struct keybuf_key *w, *n;
|
||||
unsigned j;
|
||||
|
||||
if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
|
||||
continue;
|
||||
dc = container_of(d, struct cached_dev, disk);
|
||||
|
||||
spin_lock(&dc->writeback_keys.lock);
|
||||
rbtree_postorder_for_each_entry_safe(w, n,
|
||||
&dc->writeback_keys.keys, node)
|
||||
for (j = 0; j < KEY_PTRS(&w->key); j++)
|
||||
SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
|
||||
GC_MARK_DIRTY);
|
||||
spin_unlock(&dc->writeback_keys.lock);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
for_each_cache(ca, c, i) {
|
||||
uint64_t *i;
|
||||
|
||||
@ -1817,7 +1839,8 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
|
||||
if (KEY_START(k) > KEY_START(insert) + sectors_found)
|
||||
goto check_failed;
|
||||
|
||||
if (KEY_PTRS(replace_key) != KEY_PTRS(k))
|
||||
if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
|
||||
KEY_DIRTY(k) != KEY_DIRTY(replace_key))
|
||||
goto check_failed;
|
||||
|
||||
/* skip past gen */
|
||||
@ -2217,7 +2240,7 @@ struct btree_insert_op {
|
||||
struct bkey *replace_key;
|
||||
};
|
||||
|
||||
int btree_insert_fn(struct btree_op *b_op, struct btree *b)
|
||||
static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
|
||||
{
|
||||
struct btree_insert_op *op = container_of(b_op,
|
||||
struct btree_insert_op, op);
|
||||
|
@ -25,10 +25,9 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < KEY_PTRS(k); i++) {
|
||||
struct cache *ca = PTR_CACHE(c, k, i);
|
||||
struct bucket *g = PTR_BUCKET(c, k, i);
|
||||
|
||||
if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
|
||||
if (GC_MOVE(g))
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -65,11 +64,16 @@ static void write_moving_finish(struct closure *cl)
|
||||
|
||||
static void read_moving_endio(struct bio *bio, int error)
|
||||
{
|
||||
struct bbio *b = container_of(bio, struct bbio, bio);
|
||||
struct moving_io *io = container_of(bio->bi_private,
|
||||
struct moving_io, cl);
|
||||
|
||||
if (error)
|
||||
io->op.error = error;
|
||||
else if (!KEY_DIRTY(&b->key) &&
|
||||
ptr_stale(io->op.c, &b->key, 0)) {
|
||||
io->op.error = -EINTR;
|
||||
}
|
||||
|
||||
bch_bbio_endio(io->op.c, bio, error, "reading data to move");
|
||||
}
|
||||
@ -141,6 +145,11 @@ static void read_moving(struct cache_set *c)
|
||||
if (!w)
|
||||
break;
|
||||
|
||||
if (ptr_stale(c, &w->key, 0)) {
|
||||
bch_keybuf_del(&c->moving_gc_keys, w);
|
||||
continue;
|
||||
}
|
||||
|
||||
io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
|
||||
* DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
|
||||
GFP_KERNEL);
|
||||
@ -184,7 +193,8 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r)
|
||||
|
||||
static unsigned bucket_heap_top(struct cache *ca)
|
||||
{
|
||||
return GC_SECTORS_USED(heap_peek(&ca->heap));
|
||||
struct bucket *b;
|
||||
return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
|
||||
}
|
||||
|
||||
void bch_moving_gc(struct cache_set *c)
|
||||
@ -226,9 +236,8 @@ void bch_moving_gc(struct cache_set *c)
|
||||
sectors_to_move -= GC_SECTORS_USED(b);
|
||||
}
|
||||
|
||||
ca->gc_move_threshold = bucket_heap_top(ca);
|
||||
|
||||
pr_debug("threshold %u", ca->gc_move_threshold);
|
||||
while (heap_pop(&ca->heap, b, bucket_cmp))
|
||||
SET_GC_MOVE(b, 1);
|
||||
}
|
||||
|
||||
mutex_unlock(&c->bucket_lock);
|
||||
|
@ -1676,7 +1676,7 @@ err:
|
||||
static bool can_attach_cache(struct cache *ca, struct cache_set *c)
|
||||
{
|
||||
return ca->sb.block_size == c->sb.block_size &&
|
||||
ca->sb.bucket_size == c->sb.block_size &&
|
||||
ca->sb.bucket_size == c->sb.bucket_size &&
|
||||
ca->sb.nr_in_set == c->sb.nr_in_set;
|
||||
}
|
||||
|
||||
|
@ -83,7 +83,6 @@ rw_attribute(writeback_rate);
|
||||
rw_attribute(writeback_rate_update_seconds);
|
||||
rw_attribute(writeback_rate_d_term);
|
||||
rw_attribute(writeback_rate_p_term_inverse);
|
||||
rw_attribute(writeback_rate_d_smooth);
|
||||
read_attribute(writeback_rate_debug);
|
||||
|
||||
read_attribute(stripe_size);
|
||||
@ -129,31 +128,41 @@ SHOW(__bch_cached_dev)
|
||||
var_printf(writeback_running, "%i");
|
||||
var_print(writeback_delay);
|
||||
var_print(writeback_percent);
|
||||
sysfs_print(writeback_rate, dc->writeback_rate.rate);
|
||||
sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
|
||||
|
||||
var_print(writeback_rate_update_seconds);
|
||||
var_print(writeback_rate_d_term);
|
||||
var_print(writeback_rate_p_term_inverse);
|
||||
var_print(writeback_rate_d_smooth);
|
||||
|
||||
if (attr == &sysfs_writeback_rate_debug) {
|
||||
char rate[20];
|
||||
char dirty[20];
|
||||
char derivative[20];
|
||||
char target[20];
|
||||
bch_hprint(dirty,
|
||||
bcache_dev_sectors_dirty(&dc->disk) << 9);
|
||||
bch_hprint(derivative, dc->writeback_rate_derivative << 9);
|
||||
char proportional[20];
|
||||
char derivative[20];
|
||||
char change[20];
|
||||
s64 next_io;
|
||||
|
||||
bch_hprint(rate, dc->writeback_rate.rate << 9);
|
||||
bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
|
||||
bch_hprint(target, dc->writeback_rate_target << 9);
|
||||
bch_hprint(proportional,dc->writeback_rate_proportional << 9);
|
||||
bch_hprint(derivative, dc->writeback_rate_derivative << 9);
|
||||
bch_hprint(change, dc->writeback_rate_change << 9);
|
||||
|
||||
next_io = div64_s64(dc->writeback_rate.next - local_clock(),
|
||||
NSEC_PER_MSEC);
|
||||
|
||||
return sprintf(buf,
|
||||
"rate:\t\t%u\n"
|
||||
"change:\t\t%i\n"
|
||||
"rate:\t\t%s/sec\n"
|
||||
"dirty:\t\t%s\n"
|
||||
"target:\t\t%s\n"
|
||||
"proportional:\t%s\n"
|
||||
"derivative:\t%s\n"
|
||||
"target:\t\t%s\n",
|
||||
dc->writeback_rate.rate,
|
||||
dc->writeback_rate_change,
|
||||
dirty, derivative, target);
|
||||
"change:\t\t%s/sec\n"
|
||||
"next io:\t%llims\n",
|
||||
rate, dirty, target, proportional,
|
||||
derivative, change, next_io);
|
||||
}
|
||||
|
||||
sysfs_hprint(dirty_data,
|
||||
@ -189,6 +198,7 @@ STORE(__cached_dev)
|
||||
struct kobj_uevent_env *env;
|
||||
|
||||
#define d_strtoul(var) sysfs_strtoul(var, dc->var)
|
||||
#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
|
||||
#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
|
||||
|
||||
sysfs_strtoul(data_csum, dc->disk.data_csum);
|
||||
@ -197,16 +207,15 @@ STORE(__cached_dev)
|
||||
d_strtoul(writeback_metadata);
|
||||
d_strtoul(writeback_running);
|
||||
d_strtoul(writeback_delay);
|
||||
sysfs_strtoul_clamp(writeback_rate,
|
||||
dc->writeback_rate.rate, 1, 1000000);
|
||||
|
||||
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
|
||||
|
||||
d_strtoul(writeback_rate_update_seconds);
|
||||
sysfs_strtoul_clamp(writeback_rate,
|
||||
dc->writeback_rate.rate, 1, INT_MAX);
|
||||
|
||||
d_strtoul_nonzero(writeback_rate_update_seconds);
|
||||
d_strtoul(writeback_rate_d_term);
|
||||
d_strtoul(writeback_rate_p_term_inverse);
|
||||
sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
|
||||
dc->writeback_rate_p_term_inverse, 1, INT_MAX);
|
||||
d_strtoul(writeback_rate_d_smooth);
|
||||
d_strtoul_nonzero(writeback_rate_p_term_inverse);
|
||||
|
||||
d_strtoi_h(sequential_cutoff);
|
||||
d_strtoi_h(readahead);
|
||||
@ -313,7 +322,6 @@ static struct attribute *bch_cached_dev_files[] = {
|
||||
&sysfs_writeback_rate_update_seconds,
|
||||
&sysfs_writeback_rate_d_term,
|
||||
&sysfs_writeback_rate_p_term_inverse,
|
||||
&sysfs_writeback_rate_d_smooth,
|
||||
&sysfs_writeback_rate_debug,
|
||||
&sysfs_dirty_data,
|
||||
&sysfs_stripe_size,
|
||||
|
@ -209,7 +209,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
|
||||
{
|
||||
uint64_t now = local_clock();
|
||||
|
||||
d->next += div_u64(done, d->rate);
|
||||
d->next += div_u64(done * NSEC_PER_SEC, d->rate);
|
||||
|
||||
if (time_before64(now + NSEC_PER_SEC, d->next))
|
||||
d->next = now + NSEC_PER_SEC;
|
||||
|
||||
if (time_after64(now - NSEC_PER_SEC * 2, d->next))
|
||||
d->next = now - NSEC_PER_SEC * 2;
|
||||
|
||||
return time_after64(d->next, now)
|
||||
? div_u64(d->next - now, NSEC_PER_SEC / HZ)
|
||||
|
@ -110,7 +110,7 @@ do { \
|
||||
_r; \
|
||||
})
|
||||
|
||||
#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL)
|
||||
#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL)
|
||||
|
||||
#define heap_full(h) ((h)->used == (h)->size)
|
||||
|
||||
|
@ -30,38 +30,40 @@ static void __update_writeback_rate(struct cached_dev *dc)
|
||||
|
||||
/* PD controller */
|
||||
|
||||
int change = 0;
|
||||
int64_t error;
|
||||
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
|
||||
int64_t derivative = dirty - dc->disk.sectors_dirty_last;
|
||||
int64_t proportional = dirty - target;
|
||||
int64_t change;
|
||||
|
||||
dc->disk.sectors_dirty_last = dirty;
|
||||
|
||||
derivative *= dc->writeback_rate_d_term;
|
||||
derivative = clamp(derivative, -dirty, dirty);
|
||||
/* Scale to sectors per second */
|
||||
|
||||
proportional *= dc->writeback_rate_update_seconds;
|
||||
proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
|
||||
|
||||
derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
|
||||
|
||||
derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
|
||||
dc->writeback_rate_d_smooth, 0);
|
||||
(dc->writeback_rate_d_term /
|
||||
dc->writeback_rate_update_seconds) ?: 1, 0);
|
||||
|
||||
/* Avoid divide by zero */
|
||||
if (!target)
|
||||
goto out;
|
||||
derivative *= dc->writeback_rate_d_term;
|
||||
derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
|
||||
|
||||
error = div64_s64((dirty + derivative - target) << 8, target);
|
||||
|
||||
change = div_s64((dc->writeback_rate.rate * error) >> 8,
|
||||
dc->writeback_rate_p_term_inverse);
|
||||
change = proportional + derivative;
|
||||
|
||||
/* Don't increase writeback rate if the device isn't keeping up */
|
||||
if (change > 0 &&
|
||||
time_after64(local_clock(),
|
||||
dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
|
||||
dc->writeback_rate.next + NSEC_PER_MSEC))
|
||||
change = 0;
|
||||
|
||||
dc->writeback_rate.rate =
|
||||
clamp_t(int64_t, dc->writeback_rate.rate + change,
|
||||
clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
|
||||
1, NSEC_PER_MSEC);
|
||||
out:
|
||||
|
||||
dc->writeback_rate_proportional = proportional;
|
||||
dc->writeback_rate_derivative = derivative;
|
||||
dc->writeback_rate_change = change;
|
||||
dc->writeback_rate_target = target;
|
||||
@ -87,15 +89,11 @@ static void update_writeback_rate(struct work_struct *work)
|
||||
|
||||
static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
|
||||
{
|
||||
uint64_t ret;
|
||||
|
||||
if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
|
||||
!dc->writeback_percent)
|
||||
return 0;
|
||||
|
||||
ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
|
||||
|
||||
return min_t(uint64_t, ret, HZ);
|
||||
return bch_next_delay(&dc->writeback_rate, sectors);
|
||||
}
|
||||
|
||||
struct dirty_io {
|
||||
@ -241,7 +239,7 @@ static void read_dirty(struct cached_dev *dc)
|
||||
if (KEY_START(&w->key) != dc->last_read ||
|
||||
jiffies_to_msecs(delay) > 50)
|
||||
while (!kthread_should_stop() && delay)
|
||||
delay = schedule_timeout_interruptible(delay);
|
||||
delay = schedule_timeout_uninterruptible(delay);
|
||||
|
||||
dc->last_read = KEY_OFFSET(&w->key);
|
||||
|
||||
@ -438,7 +436,7 @@ static int bch_writeback_thread(void *arg)
|
||||
while (delay &&
|
||||
!kthread_should_stop() &&
|
||||
!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
|
||||
delay = schedule_timeout_interruptible(delay);
|
||||
delay = schedule_timeout_uninterruptible(delay);
|
||||
}
|
||||
}
|
||||
|
||||
@ -476,6 +474,8 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
|
||||
|
||||
bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
|
||||
sectors_dirty_init_fn, 0);
|
||||
|
||||
dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
|
||||
}
|
||||
|
||||
int bch_cached_dev_writeback_init(struct cached_dev *dc)
|
||||
@ -490,18 +490,15 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc)
|
||||
dc->writeback_delay = 30;
|
||||
dc->writeback_rate.rate = 1024;
|
||||
|
||||
dc->writeback_rate_update_seconds = 30;
|
||||
dc->writeback_rate_d_term = 16;
|
||||
dc->writeback_rate_p_term_inverse = 64;
|
||||
dc->writeback_rate_d_smooth = 8;
|
||||
dc->writeback_rate_update_seconds = 5;
|
||||
dc->writeback_rate_d_term = 30;
|
||||
dc->writeback_rate_p_term_inverse = 6000;
|
||||
|
||||
dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
|
||||
"bcache_writeback");
|
||||
if (IS_ERR(dc->writeback_thread))
|
||||
return PTR_ERR(dc->writeback_thread);
|
||||
|
||||
set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE);
|
||||
|
||||
INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
|
||||
schedule_delayed_work(&dc->writeback_rate_update,
|
||||
dc->writeback_rate_update_seconds * HZ);
|
||||
|
Loading…
x
Reference in New Issue
Block a user