05fd095d53
After a struct sbitmap_queue is resized smaller, the allocation hints may still be set to bits beyond the new depth of the bitmap. This means that, for example, if the number of blk-mq tags is reduced through sysfs, more requests than the nominal queue depth may be in flight. It's tempting to fix this at resize time by doing a one-time reinitialization of the hints, but this can race with __sbitmap_queue_get() updating the hint. Instead, check the hint before we use it. This caused no measurable performance difference in my synthetic benchmarks. Signed-off-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>
348 lines
7.9 KiB
C
348 lines
7.9 KiB
C
/*
|
|
* Copyright (C) 2016 Facebook
|
|
* Copyright (C) 2013-2014 Jens Axboe
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <linux/random.h>
|
|
#include <linux/sbitmap.h>
|
|
|
|
int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
|
|
gfp_t flags, int node)
|
|
{
|
|
unsigned int bits_per_word;
|
|
unsigned int i;
|
|
|
|
if (shift < 0) {
|
|
shift = ilog2(BITS_PER_LONG);
|
|
/*
|
|
* If the bitmap is small, shrink the number of bits per word so
|
|
* we spread over a few cachelines, at least. If less than 4
|
|
* bits, just forget about it, it's not going to work optimally
|
|
* anyway.
|
|
*/
|
|
if (depth >= 4) {
|
|
while ((4U << shift) > depth)
|
|
shift--;
|
|
}
|
|
}
|
|
bits_per_word = 1U << shift;
|
|
if (bits_per_word > BITS_PER_LONG)
|
|
return -EINVAL;
|
|
|
|
sb->shift = shift;
|
|
sb->depth = depth;
|
|
sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
|
|
|
|
if (depth == 0) {
|
|
sb->map = NULL;
|
|
return 0;
|
|
}
|
|
|
|
sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
|
|
if (!sb->map)
|
|
return -ENOMEM;
|
|
|
|
for (i = 0; i < sb->map_nr; i++) {
|
|
sb->map[i].depth = min(depth, bits_per_word);
|
|
depth -= sb->map[i].depth;
|
|
}
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_init_node);
|
|
|
|
void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
|
|
{
|
|
unsigned int bits_per_word = 1U << sb->shift;
|
|
unsigned int i;
|
|
|
|
sb->depth = depth;
|
|
sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
|
|
|
|
for (i = 0; i < sb->map_nr; i++) {
|
|
sb->map[i].depth = min(depth, bits_per_word);
|
|
depth -= sb->map[i].depth;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_resize);
|
|
|
|
static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
|
|
bool wrap)
|
|
{
|
|
unsigned int orig_hint = hint;
|
|
int nr;
|
|
|
|
while (1) {
|
|
nr = find_next_zero_bit(&word->word, word->depth, hint);
|
|
if (unlikely(nr >= word->depth)) {
|
|
/*
|
|
* We started with an offset, and we didn't reset the
|
|
* offset to 0 in a failure case, so start from 0 to
|
|
* exhaust the map.
|
|
*/
|
|
if (orig_hint && hint && wrap) {
|
|
hint = orig_hint = 0;
|
|
continue;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
if (!test_and_set_bit(nr, &word->word))
|
|
break;
|
|
|
|
hint = nr + 1;
|
|
if (hint >= word->depth - 1)
|
|
hint = 0;
|
|
}
|
|
|
|
return nr;
|
|
}
|
|
|
|
int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
|
|
{
|
|
unsigned int i, index;
|
|
int nr = -1;
|
|
|
|
index = SB_NR_TO_INDEX(sb, alloc_hint);
|
|
|
|
for (i = 0; i < sb->map_nr; i++) {
|
|
nr = __sbitmap_get_word(&sb->map[index],
|
|
SB_NR_TO_BIT(sb, alloc_hint),
|
|
!round_robin);
|
|
if (nr != -1) {
|
|
nr += index << sb->shift;
|
|
break;
|
|
}
|
|
|
|
/* Jump to next index. */
|
|
index++;
|
|
alloc_hint = index << sb->shift;
|
|
|
|
if (index >= sb->map_nr) {
|
|
index = 0;
|
|
alloc_hint = 0;
|
|
}
|
|
}
|
|
|
|
return nr;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_get);
|
|
|
|
bool sbitmap_any_bit_set(const struct sbitmap *sb)
|
|
{
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < sb->map_nr; i++) {
|
|
if (sb->map[i].word)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);
|
|
|
|
bool sbitmap_any_bit_clear(const struct sbitmap *sb)
|
|
{
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < sb->map_nr; i++) {
|
|
const struct sbitmap_word *word = &sb->map[i];
|
|
unsigned long ret;
|
|
|
|
ret = find_first_zero_bit(&word->word, word->depth);
|
|
if (ret < word->depth)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
|
|
|
|
unsigned int sbitmap_weight(const struct sbitmap *sb)
|
|
{
|
|
unsigned int i, weight;
|
|
|
|
for (i = 0; i < sb->map_nr; i++) {
|
|
const struct sbitmap_word *word = &sb->map[i];
|
|
|
|
weight += bitmap_weight(&word->word, word->depth);
|
|
}
|
|
return weight;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_weight);
|
|
|
|
static unsigned int sbq_calc_wake_batch(unsigned int depth)
|
|
{
|
|
unsigned int wake_batch;
|
|
|
|
/*
|
|
* For each batch, we wake up one queue. We need to make sure that our
|
|
* batch size is small enough that the full depth of the bitmap is
|
|
* enough to wake up all of the queues.
|
|
*/
|
|
wake_batch = SBQ_WAKE_BATCH;
|
|
if (wake_batch > depth / SBQ_WAIT_QUEUES)
|
|
wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
|
|
|
|
return wake_batch;
|
|
}
|
|
|
|
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
|
|
int shift, bool round_robin, gfp_t flags, int node)
|
|
{
|
|
int ret;
|
|
int i;
|
|
|
|
ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node);
|
|
if (ret)
|
|
return ret;
|
|
|
|
sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
|
|
if (!sbq->alloc_hint) {
|
|
sbitmap_free(&sbq->sb);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
if (depth && !round_robin) {
|
|
for_each_possible_cpu(i)
|
|
*per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
|
|
}
|
|
|
|
sbq->wake_batch = sbq_calc_wake_batch(depth);
|
|
atomic_set(&sbq->wake_index, 0);
|
|
|
|
sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
|
|
if (!sbq->ws) {
|
|
free_percpu(sbq->alloc_hint);
|
|
sbitmap_free(&sbq->sb);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
|
|
init_waitqueue_head(&sbq->ws[i].wait);
|
|
atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
|
|
}
|
|
|
|
sbq->round_robin = round_robin;
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
|
|
|
|
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
|
|
{
|
|
sbq->wake_batch = sbq_calc_wake_batch(depth);
|
|
sbitmap_resize(&sbq->sb, depth);
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
|
|
|
|
int __sbitmap_queue_get(struct sbitmap_queue *sbq)
|
|
{
|
|
unsigned int hint, depth;
|
|
int nr;
|
|
|
|
hint = this_cpu_read(*sbq->alloc_hint);
|
|
depth = READ_ONCE(sbq->sb.depth);
|
|
if (unlikely(hint >= depth)) {
|
|
hint = depth ? prandom_u32() % depth : 0;
|
|
this_cpu_write(*sbq->alloc_hint, hint);
|
|
}
|
|
nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin);
|
|
|
|
if (nr == -1) {
|
|
/* If the map is full, a hint won't do us much good. */
|
|
this_cpu_write(*sbq->alloc_hint, 0);
|
|
} else if (nr == hint || unlikely(sbq->round_robin)) {
|
|
/* Only update the hint if we used it. */
|
|
hint = nr + 1;
|
|
if (hint >= depth - 1)
|
|
hint = 0;
|
|
this_cpu_write(*sbq->alloc_hint, hint);
|
|
}
|
|
|
|
return nr;
|
|
}
|
|
EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
|
|
|
|
static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
|
|
{
|
|
int i, wake_index;
|
|
|
|
wake_index = atomic_read(&sbq->wake_index);
|
|
for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
|
|
struct sbq_wait_state *ws = &sbq->ws[wake_index];
|
|
|
|
if (waitqueue_active(&ws->wait)) {
|
|
int o = atomic_read(&sbq->wake_index);
|
|
|
|
if (wake_index != o)
|
|
atomic_cmpxchg(&sbq->wake_index, o, wake_index);
|
|
return ws;
|
|
}
|
|
|
|
wake_index = sbq_index_inc(wake_index);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void sbq_wake_up(struct sbitmap_queue *sbq)
|
|
{
|
|
struct sbq_wait_state *ws;
|
|
int wait_cnt;
|
|
|
|
/* Ensure that the wait list checks occur after clear_bit(). */
|
|
smp_mb();
|
|
|
|
ws = sbq_wake_ptr(sbq);
|
|
if (!ws)
|
|
return;
|
|
|
|
wait_cnt = atomic_dec_return(&ws->wait_cnt);
|
|
if (unlikely(wait_cnt < 0))
|
|
wait_cnt = atomic_inc_return(&ws->wait_cnt);
|
|
if (wait_cnt == 0) {
|
|
atomic_add(sbq->wake_batch, &ws->wait_cnt);
|
|
sbq_index_atomic_inc(&sbq->wake_index);
|
|
wake_up(&ws->wait);
|
|
}
|
|
}
|
|
|
|
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
|
|
unsigned int cpu)
|
|
{
|
|
sbitmap_clear_bit(&sbq->sb, nr);
|
|
sbq_wake_up(sbq);
|
|
if (likely(!sbq->round_robin))
|
|
*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
|
|
|
|
void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
|
|
{
|
|
int i, wake_index;
|
|
|
|
/*
|
|
* Make sure all changes prior to this are visible from other CPUs.
|
|
*/
|
|
smp_mb();
|
|
wake_index = atomic_read(&sbq->wake_index);
|
|
for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
|
|
struct sbq_wait_state *ws = &sbq->ws[wake_index];
|
|
|
|
if (waitqueue_active(&ws->wait))
|
|
wake_up(&ws->wait);
|
|
|
|
wake_index = sbq_index_inc(wake_index);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);
|