1e4ab7b4c8
When using the cleaner policy to decommission the cache, there is
never any writeback started from the cache as it is constantly delayed
due to normal I/O keeping the device busy. Meaning @idle=false was
always being passed to clean_target_met()
Fix this by adding a specific 'cleaner' flag that is set when the
cleaner policy is configured. This flag serves to always allow the
cleaner's writeback work to be queued until the cache is
decommissioned (even if the cache isn't idle).
Reported-by: David Jeffery <djeffery@redhat.com>
Fixes: b29d4986d0
("dm cache: significant rework to leverage dm-bio-prison-v2")
Cc: stable@vger.kernel.org
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
1957 lines
44 KiB
C
1957 lines
44 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (C) 2015 Red Hat. All rights reserved.
|
|
*
|
|
* This file is released under the GPL.
|
|
*/
|
|
|
|
#include "dm-cache-background-tracker.h"
|
|
#include "dm-cache-policy-internal.h"
|
|
#include "dm-cache-policy.h"
|
|
#include "dm.h"
|
|
|
|
#include <linux/hash.h>
|
|
#include <linux/jiffies.h>
|
|
#include <linux/module.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/math64.h>
|
|
|
|
#define DM_MSG_PREFIX "cache-policy-smq"
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
/*
|
|
* Safe division functions that return zero on divide by zero.
|
|
*/
|
|
static unsigned int safe_div(unsigned int n, unsigned int d)
|
|
{
|
|
return d ? n / d : 0u;
|
|
}
|
|
|
|
static unsigned int safe_mod(unsigned int n, unsigned int d)
|
|
{
|
|
return d ? n % d : 0u;
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
struct entry {
|
|
unsigned int hash_next:28;
|
|
unsigned int prev:28;
|
|
unsigned int next:28;
|
|
unsigned int level:6;
|
|
bool dirty:1;
|
|
bool allocated:1;
|
|
bool sentinel:1;
|
|
bool pending_work:1;
|
|
|
|
dm_oblock_t oblock;
|
|
};
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
#define INDEXER_NULL ((1u << 28u) - 1u)
|
|
|
|
/*
|
|
* An entry_space manages a set of entries that we use for the queues.
|
|
* The clean and dirty queues share entries, so this object is separate
|
|
* from the queue itself.
|
|
*/
|
|
struct entry_space {
|
|
struct entry *begin;
|
|
struct entry *end;
|
|
};
|
|
|
|
static int space_init(struct entry_space *es, unsigned int nr_entries)
|
|
{
|
|
if (!nr_entries) {
|
|
es->begin = es->end = NULL;
|
|
return 0;
|
|
}
|
|
|
|
es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
|
|
if (!es->begin)
|
|
return -ENOMEM;
|
|
|
|
es->end = es->begin + nr_entries;
|
|
return 0;
|
|
}
|
|
|
|
static void space_exit(struct entry_space *es)
|
|
{
|
|
vfree(es->begin);
|
|
}
|
|
|
|
static struct entry *__get_entry(struct entry_space *es, unsigned int block)
|
|
{
|
|
struct entry *e;
|
|
|
|
e = es->begin + block;
|
|
BUG_ON(e >= es->end);
|
|
|
|
return e;
|
|
}
|
|
|
|
static unsigned int to_index(struct entry_space *es, struct entry *e)
|
|
{
|
|
BUG_ON(e < es->begin || e >= es->end);
|
|
return e - es->begin;
|
|
}
|
|
|
|
static struct entry *to_entry(struct entry_space *es, unsigned int block)
|
|
{
|
|
if (block == INDEXER_NULL)
|
|
return NULL;
|
|
|
|
return __get_entry(es, block);
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
struct ilist {
|
|
unsigned int nr_elts; /* excluding sentinel entries */
|
|
unsigned int head, tail;
|
|
};
|
|
|
|
static void l_init(struct ilist *l)
|
|
{
|
|
l->nr_elts = 0;
|
|
l->head = l->tail = INDEXER_NULL;
|
|
}
|
|
|
|
static struct entry *l_head(struct entry_space *es, struct ilist *l)
|
|
{
|
|
return to_entry(es, l->head);
|
|
}
|
|
|
|
static struct entry *l_tail(struct entry_space *es, struct ilist *l)
|
|
{
|
|
return to_entry(es, l->tail);
|
|
}
|
|
|
|
static struct entry *l_next(struct entry_space *es, struct entry *e)
|
|
{
|
|
return to_entry(es, e->next);
|
|
}
|
|
|
|
static struct entry *l_prev(struct entry_space *es, struct entry *e)
|
|
{
|
|
return to_entry(es, e->prev);
|
|
}
|
|
|
|
static bool l_empty(struct ilist *l)
|
|
{
|
|
return l->head == INDEXER_NULL;
|
|
}
|
|
|
|
static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
|
|
{
|
|
struct entry *head = l_head(es, l);
|
|
|
|
e->next = l->head;
|
|
e->prev = INDEXER_NULL;
|
|
|
|
if (head)
|
|
head->prev = l->head = to_index(es, e);
|
|
else
|
|
l->head = l->tail = to_index(es, e);
|
|
|
|
if (!e->sentinel)
|
|
l->nr_elts++;
|
|
}
|
|
|
|
static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
|
|
{
|
|
struct entry *tail = l_tail(es, l);
|
|
|
|
e->next = INDEXER_NULL;
|
|
e->prev = l->tail;
|
|
|
|
if (tail)
|
|
tail->next = l->tail = to_index(es, e);
|
|
else
|
|
l->head = l->tail = to_index(es, e);
|
|
|
|
if (!e->sentinel)
|
|
l->nr_elts++;
|
|
}
|
|
|
|
static void l_add_before(struct entry_space *es, struct ilist *l,
|
|
struct entry *old, struct entry *e)
|
|
{
|
|
struct entry *prev = l_prev(es, old);
|
|
|
|
if (!prev)
|
|
l_add_head(es, l, e);
|
|
|
|
else {
|
|
e->prev = old->prev;
|
|
e->next = to_index(es, old);
|
|
prev->next = old->prev = to_index(es, e);
|
|
|
|
if (!e->sentinel)
|
|
l->nr_elts++;
|
|
}
|
|
}
|
|
|
|
static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
|
|
{
|
|
struct entry *prev = l_prev(es, e);
|
|
struct entry *next = l_next(es, e);
|
|
|
|
if (prev)
|
|
prev->next = e->next;
|
|
else
|
|
l->head = e->next;
|
|
|
|
if (next)
|
|
next->prev = e->prev;
|
|
else
|
|
l->tail = e->prev;
|
|
|
|
if (!e->sentinel)
|
|
l->nr_elts--;
|
|
}
|
|
|
|
static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
|
|
{
|
|
struct entry *e;
|
|
|
|
for (e = l_head(es, l); e; e = l_next(es, e))
|
|
if (!e->sentinel) {
|
|
l_del(es, l, e);
|
|
return e;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
|
|
{
|
|
struct entry *e;
|
|
|
|
for (e = l_tail(es, l); e; e = l_prev(es, e))
|
|
if (!e->sentinel) {
|
|
l_del(es, l, e);
|
|
return e;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
/*
|
|
* The stochastic-multi-queue is a set of lru lists stacked into levels.
|
|
* Entries are moved up levels when they are used, which loosely orders the
|
|
* most accessed entries in the top levels and least in the bottom. This
|
|
* structure is *much* better than a single lru list.
|
|
*/
|
|
#define MAX_LEVELS 64u
|
|
|
|
struct queue {
|
|
struct entry_space *es;
|
|
|
|
unsigned int nr_elts;
|
|
unsigned int nr_levels;
|
|
struct ilist qs[MAX_LEVELS];
|
|
|
|
/*
|
|
* We maintain a count of the number of entries we would like in each
|
|
* level.
|
|
*/
|
|
unsigned int last_target_nr_elts;
|
|
unsigned int nr_top_levels;
|
|
unsigned int nr_in_top_levels;
|
|
unsigned int target_count[MAX_LEVELS];
|
|
};
|
|
|
|
static void q_init(struct queue *q, struct entry_space *es, unsigned int nr_levels)
|
|
{
|
|
unsigned int i;
|
|
|
|
q->es = es;
|
|
q->nr_elts = 0;
|
|
q->nr_levels = nr_levels;
|
|
|
|
for (i = 0; i < q->nr_levels; i++) {
|
|
l_init(q->qs + i);
|
|
q->target_count[i] = 0u;
|
|
}
|
|
|
|
q->last_target_nr_elts = 0u;
|
|
q->nr_top_levels = 0u;
|
|
q->nr_in_top_levels = 0u;
|
|
}
|
|
|
|
static unsigned int q_size(struct queue *q)
|
|
{
|
|
return q->nr_elts;
|
|
}
|
|
|
|
/*
|
|
* Insert an entry to the back of the given level.
|
|
*/
|
|
static void q_push(struct queue *q, struct entry *e)
|
|
{
|
|
BUG_ON(e->pending_work);
|
|
|
|
if (!e->sentinel)
|
|
q->nr_elts++;
|
|
|
|
l_add_tail(q->es, q->qs + e->level, e);
|
|
}
|
|
|
|
static void q_push_front(struct queue *q, struct entry *e)
|
|
{
|
|
BUG_ON(e->pending_work);
|
|
|
|
if (!e->sentinel)
|
|
q->nr_elts++;
|
|
|
|
l_add_head(q->es, q->qs + e->level, e);
|
|
}
|
|
|
|
static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
|
|
{
|
|
BUG_ON(e->pending_work);
|
|
|
|
if (!e->sentinel)
|
|
q->nr_elts++;
|
|
|
|
l_add_before(q->es, q->qs + e->level, old, e);
|
|
}
|
|
|
|
static void q_del(struct queue *q, struct entry *e)
|
|
{
|
|
l_del(q->es, q->qs + e->level, e);
|
|
if (!e->sentinel)
|
|
q->nr_elts--;
|
|
}
|
|
|
|
/*
|
|
* Return the oldest entry of the lowest populated level.
|
|
*/
|
|
static struct entry *q_peek(struct queue *q, unsigned int max_level, bool can_cross_sentinel)
|
|
{
|
|
unsigned int level;
|
|
struct entry *e;
|
|
|
|
max_level = min(max_level, q->nr_levels);
|
|
|
|
for (level = 0; level < max_level; level++)
|
|
for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
|
|
if (e->sentinel) {
|
|
if (can_cross_sentinel)
|
|
continue;
|
|
else
|
|
break;
|
|
}
|
|
|
|
return e;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static struct entry *q_pop(struct queue *q)
|
|
{
|
|
struct entry *e = q_peek(q, q->nr_levels, true);
|
|
|
|
if (e)
|
|
q_del(q, e);
|
|
|
|
return e;
|
|
}
|
|
|
|
/*
|
|
* This function assumes there is a non-sentinel entry to pop. It's only
|
|
* used by redistribute, so we know this is true. It also doesn't adjust
|
|
* the q->nr_elts count.
|
|
*/
|
|
static struct entry *__redist_pop_from(struct queue *q, unsigned int level)
|
|
{
|
|
struct entry *e;
|
|
|
|
for (; level < q->nr_levels; level++)
|
|
for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
|
|
if (!e->sentinel) {
|
|
l_del(q->es, q->qs + e->level, e);
|
|
return e;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void q_set_targets_subrange_(struct queue *q, unsigned int nr_elts,
|
|
unsigned int lbegin, unsigned int lend)
|
|
{
|
|
unsigned int level, nr_levels, entries_per_level, remainder;
|
|
|
|
BUG_ON(lbegin > lend);
|
|
BUG_ON(lend > q->nr_levels);
|
|
nr_levels = lend - lbegin;
|
|
entries_per_level = safe_div(nr_elts, nr_levels);
|
|
remainder = safe_mod(nr_elts, nr_levels);
|
|
|
|
for (level = lbegin; level < lend; level++)
|
|
q->target_count[level] =
|
|
(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
|
|
}
|
|
|
|
/*
|
|
* Typically we have fewer elements in the top few levels which allows us
|
|
* to adjust the promote threshold nicely.
|
|
*/
|
|
static void q_set_targets(struct queue *q)
|
|
{
|
|
if (q->last_target_nr_elts == q->nr_elts)
|
|
return;
|
|
|
|
q->last_target_nr_elts = q->nr_elts;
|
|
|
|
if (q->nr_top_levels > q->nr_levels)
|
|
q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
|
|
|
|
else {
|
|
q_set_targets_subrange_(q, q->nr_in_top_levels,
|
|
q->nr_levels - q->nr_top_levels, q->nr_levels);
|
|
|
|
if (q->nr_in_top_levels < q->nr_elts)
|
|
q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
|
|
0, q->nr_levels - q->nr_top_levels);
|
|
else
|
|
q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
|
|
}
|
|
}
|
|
|
|
static void q_redistribute(struct queue *q)
|
|
{
|
|
unsigned int target, level;
|
|
struct ilist *l, *l_above;
|
|
struct entry *e;
|
|
|
|
q_set_targets(q);
|
|
|
|
for (level = 0u; level < q->nr_levels - 1u; level++) {
|
|
l = q->qs + level;
|
|
target = q->target_count[level];
|
|
|
|
/*
|
|
* Pull down some entries from the level above.
|
|
*/
|
|
while (l->nr_elts < target) {
|
|
e = __redist_pop_from(q, level + 1u);
|
|
if (!e) {
|
|
/* bug in nr_elts */
|
|
break;
|
|
}
|
|
|
|
e->level = level;
|
|
l_add_tail(q->es, l, e);
|
|
}
|
|
|
|
/*
|
|
* Push some entries up.
|
|
*/
|
|
l_above = q->qs + level + 1u;
|
|
while (l->nr_elts > target) {
|
|
e = l_pop_tail(q->es, l);
|
|
|
|
if (!e)
|
|
/* bug in nr_elts */
|
|
break;
|
|
|
|
e->level = level + 1u;
|
|
l_add_tail(q->es, l_above, e);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void q_requeue(struct queue *q, struct entry *e, unsigned int extra_levels,
|
|
struct entry *s1, struct entry *s2)
|
|
{
|
|
struct entry *de;
|
|
unsigned int sentinels_passed = 0;
|
|
unsigned int new_level = min(q->nr_levels - 1u, e->level + extra_levels);
|
|
|
|
/* try and find an entry to swap with */
|
|
if (extra_levels && (e->level < q->nr_levels - 1u)) {
|
|
for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
|
|
sentinels_passed++;
|
|
|
|
if (de) {
|
|
q_del(q, de);
|
|
de->level = e->level;
|
|
if (s1) {
|
|
switch (sentinels_passed) {
|
|
case 0:
|
|
q_push_before(q, s1, de);
|
|
break;
|
|
|
|
case 1:
|
|
q_push_before(q, s2, de);
|
|
break;
|
|
|
|
default:
|
|
q_push(q, de);
|
|
}
|
|
} else
|
|
q_push(q, de);
|
|
}
|
|
}
|
|
|
|
q_del(q, e);
|
|
e->level = new_level;
|
|
q_push(q, e);
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
#define FP_SHIFT 8
|
|
#define SIXTEENTH (1u << (FP_SHIFT - 4u))
|
|
#define EIGHTH (1u << (FP_SHIFT - 3u))
|
|
|
|
struct stats {
|
|
unsigned int hit_threshold;
|
|
unsigned int hits;
|
|
unsigned int misses;
|
|
};
|
|
|
|
enum performance {
|
|
Q_POOR,
|
|
Q_FAIR,
|
|
Q_WELL
|
|
};
|
|
|
|
static void stats_init(struct stats *s, unsigned int nr_levels)
|
|
{
|
|
s->hit_threshold = (nr_levels * 3u) / 4u;
|
|
s->hits = 0u;
|
|
s->misses = 0u;
|
|
}
|
|
|
|
static void stats_reset(struct stats *s)
|
|
{
|
|
s->hits = s->misses = 0u;
|
|
}
|
|
|
|
static void stats_level_accessed(struct stats *s, unsigned int level)
|
|
{
|
|
if (level >= s->hit_threshold)
|
|
s->hits++;
|
|
else
|
|
s->misses++;
|
|
}
|
|
|
|
static void stats_miss(struct stats *s)
|
|
{
|
|
s->misses++;
|
|
}
|
|
|
|
/*
|
|
* There are times when we don't have any confidence in the hotspot queue.
|
|
* Such as when a fresh cache is created and the blocks have been spread
|
|
* out across the levels, or if an io load changes. We detect this by
|
|
* seeing how often a lookup is in the top levels of the hotspot queue.
|
|
*/
|
|
static enum performance stats_assess(struct stats *s)
|
|
{
|
|
unsigned int confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
|
|
|
|
if (confidence < SIXTEENTH)
|
|
return Q_POOR;
|
|
|
|
else if (confidence < EIGHTH)
|
|
return Q_FAIR;
|
|
|
|
else
|
|
return Q_WELL;
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
struct smq_hash_table {
|
|
struct entry_space *es;
|
|
unsigned long long hash_bits;
|
|
unsigned int *buckets;
|
|
};
|
|
|
|
/*
|
|
* All cache entries are stored in a chained hash table. To save space we
|
|
* use indexing again, and only store indexes to the next entry.
|
|
*/
|
|
static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned int nr_entries)
|
|
{
|
|
unsigned int i, nr_buckets;
|
|
|
|
ht->es = es;
|
|
nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
|
|
ht->hash_bits = __ffs(nr_buckets);
|
|
|
|
ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
|
|
if (!ht->buckets)
|
|
return -ENOMEM;
|
|
|
|
for (i = 0; i < nr_buckets; i++)
|
|
ht->buckets[i] = INDEXER_NULL;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void h_exit(struct smq_hash_table *ht)
|
|
{
|
|
vfree(ht->buckets);
|
|
}
|
|
|
|
static struct entry *h_head(struct smq_hash_table *ht, unsigned int bucket)
|
|
{
|
|
return to_entry(ht->es, ht->buckets[bucket]);
|
|
}
|
|
|
|
static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
|
|
{
|
|
return to_entry(ht->es, e->hash_next);
|
|
}
|
|
|
|
static void __h_insert(struct smq_hash_table *ht, unsigned int bucket, struct entry *e)
|
|
{
|
|
e->hash_next = ht->buckets[bucket];
|
|
ht->buckets[bucket] = to_index(ht->es, e);
|
|
}
|
|
|
|
static void h_insert(struct smq_hash_table *ht, struct entry *e)
|
|
{
|
|
unsigned int h = hash_64(from_oblock(e->oblock), ht->hash_bits);
|
|
|
|
__h_insert(ht, h, e);
|
|
}
|
|
|
|
static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned int h, dm_oblock_t oblock,
|
|
struct entry **prev)
|
|
{
|
|
struct entry *e;
|
|
|
|
*prev = NULL;
|
|
for (e = h_head(ht, h); e; e = h_next(ht, e)) {
|
|
if (e->oblock == oblock)
|
|
return e;
|
|
|
|
*prev = e;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void __h_unlink(struct smq_hash_table *ht, unsigned int h,
|
|
struct entry *e, struct entry *prev)
|
|
{
|
|
if (prev)
|
|
prev->hash_next = e->hash_next;
|
|
else
|
|
ht->buckets[h] = e->hash_next;
|
|
}
|
|
|
|
/*
|
|
* Also moves each entry to the front of the bucket.
|
|
*/
|
|
static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
|
|
{
|
|
struct entry *e, *prev;
|
|
unsigned int h = hash_64(from_oblock(oblock), ht->hash_bits);
|
|
|
|
e = __h_lookup(ht, h, oblock, &prev);
|
|
if (e && prev) {
|
|
/*
|
|
* Move to the front because this entry is likely
|
|
* to be hit again.
|
|
*/
|
|
__h_unlink(ht, h, e, prev);
|
|
__h_insert(ht, h, e);
|
|
}
|
|
|
|
return e;
|
|
}
|
|
|
|
static void h_remove(struct smq_hash_table *ht, struct entry *e)
|
|
{
|
|
unsigned int h = hash_64(from_oblock(e->oblock), ht->hash_bits);
|
|
struct entry *prev;
|
|
|
|
/*
|
|
* The down side of using a singly linked list is we have to
|
|
* iterate the bucket to remove an item.
|
|
*/
|
|
e = __h_lookup(ht, h, e->oblock, &prev);
|
|
if (e)
|
|
__h_unlink(ht, h, e, prev);
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
struct entry_alloc {
|
|
struct entry_space *es;
|
|
unsigned int begin;
|
|
|
|
unsigned int nr_allocated;
|
|
struct ilist free;
|
|
};
|
|
|
|
static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
|
|
unsigned int begin, unsigned int end)
|
|
{
|
|
unsigned int i;
|
|
|
|
ea->es = es;
|
|
ea->nr_allocated = 0u;
|
|
ea->begin = begin;
|
|
|
|
l_init(&ea->free);
|
|
for (i = begin; i != end; i++)
|
|
l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
|
|
}
|
|
|
|
static void init_entry(struct entry *e)
|
|
{
|
|
/*
|
|
* We can't memset because that would clear the hotspot and
|
|
* sentinel bits which remain constant.
|
|
*/
|
|
e->hash_next = INDEXER_NULL;
|
|
e->next = INDEXER_NULL;
|
|
e->prev = INDEXER_NULL;
|
|
e->level = 0u;
|
|
e->dirty = true; /* FIXME: audit */
|
|
e->allocated = true;
|
|
e->sentinel = false;
|
|
e->pending_work = false;
|
|
}
|
|
|
|
static struct entry *alloc_entry(struct entry_alloc *ea)
|
|
{
|
|
struct entry *e;
|
|
|
|
if (l_empty(&ea->free))
|
|
return NULL;
|
|
|
|
e = l_pop_head(ea->es, &ea->free);
|
|
init_entry(e);
|
|
ea->nr_allocated++;
|
|
|
|
return e;
|
|
}
|
|
|
|
/*
|
|
* This assumes the cblock hasn't already been allocated.
|
|
*/
|
|
static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned int i)
|
|
{
|
|
struct entry *e = __get_entry(ea->es, ea->begin + i);
|
|
|
|
BUG_ON(e->allocated);
|
|
|
|
l_del(ea->es, &ea->free, e);
|
|
init_entry(e);
|
|
ea->nr_allocated++;
|
|
|
|
return e;
|
|
}
|
|
|
|
static void free_entry(struct entry_alloc *ea, struct entry *e)
|
|
{
|
|
BUG_ON(!ea->nr_allocated);
|
|
BUG_ON(!e->allocated);
|
|
|
|
ea->nr_allocated--;
|
|
e->allocated = false;
|
|
l_add_tail(ea->es, &ea->free, e);
|
|
}
|
|
|
|
static bool allocator_empty(struct entry_alloc *ea)
|
|
{
|
|
return l_empty(&ea->free);
|
|
}
|
|
|
|
static unsigned int get_index(struct entry_alloc *ea, struct entry *e)
|
|
{
|
|
return to_index(ea->es, e) - ea->begin;
|
|
}
|
|
|
|
static struct entry *get_entry(struct entry_alloc *ea, unsigned int index)
|
|
{
|
|
return __get_entry(ea->es, ea->begin + index);
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
#define NR_HOTSPOT_LEVELS 64u
|
|
#define NR_CACHE_LEVELS 64u
|
|
|
|
#define WRITEBACK_PERIOD (10ul * HZ)
|
|
#define DEMOTE_PERIOD (60ul * HZ)
|
|
|
|
#define HOTSPOT_UPDATE_PERIOD (HZ)
|
|
#define CACHE_UPDATE_PERIOD (60ul * HZ)
|
|
|
|
struct smq_policy {
|
|
struct dm_cache_policy policy;
|
|
|
|
/* protects everything */
|
|
spinlock_t lock;
|
|
dm_cblock_t cache_size;
|
|
sector_t cache_block_size;
|
|
|
|
sector_t hotspot_block_size;
|
|
unsigned int nr_hotspot_blocks;
|
|
unsigned int cache_blocks_per_hotspot_block;
|
|
unsigned int hotspot_level_jump;
|
|
|
|
struct entry_space es;
|
|
struct entry_alloc writeback_sentinel_alloc;
|
|
struct entry_alloc demote_sentinel_alloc;
|
|
struct entry_alloc hotspot_alloc;
|
|
struct entry_alloc cache_alloc;
|
|
|
|
unsigned long *hotspot_hit_bits;
|
|
unsigned long *cache_hit_bits;
|
|
|
|
/*
|
|
* We maintain three queues of entries. The cache proper,
|
|
* consisting of a clean and dirty queue, containing the currently
|
|
* active mappings. The hotspot queue uses a larger block size to
|
|
* track blocks that are being hit frequently and potential
|
|
* candidates for promotion to the cache.
|
|
*/
|
|
struct queue hotspot;
|
|
struct queue clean;
|
|
struct queue dirty;
|
|
|
|
struct stats hotspot_stats;
|
|
struct stats cache_stats;
|
|
|
|
/*
|
|
* Keeps track of time, incremented by the core. We use this to
|
|
* avoid attributing multiple hits within the same tick.
|
|
*/
|
|
unsigned int tick;
|
|
|
|
/*
|
|
* The hash tables allows us to quickly find an entry by origin
|
|
* block.
|
|
*/
|
|
struct smq_hash_table table;
|
|
struct smq_hash_table hotspot_table;
|
|
|
|
bool current_writeback_sentinels;
|
|
unsigned long next_writeback_period;
|
|
|
|
bool current_demote_sentinels;
|
|
unsigned long next_demote_period;
|
|
|
|
unsigned int write_promote_level;
|
|
unsigned int read_promote_level;
|
|
|
|
unsigned long next_hotspot_period;
|
|
unsigned long next_cache_period;
|
|
|
|
struct background_tracker *bg_work;
|
|
|
|
bool migrations_allowed:1;
|
|
|
|
/*
|
|
* If this is set the policy will try and clean the whole cache
|
|
* even if the device is not idle.
|
|
*/
|
|
bool cleaner:1;
|
|
};
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
static struct entry *get_sentinel(struct entry_alloc *ea, unsigned int level, bool which)
|
|
{
|
|
return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
|
|
}
|
|
|
|
static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned int level)
|
|
{
|
|
return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
|
|
}
|
|
|
|
static struct entry *demote_sentinel(struct smq_policy *mq, unsigned int level)
|
|
{
|
|
return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
|
|
}
|
|
|
|
static void __update_writeback_sentinels(struct smq_policy *mq)
|
|
{
|
|
unsigned int level;
|
|
struct queue *q = &mq->dirty;
|
|
struct entry *sentinel;
|
|
|
|
for (level = 0; level < q->nr_levels; level++) {
|
|
sentinel = writeback_sentinel(mq, level);
|
|
q_del(q, sentinel);
|
|
q_push(q, sentinel);
|
|
}
|
|
}
|
|
|
|
static void __update_demote_sentinels(struct smq_policy *mq)
|
|
{
|
|
unsigned int level;
|
|
struct queue *q = &mq->clean;
|
|
struct entry *sentinel;
|
|
|
|
for (level = 0; level < q->nr_levels; level++) {
|
|
sentinel = demote_sentinel(mq, level);
|
|
q_del(q, sentinel);
|
|
q_push(q, sentinel);
|
|
}
|
|
}
|
|
|
|
static void update_sentinels(struct smq_policy *mq)
|
|
{
|
|
if (time_after(jiffies, mq->next_writeback_period)) {
|
|
mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
|
|
mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
|
|
__update_writeback_sentinels(mq);
|
|
}
|
|
|
|
if (time_after(jiffies, mq->next_demote_period)) {
|
|
mq->next_demote_period = jiffies + DEMOTE_PERIOD;
|
|
mq->current_demote_sentinels = !mq->current_demote_sentinels;
|
|
__update_demote_sentinels(mq);
|
|
}
|
|
}
|
|
|
|
static void __sentinels_init(struct smq_policy *mq)
|
|
{
|
|
unsigned int level;
|
|
struct entry *sentinel;
|
|
|
|
for (level = 0; level < NR_CACHE_LEVELS; level++) {
|
|
sentinel = writeback_sentinel(mq, level);
|
|
sentinel->level = level;
|
|
q_push(&mq->dirty, sentinel);
|
|
|
|
sentinel = demote_sentinel(mq, level);
|
|
sentinel->level = level;
|
|
q_push(&mq->clean, sentinel);
|
|
}
|
|
}
|
|
|
|
static void sentinels_init(struct smq_policy *mq)
|
|
{
|
|
mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
|
|
mq->next_demote_period = jiffies + DEMOTE_PERIOD;
|
|
|
|
mq->current_writeback_sentinels = false;
|
|
mq->current_demote_sentinels = false;
|
|
__sentinels_init(mq);
|
|
|
|
mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
|
|
mq->current_demote_sentinels = !mq->current_demote_sentinels;
|
|
__sentinels_init(mq);
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
static void del_queue(struct smq_policy *mq, struct entry *e)
|
|
{
|
|
q_del(e->dirty ? &mq->dirty : &mq->clean, e);
|
|
}
|
|
|
|
static void push_queue(struct smq_policy *mq, struct entry *e)
|
|
{
|
|
if (e->dirty)
|
|
q_push(&mq->dirty, e);
|
|
else
|
|
q_push(&mq->clean, e);
|
|
}
|
|
|
|
// !h, !q, a -> h, q, a
|
|
static void push(struct smq_policy *mq, struct entry *e)
|
|
{
|
|
h_insert(&mq->table, e);
|
|
if (!e->pending_work)
|
|
push_queue(mq, e);
|
|
}
|
|
|
|
static void push_queue_front(struct smq_policy *mq, struct entry *e)
|
|
{
|
|
if (e->dirty)
|
|
q_push_front(&mq->dirty, e);
|
|
else
|
|
q_push_front(&mq->clean, e);
|
|
}
|
|
|
|
static void push_front(struct smq_policy *mq, struct entry *e)
|
|
{
|
|
h_insert(&mq->table, e);
|
|
if (!e->pending_work)
|
|
push_queue_front(mq, e);
|
|
}
|
|
|
|
static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
|
|
{
|
|
return to_cblock(get_index(&mq->cache_alloc, e));
|
|
}
|
|
|
|
static void requeue(struct smq_policy *mq, struct entry *e)
|
|
{
|
|
/*
|
|
* Pending work has temporarily been taken out of the queues.
|
|
*/
|
|
if (e->pending_work)
|
|
return;
|
|
|
|
if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
|
|
if (!e->dirty) {
|
|
q_requeue(&mq->clean, e, 1u, NULL, NULL);
|
|
return;
|
|
}
|
|
|
|
q_requeue(&mq->dirty, e, 1u,
|
|
get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
|
|
get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
|
|
}
|
|
}
|
|
|
|
static unsigned int default_promote_level(struct smq_policy *mq)
|
|
{
|
|
/*
|
|
* The promote level depends on the current performance of the
|
|
* cache.
|
|
*
|
|
* If the cache is performing badly, then we can't afford
|
|
* to promote much without causing performance to drop below that
|
|
* of the origin device.
|
|
*
|
|
* If the cache is performing well, then we don't need to promote
|
|
* much. If it isn't broken, don't fix it.
|
|
*
|
|
* If the cache is middling then we promote more.
|
|
*
|
|
* This scheme reminds me of a graph of entropy vs probability of a
|
|
* binary variable.
|
|
*/
|
|
static const unsigned int table[] = {
|
|
1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1
|
|
};
|
|
|
|
unsigned int hits = mq->cache_stats.hits;
|
|
unsigned int misses = mq->cache_stats.misses;
|
|
unsigned int index = safe_div(hits << 4u, hits + misses);
|
|
return table[index];
|
|
}
|
|
|
|
static void update_promote_levels(struct smq_policy *mq)
|
|
{
|
|
/*
|
|
* If there are unused cache entries then we want to be really
|
|
* eager to promote.
|
|
*/
|
|
unsigned int threshold_level = allocator_empty(&mq->cache_alloc) ?
|
|
default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
|
|
|
|
threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
|
|
|
|
/*
|
|
* If the hotspot queue is performing badly then we have little
|
|
* confidence that we know which blocks to promote. So we cut down
|
|
* the amount of promotions.
|
|
*/
|
|
switch (stats_assess(&mq->hotspot_stats)) {
|
|
case Q_POOR:
|
|
threshold_level /= 4u;
|
|
break;
|
|
|
|
case Q_FAIR:
|
|
threshold_level /= 2u;
|
|
break;
|
|
|
|
case Q_WELL:
|
|
break;
|
|
}
|
|
|
|
mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
|
|
mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
|
|
}
|
|
|
|
/*
|
|
* If the hotspot queue is performing badly, then we try and move entries
|
|
* around more quickly.
|
|
*/
|
|
static void update_level_jump(struct smq_policy *mq)
|
|
{
|
|
switch (stats_assess(&mq->hotspot_stats)) {
|
|
case Q_POOR:
|
|
mq->hotspot_level_jump = 4u;
|
|
break;
|
|
|
|
case Q_FAIR:
|
|
mq->hotspot_level_jump = 2u;
|
|
break;
|
|
|
|
case Q_WELL:
|
|
mq->hotspot_level_jump = 1u;
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void end_hotspot_period(struct smq_policy *mq)
|
|
{
|
|
clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
|
|
update_promote_levels(mq);
|
|
|
|
if (time_after(jiffies, mq->next_hotspot_period)) {
|
|
update_level_jump(mq);
|
|
q_redistribute(&mq->hotspot);
|
|
stats_reset(&mq->hotspot_stats);
|
|
mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
|
|
}
|
|
}
|
|
|
|
static void end_cache_period(struct smq_policy *mq)
|
|
{
|
|
if (time_after(jiffies, mq->next_cache_period)) {
|
|
clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
|
|
|
|
q_redistribute(&mq->dirty);
|
|
q_redistribute(&mq->clean);
|
|
stats_reset(&mq->cache_stats);
|
|
|
|
mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
|
|
}
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
/*
|
|
* Targets are given as a percentage.
|
|
*/
|
|
#define CLEAN_TARGET 25u
|
|
#define FREE_TARGET 25u
|
|
|
|
static unsigned int percent_to_target(struct smq_policy *mq, unsigned int p)
|
|
{
|
|
return from_cblock(mq->cache_size) * p / 100u;
|
|
}
|
|
|
|
static bool clean_target_met(struct smq_policy *mq, bool idle)
|
|
{
|
|
/*
|
|
* Cache entries may not be populated. So we cannot rely on the
|
|
* size of the clean queue.
|
|
*/
|
|
if (idle || mq->cleaner) {
|
|
/*
|
|
* We'd like to clean everything.
|
|
*/
|
|
return q_size(&mq->dirty) == 0u;
|
|
}
|
|
|
|
/*
|
|
* If we're busy we don't worry about cleaning at all.
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
static bool free_target_met(struct smq_policy *mq)
|
|
{
|
|
unsigned int nr_free;
|
|
|
|
nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
|
|
return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
|
|
percent_to_target(mq, FREE_TARGET);
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
static void mark_pending(struct smq_policy *mq, struct entry *e)
|
|
{
|
|
BUG_ON(e->sentinel);
|
|
BUG_ON(!e->allocated);
|
|
BUG_ON(e->pending_work);
|
|
e->pending_work = true;
|
|
}
|
|
|
|
static void clear_pending(struct smq_policy *mq, struct entry *e)
|
|
{
|
|
BUG_ON(!e->pending_work);
|
|
e->pending_work = false;
|
|
}
|
|
|
|
static void queue_writeback(struct smq_policy *mq, bool idle)
|
|
{
|
|
int r;
|
|
struct policy_work work;
|
|
struct entry *e;
|
|
|
|
e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
|
|
if (e) {
|
|
mark_pending(mq, e);
|
|
q_del(&mq->dirty, e);
|
|
|
|
work.op = POLICY_WRITEBACK;
|
|
work.oblock = e->oblock;
|
|
work.cblock = infer_cblock(mq, e);
|
|
|
|
r = btracker_queue(mq->bg_work, &work, NULL);
|
|
if (r) {
|
|
clear_pending(mq, e);
|
|
q_push_front(&mq->dirty, e);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void queue_demotion(struct smq_policy *mq)
|
|
{
|
|
int r;
|
|
struct policy_work work;
|
|
struct entry *e;
|
|
|
|
if (WARN_ON_ONCE(!mq->migrations_allowed))
|
|
return;
|
|
|
|
e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
|
|
if (!e) {
|
|
if (!clean_target_met(mq, true))
|
|
queue_writeback(mq, false);
|
|
return;
|
|
}
|
|
|
|
mark_pending(mq, e);
|
|
q_del(&mq->clean, e);
|
|
|
|
work.op = POLICY_DEMOTE;
|
|
work.oblock = e->oblock;
|
|
work.cblock = infer_cblock(mq, e);
|
|
r = btracker_queue(mq->bg_work, &work, NULL);
|
|
if (r) {
|
|
clear_pending(mq, e);
|
|
q_push_front(&mq->clean, e);
|
|
}
|
|
}
|
|
|
|
static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
|
|
struct policy_work **workp)
|
|
{
|
|
int r;
|
|
struct entry *e;
|
|
struct policy_work work;
|
|
|
|
if (!mq->migrations_allowed)
|
|
return;
|
|
|
|
if (allocator_empty(&mq->cache_alloc)) {
|
|
/*
|
|
* We always claim to be 'idle' to ensure some demotions happen
|
|
* with continuous loads.
|
|
*/
|
|
if (!free_target_met(mq))
|
|
queue_demotion(mq);
|
|
return;
|
|
}
|
|
|
|
if (btracker_promotion_already_present(mq->bg_work, oblock))
|
|
return;
|
|
|
|
/*
|
|
* We allocate the entry now to reserve the cblock. If the
|
|
* background work is aborted we must remember to free it.
|
|
*/
|
|
e = alloc_entry(&mq->cache_alloc);
|
|
BUG_ON(!e);
|
|
e->pending_work = true;
|
|
work.op = POLICY_PROMOTE;
|
|
work.oblock = oblock;
|
|
work.cblock = infer_cblock(mq, e);
|
|
r = btracker_queue(mq->bg_work, &work, workp);
|
|
if (r)
|
|
free_entry(&mq->cache_alloc, e);
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
enum promote_result {
|
|
PROMOTE_NOT,
|
|
PROMOTE_TEMPORARY,
|
|
PROMOTE_PERMANENT
|
|
};
|
|
|
|
/*
|
|
* Converts a boolean into a promote result.
|
|
*/
|
|
static enum promote_result maybe_promote(bool promote)
|
|
{
|
|
return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
|
|
}
|
|
|
|
static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
|
|
int data_dir, bool fast_promote)
|
|
{
|
|
if (data_dir == WRITE) {
|
|
if (!allocator_empty(&mq->cache_alloc) && fast_promote)
|
|
return PROMOTE_TEMPORARY;
|
|
|
|
return maybe_promote(hs_e->level >= mq->write_promote_level);
|
|
} else
|
|
return maybe_promote(hs_e->level >= mq->read_promote_level);
|
|
}
|
|
|
|
static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
|
|
{
|
|
sector_t r = from_oblock(b);
|
|
(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
|
|
return to_oblock(r);
|
|
}
|
|
|
|
static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
|
|
{
|
|
unsigned int hi;
|
|
dm_oblock_t hb = to_hblock(mq, b);
|
|
struct entry *e = h_lookup(&mq->hotspot_table, hb);
|
|
|
|
if (e) {
|
|
stats_level_accessed(&mq->hotspot_stats, e->level);
|
|
|
|
hi = get_index(&mq->hotspot_alloc, e);
|
|
q_requeue(&mq->hotspot, e,
|
|
test_and_set_bit(hi, mq->hotspot_hit_bits) ?
|
|
0u : mq->hotspot_level_jump,
|
|
NULL, NULL);
|
|
|
|
} else {
|
|
stats_miss(&mq->hotspot_stats);
|
|
|
|
e = alloc_entry(&mq->hotspot_alloc);
|
|
if (!e) {
|
|
e = q_pop(&mq->hotspot);
|
|
if (e) {
|
|
h_remove(&mq->hotspot_table, e);
|
|
hi = get_index(&mq->hotspot_alloc, e);
|
|
clear_bit(hi, mq->hotspot_hit_bits);
|
|
}
|
|
|
|
}
|
|
|
|
if (e) {
|
|
e->oblock = hb;
|
|
q_push(&mq->hotspot, e);
|
|
h_insert(&mq->hotspot_table, e);
|
|
}
|
|
}
|
|
|
|
return e;
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
/*
|
|
* Public interface, via the policy struct. See dm-cache-policy.h for a
|
|
* description of these.
|
|
*/
|
|
|
|
static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
|
|
{
|
|
return container_of(p, struct smq_policy, policy);
|
|
}
|
|
|
|
static void smq_destroy(struct dm_cache_policy *p)
|
|
{
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
|
|
btracker_destroy(mq->bg_work);
|
|
h_exit(&mq->hotspot_table);
|
|
h_exit(&mq->table);
|
|
free_bitset(mq->hotspot_hit_bits);
|
|
free_bitset(mq->cache_hit_bits);
|
|
space_exit(&mq->es);
|
|
kfree(mq);
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
|
|
int data_dir, bool fast_copy,
|
|
struct policy_work **work, bool *background_work)
|
|
{
|
|
struct entry *e, *hs_e;
|
|
enum promote_result pr;
|
|
|
|
*background_work = false;
|
|
|
|
e = h_lookup(&mq->table, oblock);
|
|
if (e) {
|
|
stats_level_accessed(&mq->cache_stats, e->level);
|
|
|
|
requeue(mq, e);
|
|
*cblock = infer_cblock(mq, e);
|
|
return 0;
|
|
|
|
} else {
|
|
stats_miss(&mq->cache_stats);
|
|
|
|
/*
|
|
* The hotspot queue only gets updated with misses.
|
|
*/
|
|
hs_e = update_hotspot_queue(mq, oblock);
|
|
|
|
pr = should_promote(mq, hs_e, data_dir, fast_copy);
|
|
if (pr != PROMOTE_NOT) {
|
|
queue_promotion(mq, oblock, work);
|
|
*background_work = true;
|
|
}
|
|
|
|
return -ENOENT;
|
|
}
|
|
}
|
|
|
|
static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
|
|
int data_dir, bool fast_copy,
|
|
bool *background_work)
|
|
{
|
|
int r;
|
|
unsigned long flags;
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
|
|
spin_lock_irqsave(&mq->lock, flags);
|
|
r = __lookup(mq, oblock, cblock,
|
|
data_dir, fast_copy,
|
|
NULL, background_work);
|
|
spin_unlock_irqrestore(&mq->lock, flags);
|
|
|
|
return r;
|
|
}
|
|
|
|
static int smq_lookup_with_work(struct dm_cache_policy *p,
|
|
dm_oblock_t oblock, dm_cblock_t *cblock,
|
|
int data_dir, bool fast_copy,
|
|
struct policy_work **work)
|
|
{
|
|
int r;
|
|
bool background_queued;
|
|
unsigned long flags;
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
|
|
spin_lock_irqsave(&mq->lock, flags);
|
|
r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
|
|
spin_unlock_irqrestore(&mq->lock, flags);
|
|
|
|
return r;
|
|
}
|
|
|
|
static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
|
|
struct policy_work **result)
|
|
{
|
|
int r;
|
|
unsigned long flags;
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
|
|
spin_lock_irqsave(&mq->lock, flags);
|
|
r = btracker_issue(mq->bg_work, result);
|
|
if (r == -ENODATA) {
|
|
if (!clean_target_met(mq, idle)) {
|
|
queue_writeback(mq, idle);
|
|
r = btracker_issue(mq->bg_work, result);
|
|
}
|
|
}
|
|
spin_unlock_irqrestore(&mq->lock, flags);
|
|
|
|
return r;
|
|
}
|
|
|
|
/*
|
|
* We need to clear any pending work flags that have been set, and in the
|
|
* case of promotion free the entry for the destination cblock.
|
|
*/
|
|
static void __complete_background_work(struct smq_policy *mq,
|
|
struct policy_work *work,
|
|
bool success)
|
|
{
|
|
struct entry *e = get_entry(&mq->cache_alloc,
|
|
from_cblock(work->cblock));
|
|
|
|
switch (work->op) {
|
|
case POLICY_PROMOTE:
|
|
// !h, !q, a
|
|
clear_pending(mq, e);
|
|
if (success) {
|
|
e->oblock = work->oblock;
|
|
e->level = NR_CACHE_LEVELS - 1;
|
|
push(mq, e);
|
|
// h, q, a
|
|
} else {
|
|
free_entry(&mq->cache_alloc, e);
|
|
// !h, !q, !a
|
|
}
|
|
break;
|
|
|
|
case POLICY_DEMOTE:
|
|
// h, !q, a
|
|
if (success) {
|
|
h_remove(&mq->table, e);
|
|
free_entry(&mq->cache_alloc, e);
|
|
// !h, !q, !a
|
|
} else {
|
|
clear_pending(mq, e);
|
|
push_queue(mq, e);
|
|
// h, q, a
|
|
}
|
|
break;
|
|
|
|
case POLICY_WRITEBACK:
|
|
// h, !q, a
|
|
clear_pending(mq, e);
|
|
push_queue(mq, e);
|
|
// h, q, a
|
|
break;
|
|
}
|
|
|
|
btracker_complete(mq->bg_work, work);
|
|
}
|
|
|
|
static void smq_complete_background_work(struct dm_cache_policy *p,
|
|
struct policy_work *work,
|
|
bool success)
|
|
{
|
|
unsigned long flags;
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
|
|
spin_lock_irqsave(&mq->lock, flags);
|
|
__complete_background_work(mq, work, success);
|
|
spin_unlock_irqrestore(&mq->lock, flags);
|
|
}
|
|
|
|
// in_hash(oblock) -> in_hash(oblock)
|
|
static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
|
|
{
|
|
struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
|
|
|
|
if (e->pending_work)
|
|
e->dirty = set;
|
|
else {
|
|
del_queue(mq, e);
|
|
e->dirty = set;
|
|
push_queue(mq, e);
|
|
}
|
|
}
|
|
|
|
static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
|
|
{
|
|
unsigned long flags;
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
|
|
spin_lock_irqsave(&mq->lock, flags);
|
|
__smq_set_clear_dirty(mq, cblock, true);
|
|
spin_unlock_irqrestore(&mq->lock, flags);
|
|
}
|
|
|
|
static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
|
|
{
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&mq->lock, flags);
|
|
__smq_set_clear_dirty(mq, cblock, false);
|
|
spin_unlock_irqrestore(&mq->lock, flags);
|
|
}
|
|
|
|
static unsigned int random_level(dm_cblock_t cblock)
|
|
{
|
|
return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
|
|
}
|
|
|
|
static int smq_load_mapping(struct dm_cache_policy *p,
|
|
dm_oblock_t oblock, dm_cblock_t cblock,
|
|
bool dirty, uint32_t hint, bool hint_valid)
|
|
{
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
struct entry *e;
|
|
|
|
e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
|
|
e->oblock = oblock;
|
|
e->dirty = dirty;
|
|
e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
|
|
e->pending_work = false;
|
|
|
|
/*
|
|
* When we load mappings we push ahead of both sentinels in order to
|
|
* allow demotions and cleaning to occur immediately.
|
|
*/
|
|
push_front(mq, e);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
|
|
{
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
|
|
|
|
if (!e->allocated)
|
|
return -ENODATA;
|
|
|
|
// FIXME: what if this block has pending background work?
|
|
del_queue(mq, e);
|
|
h_remove(&mq->table, e);
|
|
free_entry(&mq->cache_alloc, e);
|
|
return 0;
|
|
}
|
|
|
|
static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
|
|
{
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
|
|
|
|
if (!e->allocated)
|
|
return 0;
|
|
|
|
return e->level;
|
|
}
|
|
|
|
static dm_cblock_t smq_residency(struct dm_cache_policy *p)
|
|
{
|
|
dm_cblock_t r;
|
|
unsigned long flags;
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
|
|
spin_lock_irqsave(&mq->lock, flags);
|
|
r = to_cblock(mq->cache_alloc.nr_allocated);
|
|
spin_unlock_irqrestore(&mq->lock, flags);
|
|
|
|
return r;
|
|
}
|
|
|
|
static void smq_tick(struct dm_cache_policy *p, bool can_block)
|
|
{
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&mq->lock, flags);
|
|
mq->tick++;
|
|
update_sentinels(mq);
|
|
end_hotspot_period(mq);
|
|
end_cache_period(mq);
|
|
spin_unlock_irqrestore(&mq->lock, flags);
|
|
}
|
|
|
|
static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
|
|
{
|
|
struct smq_policy *mq = to_smq_policy(p);
|
|
|
|
mq->migrations_allowed = allow;
|
|
}
|
|
|
|
/*
|
|
* smq has no config values, but the old mq policy did. To avoid breaking
|
|
* software we continue to accept these configurables for the mq policy,
|
|
* but they have no effect.
|
|
*/
|
|
static int mq_set_config_value(struct dm_cache_policy *p,
|
|
const char *key, const char *value)
|
|
{
|
|
unsigned long tmp;
|
|
|
|
if (kstrtoul(value, 10, &tmp))
|
|
return -EINVAL;
|
|
|
|
if (!strcasecmp(key, "random_threshold") ||
|
|
!strcasecmp(key, "sequential_threshold") ||
|
|
!strcasecmp(key, "discard_promote_adjustment") ||
|
|
!strcasecmp(key, "read_promote_adjustment") ||
|
|
!strcasecmp(key, "write_promote_adjustment")) {
|
|
DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
|
|
return 0;
|
|
}
|
|
|
|
return -EINVAL;
|
|
}
|
|
|
|
static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
|
|
unsigned int maxlen, ssize_t *sz_ptr)
|
|
{
|
|
ssize_t sz = *sz_ptr;
|
|
|
|
DMEMIT("10 random_threshold 0 "
|
|
"sequential_threshold 0 "
|
|
"discard_promote_adjustment 0 "
|
|
"read_promote_adjustment 0 "
|
|
"write_promote_adjustment 0 ");
|
|
|
|
*sz_ptr = sz;
|
|
return 0;
|
|
}
|
|
|
|
/* Init the policy plugin interface function pointers. */
|
|
static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
|
|
{
|
|
mq->policy.destroy = smq_destroy;
|
|
mq->policy.lookup = smq_lookup;
|
|
mq->policy.lookup_with_work = smq_lookup_with_work;
|
|
mq->policy.get_background_work = smq_get_background_work;
|
|
mq->policy.complete_background_work = smq_complete_background_work;
|
|
mq->policy.set_dirty = smq_set_dirty;
|
|
mq->policy.clear_dirty = smq_clear_dirty;
|
|
mq->policy.load_mapping = smq_load_mapping;
|
|
mq->policy.invalidate_mapping = smq_invalidate_mapping;
|
|
mq->policy.get_hint = smq_get_hint;
|
|
mq->policy.residency = smq_residency;
|
|
mq->policy.tick = smq_tick;
|
|
mq->policy.allow_migrations = smq_allow_migrations;
|
|
|
|
if (mimic_mq) {
|
|
mq->policy.set_config_value = mq_set_config_value;
|
|
mq->policy.emit_config_values = mq_emit_config_values;
|
|
}
|
|
}
|
|
|
|
static bool too_many_hotspot_blocks(sector_t origin_size,
|
|
sector_t hotspot_block_size,
|
|
unsigned int nr_hotspot_blocks)
|
|
{
|
|
return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
|
|
}
|
|
|
|
static void calc_hotspot_params(sector_t origin_size,
|
|
sector_t cache_block_size,
|
|
unsigned int nr_cache_blocks,
|
|
sector_t *hotspot_block_size,
|
|
unsigned int *nr_hotspot_blocks)
|
|
{
|
|
*hotspot_block_size = cache_block_size * 16u;
|
|
*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
|
|
|
|
while ((*hotspot_block_size > cache_block_size) &&
|
|
too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
|
|
*hotspot_block_size /= 2u;
|
|
}
|
|
|
|
static struct dm_cache_policy *
|
|
__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size,
|
|
bool mimic_mq, bool migrations_allowed, bool cleaner)
|
|
{
|
|
unsigned int i;
|
|
unsigned int nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
|
|
unsigned int total_sentinels = 2u * nr_sentinels_per_queue;
|
|
struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
|
|
|
|
if (!mq)
|
|
return NULL;
|
|
|
|
init_policy_functions(mq, mimic_mq);
|
|
mq->cache_size = cache_size;
|
|
mq->cache_block_size = cache_block_size;
|
|
|
|
calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
|
|
&mq->hotspot_block_size, &mq->nr_hotspot_blocks);
|
|
|
|
mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
|
|
mq->hotspot_level_jump = 1u;
|
|
if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
|
|
DMERR("couldn't initialize entry space");
|
|
goto bad_pool_init;
|
|
}
|
|
|
|
init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
|
|
for (i = 0; i < nr_sentinels_per_queue; i++)
|
|
get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
|
|
|
|
init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
|
|
for (i = 0; i < nr_sentinels_per_queue; i++)
|
|
get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
|
|
|
|
init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
|
|
total_sentinels + mq->nr_hotspot_blocks);
|
|
|
|
init_allocator(&mq->cache_alloc, &mq->es,
|
|
total_sentinels + mq->nr_hotspot_blocks,
|
|
total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
|
|
|
|
mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
|
|
if (!mq->hotspot_hit_bits) {
|
|
DMERR("couldn't allocate hotspot hit bitset");
|
|
goto bad_hotspot_hit_bits;
|
|
}
|
|
clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
|
|
|
|
if (from_cblock(cache_size)) {
|
|
mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
|
|
if (!mq->cache_hit_bits) {
|
|
DMERR("couldn't allocate cache hit bitset");
|
|
goto bad_cache_hit_bits;
|
|
}
|
|
clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
|
|
} else
|
|
mq->cache_hit_bits = NULL;
|
|
|
|
mq->tick = 0;
|
|
spin_lock_init(&mq->lock);
|
|
|
|
q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
|
|
mq->hotspot.nr_top_levels = 8;
|
|
mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
|
|
from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
|
|
|
|
q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
|
|
q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
|
|
|
|
stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
|
|
stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
|
|
|
|
if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
|
|
goto bad_alloc_table;
|
|
|
|
if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
|
|
goto bad_alloc_hotspot_table;
|
|
|
|
sentinels_init(mq);
|
|
mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
|
|
|
|
mq->next_hotspot_period = jiffies;
|
|
mq->next_cache_period = jiffies;
|
|
|
|
mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
|
|
if (!mq->bg_work)
|
|
goto bad_btracker;
|
|
|
|
mq->migrations_allowed = migrations_allowed;
|
|
mq->cleaner = cleaner;
|
|
|
|
return &mq->policy;
|
|
|
|
bad_btracker:
|
|
h_exit(&mq->hotspot_table);
|
|
bad_alloc_hotspot_table:
|
|
h_exit(&mq->table);
|
|
bad_alloc_table:
|
|
free_bitset(mq->cache_hit_bits);
|
|
bad_cache_hit_bits:
|
|
free_bitset(mq->hotspot_hit_bits);
|
|
bad_hotspot_hit_bits:
|
|
space_exit(&mq->es);
|
|
bad_pool_init:
|
|
kfree(mq);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
|
|
sector_t origin_size,
|
|
sector_t cache_block_size)
|
|
{
|
|
return __smq_create(cache_size, origin_size, cache_block_size,
|
|
false, true, false);
|
|
}
|
|
|
|
static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
|
|
sector_t origin_size,
|
|
sector_t cache_block_size)
|
|
{
|
|
return __smq_create(cache_size, origin_size, cache_block_size,
|
|
true, true, false);
|
|
}
|
|
|
|
static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
|
|
sector_t origin_size,
|
|
sector_t cache_block_size)
|
|
{
|
|
return __smq_create(cache_size, origin_size, cache_block_size,
|
|
false, false, true);
|
|
}
|
|
|
|
/*----------------------------------------------------------------*/
|
|
|
|
static struct dm_cache_policy_type smq_policy_type = {
|
|
.name = "smq",
|
|
.version = {2, 0, 0},
|
|
.hint_size = 4,
|
|
.owner = THIS_MODULE,
|
|
.create = smq_create
|
|
};
|
|
|
|
static struct dm_cache_policy_type mq_policy_type = {
|
|
.name = "mq",
|
|
.version = {2, 0, 0},
|
|
.hint_size = 4,
|
|
.owner = THIS_MODULE,
|
|
.create = mq_create,
|
|
};
|
|
|
|
static struct dm_cache_policy_type cleaner_policy_type = {
|
|
.name = "cleaner",
|
|
.version = {2, 0, 0},
|
|
.hint_size = 4,
|
|
.owner = THIS_MODULE,
|
|
.create = cleaner_create,
|
|
};
|
|
|
|
static struct dm_cache_policy_type default_policy_type = {
|
|
.name = "default",
|
|
.version = {2, 0, 0},
|
|
.hint_size = 4,
|
|
.owner = THIS_MODULE,
|
|
.create = smq_create,
|
|
.real = &smq_policy_type
|
|
};
|
|
|
|
static int __init smq_init(void)
|
|
{
|
|
int r;
|
|
|
|
r = dm_cache_policy_register(&smq_policy_type);
|
|
if (r) {
|
|
DMERR("register failed %d", r);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
r = dm_cache_policy_register(&mq_policy_type);
|
|
if (r) {
|
|
DMERR("register failed (as mq) %d", r);
|
|
goto out_mq;
|
|
}
|
|
|
|
r = dm_cache_policy_register(&cleaner_policy_type);
|
|
if (r) {
|
|
DMERR("register failed (as cleaner) %d", r);
|
|
goto out_cleaner;
|
|
}
|
|
|
|
r = dm_cache_policy_register(&default_policy_type);
|
|
if (r) {
|
|
DMERR("register failed (as default) %d", r);
|
|
goto out_default;
|
|
}
|
|
|
|
return 0;
|
|
|
|
out_default:
|
|
dm_cache_policy_unregister(&cleaner_policy_type);
|
|
out_cleaner:
|
|
dm_cache_policy_unregister(&mq_policy_type);
|
|
out_mq:
|
|
dm_cache_policy_unregister(&smq_policy_type);
|
|
|
|
return -ENOMEM;
|
|
}
|
|
|
|
static void __exit smq_exit(void)
|
|
{
|
|
dm_cache_policy_unregister(&cleaner_policy_type);
|
|
dm_cache_policy_unregister(&smq_policy_type);
|
|
dm_cache_policy_unregister(&mq_policy_type);
|
|
dm_cache_policy_unregister(&default_policy_type);
|
|
}
|
|
|
|
module_init(smq_init);
|
|
module_exit(smq_exit);
|
|
|
|
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_DESCRIPTION("smq cache policy");
|
|
|
|
MODULE_ALIAS("dm-cache-default");
|
|
MODULE_ALIAS("dm-cache-mq");
|
|
MODULE_ALIAS("dm-cache-cleaner");
|