bcachefs: KEY_TYPE_alloc_v2

This introduces a new version of KEY_TYPE_alloc, which uses the new
varint encoding introduced for inodes. This means we'll eventually be
able to support much larger bucket sizes (for SMR devices), and the
read/write time fields are expanded to 64 bits - which will be used in
the next patch to get rid of the periodic rescaling of those fields.

Also, for buckets that are members of erasure coded stripes, this adds
persistent fields for the index of the stripe they're members of and the
stripe redundancy. This is part of work to get rid of having to scan and
read into memory the alloc and stripes btrees at mount time.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2021-01-22 18:01:07 -05:00 committed by Kent Overstreet
parent 26452d1dcd
commit 7f4e1d5d0f
9 changed files with 407 additions and 303 deletions

View File

@ -15,6 +15,7 @@
#include "error.h"
#include "recovery.h"
#include "trace.h"
#include "varint.h"
#include <linux/kthread.h>
#include <linux/math64.h>
@ -24,11 +25,10 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
static const char * const bch2_alloc_field_names[] = {
#define x(name, bytes) #name,
BCH_ALLOC_FIELDS()
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
BCH_ALLOC_FIELDS_V1()
#undef x
NULL
};
static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
@ -67,10 +67,10 @@ static void pd_controllers_update(struct work_struct *work)
/* Persistent alloc info: */
static inline u64 get_alloc_field(const struct bch_alloc *a,
const void **p, unsigned field)
static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
const void **p, unsigned field)
{
unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
u64 v;
if (!(a->fields & (1 << field)))
@ -97,10 +97,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a,
return v;
}
static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
unsigned field, u64 v)
static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
unsigned field, u64 v)
{
unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
if (!v)
return;
@ -127,55 +127,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
*p += bytes;
}
static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
struct bkey_s_c k)
{
const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
const void *d = in->data;
unsigned idx = 0;
out->gen = in->gen;
#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
BCH_ALLOC_FIELDS_V1()
#undef x
}
static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst,
const struct bkey_alloc_unpacked src)
{
struct bkey_i_alloc *a = bkey_alloc_init(&dst->k);
void *d = a->v.data;
unsigned bytes, idx = 0;
a->k.p = POS(src.dev, src.bucket);
a->v.fields = 0;
a->v.gen = src.gen;
#define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name);
BCH_ALLOC_FIELDS_V1()
#undef x
bytes = (void *) d - (void *) &a->v;
set_bkey_val_bytes(&a->k, bytes);
memset_u64s_tail(&a->v, 0, bytes);
}
static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
struct bkey_s_c k)
{
struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
const u8 *in = a.v->data;
const u8 *end = bkey_val_end(a);
unsigned fieldnr = 0;
int ret;
u64 v;
out->gen = a.v->gen;
out->oldest_gen = a.v->oldest_gen;
out->data_type = a.v->data_type;
#define x(_name, _bits) \
if (fieldnr < a.v->nr_fields) { \
ret = bch2_varint_decode(in, end, &v); \
if (ret < 0) \
return ret; \
in += ret; \
} else { \
v = 0; \
} \
out->_name = v; \
if (v != out->_name) \
return -1; \
fieldnr++;
BCH_ALLOC_FIELDS_V2()
#undef x
return 0;
}
static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
const struct bkey_alloc_unpacked src)
{
struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
u8 *out = a->v.data;
u8 *end = (void *) &dst[1];
u8 *last_nonzero_field = out;
unsigned bytes;
a->k.p = POS(src.dev, src.bucket);
a->v.gen = src.gen;
a->v.oldest_gen = src.oldest_gen;
a->v.data_type = src.data_type;
#define x(_name, _bits) \
nr_fields++; \
\
if (src._name) { \
out += bch2_varint_encode(out, src._name); \
\
last_nonzero_field = out; \
last_nonzero_fieldnr = nr_fields; \
} else { \
*out++ = 0; \
}
BCH_ALLOC_FIELDS_V2()
#undef x
BUG_ON(out > end);
out = last_nonzero_field;
a->v.nr_fields = last_nonzero_fieldnr;
bytes = (u8 *) out - (u8 *) &a->v;
set_bkey_val_bytes(&a->k, bytes);
memset_u64s_tail(&a->v, 0, bytes);
}
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
{
struct bkey_alloc_unpacked ret = { .gen = 0 };
struct bkey_alloc_unpacked ret = {
.dev = k.k->p.inode,
.bucket = k.k->p.offset,
.gen = 0,
};
if (k.k->type == KEY_TYPE_alloc) {
const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
const void *d = a->data;
unsigned idx = 0;
if (k.k->type == KEY_TYPE_alloc_v2)
bch2_alloc_unpack_v2(&ret, k);
else if (k.k->type == KEY_TYPE_alloc)
bch2_alloc_unpack_v1(&ret, k);
ret.gen = a->gen;
#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
BCH_ALLOC_FIELDS()
#undef x
}
return ret;
}
void bch2_alloc_pack(struct bkey_i_alloc *dst,
void bch2_alloc_pack(struct bch_fs *c,
struct bkey_alloc_buf *dst,
const struct bkey_alloc_unpacked src)
{
unsigned idx = 0;
void *d = dst->v.data;
unsigned bytes;
dst->v.fields = 0;
dst->v.gen = src.gen;
#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name);
BCH_ALLOC_FIELDS()
#undef x
bytes = (void *) d - (void *) &dst->v;
set_bkey_val_bytes(&dst->k, bytes);
memset_u64s_tail(&dst->v, 0, bytes);
if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))
bch2_alloc_pack_v2(dst, src);
else
bch2_alloc_pack_v1(dst, src);
}
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
{
unsigned i, bytes = offsetof(struct bch_alloc, data);
for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
if (a->fields & (1 << i))
bytes += BCH_ALLOC_FIELD_BYTES[i];
bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
return DIV_ROUND_UP(bytes, sizeof(u64));
}
const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
@ -190,20 +284,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
return NULL;
}
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
const void *d = a.v->data;
unsigned i;
struct bkey_alloc_unpacked u;
pr_buf(out, "gen %u", a.v->gen);
if (k.k->p.inode >= c->sb.nr_devices ||
!c->devs[k.k->p.inode])
return "invalid device";
for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
if (a.v->fields & (1 << i))
pr_buf(out, " %s %llu",
bch2_alloc_field_names[i],
get_alloc_field(a.v, &d, i));
if (bch2_alloc_unpack_v2(&u, k))
return "unpack error";
return NULL;
}
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
pr_buf(out, "gen %u oldest_gen %u data_type %u",
u.gen, u.oldest_gen, u.data_type);
#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name);
BCH_ALLOC_FIELDS_V2()
#undef x
}
static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
@ -213,7 +317,9 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
struct bucket *g;
struct bkey_alloc_unpacked u;
if (level || k.k->type != KEY_TYPE_alloc)
if (level ||
(k.k->type != KEY_TYPE_alloc &&
k.k->type != KEY_TYPE_alloc_v2))
return 0;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
@ -281,8 +387,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
struct bucket *g;
struct bucket_mark m;
struct bkey_alloc_unpacked old_u, new_u;
__BKEY_PADDED(k, 8) alloc_key; /* hack: */
struct bkey_i_alloc *a;
struct bkey_alloc_buf a;
int ret;
retry:
bch2_trans_begin(trans);
@ -303,17 +408,14 @@ retry:
ca = bch_dev_bkey_exists(c, iter->pos.inode);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
new_u = alloc_mem_to_key(g, m);
new_u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
if (!bkey_alloc_unpacked_cmp(old_u, new_u))
return 0;
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
bch2_alloc_pack(a, new_u);
bch2_trans_update(trans, iter, &a->k_i,
bch2_alloc_pack(c, &a, new_u);
bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_NORUN);
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|flags);
@ -473,9 +575,9 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
struct btree_iter *iter;
struct bucket *g;
struct bkey_i_alloc *a;
struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u;
u16 *time;
u64 *time;
int ret = 0;
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@ -486,28 +588,24 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
if (ret)
goto out;
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
percpu_down_read(&c->mark_lock);
g = bucket(ca, bucket_nr);
u = alloc_mem_to_key(g, READ_ONCE(g->mark));
u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
time = rw == READ ? &u.read_time : &u.write_time;
if (*time == c->bucket_clock[rw].hand)
goto out;
*time = c->bucket_clock[rw].hand;
bch2_alloc_pack(a, u);
ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
bch2_alloc_pack(c, a, u);
ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
out:
bch2_trans_iter_put(trans, iter);
@ -863,14 +961,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
struct btree_iter *iter,
u64 *journal_seq, unsigned flags)
{
#if 0
__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
#else
/* hack: */
__BKEY_PADDED(k, 8) alloc_key;
#endif
struct bch_fs *c = trans->c;
struct bkey_i_alloc *a;
struct bkey_alloc_buf a;
struct bkey_alloc_unpacked u;
struct bucket *g;
struct bucket_mark m;
@ -920,8 +1012,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
goto out;
}
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
retry:
ret = bch2_btree_iter_traverse(iter);
@ -931,7 +1021,7 @@ retry:
percpu_down_read(&c->mark_lock);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
u = alloc_mem_to_key(g, m);
u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
@ -944,11 +1034,8 @@ retry:
u.read_time = c->bucket_clock[READ].hand;
u.write_time = c->bucket_clock[WRITE].hand;
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i,
bch2_alloc_pack(c, &a, u);
bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_BUCKET_INVALIDATE);
/*

View File

@ -7,12 +7,33 @@
#include "debug.h"
struct bkey_alloc_unpacked {
u64 bucket;
u8 dev;
u8 gen;
u8 oldest_gen;
u8 data_type;
#define x(_name, _bits) u##_bits _name;
BCH_ALLOC_FIELDS()
BCH_ALLOC_FIELDS_V2()
#undef x
};
struct bkey_alloc_buf {
struct bkey_i k;
union {
struct {
#define x(_name, _bits) + _bits / 8
u8 _pad[8 + BCH_ALLOC_FIELDS_V1()];
#undef x
} _v1;
struct {
#define x(_name, _bits) + 8 + _bits / 8
u8 _pad[8 + BCH_ALLOC_FIELDS_V2()];
#undef x
} _v2;
};
} __attribute__((packed, aligned(8)));
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
@ -20,23 +41,28 @@ struct bkey_alloc_unpacked {
static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
struct bkey_alloc_unpacked r)
{
return l.gen != r.gen
#define x(_name, _bits) || l._name != r._name
BCH_ALLOC_FIELDS()
return l.gen != r.gen ||
l.oldest_gen != r.oldest_gen ||
l.data_type != r.data_type
#define x(_name, ...) || l._name != r._name
BCH_ALLOC_FIELDS_V2()
#undef x
;
}
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
void bch2_alloc_pack(struct bkey_i_alloc *,
void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
const struct bkey_alloc_unpacked);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
static inline struct bkey_alloc_unpacked
alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
alloc_mem_to_key(struct btree_iter *iter,
struct bucket *g, struct bucket_mark m)
{
return (struct bkey_alloc_unpacked) {
.dev = iter->pos.inode,
.bucket = iter->pos.offset,
.gen = m.gen,
.oldest_gen = g->oldest_gen,
.data_type = m.data_type,
@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
.key_invalid = bch2_alloc_invalid, \
.key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
}

View File

@ -345,7 +345,8 @@ static inline void bkey_init(struct bkey *k)
x(reflink_v, 16) \
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
x(indirect_inline_data, 19)
x(indirect_inline_data, 19) \
x(alloc_v2, 20)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@ -555,9 +556,11 @@ struct bch_extent_stripe_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:5,
block:8,
idx:51;
redundancy:4,
idx:47;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 idx:51,
__u64 idx:47,
redundancy:4,
block:8,
type:5;
#endif
@ -803,35 +806,40 @@ struct bch_alloc {
__u8 data[];
} __attribute__((packed, aligned(8)));
#define BCH_ALLOC_FIELDS() \
#define BCH_ALLOC_FIELDS_V1() \
x(read_time, 16) \
x(write_time, 16) \
x(data_type, 8) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
x(oldest_gen, 8)
x(oldest_gen, 8) \
x(stripe, 32) \
x(stripe_redundancy, 8)
struct bch_alloc_v2 {
struct bch_val v;
__u8 nr_fields;
__u8 gen;
__u8 oldest_gen;
__u8 data_type;
__u8 data[];
} __attribute__((packed, aligned(8)));
#define BCH_ALLOC_FIELDS_V2() \
x(read_time, 64) \
x(write_time, 64) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
x(stripe, 32) \
x(stripe_redundancy, 8)
enum {
#define x(name, bytes) BCH_ALLOC_FIELD_##name,
BCH_ALLOC_FIELDS()
#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
BCH_ALLOC_FIELDS_V1()
#undef x
BCH_ALLOC_FIELD_NR
};
static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
BCH_ALLOC_FIELDS()
#undef x
};
#define x(name, bits) + (bits / 8)
static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
DIV_ROUND_UP(offsetof(struct bch_alloc, data)
BCH_ALLOC_FIELDS(), sizeof(u64));
#undef x
#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
/* Quotas: */
enum quota_types {
@ -1337,7 +1345,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
x(btree_updates_journalled, 13) \
x(reflink_inline_data, 14) \
x(new_varint, 15) \
x(journal_no_flush, 16)
x(journal_no_flush, 16) \
x(alloc_v2, 17)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
@ -1345,7 +1354,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_new_varint)| \
(1ULL << BCH_FEATURE_journal_no_flush))
(1ULL << BCH_FEATURE_journal_no_flush)| \
(1ULL << BCH_FEATURE_alloc_v2))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,

View File

@ -538,6 +538,7 @@ BKEY_VAL_ACCESSORS(reflink_v);
BKEY_VAL_ACCESSORS(inline_data);
BKEY_VAL_ACCESSORS(btree_ptr_v2);
BKEY_VAL_ACCESSORS(indirect_inline_data);
BKEY_VAL_ACCESSORS(alloc_v2);
/* byte order helpers */

View File

@ -688,7 +688,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
struct bucket_mark old_m, m;
/* We don't do anything for deletions - do we?: */
if (new.k->type != KEY_TYPE_alloc)
if (new.k->type != KEY_TYPE_alloc &&
new.k->type != KEY_TYPE_alloc_v2)
return 0;
/*
@ -711,6 +712,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
m.data_type = u.data_type;
m.dirty_sectors = u.dirty_sectors;
m.cached_sectors = u.cached_sectors;
m.stripe = u.stripe != 0;
if (journal_seq) {
m.journal_seq_valid = 1;
@ -724,6 +726,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
g->io_time[WRITE] = u.write_time;
g->oldest_gen = u.oldest_gen;
g->gen_valid = 1;
g->stripe = u.stripe;
g->stripe_redundancy = u.stripe_redundancy;
/*
* need to know if we're getting called from the invalidate path or
@ -918,11 +922,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
unsigned ptr_idx,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags,
bool enabled)
u64 journal_seq, unsigned flags)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
@ -935,8 +938,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
char buf[200];
int ret;
if (enabled)
g->ec_redundancy = s->nr_redundant;
if (g->stripe && g->stripe != k.k->p.offset) {
bch2_fs_inconsistent(c,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EINVAL;
}
old = bucket_cmpxchg(g, new, ({
ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
@ -944,23 +952,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
if (ret)
return ret;
if (new.stripe && enabled)
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
if (!new.stripe && !enabled)
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
new.stripe = enabled;
if ((flags & BTREE_TRIGGER_GC) && parity) {
new.data_type = enabled ? BCH_DATA_parity : 0;
new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
if (parity) {
new.data_type = BCH_DATA_parity;
new.dirty_sectors = le16_to_cpu(s->sectors);
}
if (journal_seq) {
@ -969,8 +963,8 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
}
}));
if (!enabled)
g->ec_redundancy = 0;
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
return 0;
@ -1166,6 +1160,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
unsigned i;
int ret;
BUG_ON(gc && old_s);
if (!m || (old_s && !m->alive)) {
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
@ -1173,48 +1169,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
}
if (!new_s) {
/* Deleting: */
for (i = 0; i < old_s->nr_blocks; i++) {
ret = bucket_set_stripe(c, old, i, fs_usage,
journal_seq, flags, false);
if (ret)
return ret;
}
if (!gc && m->on_heap) {
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_del(c, m, idx);
spin_unlock(&c->ec_stripes_heap_lock);
}
if (gc)
update_replicas(c, fs_usage, &m->r.e,
-((s64) m->sectors * m->nr_redundant));
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_del(c, m, idx);
spin_unlock(&c->ec_stripes_heap_lock);
memset(m, 0, sizeof(*m));
} else {
BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
for (i = 0; i < new_s->nr_blocks; i++) {
if (!old_s ||
memcmp(new_s->ptrs + i,
old_s->ptrs + i,
sizeof(struct bch_extent_ptr))) {
if (old_s) {
bucket_set_stripe(c, old, i, fs_usage,
journal_seq, flags, false);
if (ret)
return ret;
}
ret = bucket_set_stripe(c, new, i, fs_usage,
journal_seq, flags, true);
if (ret)
return ret;
}
}
m->alive = true;
m->sectors = le16_to_cpu(new_s->sectors);
m->algorithm = new_s->algorithm;
@ -1223,27 +1183,13 @@ static int bch2_mark_stripe(struct bch_fs *c,
m->blocks_nonempty = 0;
for (i = 0; i < new_s->nr_blocks; i++) {
unsigned s = stripe_blockcount_get(new_s, i);
/*
* gc recalculates this field from stripe ptr
* references:
*/
if (!gc)
m->block_sectors[i] = s;
m->blocks_nonempty += !!s;
m->block_sectors[i] =
stripe_blockcount_get(new_s, i);
m->blocks_nonempty += !!m->block_sectors[i];
}
if (gc && old_s)
update_replicas(c, fs_usage, &m->r.e,
-((s64) m->sectors * m->nr_redundant));
bch2_bkey_to_replicas(&m->r.e, new);
if (gc)
update_replicas(c, fs_usage, &m->r.e,
((s64) m->sectors * m->nr_redundant));
if (!gc) {
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_update(c, m, idx);
@ -1251,6 +1197,25 @@ static int bch2_mark_stripe(struct bch_fs *c,
}
}
if (gc) {
/*
* gc recalculates this field from stripe ptr
* references:
*/
memset(m->block_sectors, 0, sizeof(m->block_sectors));
m->blocks_nonempty = 0;
for (i = 0; i < new_s->nr_blocks; i++) {
ret = mark_stripe_bucket(c, new, i, fs_usage,
journal_seq, flags);
if (ret)
return ret;
}
update_replicas(c, fs_usage, &m->r.e,
((s64) m->sectors * m->nr_redundant));
}
return 0;
}
@ -1274,6 +1239,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,
switch (k.k->type) {
case KEY_TYPE_alloc:
case KEY_TYPE_alloc_v2:
ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_btree_ptr:
@ -1542,9 +1508,10 @@ static int trans_get_key(struct btree_trans *trans,
return ret;
}
static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
const struct bch_extent_ptr *ptr,
struct bkey_alloc_unpacked *u)
static struct bkey_alloc_buf *
bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
const struct bch_extent_ptr *ptr,
struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@ -1552,8 +1519,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
struct bucket *g;
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_alloc_buf *a;
int ret;
a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
if (IS_ERR(a))
return a;
iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
if (iter) {
*u = bch2_alloc_unpack(k);
@ -1565,17 +1537,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
ret = bch2_btree_iter_traverse(iter);
if (ret) {
bch2_trans_iter_put(trans, iter);
return ret;
return ERR_PTR(ret);
}
percpu_down_read(&c->mark_lock);
g = bucket(ca, pos.offset);
*u = alloc_mem_to_key(g, READ_ONCE(g->mark));
*u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
}
*_iter = iter;
return 0;
return a;
}
static int bch2_trans_mark_pointer(struct btree_trans *trans,
@ -1585,27 +1557,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
struct bkey_alloc_buf *a;
int ret;
ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
if (ret)
return ret;
a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
if (IS_ERR(a))
return PTR_ERR(a);
ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
&u.dirty_sectors, &u.cached_sectors);
if (ret)
goto out;
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
bch2_alloc_pack(c, a, u);
bch2_trans_update(trans, iter, &a->k, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@ -1716,34 +1681,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
}
static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
const struct bch_extent_ptr *ptr,
s64 sectors, bool parity)
struct bkey_s_c_stripe s,
unsigned idx, bool deleting)
{
struct bkey_i_alloc *a;
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
struct bkey_alloc_buf *a;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
int ret;
bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
int ret = 0;
ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
if (ret)
return ret;
a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
if (IS_ERR(a))
return PTR_ERR(a);
if (parity) {
s64 sectors = le16_to_cpu(s.v->sectors);
if (deleting)
sectors = -sectors;
u.dirty_sectors += sectors;
u.data_type = u.dirty_sectors
? BCH_DATA_parity
: 0;
}
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto err;
if (!deleting) {
if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
"bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
iter->pos.inode, iter->pos.offset, u.gen,
u.stripe, s.k->p.offset)) {
ret = -EIO;
goto err;
}
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
u.stripe = s.k->p.offset;
u.stripe_redundancy = s.v->nr_redundant;
} else {
u.stripe = 0;
u.stripe_redundancy = 0;
}
bch2_alloc_pack(c, a, u);
bch2_trans_update(trans, iter, &a->k, 0);
err:
bch2_trans_iter_put(trans, iter);
return ret;
@ -1753,51 +1735,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(old).v : NULL;
const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(new).v : NULL;
struct bkey_s_c_stripe old_s = { NULL };
struct bkey_s_c_stripe new_s = { NULL };
struct bch_replicas_padded r;
unsigned i;
int ret = 0;
if (old.k->type == KEY_TYPE_stripe)
old_s = bkey_s_c_to_stripe(old);
if (new.k->type == KEY_TYPE_stripe)
new_s = bkey_s_c_to_stripe(new);
/*
* If the pointers aren't changing, we don't need to do anything:
*/
if (new_s && old_s &&
!memcmp(old_s->ptrs, new_s->ptrs,
new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
if (new_s.k && old_s.k &&
new_s.v->nr_blocks == old_s.v->nr_blocks &&
new_s.v->nr_redundant == old_s.v->nr_redundant &&
!memcmp(old_s.v->ptrs, new_s.v->ptrs,
new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
if (new_s) {
unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant;
s64 sectors = le16_to_cpu(new_s->sectors);
if (new_s.k) {
s64 sectors = le16_to_cpu(new_s.v->sectors);
bch2_bkey_to_replicas(&r.e, new);
update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
for (i = 0; i < new_s->nr_blocks; i++) {
bool parity = i >= nr_data;
ret = bch2_trans_mark_stripe_alloc_ref(trans,
&new_s->ptrs[i], sectors, parity);
for (i = 0; i < new_s.v->nr_blocks; i++) {
ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
i, false);
if (ret)
return ret;
}
}
if (old_s) {
unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant;
s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
if (old_s.k) {
s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
bch2_bkey_to_replicas(&r.e, old);
update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
for (i = 0; i < old_s->nr_blocks; i++) {
bool parity = i >= nr_data;
ret = bch2_trans_mark_stripe_alloc_ref(trans,
&old_s->ptrs[i], sectors, parity);
for (i = 0; i < old_s.v->nr_blocks; i++) {
ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
i, true);
if (ret)
return ret;
}
@ -2068,21 +2049,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
struct bkey_alloc_buf *a;
struct bch_extent_ptr ptr = {
.dev = ca->dev_idx,
.offset = bucket_to_sector(ca, b),
};
int ret = 0;
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
return ret;
ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
if (ret)
return ret;
a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
if (IS_ERR(a))
return PTR_ERR(a);
if (u.data_type && u.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@ -2115,10 +2091,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
u.data_type = type;
u.dirty_sectors = sectors;
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
bch2_alloc_pack(c, a, u);
bch2_trans_update(trans, iter, &a->k, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;

View File

@ -41,7 +41,8 @@ struct bucket {
u8 oldest_gen;
u8 gc_gen;
unsigned gen_valid:1;
u8 ec_redundancy;
u8 stripe_redundancy;
u32 stripe;
};
struct bucket_array {

View File

@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
if (!bkey_cmp(k.k->p, POS_MIN))
return "stripe at pos 0";
if (k.k->p.inode)
return "invalid stripe key";
@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
char buf2[200];
bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
bch_err_ratelimited(c,
"stripe checksum error at %u:%u: csum type %u, expected %llx got %llx",
i, j, v->csum_type,
want.lo, got.lo);
"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
(void *) _RET_IP_, i, j, v->csum_type,
want.lo, got.lo, buf2);
clear_bit(i, buf->valid);
break;
}
@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
static void ec_block_endio(struct bio *bio)
{
struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
struct bch_stripe *v = &ec_bio->buf->key.v;
struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
if (ptr_stale(ca, ptr)) {
bch_err_ratelimited(ca->fs,
"error %s stripe: stale pointer after io",
bio_data_dir(bio) == READ ? "reading from" : "writing to");
clear_bit(ec_bio->idx, ec_bio->buf->valid);
}
bio_put(&ec_bio->bio);
percpu_ref_put(&ca->io_ref);
closure_put(cl);
@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c,
static int ec_stripe_delete(struct bch_fs *c, size_t idx)
{
//pr_info("deleting stripe %zu", idx);
return bch2_btree_delete_range(c, BTREE_ID_EC,
POS(0, idx),
POS(0, idx + 1),
@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
*dst = (struct bch_extent_stripe_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
.block = block,
.redundancy = s->key.v.nr_redundant,
.idx = s->key.k.p.offset,
};
}
@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
if (!ob)
return;
//pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
ec = ob->ec;
mutex_lock(&ec->lock);
@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c,
struct stripe *m;
size_t heap_idx;
u64 stripe_idx;
s64 ret = -1;
if (may_create_new_stripe(c))
return -1;
spin_lock(&c->ec_stripes_heap_lock);
for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
/* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty)
continue;
@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c,
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
bch2_stripes_heap_del(c, m, stripe_idx);
spin_unlock(&c->ec_stripes_heap_lock);
return stripe_idx;
ret = stripe_idx;
break;
}
}
spin_unlock(&c->ec_stripes_heap_lock);
return -1;
return ret;
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,

View File

@ -703,14 +703,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
if (p.ptr.cached)
continue;
if (p.has_ec) {
struct stripe *s =
genradix_ptr(&c->stripes[0], p.ec.idx);
WARN_ON(!s);
if (s)
replicas += s->nr_redundant;
}
if (p.has_ec)
replicas += p.ec.redundancy;
replicas++;
@ -733,16 +727,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
durability = max_t(unsigned, durability, ca->mi.durability);
if (p.has_ec) {
struct stripe *s =
genradix_ptr(&c->stripes[0], p.ec.idx);
if (p.has_ec)
durability += p.ec.redundancy;
if (WARN_ON(!s))
goto out;
durability += s->nr_redundant;
}
out:
return durability;
}

View File

@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
data_opts->rewrite_dev = p.ptr.dev;
if (p.has_ec) {
struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
data_opts->nr_replicas += m->nr_redundant;
}
if (p.has_ec)
data_opts->nr_replicas += p.ec.redundancy;
return DATA_REWRITE;
}
@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c)
bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
WARN_ON(m.stripe && !g->ec_redundancy);
WARN_ON(m.stripe && !g->stripe_redundancy);
e = (struct copygc_heap_entry) {
.dev = dev_idx,
.gen = m.gen,
.replicas = 1 + g->ec_redundancy,
.replicas = 1 + g->stripe_redundancy,
.fragmentation = bucket_sectors_used(m) * (1U << 15)
/ ca->mi.bucket_size,
.sectors = bucket_sectors_used(m),