diff --git a/lib/Makefile.in b/lib/Makefile.in index 1d422354c..4fb4fcf28 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -21,6 +21,7 @@ ifeq ("@CLUSTER@", "shared") endif SOURCES =\ + ../base/data-struct/radix-tree.c \ activate/activate.c \ cache/lvmcache.c \ cache_segtype/cache.c \ diff --git a/lib/device/bcache.c b/lib/device/bcache.c index 9e94fccf5..d9bacf8bb 100644 --- a/lib/device/bcache.c +++ b/lib/device/bcache.c @@ -15,6 +15,8 @@ #define _GNU_SOURCE #include "bcache.h" + +#include "base/data-struct/radix-tree.h" #include "lvm-logging.h" #include "log.h" @@ -453,12 +455,7 @@ struct bcache { struct dm_list clean; struct dm_list io_pending; - /* - * Hash table. - */ - unsigned nr_buckets; - unsigned hash_mask; - struct dm_list *buckets; + struct radix_tree *rtree; /* * Statistics @@ -473,75 +470,50 @@ struct bcache { //---------------------------------------------------------------- -/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ -#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL +struct key_parts { + uint32_t fd; + uint64_t b; +} __attribute__ ((packed)); -static unsigned _hash(struct bcache *cache, int fd, uint64_t i) +union key { + struct key_parts parts; + uint8_t bytes[12]; +}; + +static struct block *_block_lookup(struct bcache *cache, int fd, uint64_t i) { - uint64_t h = (i << 10) & fd; - h *= GOLDEN_RATIO_PRIME_64; - return h & cache->hash_mask; -} + union key k; + union radix_value v; -static struct block *_hash_lookup(struct bcache *cache, int fd, uint64_t i) -{ - struct block *b; - unsigned h = _hash(cache, fd, i); + k.parts.fd = fd; + k.parts.b = i; - dm_list_iterate_items_gen (b, cache->buckets + h, hash) - if (b->fd == fd && b->index == i) - return b; + if (radix_tree_lookup(cache->rtree, k.bytes, k.bytes + sizeof(k.bytes), &v)) + return v.ptr; return NULL; } -static void _hash_insert(struct block *b) +static bool _block_insert(struct block *b) { - unsigned h = _hash(b->cache, b->fd, b->index); - dm_list_add_h(b->cache->buckets + h, &b->hash); + union key k; + union radix_value v; + + k.parts.fd = b->fd; + k.parts.b = b->index; + v.ptr = b; + + return radix_tree_insert(b->cache->rtree, k.bytes, k.bytes + sizeof(k.bytes), v); } -static inline void _hash_remove(struct block *b) +static void _block_remove(struct block *b) { - dm_list_del(&b->hash); -} + union key k; -/* - * Must return a power of 2. - */ -static unsigned _calc_nr_buckets(unsigned nr_blocks) -{ - unsigned r = 8; - unsigned n = nr_blocks / 4; + k.parts.fd = b->fd; + k.parts.b = b->index; - if (n < 8) - n = 8; - - while (r < n) - r <<= 1; - - return r; -} - -static bool _hash_table_init(struct bcache *cache, unsigned nr_entries) -{ - unsigned i; - - cache->nr_buckets = _calc_nr_buckets(nr_entries); - cache->hash_mask = cache->nr_buckets - 1; - cache->buckets = dm_malloc(cache->nr_buckets * sizeof(*cache->buckets)); - if (!cache->buckets) - return false; - - for (i = 0; i < cache->nr_buckets; i++) - dm_list_init(cache->buckets + i); - - return true; -} - -static void _hash_table_exit(struct bcache *cache) -{ - dm_free(cache->buckets); + radix_tree_remove(b->cache->rtree, k.bytes, k.bytes + sizeof(k.bytes)); } //---------------------------------------------------------------- @@ -587,6 +559,11 @@ static struct block *_alloc_block(struct bcache *cache) return dm_list_struct_base(_list_pop(&cache->free), struct block, list); } +static void _free_block(struct block *b) +{ + dm_list_add(&b->cache->free, &b->list); +} + /*---------------------------------------------------------------- * Clean/dirty list management. * Always use these methods to ensure nr_dirty_ is correct. @@ -742,7 +719,7 @@ static struct block *_find_unused_clean_block(struct bcache *cache) dm_list_iterate_items (b, &cache->clean) { if (!b->ref_count) { _unlink_block(b); - _hash_remove(b); + _block_remove(b); return b; } } @@ -779,23 +756,13 @@ static struct block *_new_block(struct bcache *cache, int fd, block_address i, b b->ref_count = 0; b->error = 0; - _hash_insert(b); + if (!_block_insert(b)) { + log_error("bcache unable to insert block in radix tree (OOM?)"); + _free_block(b); + return NULL; + } } -#if 0 - if (!b) { - log_error("bcache no new blocks for fd %d index %u " - "clean %u free %u dirty %u pending %u nr_data_blocks %u nr_cache_blocks %u", - fd, (uint32_t) i, - dm_list_size(&cache->clean), - dm_list_size(&cache->free), - dm_list_size(&cache->dirty), - dm_list_size(&cache->io_pending), - (uint32_t)cache->nr_data_blocks, - (uint32_t)cache->nr_cache_blocks); - } -#endif - return b; } @@ -833,7 +800,7 @@ static struct block *_lookup_or_read_block(struct bcache *cache, int fd, block_address i, unsigned flags) { - struct block *b = _hash_lookup(cache, fd, i); + struct block *b = _block_lookup(cache, fd, i); if (b) { // FIXME: this is insufficient. We need to also catch a read @@ -937,7 +904,8 @@ struct bcache *bcache_create(sector_t block_sectors, unsigned nr_cache_blocks, dm_list_init(&cache->clean); dm_list_init(&cache->io_pending); - if (!_hash_table_init(cache, nr_cache_blocks)) { + cache->rtree = radix_tree_create(NULL, NULL); + if (!cache->rtree) { cache->engine->destroy(cache->engine); dm_free(cache); return NULL; @@ -952,7 +920,7 @@ struct bcache *bcache_create(sector_t block_sectors, unsigned nr_cache_blocks, if (!_init_free_list(cache, nr_cache_blocks, pgsize)) { cache->engine->destroy(cache->engine); - _hash_table_exit(cache); + radix_tree_destroy(cache->rtree); dm_free(cache); return NULL; } @@ -968,7 +936,7 @@ void bcache_destroy(struct bcache *cache) bcache_flush(cache); _wait_all(cache); _exit_free_list(cache); - _hash_table_exit(cache); + radix_tree_destroy(cache->rtree); cache->engine->destroy(cache->engine); dm_free(cache); } @@ -990,7 +958,7 @@ unsigned bcache_max_prefetches(struct bcache *cache) void bcache_prefetch(struct bcache *cache, int fd, block_address i) { - struct block *b = _hash_lookup(cache, fd, i); + struct block *b = _block_lookup(cache, fd, i); if (!b) { if (cache->nr_io_pending < cache->max_io) { @@ -1003,11 +971,13 @@ void bcache_prefetch(struct bcache *cache, int fd, block_address i) } } +//---------------------------------------------------------------- + static void _recycle_block(struct bcache *cache, struct block *b) { _unlink_block(b); - _hash_remove(b); - dm_list_add(&cache->free, &b->list); + _block_remove(b); + _free_block(b); } bool bcache_get(struct bcache *cache, int fd, block_address i, @@ -1041,6 +1011,8 @@ bool bcache_get(struct bcache *cache, int fd, block_address i, return false; } +//---------------------------------------------------------------- + static void _put_ref(struct block *b) { if (!b->ref_count) { @@ -1061,6 +1033,8 @@ void bcache_put(struct block *b) _preemptive_writeback(b->cache); } +//---------------------------------------------------------------- + bool bcache_flush(struct bcache *cache) { // Only dirty data is on the errored list, since bad read blocks get @@ -1083,6 +1057,7 @@ bool bcache_flush(struct bcache *cache) return dm_list_empty(&cache->errored); } +//---------------------------------------------------------------- /* * You can safely call this with a NULL block. */ @@ -1115,29 +1090,72 @@ static bool _invalidate_block(struct bcache *cache, struct block *b) bool bcache_invalidate(struct bcache *cache, int fd, block_address i) { - return _invalidate_block(cache, _hash_lookup(cache, fd, i)); -} - -// FIXME: switch to a trie, or maybe 1 hash table per fd? To save iterating -// through the whole cache. -bool bcache_invalidate_fd(struct bcache *cache, int fd) -{ - struct block *b, *tmp; - bool r = true; - - // Start writing back any dirty blocks on this fd. - dm_list_iterate_items_safe (b, tmp, &cache->dirty) - if (b->fd == fd) - _issue_write(b); - - _wait_all(cache); - - // Everything should be in the clean list now. - dm_list_iterate_items_safe (b, tmp, &cache->clean) - if (b->fd == fd) - r = _invalidate_block(cache, b) && r; - - return r; + return _invalidate_block(cache, _block_lookup(cache, fd, i)); +} + +//---------------------------------------------------------------- + +struct invalidate_iterator { + bool success; + struct radix_tree_iterator it; +}; + +static bool _writeback_v(struct radix_tree_iterator *it, + uint8_t *kb, uint8_t *ke, union radix_value v) +{ + struct block *b = v.ptr; + + if (_test_flags(b, BF_DIRTY)) + _issue_write(b); + + return true; +} + +static bool _invalidate_v(struct radix_tree_iterator *it, + uint8_t *kb, uint8_t *ke, union radix_value v) +{ + struct block *b = v.ptr; + struct invalidate_iterator *iit = container_of(it, struct invalidate_iterator, it); + + if (b->error || _test_flags(b, BF_DIRTY)) { + log_warn("bcache_invalidate: block (%d, %llu) still dirty", + b->fd, (unsigned long long) b->index); + iit->success = false; + return true; + } + + if (b->ref_count) { + log_warn("bcache_invalidate: block (%d, %llu) still held", + b->fd, (unsigned long long) b->index); + iit->success = false; + return true; + } + + _unlink_block(b); + _free_block(b); + + // We can't remove the block from the radix tree yet because + // we're in the middle of an iteration. + return true; +} + +bool bcache_invalidate_fd(struct bcache *cache, int fd) +{ + union key k; + struct invalidate_iterator it; + + k.parts.fd = fd; + + it.it.visit = _writeback_v; + radix_tree_iterate(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.fd), &it.it); + + _wait_all(cache); + + it.success = true; + it.it.visit = _invalidate_v; + radix_tree_iterate(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.fd), &it.it); + radix_tree_remove_prefix(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.fd)); + return it.success; } //---------------------------------------------------------------- diff --git a/make.tmpl.in b/make.tmpl.in index 9d5d36757..1f999ec69 100644 --- a/make.tmpl.in +++ b/make.tmpl.in @@ -306,7 +306,7 @@ LIB_VERSION_DM := $(shell $(AWK) -F '.' '{printf "%s.%s",$$1,$$2}' $(top_srcdir) LIB_VERSION_APP := $(shell $(AWK) -F '[(). ]' '{printf "%s.%s",$$1,$$4}' $(top_srcdir)/VERSION) -INCLUDES += -I$(srcdir) -I$(top_builddir)/include +INCLUDES += -I$(top_srcdir) -I$(srcdir) -I$(top_builddir)/include INC_LNS = $(top_builddir)/include/.symlinks_created