2017-03-17 09:18:50 +03:00
/* SPDX-License-Identifier: GPL-2.0 */
# ifndef _BCACHEFS_H
# define _BCACHEFS_H
/*
* SOME HIGH LEVEL CODE DOCUMENTATION :
*
* Bcache mostly works with cache sets , cache devices , and backing devices .
*
* Support for multiple cache devices hasn ' t quite been finished off yet , but
* it ' s about 95 % plumbed through . A cache set and its cache devices is sort of
* like a md raid array and its component devices . Most of the code doesn ' t care
* about individual cache devices , the main abstraction is the cache set .
*
* Multiple cache devices is intended to give us the ability to mirror dirty
* cached data and metadata , without mirroring clean cached data .
*
* Backing devices are different , in that they have a lifetime independent of a
* cache set . When you register a newly formatted backing device it ' ll come up
* in passthrough mode , and then you can attach and detach a backing device from
* a cache set at runtime - while it ' s mounted and in use . Detaching implicitly
* invalidates any cached data for that backing device .
*
* A cache set can have multiple ( many ) backing devices attached to it .
*
* There ' s also flash only volumes - this is the reason for the distinction
* between struct cached_dev and struct bcache_device . A flash only volume
* works much like a bcache device that has a backing device , except the
* " cached " data is always dirty . The end result is that we get thin
* provisioning with very little additional code .
*
* Flash only volumes work but they ' re not production ready because the moving
* garbage collector needs more work . More on that later .
*
* BUCKETS / ALLOCATION :
*
* Bcache is primarily designed for caching , which means that in normal
* operation all of our available space will be allocated . Thus , we need an
* efficient way of deleting things from the cache so we can write new things to
* it .
*
* To do this , we first divide the cache device up into buckets . A bucket is the
* unit of allocation ; they ' re typically around 1 mb - anywhere from 128 k to 2 M +
* works efficiently .
*
* Each bucket has a 16 bit priority , and an 8 bit generation associated with
* it . The gens and priorities for all the buckets are stored contiguously and
* packed on disk ( in a linked list of buckets - aside from the superblock , all
* of bcache ' s metadata is stored in buckets ) .
*
* The priority is used to implement an LRU . We reset a bucket ' s priority when
* we allocate it or on cache it , and every so often we decrement the priority
* of each bucket . It could be used to implement something more sophisticated ,
* if anyone ever gets around to it .
*
* The generation is used for invalidating buckets . Each pointer also has an 8
* bit generation embedded in it ; for a pointer to be considered valid , its gen
* must match the gen of the bucket it points into . Thus , to reuse a bucket all
* we have to do is increment its gen ( and write its new gen to disk ; we batch
* this up ) .
*
* Bcache is entirely COW - we never write twice to a bucket , even buckets that
* contain metadata ( including btree nodes ) .
*
* THE BTREE :
*
* Bcache is in large part design around the btree .
*
* At a high level , the btree is just an index of key - > ptr tuples .
*
* Keys represent extents , and thus have a size field . Keys also have a variable
* number of pointers attached to them ( potentially zero , which is handy for
* invalidating the cache ) .
*
* The key itself is an inode : offset pair . The inode number corresponds to a
* backing device or a flash only volume . The offset is the ending offset of the
* extent within the inode - not the starting offset ; this makes lookups
* slightly more convenient .
*
* Pointers contain the cache device id , the offset on that device , and an 8 bit
* generation number . More on the gen later .
*
* Index lookups are not fully abstracted - cache lookups in particular are
* still somewhat mixed in with the btree code , but things are headed in that
* direction .
*
* Updates are fairly well abstracted , though . There are two different ways of
* updating the btree ; insert and replace .
*
* BTREE_INSERT will just take a list of keys and insert them into the btree -
* overwriting ( possibly only partially ) any extents they overlap with . This is
* used to update the index after a write .
*
* BTREE_REPLACE is really cmpxchg ( ) ; it inserts a key into the btree iff it is
* overwriting a key that matches another given key . This is used for inserting
* data into the cache after a cache miss , and for background writeback , and for
* the moving garbage collector .
*
* There is no " delete " operation ; deleting things from the index is
* accomplished by either by invalidating pointers ( by incrementing a bucket ' s
* gen ) or by inserting a key with 0 pointers - which will overwrite anything
* previously present at that location in the index .
*
* This means that there are always stale / invalid keys in the btree . They ' re
* filtered out by the code that iterates through a btree node , and removed when
* a btree node is rewritten .
*
* BTREE NODES :
*
* Our unit of allocation is a bucket , and we we can ' t arbitrarily allocate and
* free smaller than a bucket - so , that ' s how big our btree nodes are .
*
* ( If buckets are really big we ' ll only use part of the bucket for a btree node
* - no less than 1 / 4 th - but a bucket still contains no more than a single
* btree node . I ' d actually like to change this , but for now we rely on the
* bucket ' s gen for deleting btree nodes when we rewrite / split a node . )
*
* Anyways , btree nodes are big - big enough to be inefficient with a textbook
* btree implementation .
*
* The way this is solved is that btree nodes are internally log structured ; we
* can append new keys to an existing btree node without rewriting it . This
* means each set of keys we write is sorted , but the node is not .
*
* We maintain this log structure in memory - keeping 1 Mb of keys sorted would
* be expensive , and we have to distinguish between the keys we have written and
* the keys we haven ' t . So to do a lookup in a btree node , we have to search
* each sorted set . But we do merge written sets together lazily , so the cost of
* these extra searches is quite low ( normally most of the keys in a btree node
* will be in one big set , and then there ' ll be one or two sets that are much
* smaller ) .
*
* This log structure makes bcache ' s btree more of a hybrid between a
* conventional btree and a compacting data structure , with some of the
* advantages of both .
*
* GARBAGE COLLECTION :
*
* We can ' t just invalidate any bucket - it might contain dirty data or
* metadata . If it once contained dirty data , other writes might overwrite it
* later , leaving no valid pointers into that bucket in the index .
*
* Thus , the primary purpose of garbage collection is to find buckets to reuse .
* It also counts how much valid data it each bucket currently contains , so that
* allocation can reuse buckets sooner when they ' ve been mostly overwritten .
*
* It also does some things that are really internal to the btree
* implementation . If a btree node contains pointers that are stale by more than
* some threshold , it rewrites the btree node to avoid the bucket ' s generation
* wrapping around . It also merges adjacent btree nodes if they ' re empty enough .
*
* THE JOURNAL :
*
* Bcache ' s journal is not necessary for consistency ; we always strictly
* order metadata writes so that the btree and everything else is consistent on
* disk in the event of an unclean shutdown , and in fact bcache had writeback
* caching ( with recovery from unclean shutdown ) before journalling was
* implemented .
*
* Rather , the journal is purely a performance optimization ; we can ' t complete a
* write until we ' ve updated the index on disk , otherwise the cache would be
* inconsistent in the event of an unclean shutdown . This means that without the
* journal , on random write workloads we constantly have to update all the leaf
* nodes in the btree , and those writes will be mostly empty ( appending at most
* a few keys each ) - highly inefficient in terms of amount of metadata writes ,
* and it puts more strain on the various btree resorting / compacting code .
*
* The journal is just a log of keys we ' ve inserted ; on startup we just reinsert
* all the keys in the open journal entries . That means that when we ' re updating
* a node in the btree , we can wait until a 4 k block of keys fills up before
* writing them out .
*
* For simplicity , we only journal updates to leaf nodes ; updates to parent
* nodes are rare enough ( since our leaf nodes are huge ) that it wasn ' t worth
* the complexity to deal with journalling them ( in particular , journal replay )
* - updates to non leaf nodes just happen synchronously ( see btree_split ( ) ) .
*/
# undef pr_fmt
# define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
# include <linux/backing-dev-defs.h>
# include <linux/bug.h>
# include <linux/bio.h>
# include <linux/closure.h>
# include <linux/kobject.h>
# include <linux/list.h>
2019-04-05 04:53:12 +03:00
# include <linux/math64.h>
2017-03-17 09:18:50 +03:00
# include <linux/mutex.h>
# include <linux/percpu-refcount.h>
# include <linux/percpu-rwsem.h>
# include <linux/rhashtable.h>
# include <linux/rwsem.h>
2021-05-19 06:53:43 +03:00
# include <linux/semaphore.h>
2017-03-17 09:18:50 +03:00
# include <linux/seqlock.h>
# include <linux/shrinker.h>
2020-11-16 00:30:22 +03:00
# include <linux/srcu.h>
2017-03-17 09:18:50 +03:00
# include <linux/types.h>
# include <linux/workqueue.h>
# include <linux/zstd.h>
# include "bcachefs_format.h"
2021-11-28 21:42:05 +03:00
# include "errcode.h"
2017-03-17 09:18:50 +03:00
# include "fifo.h"
# include "opts.h"
# include "util.h"
# define dynamic_fault(...) 0
# define race_fault(...) 0
2018-11-01 22:13:19 +03:00
# define bch2_fs_init_fault(name) \
2017-03-17 09:18:50 +03:00
dynamic_fault ( " bcachefs:bch_fs_init: " name )
# define bch2_meta_read_fault(name) \
dynamic_fault ( " bcachefs:meta:read: " name )
# define bch2_meta_write_fault(name) \
dynamic_fault ( " bcachefs:meta:write: " name )
# ifdef __KERNEL__
2020-12-03 21:57:22 +03:00
# define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
# define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
2017-03-17 09:18:50 +03:00
# else
2021-11-15 23:03:06 +03:00
# define bch2_fmt(_c, fmt) "%s: " fmt "\n", ((_c)->name)
# define bch2_fmt_inum(_c, _inum, fmt) "%s inum %llu: " fmt "\n", ((_c)->name), (_inum)
2017-03-17 09:18:50 +03:00
# endif
# define bch_info(c, fmt, ...) \
printk ( KERN_INFO bch2_fmt ( c , fmt ) , # # __VA_ARGS__ )
# define bch_notice(c, fmt, ...) \
printk ( KERN_NOTICE bch2_fmt ( c , fmt ) , # # __VA_ARGS__ )
# define bch_warn(c, fmt, ...) \
printk ( KERN_WARNING bch2_fmt ( c , fmt ) , # # __VA_ARGS__ )
2019-04-04 03:38:37 +03:00
# define bch_warn_ratelimited(c, fmt, ...) \
printk_ratelimited ( KERN_WARNING bch2_fmt ( c , fmt ) , # # __VA_ARGS__ )
2017-03-17 09:18:50 +03:00
# define bch_err(c, fmt, ...) \
printk ( KERN_ERR bch2_fmt ( c , fmt ) , # # __VA_ARGS__ )
2020-12-03 21:57:22 +03:00
2018-11-25 01:09:44 +03:00
# define bch_err_ratelimited(c, fmt, ...) \
printk_ratelimited ( KERN_ERR bch2_fmt ( c , fmt ) , # # __VA_ARGS__ )
2020-12-03 21:57:22 +03:00
# define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
printk_ratelimited ( KERN_ERR bch2_fmt_inum ( c , _inum , fmt ) , # # __VA_ARGS__ )
2017-03-17 09:18:50 +03:00
# define bch_verbose(c, fmt, ...) \
do { \
2018-12-19 20:58:56 +03:00
if ( ( c ) - > opts . verbose ) \
2017-03-17 09:18:50 +03:00
bch_info ( c , fmt , # # __VA_ARGS__ ) ; \
} while ( 0 )
# define pr_verbose_init(opts, fmt, ...) \
do { \
2018-12-19 20:58:56 +03:00
if ( opt_get ( opts , verbose ) ) \
2017-03-17 09:18:50 +03:00
pr_info ( fmt , # # __VA_ARGS__ ) ; \
} while ( 0 )
/* Parameters that are useful for debugging, but should always be compiled in: */
# define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM ( key_merging_disabled , \
" Disables merging of extents " ) \
BCH_DEBUG_PARAM ( btree_gc_always_rewrite , \
" Causes mark and sweep to compact and rewrite every " \
" btree node it traverses " ) \
BCH_DEBUG_PARAM ( btree_gc_rewrite_disabled , \
" Disables rewriting of btree nodes during mark and sweep " ) \
BCH_DEBUG_PARAM ( btree_shrinker_disabled , \
2021-04-21 03:21:12 +03:00
" Disables the shrinker callback for the btree node cache " ) \
BCH_DEBUG_PARAM ( verify_btree_ondisk , \
" Reread btree nodes at various points to verify the " \
" mergesort in the read path against modifications " \
2021-05-22 06:57:37 +03:00
" done in memory " ) \
BCH_DEBUG_PARAM ( verify_all_btree_replicas , \
" When reading btree nodes, read all replicas and " \
" compare them " )
2017-03-17 09:18:50 +03:00
/* Parameters that should only be compiled in in debug mode: */
# define BCH_DEBUG_PARAMS_DEBUG() \
BCH_DEBUG_PARAM ( expensive_debug_checks , \
" Enables various runtime debugging checks that " \
" significantly affect performance " ) \
2019-03-28 08:51:47 +03:00
BCH_DEBUG_PARAM ( debug_check_iterators , \
" Enables extra verification for btree iterators " ) \
2017-03-17 09:18:50 +03:00
BCH_DEBUG_PARAM ( debug_check_bkeys , \
" Run bkey_debugcheck (primarily checking GC/allocation " \
" information) when iterating over keys " ) \
2020-11-03 02:36:08 +03:00
BCH_DEBUG_PARAM ( debug_check_btree_accounting , \
" Verify btree accounting for keys within a node " ) \
2017-03-17 09:18:50 +03:00
BCH_DEBUG_PARAM ( journal_seq_verify , \
" Store the journal sequence number in the version " \
" number of every btree key, and verify that btree " \
" update ordering is preserved during recovery " ) \
BCH_DEBUG_PARAM ( inject_invalid_keys , \
" Store the journal sequence number in the version " \
" number of every btree key, and verify that btree " \
" update ordering is preserved during recovery " ) \
2018-07-22 17:43:01 +03:00
BCH_DEBUG_PARAM ( test_alloc_startup , \
" Force allocator startup to use the slowpath where it " \
" can't find enough free buckets without invalidating " \
2018-11-01 22:13:19 +03:00
" cached data " ) \
BCH_DEBUG_PARAM ( force_reconstruct_read , \
" Force reads to use the reconstruct path, when reading " \
2019-03-22 02:03:57 +03:00
" from erasure coded extents " ) \
BCH_DEBUG_PARAM ( test_restart_gc , \
2019-08-28 20:20:31 +03:00
" Test restarting mark and sweep gc when bucket gens change " )
2017-03-17 09:18:50 +03:00
# define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
# ifdef CONFIG_BCACHEFS_DEBUG
# define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
# else
# define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
# endif
2020-11-03 02:20:44 +03:00
# define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
BCH_DEBUG_PARAMS ( )
# undef BCH_DEBUG_PARAM
# ifndef CONFIG_BCACHEFS_DEBUG
# define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
BCH_DEBUG_PARAMS_DEBUG ( )
# undef BCH_DEBUG_PARAM
# endif
2017-03-17 09:18:50 +03:00
# define BCH_TIME_STATS() \
x ( btree_node_mem_alloc ) \
2019-03-21 23:28:57 +03:00
x ( btree_node_split ) \
2021-12-10 23:41:38 +03:00
x ( btree_node_compact ) \
x ( btree_node_merge ) \
2019-03-21 23:28:57 +03:00
x ( btree_node_sort ) \
x ( btree_node_read ) \
2021-12-10 23:41:38 +03:00
x ( btree_interior_update_foreground ) \
x ( btree_interior_update_total ) \
2017-03-17 09:18:50 +03:00
x ( btree_gc ) \
x ( btree_lock_contended_read ) \
x ( btree_lock_contended_intent ) \
x ( btree_lock_contended_write ) \
x ( data_write ) \
x ( data_read ) \
x ( data_promote ) \
2021-12-10 23:41:38 +03:00
x ( journal_flush_write ) \
x ( journal_noflush_write ) \
2019-03-18 20:42:10 +03:00
x ( journal_flush_seq ) \
x ( blocked_journal ) \
x ( blocked_allocate ) \
x ( blocked_allocate_open_bucket )
2017-03-17 09:18:50 +03:00
enum bch_time_stats {
# define x(name) BCH_TIME_##name,
BCH_TIME_STATS ( )
# undef x
BCH_TIME_STAT_NR
} ;
# include "alloc_types.h"
# include "btree_types.h"
# include "buckets_types.h"
# include "clock_types.h"
2018-11-01 22:13:19 +03:00
# include "ec_types.h"
2017-03-17 09:18:50 +03:00
# include "journal_types.h"
# include "keylist_types.h"
# include "quota_types.h"
# include "rebalance_types.h"
2018-10-30 21:14:19 +03:00
# include "replicas_types.h"
2021-10-11 19:03:19 +03:00
# include "subvolume_types.h"
2017-03-17 09:18:50 +03:00
# include "super_types.h"
/* Number of nodes btree coalesce will try to coalesce at once */
# define GC_MERGE_NODES 4U
/* Maximum number of nodes we might need to allocate atomically: */
# define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
/* Size of the freelist we allocate btree nodes from: */
2020-06-16 00:38:26 +03:00
# define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
2017-03-17 09:18:50 +03:00
2019-01-19 21:13:29 +03:00
# define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
2017-03-17 09:18:50 +03:00
struct btree ;
enum gc_phase {
2018-11-25 01:09:44 +03:00
GC_PHASE_NOT_RUNNING ,
2017-03-17 09:18:50 +03:00
GC_PHASE_START ,
GC_PHASE_SB ,
2021-02-21 03:27:37 +03:00
GC_PHASE_BTREE_stripes ,
GC_PHASE_BTREE_extents ,
GC_PHASE_BTREE_inodes ,
GC_PHASE_BTREE_dirents ,
GC_PHASE_BTREE_xattrs ,
GC_PHASE_BTREE_alloc ,
GC_PHASE_BTREE_quotas ,
GC_PHASE_BTREE_reflink ,
2021-03-16 07:42:25 +03:00
GC_PHASE_BTREE_subvolumes ,
GC_PHASE_BTREE_snapshots ,
2017-03-17 09:18:50 +03:00
GC_PHASE_PENDING_DELETE ,
} ;
struct gc_pos {
enum gc_phase phase ;
struct bpos pos ;
unsigned level ;
} ;
2021-05-23 09:31:33 +03:00
struct reflink_gc {
u64 offset ;
u32 size ;
u32 refcount ;
} ;
typedef GENRADIX ( struct reflink_gc ) reflink_gc_table ;
2017-03-17 09:18:50 +03:00
struct io_count {
u64 sectors [ 2 ] [ BCH_DATA_NR ] ;
} ;
struct bch_dev {
struct kobject kobj ;
struct percpu_ref ref ;
struct completion ref_completion ;
struct percpu_ref io_ref ;
struct completion io_ref_completion ;
struct bch_fs * fs ;
u8 dev_idx ;
/*
* Cached version of this device ' s member info from superblock
* Committed by bch2_write_super ( ) - > bch_fs_mi_update ( )
*/
struct bch_member_cpu mi ;
__uuid_t uuid ;
char name [ BDEVNAME_SIZE ] ;
struct bch_sb_handle disk_sb ;
2019-03-22 06:13:46 +03:00
struct bch_sb * sb_read_scratch ;
2017-03-17 09:18:50 +03:00
int sb_write_error ;
struct bch_devs_mask self ;
/* biosets used in cloned bios for writing multiple replicas */
struct bio_set replica_set ;
/*
* Buckets :
2018-11-26 08:13:33 +03:00
* Per - bucket arrays are protected by c - > mark_lock , bucket_lock and
2017-03-17 09:18:50 +03:00
* gc_lock , for device resize - holding any is sufficient for access :
* Or rcu_read_lock ( ) , but only for ptr_stale ( ) :
*/
2018-07-23 12:32:01 +03:00
struct bucket_array __rcu * buckets [ 2 ] ;
2021-12-26 03:55:34 +03:00
struct bucket_gens * bucket_gens ;
2018-11-19 09:16:07 +03:00
unsigned long * buckets_nouse ;
2017-03-17 09:18:50 +03:00
struct rw_semaphore bucket_lock ;
2021-01-22 05:52:06 +03:00
struct bch_dev_usage * usage_base ;
struct bch_dev_usage __percpu * usage [ JOURNAL_BUF_NR ] ;
struct bch_dev_usage __percpu * usage_gc ;
2017-03-17 09:18:50 +03:00
/* Allocator: */
2021-12-24 12:22:20 +03:00
u64 new_fs_bucket_idx ;
2017-03-17 09:18:50 +03:00
struct task_struct __rcu * alloc_thread ;
/*
* free : Buckets that are ready to be used
*
* free_inc : Incoming buckets - these are buckets that currently have
* cached data in them , and we can ' t reuse them until after we write
* their new gen to disk . After prio_write ( ) finishes writing the new
* gens / prios , they ' ll be moved to the free list ( and possibly discarded
* in the process )
*/
alloc_fifo free [ RESERVE_NR ] ;
alloc_fifo free_inc ;
2021-04-13 16:49:23 +03:00
unsigned nr_open_buckets ;
2017-03-17 09:18:50 +03:00
2020-06-09 22:44:03 +03:00
open_bucket_idx_t open_buckets_partial [ OPEN_BUCKETS_COUNT ] ;
open_bucket_idx_t open_buckets_partial_nr ;
2017-03-17 09:18:50 +03:00
size_t fifo_last_bucket ;
size_t inc_gen_needs_gc ;
size_t inc_gen_really_needs_gc ;
2018-11-19 09:31:41 +03:00
2021-04-19 00:54:56 +03:00
enum allocator_states allocator_state ;
2017-03-17 09:18:50 +03:00
alloc_heap alloc_heap ;
atomic64_t rebalance_work ;
struct journal_device journal ;
2021-01-29 21:58:10 +03:00
u64 prev_journal_sector ;
2017-03-17 09:18:50 +03:00
struct work_struct io_error_work ;
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency [ 2 ] ;
struct bch2_time_stats io_latency [ 2 ] ;
# define CONGESTED_MAX 1024
atomic_t congested ;
u64 congested_last ;
struct io_count __percpu * io_done ;
} ;
enum {
/* startup: */
2021-11-15 23:02:13 +03:00
BCH_FS_INITIALIZED ,
2017-03-17 09:18:50 +03:00
BCH_FS_ALLOC_READ_DONE ,
2020-05-24 20:37:44 +03:00
BCH_FS_ALLOC_CLEAN ,
2019-02-10 00:15:29 +03:00
BCH_FS_ALLOCATOR_RUNNING ,
2020-05-28 23:06:13 +03:00
BCH_FS_ALLOCATOR_STOPPING ,
2017-03-17 09:18:50 +03:00
BCH_FS_INITIAL_GC_DONE ,
2021-04-24 23:32:35 +03:00
BCH_FS_INITIAL_GC_UNFIXED ,
2021-06-23 03:44:54 +03:00
BCH_FS_TOPOLOGY_REPAIR_DONE ,
2017-03-17 09:18:50 +03:00
BCH_FS_FSCK_DONE ,
BCH_FS_STARTED ,
2019-03-22 05:19:57 +03:00
BCH_FS_RW ,
2021-04-25 01:02:59 +03:00
BCH_FS_WAS_RW ,
2017-03-17 09:18:50 +03:00
/* shutdown: */
2019-04-05 04:53:12 +03:00
BCH_FS_STOPPING ,
2017-03-17 09:18:50 +03:00
BCH_FS_EMERGENCY_RO ,
BCH_FS_WRITE_DISABLE_COMPLETE ,
/* errors: */
BCH_FS_ERROR ,
2021-04-24 23:32:35 +03:00
BCH_FS_TOPOLOGY_ERROR ,
2019-03-28 16:34:55 +03:00
BCH_FS_ERRORS_FIXED ,
2021-04-24 23:32:35 +03:00
BCH_FS_ERRORS_NOT_FIXED ,
2017-03-17 09:18:50 +03:00
/* misc: */
2021-01-27 04:59:00 +03:00
BCH_FS_NEED_ANOTHER_GC ,
BCH_FS_DELETED_NODES ,
2021-01-09 05:20:58 +03:00
BCH_FS_NEED_ALLOC_WRITE ,
2017-03-17 09:18:50 +03:00
BCH_FS_REBUILD_REPLICAS ,
BCH_FS_HOLD_BTREE_WRITES ,
} ;
struct btree_debug {
unsigned id ;
struct dentry * btree ;
struct dentry * btree_format ;
struct dentry * failed ;
} ;
2018-11-27 16:23:22 +03:00
struct bch_fs_pcpu {
u64 sectors_available ;
} ;
2019-04-05 04:53:12 +03:00
struct journal_seq_blacklist_table {
size_t nr ;
struct journal_seq_blacklist_table_entry {
u64 start ;
u64 end ;
bool dirty ;
} entries [ 0 ] ;
} ;
2020-03-25 23:12:33 +03:00
struct journal_keys {
struct journal_key {
enum btree_id btree_id : 8 ;
unsigned level : 8 ;
2021-01-27 04:15:46 +03:00
bool allocated ;
2022-01-01 01:54:13 +03:00
bool overwritten ;
2020-03-25 23:12:33 +03:00
struct bkey_i * k ;
u32 journal_seq ;
u32 journal_offset ;
} * d ;
size_t nr ;
2021-01-27 04:15:46 +03:00
size_t size ;
2020-03-25 23:12:33 +03:00
u64 journal_seq_base ;
} ;
2021-08-30 22:18:31 +03:00
struct btree_path_buf {
struct btree_path * path ;
2020-11-06 04:02:01 +03:00
} ;
2021-04-24 07:24:25 +03:00
# define REPLICAS_DELTA_LIST_MAX (1U << 16)
2021-03-16 07:42:25 +03:00
struct snapshot_t {
u32 parent ;
u32 children [ 2 ] ;
u32 subvol ; /* Nonzero only if a subvolume points to this node: */
u32 equiv ;
} ;
typedef struct {
u32 subvol ;
u64 inum ;
} subvol_inum ;
# define BCACHEFS_ROOT_SUBVOL_INUM \
( ( subvol_inum ) { BCACHEFS_ROOT_SUBVOL , BCACHEFS_ROOT_INO } )
2017-03-17 09:18:50 +03:00
struct bch_fs {
struct closure cl ;
struct list_head list ;
struct kobject kobj ;
struct kobject internal ;
struct kobject opts_dir ;
struct kobject time_stats ;
unsigned long flags ;
int minor ;
struct device * chardev ;
struct super_block * vfs_sb ;
2021-05-28 02:15:44 +03:00
dev_t dev ;
2017-03-17 09:18:50 +03:00
char name [ 40 ] ;
2020-06-15 21:58:47 +03:00
/* ro/rw, add/remove/resize devices: */
struct rw_semaphore state_lock ;
2017-03-17 09:18:50 +03:00
/* Counts outstanding writes, for clean transition to read-only */
struct percpu_ref writes ;
struct work_struct read_only_work ;
struct bch_dev __rcu * devs [ BCH_SB_MEMBERS_MAX ] ;
2018-12-01 18:32:48 +03:00
struct bch_replicas_cpu replicas ;
struct bch_replicas_cpu replicas_gc ;
2017-03-17 09:18:50 +03:00
struct mutex replicas_gc_lock ;
2021-04-24 07:24:25 +03:00
mempool_t replicas_delta_pool ;
2017-03-17 09:18:50 +03:00
2021-02-03 21:10:55 +03:00
struct journal_entry_res btree_root_journal_res ;
2019-01-25 01:12:00 +03:00
struct journal_entry_res replicas_journal_res ;
2021-02-03 21:10:55 +03:00
struct journal_entry_res clock_journal_res ;
2021-01-22 05:52:06 +03:00
struct journal_entry_res dev_usage_journal_res ;
2017-03-17 09:18:50 +03:00
struct bch_disk_groups_cpu __rcu * disk_groups ;
struct bch_opts opts ;
/* Updated by bch2_sb_update():*/
struct {
__uuid_t uuid ;
__uuid_t user_uuid ;
2018-11-01 22:10:01 +03:00
u16 version ;
2021-03-21 23:03:23 +03:00
u16 version_min ;
2017-03-17 09:18:50 +03:00
u8 nr_devices ;
u8 clean ;
u8 encryption_type ;
u64 time_base_lo ;
u32 time_base_hi ;
2021-04-29 05:51:42 +03:00
unsigned time_units_per_sec ;
unsigned nsec_per_time_unit ;
2017-03-17 09:18:50 +03:00
u64 features ;
2019-02-06 19:56:51 +03:00
u64 compat ;
2017-03-17 09:18:50 +03:00
} sb ;
2021-04-29 05:51:42 +03:00
2017-03-17 09:18:50 +03:00
struct bch_sb_handle disk_sb ;
unsigned short block_bits ; /* ilog2(block_size) */
u16 btree_foreground_merge_threshold ;
struct closure sb_write ;
struct mutex sb_lock ;
2021-03-16 07:42:25 +03:00
/* snapshot.c: */
GENRADIX ( struct snapshot_t ) snapshots ;
struct bch_snapshot_table __rcu * snapshot_table ;
struct mutex snapshot_table_lock ;
struct work_struct snapshot_delete_work ;
2021-10-11 19:03:19 +03:00
struct work_struct snapshot_wait_for_pagecache_and_delete_work ;
struct snapshot_id_list snapshots_unlinked ;
struct mutex snapshots_unlinked_lock ;
2021-03-16 07:42:25 +03:00
2017-03-17 09:18:50 +03:00
/* BTREE CACHE */
struct bio_set btree_bio ;
2021-05-23 00:37:25 +03:00
struct workqueue_struct * io_complete_wq ;
2017-03-17 09:18:50 +03:00
struct btree_root btree_roots [ BTREE_ID_NR ] ;
struct mutex btree_root_lock ;
struct btree_cache btree_cache ;
/*
* Cache of allocated btree nodes - if we allocate a btree node and
* don ' t use it , if we free it that space can ' t be reused until going
* _all_ the way through the allocator ( which exposes us to a livelock
* when allocating btree reserves fail halfway through ) - instead , we
* can stick them here :
*/
struct btree_alloc btree_reserve_cache [ BTREE_NODE_RESERVE * 2 ] ;
unsigned btree_reserve_cache_nr ;
struct mutex btree_reserve_cache_lock ;
mempool_t btree_interior_update_pool ;
struct list_head btree_interior_update_list ;
2020-02-09 00:39:37 +03:00
struct list_head btree_interior_updates_unwritten ;
2017-03-17 09:18:50 +03:00
struct mutex btree_interior_update_lock ;
struct closure_waitlist btree_interior_update_wait ;
2020-05-25 21:57:06 +03:00
struct workqueue_struct * btree_interior_update_worker ;
struct work_struct btree_interior_update_work ;
2020-06-02 23:36:11 +03:00
/* btree_iter.c: */
struct mutex btree_trans_lock ;
struct list_head btree_trans_list ;
2021-08-30 22:18:31 +03:00
mempool_t btree_paths_pool ;
2021-04-24 07:09:06 +03:00
mempool_t btree_trans_mem_pool ;
2021-08-30 22:18:31 +03:00
struct btree_path_buf __percpu * btree_paths_bufs ;
2018-08-09 04:22:46 +03:00
2020-11-16 00:30:22 +03:00
struct srcu_struct btree_trans_barrier ;
2021-12-21 02:18:35 +03:00
bool btree_trans_barrier_initialized ;
2020-11-16 00:30:22 +03:00
2019-03-08 03:46:10 +03:00
struct btree_key_cache btree_key_cache ;
2021-05-23 00:37:25 +03:00
struct workqueue_struct * btree_update_wq ;
2021-07-10 20:44:42 +03:00
struct workqueue_struct * btree_io_complete_wq ;
2017-03-17 09:18:50 +03:00
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct * copygc_wq ;
/* ALLOCATION */
struct bch_devs_mask rw_devs [ BCH_DATA_NR ] ;
u64 capacity ; /* sectors */
/*
* When capacity _decreases_ ( due to a disk being removed ) , we
* increment capacity_gen - this invalidates outstanding reservations
* and forces them to be revalidated
*/
u32 capacity_gen ;
2018-11-05 05:55:35 +03:00
unsigned bucket_size_max ;
2017-03-17 09:18:50 +03:00
atomic64_t sectors_available ;
2020-12-03 22:17:33 +03:00
struct mutex sectors_available_lock ;
2017-03-17 09:18:50 +03:00
2018-11-27 16:23:22 +03:00
struct bch_fs_pcpu __percpu * pcpu ;
struct percpu_rw_semaphore mark_lock ;
2017-03-17 09:18:50 +03:00
2019-02-11 03:34:47 +03:00
seqcount_t usage_lock ;
struct bch_fs_usage * usage_base ;
2020-11-14 02:36:33 +03:00
struct bch_fs_usage __percpu * usage [ JOURNAL_BUF_NR ] ;
2019-02-11 03:34:47 +03:00
struct bch_fs_usage __percpu * usage_gc ;
u64 __percpu * online_reserved ;
2019-03-16 01:20:46 +03:00
/* single element mempool: */
struct mutex usage_scratch_lock ;
2019-02-11 03:34:47 +03:00
struct bch_fs_usage_online * usage_scratch ;
2019-01-21 23:32:13 +03:00
2017-03-17 09:18:50 +03:00
struct io_clock io_clock [ 2 ] ;
2019-04-05 04:53:12 +03:00
/* JOURNAL SEQ BLACKLIST */
struct journal_seq_blacklist_table *
journal_seq_blacklist_table ;
2017-03-17 09:18:50 +03:00
/* ALLOCATOR */
spinlock_t freelist_lock ;
2018-07-22 06:36:11 +03:00
struct closure_waitlist freelist_wait ;
2019-03-18 20:42:10 +03:00
u64 blocked_allocate ;
u64 blocked_allocate_open_bucket ;
2021-12-26 05:43:29 +03:00
2020-06-09 22:44:03 +03:00
open_bucket_idx_t open_buckets_freelist ;
open_bucket_idx_t open_buckets_nr_free ;
2017-03-17 09:18:50 +03:00
struct closure_waitlist open_buckets_wait ;
struct open_bucket open_buckets [ OPEN_BUCKETS_COUNT ] ;
2021-12-26 05:43:29 +03:00
open_bucket_idx_t open_buckets_hash [ OPEN_BUCKETS_COUNT ] ;
2017-03-17 09:18:50 +03:00
struct write_point btree_write_point ;
struct write_point rebalance_write_point ;
2018-11-05 05:55:35 +03:00
struct write_point write_points [ WRITE_POINT_MAX ] ;
struct hlist_head write_points_hash [ WRITE_POINT_HASH_NR ] ;
2017-03-17 09:18:50 +03:00
struct mutex write_points_hash_lock ;
2018-11-05 05:55:35 +03:00
unsigned write_points_nr ;
2017-03-17 09:18:50 +03:00
/* GARBAGE COLLECTION */
struct task_struct * gc_thread ;
atomic_t kick_gc ;
unsigned long gc_count ;
2021-04-13 22:00:40 +03:00
enum btree_id gc_gens_btree ;
struct bpos gc_gens_pos ;
2017-03-17 09:18:50 +03:00
/*
* Tracks GC ' s progress - everything in the range [ ZERO_KEY . . gc_cur_pos ]
* has been marked by GC .
*
2021-02-21 03:27:37 +03:00
* gc_cur_phase is a superset of btree_ids ( BTREE_ID_extents etc . )
2017-03-17 09:18:50 +03:00
*
* Protected by gc_pos_lock . Only written to by GC thread , so GC thread
* can read without a lock .
*/
seqcount_t gc_pos_lock ;
struct gc_pos gc_pos ;
/*
* The allocation code needs gc_mark in struct bucket to be correct , but
* it ' s not while a gc is in progress .
*/
struct rw_semaphore gc_lock ;
/* IO PATH */
2021-05-19 06:53:43 +03:00
struct semaphore io_in_flight ;
2017-03-17 09:18:50 +03:00
struct bio_set bio_read ;
struct bio_set bio_read_split ;
struct bio_set bio_write ;
struct mutex bio_bounce_pages_lock ;
2020-11-03 02:20:44 +03:00
mempool_t bio_bounce_pages ;
2017-03-17 09:18:50 +03:00
struct rhashtable promote_table ;
mempool_t compression_bounce [ 2 ] ;
2019-12-29 04:17:06 +03:00
mempool_t compress_workspace [ BCH_COMPRESSION_TYPE_NR ] ;
2017-03-17 09:18:50 +03:00
mempool_t decompress_workspace ;
ZSTD_parameters zstd_params ;
struct crypto_shash * sha256 ;
struct crypto_sync_skcipher * chacha20 ;
struct crypto_shash * poly1305 ;
atomic64_t key_version ;
2019-11-10 00:01:15 +03:00
mempool_t large_bkey_pool ;
2017-03-17 09:18:50 +03:00
/* REBALANCE */
struct bch_fs_rebalance rebalance ;
2020-07-11 23:28:54 +03:00
/* COPYGC */
struct task_struct * copygc_thread ;
copygc_heap copygc_heap ;
struct write_point copygc_write_point ;
2021-04-13 21:45:55 +03:00
s64 copygc_wait ;
2020-07-11 23:28:54 +03:00
2021-07-23 22:57:19 +03:00
/* DATA PROGRESS STATS */
struct list_head data_progress_list ;
struct mutex data_progress_lock ;
2018-11-25 01:09:44 +03:00
/* STRIPES: */
2021-12-05 07:07:33 +03:00
GENRADIX ( struct stripe ) stripes ;
GENRADIX ( struct gc_stripe ) gc_stripes ;
2018-11-01 22:13:19 +03:00
ec_stripes_heap ec_stripes_heap ;
spinlock_t ec_stripes_heap_lock ;
2018-11-25 01:09:44 +03:00
/* ERASURE CODING */
2020-07-07 03:59:46 +03:00
struct list_head ec_stripe_head_list ;
struct mutex ec_stripe_head_lock ;
struct list_head ec_stripe_new_list ;
struct mutex ec_stripe_new_lock ;
struct work_struct ec_stripe_create_work ;
2019-08-23 00:09:16 +03:00
u64 ec_stripe_hint ;
2018-11-25 01:09:44 +03:00
2018-11-01 22:13:19 +03:00
struct bio_set ec_bioset ;
struct work_struct ec_stripe_delete_work ;
struct llist_head ec_stripe_delete_list ;
2019-08-16 16:59:56 +03:00
/* REFLINK */
u64 reflink_hint ;
2021-05-23 09:31:33 +03:00
reflink_gc_table reflink_gc_table ;
size_t reflink_gc_nr ;
2019-08-16 16:59:56 +03:00
2017-03-17 09:18:50 +03:00
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset ;
struct bio_set dio_write_bioset ;
struct bio_set dio_read_bioset ;
2021-04-01 04:07:37 +03:00
atomic64_t btree_writes_nr ;
atomic64_t btree_writes_sectors ;
2017-03-17 09:18:50 +03:00
spinlock_t btree_write_error_lock ;
/* ERRORS */
struct list_head fsck_errors ;
struct mutex fsck_error_lock ;
bool fsck_alloc_err ;
/* QUOTAS */
struct bch_memquota_type quotas [ QTYP_NR ] ;
/* DEBUG JUNK */
struct dentry * debug ;
struct btree_debug btree_debug [ BTREE_ID_NR ] ;
struct btree * verify_data ;
struct btree_node * verify_ondisk ;
struct mutex verify_lock ;
2020-11-03 07:51:33 +03:00
u64 * unused_inode_hints ;
unsigned inode_shard_bits ;
2017-03-17 09:18:50 +03:00
/*
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - have to dynamically allocate them
*/
mempool_t fill_iter ;
mempool_t btree_bounce_pool ;
struct journal journal ;
2020-03-25 23:12:33 +03:00
struct list_head journal_entries ;
struct journal_keys journal_keys ;
2021-01-27 04:15:46 +03:00
struct list_head journal_iters ;
2017-03-17 09:18:50 +03:00
2018-07-22 05:57:20 +03:00
u64 last_bucket_seq_cleanup ;
2017-03-17 09:18:50 +03:00
/* The rest of this all shows up in sysfs */
atomic_long_t read_realloc_races ;
atomic_long_t extent_migrate_done ;
atomic_long_t extent_migrate_raced ;
unsigned btree_gc_periodic : 1 ;
unsigned copy_gc_enabled : 1 ;
bool promote_whole_extents ;
struct bch2_time_stats times [ BCH_TIME_STAT_NR ] ;
} ;
static inline void bch2_set_ra_pages ( struct bch_fs * c , unsigned ra_pages )
{
# ifndef NO_BCACHEFS_FS
if ( c - > vfs_sb )
c - > vfs_sb - > s_bdi - > ra_pages = ra_pages ;
# endif
}
static inline unsigned bucket_bytes ( const struct bch_dev * ca )
{
return ca - > mi . bucket_size < < 9 ;
}
static inline unsigned block_bytes ( const struct bch_fs * c )
{
2021-12-14 22:24:41 +03:00
return c - > opts . block_size ;
2017-03-17 09:18:50 +03:00
}
2021-12-14 22:24:41 +03:00
static inline unsigned block_sectors ( const struct bch_fs * c )
{
return c - > opts . block_size > > 9 ;
}
static inline size_t btree_sectors ( const struct bch_fs * c )
{
return c - > opts . btree_node_size > > 9 ;
}
static inline struct timespec64 bch2_time_to_timespec ( const struct bch_fs * c , s64 time )
2017-03-17 09:18:50 +03:00
{
2021-04-29 05:51:42 +03:00
struct timespec64 t ;
s32 rem ;
time + = c - > sb . time_base_lo ;
t . tv_sec = div_s64_rem ( time , c - > sb . time_units_per_sec , & rem ) ;
t . tv_nsec = rem * c - > sb . nsec_per_time_unit ;
return t ;
2017-03-17 09:18:50 +03:00
}
2021-12-14 22:24:41 +03:00
static inline s64 timespec_to_bch2_time ( const struct bch_fs * c , struct timespec64 ts )
2017-03-17 09:18:50 +03:00
{
2021-04-29 05:51:42 +03:00
return ( ts . tv_sec * c - > sb . time_units_per_sec +
( int ) ts . tv_nsec / c - > sb . nsec_per_time_unit ) - c - > sb . time_base_lo ;
2017-03-17 09:18:50 +03:00
}
2021-12-14 22:24:41 +03:00
static inline s64 bch2_current_time ( const struct bch_fs * c )
2017-03-17 09:18:50 +03:00
{
struct timespec64 now ;
2019-04-16 23:03:31 +03:00
ktime_get_coarse_real_ts64 ( & now ) ;
2017-03-17 09:18:50 +03:00
return timespec_to_bch2_time ( c , now ) ;
}
2019-05-12 00:32:07 +03:00
static inline bool bch2_dev_exists2 ( const struct bch_fs * c , unsigned dev )
{
return dev < c - > sb . nr_devices & & c - > devs [ dev ] ;
}
2017-03-17 09:18:50 +03:00
# endif /* _BCACHEFS_H */