2017-03-17 09:18:50 +03:00
// SPDX-License-Identifier: GPL-2.0
/*
* Moving / copying garbage collector
*
* Copyright 2012 Google , Inc .
*/
# include "bcachefs.h"
2021-12-25 11:37:52 +03:00
# include "alloc_background.h"
2018-10-06 07:46:55 +03:00
# include "alloc_foreground.h"
2017-03-17 09:18:50 +03:00
# include "btree_iter.h"
# include "btree_update.h"
# include "buckets.h"
# include "clock.h"
# include "disk_groups.h"
2020-07-22 00:12:39 +03:00
# include "error.h"
2017-03-17 09:18:50 +03:00
# include "extents.h"
# include "eytzinger.h"
# include "io.h"
# include "keylist.h"
# include "move.h"
# include "movinggc.h"
# include "super-io.h"
# include "trace.h"
# include <linux/freezer.h>
# include <linux/kthread.h>
# include <linux/math64.h>
# include <linux/sched/task.h>
# include <linux/sort.h>
# include <linux/wait.h>
static int bucket_offset_cmp ( const void * _l , const void * _r , size_t size )
{
const struct copygc_heap_entry * l = _l ;
const struct copygc_heap_entry * r = _r ;
2020-07-11 23:28:54 +03:00
return cmp_int ( l - > dev , r - > dev ) ? :
cmp_int ( l - > offset , r - > offset ) ;
2017-03-17 09:18:50 +03:00
}
2020-07-23 06:11:48 +03:00
static enum data_cmd copygc_pred ( struct bch_fs * c , void * arg ,
struct bkey_s_c k ,
struct bch_io_opts * io_opts ,
struct data_opts * data_opts )
2017-03-17 09:18:50 +03:00
{
2020-07-11 23:28:54 +03:00
copygc_heap * h = & c - > copygc_heap ;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
2020-07-23 06:11:48 +03:00
const union bch_extent_entry * entry ;
2020-12-03 21:09:08 +03:00
struct extent_ptr_decoded p = { 0 } ;
2020-07-11 23:28:54 +03:00
2020-07-23 06:11:48 +03:00
bkey_for_each_ptr_decode ( k . k , ptrs , p , entry ) {
struct bch_dev * ca = bch_dev_bkey_exists ( c , p . ptr . dev ) ;
2020-07-11 23:28:54 +03:00
struct copygc_heap_entry search = {
2020-07-23 06:11:48 +03:00
. dev = p . ptr . dev ,
. offset = p . ptr . offset ,
2020-07-11 23:28:54 +03:00
} ;
2022-01-11 03:46:39 +03:00
ssize_t i ;
2017-03-17 09:18:50 +03:00
2022-01-11 03:46:39 +03:00
if ( p . ptr . cached )
continue ;
i = eytzinger0_find_le ( h - > data , h - > used ,
sizeof ( h - > data [ 0 ] ) ,
bucket_offset_cmp , & search ) ;
2020-06-04 06:47:50 +03:00
#if 0
/* eytzinger search verify code: */
ssize_t j = - 1 , k ;
2017-03-17 09:18:50 +03:00
2020-06-04 06:47:50 +03:00
for ( k = 0 ; k < h - > used ; k + + )
if ( h - > data [ k ] . offset < = ptr - > offset & &
( j < 0 | | h - > data [ k ] . offset > h - > data [ j ] . offset ) )
j = k ;
BUG_ON ( i ! = j ) ;
# endif
2020-07-11 23:28:54 +03:00
if ( i > = 0 & &
2021-08-18 23:19:28 +03:00
p . ptr . dev = = h - > data [ i ] . dev & &
2020-07-23 06:11:48 +03:00
p . ptr . offset < h - > data [ i ] . offset + ca - > mi . bucket_size & &
p . ptr . gen = = h - > data [ i ] . gen ) {
2021-04-21 00:09:25 +03:00
/*
* We need to use the journal reserve here , because
* - journal reclaim depends on btree key cache
* flushing to make forward progress ,
* - which has to make forward progress when the
* journal is pre - reservation full ,
* - and depends on allocation - meaning allocator and
* copygc
*/
2020-07-23 06:11:48 +03:00
data_opts - > target = io_opts - > background_target ;
data_opts - > nr_replicas = 1 ;
2021-04-21 00:09:25 +03:00
data_opts - > btree_insert_flags = BTREE_INSERT_USE_RESERVE |
2022-03-15 04:48:42 +03:00
JOURNAL_WATERMARK_copygc ;
2020-07-23 06:11:48 +03:00
data_opts - > rewrite_dev = p . ptr . dev ;
2017-03-17 09:18:50 +03:00
2021-01-23 02:01:07 +03:00
if ( p . has_ec )
data_opts - > nr_replicas + = p . ec . redundancy ;
2020-07-23 06:11:48 +03:00
return DATA_REWRITE ;
}
}
return DATA_SKIP ;
2017-03-17 09:18:50 +03:00
}
static bool have_copygc_reserve ( struct bch_dev * ca )
{
bool ret ;
2019-11-21 00:16:57 +03:00
spin_lock ( & ca - > fs - > freelist_lock ) ;
2022-03-14 02:27:55 +03:00
ret = fifo_full ( & ca - > free [ RESERVE_movinggc ] ) | |
2021-04-19 00:54:56 +03:00
ca - > allocator_state ! = ALLOCATOR_running ;
2019-11-21 00:16:57 +03:00
spin_unlock ( & ca - > fs - > freelist_lock ) ;
2017-03-17 09:18:50 +03:00
return ret ;
}
2020-08-12 20:49:09 +03:00
static inline int fragmentation_cmp ( copygc_heap * heap ,
struct copygc_heap_entry l ,
struct copygc_heap_entry r )
{
return cmp_int ( l . fragmentation , r . fragmentation ) ;
}
2021-12-25 11:37:52 +03:00
static int walk_buckets_to_copygc ( struct bch_fs * c )
{
copygc_heap * h = & c - > copygc_heap ;
struct btree_trans trans ;
struct btree_iter iter ;
struct bkey_s_c k ;
2022-01-01 04:03:29 +03:00
struct bch_alloc_v4 a ;
2021-12-25 11:37:52 +03:00
int ret ;
bch2_trans_init ( & trans , c , 0 , 0 ) ;
for_each_btree_key ( & trans , iter , BTREE_ID_alloc , POS_MIN ,
BTREE_ITER_PREFETCH , k , ret ) {
struct bch_dev * ca = bch_dev_bkey_exists ( c , iter . pos . inode ) ;
struct copygc_heap_entry e ;
2022-01-01 04:03:29 +03:00
bch2_alloc_to_v4 ( k , & a ) ;
2021-12-25 11:37:52 +03:00
2022-01-01 04:03:29 +03:00
if ( a . data_type ! = BCH_DATA_user | |
a . dirty_sectors > = ca - > mi . bucket_size | |
2021-12-25 11:37:52 +03:00
bch2_bucket_is_open ( c , iter . pos . inode , iter . pos . offset ) )
continue ;
e = ( struct copygc_heap_entry ) {
. dev = iter . pos . inode ,
2022-01-01 04:03:29 +03:00
. gen = a . gen ,
. replicas = 1 + a . stripe_redundancy ,
. fragmentation = div_u64 ( ( u64 ) a . dirty_sectors * ( 1ULL < < 31 ) ,
ca - > mi . bucket_size ) ,
. sectors = a . dirty_sectors ,
2021-12-25 11:37:52 +03:00
. offset = bucket_to_sector ( ca , iter . pos . offset ) ,
} ;
heap_add_or_replace ( h , e , - fragmentation_cmp , NULL ) ;
}
bch2_trans_iter_exit ( & trans , & iter ) ;
bch2_trans_exit ( & trans ) ;
return ret ;
}
static int bucket_inorder_cmp ( const void * _l , const void * _r )
{
const struct copygc_heap_entry * l = _l ;
const struct copygc_heap_entry * r = _r ;
return cmp_int ( l - > dev , r - > dev ) ? : cmp_int ( l - > offset , r - > offset ) ;
}
static int check_copygc_was_done ( struct bch_fs * c ,
u64 * sectors_not_moved ,
u64 * buckets_not_moved )
{
copygc_heap * h = & c - > copygc_heap ;
struct btree_trans trans ;
struct btree_iter iter ;
struct bkey_s_c k ;
2022-01-01 04:03:29 +03:00
struct bch_alloc_v4 a ;
2021-12-25 11:37:52 +03:00
struct copygc_heap_entry * i ;
int ret = 0 ;
sort ( h - > data , h - > used , sizeof ( h - > data [ 0 ] ) , bucket_inorder_cmp , NULL ) ;
bch2_trans_init ( & trans , c , 0 , 0 ) ;
bch2_trans_iter_init ( & trans , & iter , BTREE_ID_alloc , POS_MIN , 0 ) ;
for ( i = h - > data ; i < h - > data + h - > used ; i + + ) {
struct bch_dev * ca = bch_dev_bkey_exists ( c , i - > dev ) ;
bch2_btree_iter_set_pos ( & iter , POS ( i - > dev , sector_to_bucket ( ca , i - > offset ) ) ) ;
ret = lockrestart_do ( & trans ,
bkey_err ( k = bch2_btree_iter_peek_slot ( & iter ) ) ) ;
if ( ret )
break ;
2022-01-01 04:03:29 +03:00
bch2_alloc_to_v4 ( k , & a ) ;
2021-12-25 11:37:52 +03:00
2022-01-01 04:03:29 +03:00
if ( a . gen = = i - > gen & & a . dirty_sectors ) {
* sectors_not_moved + = a . dirty_sectors ;
2021-12-25 11:37:52 +03:00
* buckets_not_moved + = 1 ;
}
}
bch2_trans_iter_exit ( & trans , & iter ) ;
bch2_trans_exit ( & trans ) ;
return ret ;
}
2020-07-22 00:12:39 +03:00
static int bch2_copygc ( struct bch_fs * c )
2017-03-17 09:18:50 +03:00
{
2020-07-11 23:28:54 +03:00
copygc_heap * h = & c - > copygc_heap ;
2017-03-17 09:18:50 +03:00
struct copygc_heap_entry e , * i ;
struct bch_move_stats move_stats ;
2021-12-05 05:52:09 +03:00
u64 sectors_to_move = 0 , sectors_to_write = 0 , sectors_not_moved = 0 ;
2020-07-11 23:28:54 +03:00
u64 sectors_reserved = 0 ;
2017-03-17 09:18:50 +03:00
u64 buckets_to_move , buckets_not_moved = 0 ;
2020-07-11 23:28:54 +03:00
struct bch_dev * ca ;
unsigned dev_idx ;
2021-12-25 11:37:52 +03:00
size_t heap_size = 0 ;
2017-03-17 09:18:50 +03:00
int ret ;
2021-07-23 22:57:19 +03:00
bch_move_stats_init ( & move_stats , " copygc " ) ;
2017-03-17 09:18:50 +03:00
/*
* Find buckets with lowest sector counts , skipping completely
* empty buckets , by building a maxheap sorted by sector count ,
* and repeatedly replacing the maximum element until all
* buckets have been visited .
*/
h - > used = 0 ;
2020-07-11 23:28:54 +03:00
for_each_rw_member ( ca , c , dev_idx )
heap_size + = ca - > mi . nbuckets > > 7 ;
2017-03-17 09:18:50 +03:00
2020-07-11 23:28:54 +03:00
if ( h - > size < heap_size ) {
free_heap ( & c - > copygc_heap ) ;
if ( ! init_heap ( & c - > copygc_heap , heap_size , GFP_KERNEL ) ) {
bch_err ( c , " error allocating copygc heap " ) ;
2020-07-22 00:12:39 +03:00
return 0 ;
2020-07-11 23:28:54 +03:00
}
}
for_each_rw_member ( ca , c , dev_idx ) {
closure_wait_event ( & c - > freelist_wait , have_copygc_reserve ( ca ) ) ;
spin_lock ( & ca - > fs - > freelist_lock ) ;
2022-03-14 02:27:55 +03:00
sectors_reserved + = fifo_used ( & ca - > free [ RESERVE_movinggc ] ) * ca - > mi . bucket_size ;
2020-07-11 23:28:54 +03:00
spin_unlock ( & ca - > fs - > freelist_lock ) ;
2021-12-25 11:37:52 +03:00
}
2020-07-11 23:28:54 +03:00
2021-12-25 11:37:52 +03:00
ret = walk_buckets_to_copygc ( c ) ;
if ( ret ) {
bch2_fs_fatal_error ( c , " error walking buckets to copygc! " ) ;
return ret ;
2017-03-17 09:18:50 +03:00
}
2021-12-28 05:28:50 +03:00
if ( ! h - > used ) {
bch_err_ratelimited ( c , " copygc requested to run but found no buckets to move! " ) ;
return 0 ;
}
2021-12-05 05:52:09 +03:00
/*
2022-03-14 02:27:55 +03:00
* Our btree node allocations also come out of RESERVE_movingc :
2021-12-05 05:52:09 +03:00
*/
sectors_reserved = ( sectors_reserved * 3 ) / 4 ;
2020-07-22 00:12:39 +03:00
if ( ! sectors_reserved ) {
bch2_fs_fatal_error ( c , " stuck, ran out of copygc reserve! " ) ;
return - 1 ;
}
2021-12-05 05:52:09 +03:00
for ( i = h - > data ; i < h - > data + h - > used ; i + + ) {
sectors_to_move + = i - > sectors ;
sectors_to_write + = i - > sectors * i - > replicas ;
}
2017-03-17 09:18:50 +03:00
2021-12-05 05:52:09 +03:00
while ( sectors_to_write > sectors_reserved ) {
2020-08-12 20:49:09 +03:00
BUG_ON ( ! heap_pop ( h , e , - fragmentation_cmp , NULL ) ) ;
2021-12-05 05:52:09 +03:00
sectors_to_write - = e . sectors * e . replicas ;
2017-03-17 09:18:50 +03:00
}
buckets_to_move = h - > used ;
2022-01-07 05:38:08 +03:00
if ( ! buckets_to_move ) {
bch_err_ratelimited ( c , " copygc cannot run - sectors_reserved %llu! " ,
sectors_reserved ) ;
2020-07-22 00:12:39 +03:00
return 0 ;
2022-01-07 05:38:08 +03:00
}
2017-03-17 09:18:50 +03:00
eytzinger0_sort ( h - > data , h - > used ,
sizeof ( h - > data [ 0 ] ) ,
bucket_offset_cmp , NULL ) ;
2021-03-15 02:01:14 +03:00
ret = bch2_move_data ( c ,
0 , POS_MIN ,
BTREE_ID_NR , POS_MAX ,
2021-04-18 03:24:54 +03:00
NULL ,
2020-07-11 23:28:54 +03:00
writepoint_ptr ( & c - > copygc_write_point ) ,
copygc_pred , NULL ,
2017-03-17 09:18:50 +03:00
& move_stats ) ;
2021-12-25 11:37:52 +03:00
if ( ret ) {
bch_err ( c , " error %i from bch2_move_data() in copygc " , ret ) ;
return ret ;
}
2017-03-17 09:18:50 +03:00
2021-12-25 11:37:52 +03:00
ret = check_copygc_was_done ( c , & sectors_not_moved , & buckets_not_moved ) ;
if ( ret ) {
bch_err ( c , " error %i from check_copygc_was_done() " , ret ) ;
return ret ;
2017-03-17 09:18:50 +03:00
}
2021-12-25 11:37:52 +03:00
if ( sectors_not_moved )
2019-04-04 03:38:37 +03:00
bch_warn_ratelimited ( c ,
2020-06-04 06:47:50 +03:00
" copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors) " ,
2017-03-17 09:18:50 +03:00
sectors_not_moved , sectors_to_move ,
2020-06-04 06:47:50 +03:00
buckets_not_moved , buckets_to_move ,
atomic64_read ( & move_stats . sectors_moved ) ,
atomic64_read ( & move_stats . keys_raced ) ,
atomic64_read ( & move_stats . sectors_raced ) ) ;
2017-03-17 09:18:50 +03:00
2020-07-11 23:28:54 +03:00
trace_copygc ( c ,
2017-03-17 09:18:50 +03:00
atomic64_read ( & move_stats . sectors_moved ) , sectors_not_moved ,
buckets_to_move , buckets_not_moved ) ;
2020-07-22 00:12:39 +03:00
return 0 ;
2017-03-17 09:18:50 +03:00
}
2019-12-21 00:19:46 +03:00
/*
* Copygc runs when the amount of fragmented data is above some arbitrary
* threshold :
*
* The threshold at the limit - when the device is full - is the amount of space
* we reserved in bch2_recalc_capacity ; we can ' t have more than that amount of
* disk space stranded due to fragmentation and store everything we have
* promised to store .
*
* But we don ' t want to be running copygc unnecessarily when the device still
* has plenty of free space - rather , we want copygc to smoothly run every so
* often and continually reduce the amount of fragmented space as the device
* fills up . So , we increase the threshold by half the current free space .
*/
2020-07-11 23:28:54 +03:00
unsigned long bch2_copygc_wait_amount ( struct bch_fs * c )
2019-12-21 00:19:46 +03:00
{
2020-07-11 23:28:54 +03:00
struct bch_dev * ca ;
unsigned dev_idx ;
2021-04-27 21:03:13 +03:00
s64 wait = S64_MAX , fragmented_allowed , fragmented ;
2020-07-11 23:28:54 +03:00
for_each_rw_member ( ca , c , dev_idx ) {
2020-07-22 20:27:00 +03:00
struct bch_dev_usage usage = bch2_dev_usage_read ( ca ) ;
2020-07-11 23:28:54 +03:00
2021-04-27 21:03:13 +03:00
fragmented_allowed = ( ( __dev_buckets_reclaimable ( ca , usage ) *
2020-07-11 23:28:54 +03:00
ca - > mi . bucket_size ) > > 1 ) ;
2021-04-27 21:03:13 +03:00
fragmented = usage . d [ BCH_DATA_user ] . fragmented ;
wait = min ( wait , max ( 0LL , fragmented_allowed - fragmented ) ) ;
2020-07-11 23:28:54 +03:00
}
2019-12-21 00:19:46 +03:00
2021-04-27 21:03:13 +03:00
return wait ;
2019-12-21 00:19:46 +03:00
}
2017-03-17 09:18:50 +03:00
static int bch2_copygc_thread ( void * arg )
{
2020-07-11 23:28:54 +03:00
struct bch_fs * c = arg ;
2017-03-17 09:18:50 +03:00
struct io_clock * clock = & c - > io_clock [ WRITE ] ;
2021-01-21 23:28:59 +03:00
u64 last , wait ;
2017-03-17 09:18:50 +03:00
set_freezable ( ) ;
while ( ! kthread_should_stop ( ) ) {
2021-05-26 01:42:05 +03:00
cond_resched ( ) ;
2017-03-17 09:18:50 +03:00
if ( kthread_wait_freezable ( c - > copy_gc_enabled ) )
break ;
2021-01-21 23:28:59 +03:00
last = atomic64_read ( & clock - > now ) ;
2020-07-11 23:28:54 +03:00
wait = bch2_copygc_wait_amount ( c ) ;
2017-03-17 09:18:50 +03:00
2019-12-21 00:19:46 +03:00
if ( wait > clock - > max_slop ) {
2021-05-26 08:03:35 +03:00
trace_copygc_wait ( c , wait , last + wait ) ;
2021-04-13 21:45:55 +03:00
c - > copygc_wait = last + wait ;
2019-12-21 00:19:46 +03:00
bch2_kthread_io_clock_wait ( clock , last + wait ,
2017-03-17 09:18:50 +03:00
MAX_SCHEDULE_TIMEOUT ) ;
continue ;
}
2021-04-13 21:45:55 +03:00
c - > copygc_wait = 0 ;
2020-07-22 00:12:39 +03:00
if ( bch2_copygc ( c ) )
break ;
2017-03-17 09:18:50 +03:00
}
return 0 ;
}
2020-07-11 23:28:54 +03:00
void bch2_copygc_stop ( struct bch_fs * c )
2017-03-17 09:18:50 +03:00
{
2020-07-11 23:28:54 +03:00
if ( c - > copygc_thread ) {
kthread_stop ( c - > copygc_thread ) ;
put_task_struct ( c - > copygc_thread ) ;
2017-03-17 09:18:50 +03:00
}
2020-07-11 23:28:54 +03:00
c - > copygc_thread = NULL ;
2017-03-17 09:18:50 +03:00
}
2020-07-11 23:28:54 +03:00
int bch2_copygc_start ( struct bch_fs * c )
2017-03-17 09:18:50 +03:00
{
struct task_struct * t ;
2020-07-11 23:28:54 +03:00
if ( c - > copygc_thread )
2019-05-24 21:45:33 +03:00
return 0 ;
2017-03-17 09:18:50 +03:00
if ( c - > opts . nochanges )
return 0 ;
if ( bch2_fs_init_fault ( " copygc_start " ) )
return - ENOMEM ;
2020-11-20 04:55:33 +03:00
t = kthread_create ( bch2_copygc_thread , c , " bch-copygc/%s " , c - > name ) ;
2021-02-23 23:16:41 +03:00
if ( IS_ERR ( t ) ) {
bch_err ( c , " error creating copygc thread: %li " , PTR_ERR ( t ) ) ;
2017-03-17 09:18:50 +03:00
return PTR_ERR ( t ) ;
2021-02-23 23:16:41 +03:00
}
2017-03-17 09:18:50 +03:00
get_task_struct ( t ) ;
2020-07-11 23:28:54 +03:00
c - > copygc_thread = t ;
wake_up_process ( c - > copygc_thread ) ;
2017-03-17 09:18:50 +03:00
return 0 ;
}
2020-07-11 23:28:54 +03:00
void bch2_fs_copygc_init ( struct bch_fs * c )
2017-03-17 09:18:50 +03:00
{
}