2018-04-03 20:23:33 +03:00
// SPDX-License-Identifier: GPL-2.0
2012-06-28 20:03:02 +04:00
/*
* Copyright ( C ) 2011 STRATO . All rights reserved .
*/
# include <linux/sched.h>
# include <linux/pagemap.h>
# include <linux/writeback.h>
# include <linux/blkdev.h>
# include <linux/rbtree.h>
# include <linux/slab.h>
# include <linux/workqueue.h>
2013-01-29 10:04:50 +04:00
# include <linux/btrfs.h>
2017-12-22 11:06:39 +03:00
# include <linux/sizes.h>
2012-06-28 20:03:02 +04:00
# include "ctree.h"
# include "transaction.h"
# include "disk-io.h"
# include "locking.h"
# include "ulist.h"
# include "backref.h"
2013-04-25 20:04:51 +04:00
# include "extent_io.h"
2014-05-14 04:30:47 +04:00
# include "qgroup.h"
2019-06-20 22:37:44 +03:00
# include "block-group.h"
2015-04-17 05:23:16 +03:00
2012-06-28 20:03:02 +04:00
/* TODO XXX FIXME
* - subvol delete - > delete when ref goes to 0 ? delete limits also ?
* - reorganize keys
* - compressed
* - sync
* - copy also limits on subvol creation
* - limit
2018-11-28 14:05:13 +03:00
* - caches for ulists
2012-06-28 20:03:02 +04:00
* - performance benchmarks
* - check all ioctl parameters
*/
2017-12-12 10:34:24 +03:00
/*
* Helpers to access qgroup reservation
*
* Callers should ensure the lock context and type are valid
*/
static u64 qgroup_rsv_total ( const struct btrfs_qgroup * qgroup )
{
u64 ret = 0 ;
int i ;
for ( i = 0 ; i < BTRFS_QGROUP_RSV_LAST ; i + + )
ret + = qgroup - > rsv . values [ i ] ;
return ret ;
}
# ifdef CONFIG_BTRFS_DEBUG
static const char * qgroup_rsv_type_str ( enum btrfs_qgroup_rsv_type type )
{
if ( type = = BTRFS_QGROUP_RSV_DATA )
return " data " ;
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:29 +03:00
if ( type = = BTRFS_QGROUP_RSV_META_PERTRANS )
return " meta_pertrans " ;
if ( type = = BTRFS_QGROUP_RSV_META_PREALLOC )
return " meta_prealloc " ;
2017-12-12 10:34:24 +03:00
return NULL ;
}
# endif
2017-12-12 10:34:27 +03:00
static void qgroup_rsv_add ( struct btrfs_fs_info * fs_info ,
struct btrfs_qgroup * qgroup , u64 num_bytes ,
2017-12-12 10:34:24 +03:00
enum btrfs_qgroup_rsv_type type )
{
2017-12-12 10:34:27 +03:00
trace_qgroup_update_reserve ( fs_info , qgroup , num_bytes , type ) ;
2017-12-12 10:34:24 +03:00
qgroup - > rsv . values [ type ] + = num_bytes ;
}
2017-12-12 10:34:27 +03:00
static void qgroup_rsv_release ( struct btrfs_fs_info * fs_info ,
struct btrfs_qgroup * qgroup , u64 num_bytes ,
2017-12-12 10:34:24 +03:00
enum btrfs_qgroup_rsv_type type )
{
2017-12-12 10:34:27 +03:00
trace_qgroup_update_reserve ( fs_info , qgroup , - ( s64 ) num_bytes , type ) ;
2017-12-12 10:34:24 +03:00
if ( qgroup - > rsv . values [ type ] > = num_bytes ) {
qgroup - > rsv . values [ type ] - = num_bytes ;
return ;
}
# ifdef CONFIG_BTRFS_DEBUG
WARN_RATELIMIT ( 1 ,
" qgroup %llu %s reserved space underflow, have %llu to free %llu " ,
qgroup - > qgroupid , qgroup_rsv_type_str ( type ) ,
qgroup - > rsv . values [ type ] , num_bytes ) ;
# endif
qgroup - > rsv . values [ type ] = 0 ;
}
2017-12-12 10:34:27 +03:00
static void qgroup_rsv_add_by_qgroup ( struct btrfs_fs_info * fs_info ,
struct btrfs_qgroup * dest ,
struct btrfs_qgroup * src )
2017-12-12 10:34:24 +03:00
{
int i ;
for ( i = 0 ; i < BTRFS_QGROUP_RSV_LAST ; i + + )
2017-12-12 10:34:27 +03:00
qgroup_rsv_add ( fs_info , dest , src - > rsv . values [ i ] , i ) ;
2017-12-12 10:34:24 +03:00
}
2017-12-12 10:34:27 +03:00
static void qgroup_rsv_release_by_qgroup ( struct btrfs_fs_info * fs_info ,
struct btrfs_qgroup * dest ,
2017-12-12 10:34:24 +03:00
struct btrfs_qgroup * src )
{
int i ;
for ( i = 0 ; i < BTRFS_QGROUP_RSV_LAST ; i + + )
2017-12-12 10:34:27 +03:00
qgroup_rsv_release ( fs_info , dest , src - > rsv . values [ i ] , i ) ;
2017-12-12 10:34:24 +03:00
}
2015-03-12 11:10:13 +03:00
static void btrfs_qgroup_update_old_refcnt ( struct btrfs_qgroup * qg , u64 seq ,
int mod )
{
if ( qg - > old_refcnt < seq )
qg - > old_refcnt = seq ;
qg - > old_refcnt + = mod ;
}
static void btrfs_qgroup_update_new_refcnt ( struct btrfs_qgroup * qg , u64 seq ,
int mod )
{
if ( qg - > new_refcnt < seq )
qg - > new_refcnt = seq ;
qg - > new_refcnt + = mod ;
}
static inline u64 btrfs_qgroup_get_old_refcnt ( struct btrfs_qgroup * qg , u64 seq )
{
if ( qg - > old_refcnt < seq )
return 0 ;
return qg - > old_refcnt - seq ;
}
static inline u64 btrfs_qgroup_get_new_refcnt ( struct btrfs_qgroup * qg , u64 seq )
{
if ( qg - > new_refcnt < seq )
return 0 ;
return qg - > new_refcnt - seq ;
}
2012-06-28 20:03:02 +04:00
/*
* glue structure to represent the relations between qgroups .
*/
struct btrfs_qgroup_list {
struct list_head next_group ;
struct list_head next_member ;
struct btrfs_qgroup * group ;
struct btrfs_qgroup * member ;
} ;
2016-10-26 17:23:50 +03:00
static inline u64 qgroup_to_aux ( struct btrfs_qgroup * qg )
{
return ( u64 ) ( uintptr_t ) qg ;
}
static inline struct btrfs_qgroup * unode_aux_to_qgroup ( struct ulist_node * n )
{
return ( struct btrfs_qgroup * ) ( uintptr_t ) n - > aux ;
}
2014-05-14 04:30:47 +04:00
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
static int
qgroup_rescan_init ( struct btrfs_fs_info * fs_info , u64 progress_objectid ,
int init_flags ) ;
static void qgroup_rescan_zero_tracking ( struct btrfs_fs_info * fs_info ) ;
2013-04-25 20:04:51 +04:00
2013-04-07 14:50:17 +04:00
/* must be called with qgroup_ioctl_lock held */
2012-06-28 20:03:02 +04:00
static struct btrfs_qgroup * find_qgroup_rb ( struct btrfs_fs_info * fs_info ,
u64 qgroupid )
{
struct rb_node * n = fs_info - > qgroup_tree . rb_node ;
struct btrfs_qgroup * qgroup ;
while ( n ) {
qgroup = rb_entry ( n , struct btrfs_qgroup , node ) ;
if ( qgroup - > qgroupid < qgroupid )
n = n - > rb_left ;
else if ( qgroup - > qgroupid > qgroupid )
n = n - > rb_right ;
else
return qgroup ;
}
return NULL ;
}
/* must be called with qgroup_lock held */
static struct btrfs_qgroup * add_qgroup_rb ( struct btrfs_fs_info * fs_info ,
u64 qgroupid )
{
struct rb_node * * p = & fs_info - > qgroup_tree . rb_node ;
struct rb_node * parent = NULL ;
struct btrfs_qgroup * qgroup ;
while ( * p ) {
parent = * p ;
qgroup = rb_entry ( parent , struct btrfs_qgroup , node ) ;
if ( qgroup - > qgroupid < qgroupid )
p = & ( * p ) - > rb_left ;
else if ( qgroup - > qgroupid > qgroupid )
p = & ( * p ) - > rb_right ;
else
return qgroup ;
}
qgroup = kzalloc ( sizeof ( * qgroup ) , GFP_ATOMIC ) ;
if ( ! qgroup )
return ERR_PTR ( - ENOMEM ) ;
qgroup - > qgroupid = qgroupid ;
INIT_LIST_HEAD ( & qgroup - > groups ) ;
INIT_LIST_HEAD ( & qgroup - > members ) ;
INIT_LIST_HEAD ( & qgroup - > dirty ) ;
rb_link_node ( & qgroup - > node , parent , p ) ;
rb_insert_color ( & qgroup - > node , & fs_info - > qgroup_tree ) ;
return qgroup ;
}
2013-08-14 05:13:36 +04:00
static void __del_qgroup_rb ( struct btrfs_qgroup * qgroup )
2012-06-28 20:03:02 +04:00
{
struct btrfs_qgroup_list * list ;
list_del ( & qgroup - > dirty ) ;
while ( ! list_empty ( & qgroup - > groups ) ) {
list = list_first_entry ( & qgroup - > groups ,
struct btrfs_qgroup_list , next_group ) ;
list_del ( & list - > next_group ) ;
list_del ( & list - > next_member ) ;
kfree ( list ) ;
}
while ( ! list_empty ( & qgroup - > members ) ) {
list = list_first_entry ( & qgroup - > members ,
struct btrfs_qgroup_list , next_member ) ;
list_del ( & list - > next_group ) ;
list_del ( & list - > next_member ) ;
kfree ( list ) ;
}
kfree ( qgroup ) ;
2013-08-14 05:13:36 +04:00
}
2012-06-28 20:03:02 +04:00
2013-08-14 05:13:36 +04:00
/* must be called with qgroup_lock held */
static int del_qgroup_rb ( struct btrfs_fs_info * fs_info , u64 qgroupid )
{
struct btrfs_qgroup * qgroup = find_qgroup_rb ( fs_info , qgroupid ) ;
if ( ! qgroup )
return - ENOENT ;
rb_erase ( & qgroup - > node , & fs_info - > qgroup_tree ) ;
__del_qgroup_rb ( qgroup ) ;
2012-06-28 20:03:02 +04:00
return 0 ;
}
/* must be called with qgroup_lock held */
static int add_relation_rb ( struct btrfs_fs_info * fs_info ,
u64 memberid , u64 parentid )
{
struct btrfs_qgroup * member ;
struct btrfs_qgroup * parent ;
struct btrfs_qgroup_list * list ;
member = find_qgroup_rb ( fs_info , memberid ) ;
parent = find_qgroup_rb ( fs_info , parentid ) ;
if ( ! member | | ! parent )
return - ENOENT ;
list = kzalloc ( sizeof ( * list ) , GFP_ATOMIC ) ;
if ( ! list )
return - ENOMEM ;
list - > group = parent ;
list - > member = member ;
list_add_tail ( & list - > next_group , & member - > groups ) ;
list_add_tail ( & list - > next_member , & parent - > members ) ;
return 0 ;
}
/* must be called with qgroup_lock held */
static int del_relation_rb ( struct btrfs_fs_info * fs_info ,
u64 memberid , u64 parentid )
{
struct btrfs_qgroup * member ;
struct btrfs_qgroup * parent ;
struct btrfs_qgroup_list * list ;
member = find_qgroup_rb ( fs_info , memberid ) ;
parent = find_qgroup_rb ( fs_info , parentid ) ;
if ( ! member | | ! parent )
return - ENOENT ;
list_for_each_entry ( list , & member - > groups , next_group ) {
if ( list - > group = = parent ) {
list_del ( & list - > next_group ) ;
list_del ( & list - > next_member ) ;
kfree ( list ) ;
return 0 ;
}
}
return - ENOENT ;
}
2014-05-08 01:06:09 +04:00
# ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts ( struct btrfs_fs_info * fs_info , u64 qgroupid ,
u64 rfer , u64 excl )
{
struct btrfs_qgroup * qgroup ;
qgroup = find_qgroup_rb ( fs_info , qgroupid ) ;
if ( ! qgroup )
return - EINVAL ;
if ( qgroup - > rfer ! = rfer | | qgroup - > excl ! = excl )
return - EINVAL ;
return 0 ;
}
# endif
2012-06-28 20:03:02 +04:00
/*
* The full config is read in one go , only called from open_ctree ( )
* It doesn ' t use any locking , as at this point we ' re still single - threaded
*/
int btrfs_read_qgroup_config ( struct btrfs_fs_info * fs_info )
{
struct btrfs_key key ;
struct btrfs_key found_key ;
struct btrfs_root * quota_root = fs_info - > quota_root ;
struct btrfs_path * path = NULL ;
struct extent_buffer * l ;
int slot ;
int ret = 0 ;
u64 flags = 0 ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
u64 rescan_progress = 0 ;
2012-06-28 20:03:02 +04:00
2016-09-02 22:40:02 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) )
2012-06-28 20:03:02 +04:00
return 0 ;
2017-02-13 14:10:20 +03:00
fs_info - > qgroup_ulist = ulist_alloc ( GFP_KERNEL ) ;
2013-05-06 15:03:27 +04:00
if ( ! fs_info - > qgroup_ulist ) {
ret = - ENOMEM ;
goto out ;
}
2012-06-28 20:03:02 +04:00
path = btrfs_alloc_path ( ) ;
if ( ! path ) {
ret = - ENOMEM ;
goto out ;
}
/* default this to quota off, in case no status key is found */
fs_info - > qgroup_flags = 0 ;
/*
* pass 1 : read status , all qgroup infos and limits
*/
key . objectid = 0 ;
key . type = 0 ;
key . offset = 0 ;
ret = btrfs_search_slot_for_read ( quota_root , & key , path , 1 , 1 ) ;
if ( ret )
goto out ;
while ( 1 ) {
struct btrfs_qgroup * qgroup ;
slot = path - > slots [ 0 ] ;
l = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( l , & found_key , slot ) ;
if ( found_key . type = = BTRFS_QGROUP_STATUS_KEY ) {
struct btrfs_qgroup_status_item * ptr ;
ptr = btrfs_item_ptr ( l , slot ,
struct btrfs_qgroup_status_item ) ;
if ( btrfs_qgroup_status_version ( l , ptr ) ! =
BTRFS_QGROUP_STATUS_VERSION ) {
2013-12-20 20:37:06 +04:00
btrfs_err ( fs_info ,
" old qgroup version, quota disabled " ) ;
2012-06-28 20:03:02 +04:00
goto out ;
}
if ( btrfs_qgroup_status_generation ( l , ptr ) ! =
fs_info - > generation ) {
flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
2013-12-20 20:37:06 +04:00
btrfs_err ( fs_info ,
2016-09-20 17:05:00 +03:00
" qgroup generation mismatch, marked as inconsistent " ) ;
2012-06-28 20:03:02 +04:00
}
fs_info - > qgroup_flags = btrfs_qgroup_status_flags ( l ,
ptr ) ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
rescan_progress = btrfs_qgroup_status_rescan ( l , ptr ) ;
2012-06-28 20:03:02 +04:00
goto next1 ;
}
if ( found_key . type ! = BTRFS_QGROUP_INFO_KEY & &
found_key . type ! = BTRFS_QGROUP_LIMIT_KEY )
goto next1 ;
qgroup = find_qgroup_rb ( fs_info , found_key . offset ) ;
if ( ( qgroup & & found_key . type = = BTRFS_QGROUP_INFO_KEY ) | |
( ! qgroup & & found_key . type = = BTRFS_QGROUP_LIMIT_KEY ) ) {
2015-07-06 16:38:11 +03:00
btrfs_err ( fs_info , " inconsistent qgroup config " ) ;
2012-06-28 20:03:02 +04:00
flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
}
if ( ! qgroup ) {
qgroup = add_qgroup_rb ( fs_info , found_key . offset ) ;
if ( IS_ERR ( qgroup ) ) {
ret = PTR_ERR ( qgroup ) ;
goto out ;
}
}
switch ( found_key . type ) {
case BTRFS_QGROUP_INFO_KEY : {
struct btrfs_qgroup_info_item * ptr ;
ptr = btrfs_item_ptr ( l , slot ,
struct btrfs_qgroup_info_item ) ;
qgroup - > rfer = btrfs_qgroup_info_rfer ( l , ptr ) ;
qgroup - > rfer_cmpr = btrfs_qgroup_info_rfer_cmpr ( l , ptr ) ;
qgroup - > excl = btrfs_qgroup_info_excl ( l , ptr ) ;
qgroup - > excl_cmpr = btrfs_qgroup_info_excl_cmpr ( l , ptr ) ;
/* generation currently unused */
break ;
}
case BTRFS_QGROUP_LIMIT_KEY : {
struct btrfs_qgroup_limit_item * ptr ;
ptr = btrfs_item_ptr ( l , slot ,
struct btrfs_qgroup_limit_item ) ;
qgroup - > lim_flags = btrfs_qgroup_limit_flags ( l , ptr ) ;
qgroup - > max_rfer = btrfs_qgroup_limit_max_rfer ( l , ptr ) ;
qgroup - > max_excl = btrfs_qgroup_limit_max_excl ( l , ptr ) ;
qgroup - > rsv_rfer = btrfs_qgroup_limit_rsv_rfer ( l , ptr ) ;
qgroup - > rsv_excl = btrfs_qgroup_limit_rsv_excl ( l , ptr ) ;
break ;
}
}
next1 :
ret = btrfs_next_item ( quota_root , path ) ;
if ( ret < 0 )
goto out ;
if ( ret )
break ;
}
btrfs_release_path ( path ) ;
/*
* pass 2 : read all qgroup relations
*/
key . objectid = 0 ;
key . type = BTRFS_QGROUP_RELATION_KEY ;
key . offset = 0 ;
ret = btrfs_search_slot_for_read ( quota_root , & key , path , 1 , 0 ) ;
if ( ret )
goto out ;
while ( 1 ) {
slot = path - > slots [ 0 ] ;
l = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( l , & found_key , slot ) ;
if ( found_key . type ! = BTRFS_QGROUP_RELATION_KEY )
goto next2 ;
if ( found_key . objectid > found_key . offset ) {
/* parent <- member, not needed to build config */
/* FIXME should we omit the key completely? */
goto next2 ;
}
ret = add_relation_rb ( fs_info , found_key . objectid ,
found_key . offset ) ;
2013-01-17 12:22:08 +04:00
if ( ret = = - ENOENT ) {
2013-12-20 20:37:06 +04:00
btrfs_warn ( fs_info ,
" orphan qgroup relation 0x%llx->0x%llx " ,
2013-08-20 15:20:07 +04:00
found_key . objectid , found_key . offset ) ;
2013-01-17 12:22:08 +04:00
ret = 0 ; /* ignore the error */
}
2012-06-28 20:03:02 +04:00
if ( ret )
goto out ;
next2 :
ret = btrfs_next_item ( quota_root , path ) ;
if ( ret < 0 )
goto out ;
if ( ret )
break ;
}
out :
fs_info - > qgroup_flags | = flags ;
2016-09-02 22:40:02 +03:00
if ( ! ( fs_info - > qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON ) )
clear_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) ;
else if ( fs_info - > qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN & &
ret > = 0 )
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
ret = qgroup_rescan_init ( fs_info , rescan_progress , 0 ) ;
2012-06-28 20:03:02 +04:00
btrfs_free_path ( path ) ;
2013-05-28 19:47:23 +04:00
if ( ret < 0 ) {
2013-05-06 15:03:27 +04:00
ulist_free ( fs_info - > qgroup_ulist ) ;
2013-05-28 19:47:23 +04:00
fs_info - > qgroup_ulist = NULL ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
fs_info - > qgroup_flags & = ~ BTRFS_QGROUP_STATUS_FLAG_RESCAN ;
2013-05-28 19:47:23 +04:00
}
2013-05-06 15:03:27 +04:00
2012-06-28 20:03:02 +04:00
return ret < 0 ? ret : 0 ;
}
/*
2013-08-14 05:13:37 +04:00
* This is called from close_ctree ( ) or open_ctree ( ) or btrfs_quota_disable ( ) ,
* first two are in single - threaded paths . And for the third one , we have set
* quota_root to be null with qgroup_lock held before , so it is safe to clean
* up the in - memory structures without qgroup_lock held .
2012-06-28 20:03:02 +04:00
*/
void btrfs_free_qgroup_config ( struct btrfs_fs_info * fs_info )
{
struct rb_node * n ;
struct btrfs_qgroup * qgroup ;
while ( ( n = rb_first ( & fs_info - > qgroup_tree ) ) ) {
qgroup = rb_entry ( n , struct btrfs_qgroup , node ) ;
rb_erase ( n , & fs_info - > qgroup_tree ) ;
2013-08-14 05:13:36 +04:00
__del_qgroup_rb ( qgroup ) ;
2012-06-28 20:03:02 +04:00
}
2013-07-13 17:02:54 +04:00
/*
2018-11-28 14:05:13 +03:00
* We call btrfs_free_qgroup_config ( ) when unmounting
2016-05-20 04:18:45 +03:00
* filesystem and disabling quota , so we set qgroup_ulist
2013-07-13 17:02:54 +04:00
* to be null here to avoid double free .
*/
2013-05-06 15:03:27 +04:00
ulist_free ( fs_info - > qgroup_ulist ) ;
2013-07-13 17:02:54 +04:00
fs_info - > qgroup_ulist = NULL ;
2012-06-28 20:03:02 +04:00
}
2018-07-18 09:45:24 +03:00
static int add_qgroup_relation_item ( struct btrfs_trans_handle * trans , u64 src ,
u64 dst )
2012-06-28 20:03:02 +04:00
{
int ret ;
2018-07-18 09:45:24 +03:00
struct btrfs_root * quota_root = trans - > fs_info - > quota_root ;
2012-06-28 20:03:02 +04:00
struct btrfs_path * path ;
struct btrfs_key key ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = src ;
key . type = BTRFS_QGROUP_RELATION_KEY ;
key . offset = dst ;
ret = btrfs_insert_empty_item ( trans , quota_root , path , & key , 0 ) ;
btrfs_mark_buffer_dirty ( path - > nodes [ 0 ] ) ;
btrfs_free_path ( path ) ;
return ret ;
}
2018-07-18 09:45:25 +03:00
static int del_qgroup_relation_item ( struct btrfs_trans_handle * trans , u64 src ,
u64 dst )
2012-06-28 20:03:02 +04:00
{
int ret ;
2018-07-18 09:45:25 +03:00
struct btrfs_root * quota_root = trans - > fs_info - > quota_root ;
2012-06-28 20:03:02 +04:00
struct btrfs_path * path ;
struct btrfs_key key ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = src ;
key . type = BTRFS_QGROUP_RELATION_KEY ;
key . offset = dst ;
ret = btrfs_search_slot ( trans , quota_root , & key , path , - 1 , 1 ) ;
if ( ret < 0 )
goto out ;
if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
ret = btrfs_del_item ( trans , quota_root , path ) ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
static int add_qgroup_item ( struct btrfs_trans_handle * trans ,
struct btrfs_root * quota_root , u64 qgroupid )
{
int ret ;
struct btrfs_path * path ;
struct btrfs_qgroup_info_item * qgroup_info ;
struct btrfs_qgroup_limit_item * qgroup_limit ;
struct extent_buffer * leaf ;
struct btrfs_key key ;
2016-06-21 16:52:41 +03:00
if ( btrfs_is_testing ( quota_root - > fs_info ) )
2014-05-08 01:06:09 +04:00
return 0 ;
2014-09-30 01:53:21 +04:00
2012-06-28 20:03:02 +04:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = 0 ;
key . type = BTRFS_QGROUP_INFO_KEY ;
key . offset = qgroupid ;
2014-08-19 01:01:17 +04:00
/*
* Avoid a transaction abort by catching - EEXIST here . In that
* case , we proceed by re - initializing the existing structure
* on disk .
*/
2012-06-28 20:03:02 +04:00
ret = btrfs_insert_empty_item ( trans , quota_root , path , & key ,
sizeof ( * qgroup_info ) ) ;
2014-08-19 01:01:17 +04:00
if ( ret & & ret ! = - EEXIST )
2012-06-28 20:03:02 +04:00
goto out ;
leaf = path - > nodes [ 0 ] ;
qgroup_info = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_qgroup_info_item ) ;
btrfs_set_qgroup_info_generation ( leaf , qgroup_info , trans - > transid ) ;
btrfs_set_qgroup_info_rfer ( leaf , qgroup_info , 0 ) ;
btrfs_set_qgroup_info_rfer_cmpr ( leaf , qgroup_info , 0 ) ;
btrfs_set_qgroup_info_excl ( leaf , qgroup_info , 0 ) ;
btrfs_set_qgroup_info_excl_cmpr ( leaf , qgroup_info , 0 ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
btrfs_release_path ( path ) ;
key . type = BTRFS_QGROUP_LIMIT_KEY ;
ret = btrfs_insert_empty_item ( trans , quota_root , path , & key ,
sizeof ( * qgroup_limit ) ) ;
2014-08-19 01:01:17 +04:00
if ( ret & & ret ! = - EEXIST )
2012-06-28 20:03:02 +04:00
goto out ;
leaf = path - > nodes [ 0 ] ;
qgroup_limit = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_qgroup_limit_item ) ;
btrfs_set_qgroup_limit_flags ( leaf , qgroup_limit , 0 ) ;
btrfs_set_qgroup_limit_max_rfer ( leaf , qgroup_limit , 0 ) ;
btrfs_set_qgroup_limit_max_excl ( leaf , qgroup_limit , 0 ) ;
btrfs_set_qgroup_limit_rsv_rfer ( leaf , qgroup_limit , 0 ) ;
btrfs_set_qgroup_limit_rsv_excl ( leaf , qgroup_limit , 0 ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
ret = 0 ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
2018-07-18 09:45:26 +03:00
static int del_qgroup_item ( struct btrfs_trans_handle * trans , u64 qgroupid )
2012-06-28 20:03:02 +04:00
{
int ret ;
2018-07-18 09:45:26 +03:00
struct btrfs_root * quota_root = trans - > fs_info - > quota_root ;
2012-06-28 20:03:02 +04:00
struct btrfs_path * path ;
struct btrfs_key key ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = 0 ;
key . type = BTRFS_QGROUP_INFO_KEY ;
key . offset = qgroupid ;
ret = btrfs_search_slot ( trans , quota_root , & key , path , - 1 , 1 ) ;
if ( ret < 0 )
goto out ;
if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
ret = btrfs_del_item ( trans , quota_root , path ) ;
if ( ret )
goto out ;
btrfs_release_path ( path ) ;
key . type = BTRFS_QGROUP_LIMIT_KEY ;
ret = btrfs_search_slot ( trans , quota_root , & key , path , - 1 , 1 ) ;
if ( ret < 0 )
goto out ;
if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
ret = btrfs_del_item ( trans , quota_root , path ) ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
static int update_qgroup_limit_item ( struct btrfs_trans_handle * trans ,
2014-11-21 05:01:41 +03:00
struct btrfs_qgroup * qgroup )
2012-06-28 20:03:02 +04:00
{
2018-07-18 09:45:27 +03:00
struct btrfs_root * quota_root = trans - > fs_info - > quota_root ;
2012-06-28 20:03:02 +04:00
struct btrfs_path * path ;
struct btrfs_key key ;
struct extent_buffer * l ;
struct btrfs_qgroup_limit_item * qgroup_limit ;
int ret ;
int slot ;
key . objectid = 0 ;
key . type = BTRFS_QGROUP_LIMIT_KEY ;
2014-11-21 05:01:41 +03:00
key . offset = qgroup - > qgroupid ;
2012-06-28 20:03:02 +04:00
path = btrfs_alloc_path ( ) ;
2013-02-27 15:20:56 +04:00
if ( ! path )
return - ENOMEM ;
2018-07-18 09:45:27 +03:00
ret = btrfs_search_slot ( trans , quota_root , & key , path , 0 , 1 ) ;
2012-06-28 20:03:02 +04:00
if ( ret > 0 )
ret = - ENOENT ;
if ( ret )
goto out ;
l = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
2013-11-05 01:34:29 +04:00
qgroup_limit = btrfs_item_ptr ( l , slot , struct btrfs_qgroup_limit_item ) ;
2014-11-21 05:01:41 +03:00
btrfs_set_qgroup_limit_flags ( l , qgroup_limit , qgroup - > lim_flags ) ;
btrfs_set_qgroup_limit_max_rfer ( l , qgroup_limit , qgroup - > max_rfer ) ;
btrfs_set_qgroup_limit_max_excl ( l , qgroup_limit , qgroup - > max_excl ) ;
btrfs_set_qgroup_limit_rsv_rfer ( l , qgroup_limit , qgroup - > rsv_rfer ) ;
btrfs_set_qgroup_limit_rsv_excl ( l , qgroup_limit , qgroup - > rsv_excl ) ;
2012-06-28 20:03:02 +04:00
btrfs_mark_buffer_dirty ( l ) ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
static int update_qgroup_info_item ( struct btrfs_trans_handle * trans ,
struct btrfs_qgroup * qgroup )
{
2018-07-18 09:45:28 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
struct btrfs_root * quota_root = fs_info - > quota_root ;
2012-06-28 20:03:02 +04:00
struct btrfs_path * path ;
struct btrfs_key key ;
struct extent_buffer * l ;
struct btrfs_qgroup_info_item * qgroup_info ;
int ret ;
int slot ;
2018-07-18 09:45:28 +03:00
if ( btrfs_is_testing ( fs_info ) )
2014-05-08 01:06:09 +04:00
return 0 ;
2014-09-30 01:53:21 +04:00
2012-06-28 20:03:02 +04:00
key . objectid = 0 ;
key . type = BTRFS_QGROUP_INFO_KEY ;
key . offset = qgroup - > qgroupid ;
path = btrfs_alloc_path ( ) ;
2013-02-27 15:20:56 +04:00
if ( ! path )
return - ENOMEM ;
2018-07-18 09:45:28 +03:00
ret = btrfs_search_slot ( trans , quota_root , & key , path , 0 , 1 ) ;
2012-06-28 20:03:02 +04:00
if ( ret > 0 )
ret = - ENOENT ;
if ( ret )
goto out ;
l = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
2013-11-05 01:34:29 +04:00
qgroup_info = btrfs_item_ptr ( l , slot , struct btrfs_qgroup_info_item ) ;
2012-06-28 20:03:02 +04:00
btrfs_set_qgroup_info_generation ( l , qgroup_info , trans - > transid ) ;
btrfs_set_qgroup_info_rfer ( l , qgroup_info , qgroup - > rfer ) ;
btrfs_set_qgroup_info_rfer_cmpr ( l , qgroup_info , qgroup - > rfer_cmpr ) ;
btrfs_set_qgroup_info_excl ( l , qgroup_info , qgroup - > excl ) ;
btrfs_set_qgroup_info_excl_cmpr ( l , qgroup_info , qgroup - > excl_cmpr ) ;
btrfs_mark_buffer_dirty ( l ) ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
2018-07-18 09:45:29 +03:00
static int update_qgroup_status_item ( struct btrfs_trans_handle * trans )
2012-06-28 20:03:02 +04:00
{
2018-07-18 09:45:29 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
struct btrfs_root * quota_root = fs_info - > quota_root ;
2012-06-28 20:03:02 +04:00
struct btrfs_path * path ;
struct btrfs_key key ;
struct extent_buffer * l ;
struct btrfs_qgroup_status_item * ptr ;
int ret ;
int slot ;
key . objectid = 0 ;
key . type = BTRFS_QGROUP_STATUS_KEY ;
key . offset = 0 ;
path = btrfs_alloc_path ( ) ;
2013-02-27 15:20:56 +04:00
if ( ! path )
return - ENOMEM ;
2018-07-18 09:45:29 +03:00
ret = btrfs_search_slot ( trans , quota_root , & key , path , 0 , 1 ) ;
2012-06-28 20:03:02 +04:00
if ( ret > 0 )
ret = - ENOENT ;
if ( ret )
goto out ;
l = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
ptr = btrfs_item_ptr ( l , slot , struct btrfs_qgroup_status_item ) ;
btrfs_set_qgroup_status_flags ( l , ptr , fs_info - > qgroup_flags ) ;
btrfs_set_qgroup_status_generation ( l , ptr , trans - > transid ) ;
2013-04-25 20:04:51 +04:00
btrfs_set_qgroup_status_rescan ( l , ptr ,
fs_info - > qgroup_rescan_progress . objectid ) ;
2012-06-28 20:03:02 +04:00
btrfs_mark_buffer_dirty ( l ) ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
/*
* called with qgroup_lock held
*/
static int btrfs_clean_quota_tree ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root )
{
struct btrfs_path * path ;
struct btrfs_key key ;
2013-02-27 15:16:57 +04:00
struct extent_buffer * leaf = NULL ;
2012-06-28 20:03:02 +04:00
int ret ;
2013-02-27 15:16:57 +04:00
int nr = 0 ;
2012-06-28 20:03:02 +04:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2013-02-27 15:16:57 +04:00
path - > leave_spinning = 1 ;
key . objectid = 0 ;
key . offset = 0 ;
key . type = 0 ;
2012-06-28 20:03:02 +04:00
2013-02-27 15:16:57 +04:00
while ( 1 ) {
2012-06-28 20:03:02 +04:00
ret = btrfs_search_slot ( trans , root , & key , path , - 1 , 1 ) ;
2013-02-27 15:16:57 +04:00
if ( ret < 0 )
goto out ;
leaf = path - > nodes [ 0 ] ;
nr = btrfs_header_nritems ( leaf ) ;
if ( ! nr )
2012-06-28 20:03:02 +04:00
break ;
2013-02-27 15:16:57 +04:00
/*
* delete the leaf one by one
* since the whole tree is going
* to be deleted .
*/
path - > slots [ 0 ] = 0 ;
ret = btrfs_del_items ( trans , root , path , 0 , nr ) ;
2012-06-28 20:03:02 +04:00
if ( ret )
goto out ;
2013-02-27 15:16:57 +04:00
2012-06-28 20:03:02 +04:00
btrfs_release_path ( path ) ;
}
ret = 0 ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
2018-07-05 14:50:48 +03:00
int btrfs_quota_enable ( struct btrfs_fs_info * fs_info )
2012-06-28 20:03:02 +04:00
{
struct btrfs_root * quota_root ;
2013-04-07 14:24:57 +04:00
struct btrfs_root * tree_root = fs_info - > tree_root ;
2012-06-28 20:03:02 +04:00
struct btrfs_path * path = NULL ;
struct btrfs_qgroup_status_item * ptr ;
struct extent_buffer * leaf ;
struct btrfs_key key ;
2013-04-07 14:24:57 +04:00
struct btrfs_key found_key ;
struct btrfs_qgroup * qgroup = NULL ;
2018-07-05 14:50:48 +03:00
struct btrfs_trans_handle * trans = NULL ;
2012-06-28 20:03:02 +04:00
int ret = 0 ;
2013-04-07 14:24:57 +04:00
int slot ;
2012-06-28 20:03:02 +04:00
2013-04-07 14:50:16 +04:00
mutex_lock ( & fs_info - > qgroup_ioctl_lock ) ;
2018-01-31 11:52:04 +03:00
if ( fs_info - > quota_root )
2012-06-28 20:03:02 +04:00
goto out ;
2018-12-19 21:47:37 +03:00
fs_info - > qgroup_ulist = ulist_alloc ( GFP_KERNEL ) ;
if ( ! fs_info - > qgroup_ulist ) {
ret = - ENOMEM ;
goto out ;
}
2018-07-05 14:50:48 +03:00
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
*
* Yet we also need 2 * n items for a QGROUP_INFO / QGROUP_LIMIT items
* per subvolume . However those are not currently reserved since it
* would be a lot of overkill .
*/
trans = btrfs_start_transaction ( tree_root , 2 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
trans = NULL ;
goto out ;
}
2012-06-28 20:03:02 +04:00
/*
* initially create the quota tree
*/
2019-03-20 15:20:49 +03:00
quota_root = btrfs_create_tree ( trans , BTRFS_QUOTA_TREE_OBJECTID ) ;
2012-06-28 20:03:02 +04:00
if ( IS_ERR ( quota_root ) ) {
ret = PTR_ERR ( quota_root ) ;
2018-07-05 14:50:48 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2012-06-28 20:03:02 +04:00
goto out ;
}
path = btrfs_alloc_path ( ) ;
2012-10-16 09:44:21 +04:00
if ( ! path ) {
ret = - ENOMEM ;
2018-07-05 14:50:48 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2012-10-16 09:44:21 +04:00
goto out_free_root ;
}
2012-06-28 20:03:02 +04:00
key . objectid = 0 ;
key . type = BTRFS_QGROUP_STATUS_KEY ;
key . offset = 0 ;
ret = btrfs_insert_empty_item ( trans , quota_root , path , & key ,
sizeof ( * ptr ) ) ;
2018-07-05 14:50:48 +03:00
if ( ret ) {
btrfs_abort_transaction ( trans , ret ) ;
2012-10-16 09:44:21 +04:00
goto out_free_path ;
2018-07-05 14:50:48 +03:00
}
2012-06-28 20:03:02 +04:00
leaf = path - > nodes [ 0 ] ;
ptr = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_qgroup_status_item ) ;
btrfs_set_qgroup_status_generation ( leaf , ptr , trans - > transid ) ;
btrfs_set_qgroup_status_version ( leaf , ptr , BTRFS_QGROUP_STATUS_VERSION ) ;
fs_info - > qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
btrfs_set_qgroup_status_flags ( leaf , ptr , fs_info - > qgroup_flags ) ;
2013-04-25 20:04:51 +04:00
btrfs_set_qgroup_status_rescan ( leaf , ptr , 0 ) ;
2012-06-28 20:03:02 +04:00
btrfs_mark_buffer_dirty ( leaf ) ;
2013-04-07 14:24:57 +04:00
key . objectid = 0 ;
key . type = BTRFS_ROOT_REF_KEY ;
key . offset = 0 ;
btrfs_release_path ( path ) ;
ret = btrfs_search_slot_for_read ( tree_root , & key , path , 1 , 0 ) ;
if ( ret > 0 )
goto out_add_root ;
2018-07-05 14:50:48 +03:00
if ( ret < 0 ) {
btrfs_abort_transaction ( trans , ret ) ;
2013-04-07 14:24:57 +04:00
goto out_free_path ;
2018-07-05 14:50:48 +03:00
}
2013-04-07 14:24:57 +04:00
while ( 1 ) {
slot = path - > slots [ 0 ] ;
leaf = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & found_key , slot ) ;
if ( found_key . type = = BTRFS_ROOT_REF_KEY ) {
ret = add_qgroup_item ( trans , quota_root ,
found_key . offset ) ;
2018-07-05 14:50:48 +03:00
if ( ret ) {
btrfs_abort_transaction ( trans , ret ) ;
2013-04-07 14:24:57 +04:00
goto out_free_path ;
2018-07-05 14:50:48 +03:00
}
2013-04-07 14:24:57 +04:00
qgroup = add_qgroup_rb ( fs_info , found_key . offset ) ;
if ( IS_ERR ( qgroup ) ) {
ret = PTR_ERR ( qgroup ) ;
2018-07-05 14:50:48 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2013-04-07 14:24:57 +04:00
goto out_free_path ;
}
}
ret = btrfs_next_item ( tree_root , path ) ;
2018-07-05 14:50:48 +03:00
if ( ret < 0 ) {
btrfs_abort_transaction ( trans , ret ) ;
2013-04-07 14:24:57 +04:00
goto out_free_path ;
2018-07-05 14:50:48 +03:00
}
2013-04-07 14:24:57 +04:00
if ( ret )
break ;
}
out_add_root :
btrfs_release_path ( path ) ;
ret = add_qgroup_item ( trans , quota_root , BTRFS_FS_TREE_OBJECTID ) ;
2018-07-05 14:50:48 +03:00
if ( ret ) {
btrfs_abort_transaction ( trans , ret ) ;
2013-04-07 14:24:57 +04:00
goto out_free_path ;
2018-07-05 14:50:48 +03:00
}
2013-04-07 14:24:57 +04:00
qgroup = add_qgroup_rb ( fs_info , BTRFS_FS_TREE_OBJECTID ) ;
if ( IS_ERR ( qgroup ) ) {
ret = PTR_ERR ( qgroup ) ;
2018-07-05 14:50:48 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2013-04-07 14:24:57 +04:00
goto out_free_path ;
}
2018-07-05 14:50:48 +03:00
ret = btrfs_commit_transaction ( trans ) ;
2018-08-20 11:25:33 +03:00
trans = NULL ;
if ( ret )
2018-07-05 14:50:48 +03:00
goto out_free_path ;
2018-11-19 17:15:36 +03:00
/*
* Set quota enabled flag after committing the transaction , to avoid
* deadlocks on fs_info - > qgroup_ioctl_lock with concurrent snapshot
* creation .
*/
spin_lock ( & fs_info - > qgroup_lock ) ;
fs_info - > quota_root = quota_root ;
set_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) ;
spin_unlock ( & fs_info - > qgroup_lock ) ;
2018-01-31 11:52:04 +03:00
ret = qgroup_rescan_init ( fs_info , 0 , 1 ) ;
if ( ! ret ) {
qgroup_rescan_zero_tracking ( fs_info ) ;
btrfs_queue_work ( fs_info - > qgroup_rescan_workers ,
& fs_info - > qgroup_rescan_work ) ;
}
2012-10-16 09:44:21 +04:00
out_free_path :
2012-06-28 20:03:02 +04:00
btrfs_free_path ( path ) ;
2012-10-16 09:44:21 +04:00
out_free_root :
if ( ret ) {
free_extent_buffer ( quota_root - > node ) ;
free_extent_buffer ( quota_root - > commit_root ) ;
kfree ( quota_root ) ;
}
out :
2013-05-28 19:47:23 +04:00
if ( ret ) {
2013-05-06 15:03:27 +04:00
ulist_free ( fs_info - > qgroup_ulist ) ;
2013-05-28 19:47:23 +04:00
fs_info - > qgroup_ulist = NULL ;
2018-07-05 14:50:48 +03:00
if ( trans )
btrfs_end_transaction ( trans ) ;
2013-05-28 19:47:23 +04:00
}
2013-04-07 14:50:16 +04:00
mutex_unlock ( & fs_info - > qgroup_ioctl_lock ) ;
2012-06-28 20:03:02 +04:00
return ret ;
}
2018-07-05 14:50:48 +03:00
int btrfs_quota_disable ( struct btrfs_fs_info * fs_info )
2012-06-28 20:03:02 +04:00
{
struct btrfs_root * quota_root ;
2018-07-05 14:50:48 +03:00
struct btrfs_trans_handle * trans = NULL ;
2012-06-28 20:03:02 +04:00
int ret = 0 ;
2013-04-07 14:50:16 +04:00
mutex_lock ( & fs_info - > qgroup_ioctl_lock ) ;
2013-04-07 14:50:17 +04:00
if ( ! fs_info - > quota_root )
2013-04-07 14:50:16 +04:00
goto out ;
2018-07-05 14:50:48 +03:00
/*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
* btrfs_clean_quota_tree but this is not done .
*/
trans = btrfs_start_transaction ( fs_info - > tree_root , 1 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out ;
}
2016-09-02 22:40:02 +03:00
clear_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) ;
2016-08-09 05:08:06 +03:00
btrfs_qgroup_wait_for_completion ( fs_info , false ) ;
2015-11-06 21:36:42 +03:00
spin_lock ( & fs_info - > qgroup_lock ) ;
2012-06-28 20:03:02 +04:00
quota_root = fs_info - > quota_root ;
fs_info - > quota_root = NULL ;
2015-02-27 11:24:26 +03:00
fs_info - > qgroup_flags & = ~ BTRFS_QGROUP_STATUS_FLAG_ON ;
2012-06-28 20:03:02 +04:00
spin_unlock ( & fs_info - > qgroup_lock ) ;
2013-08-14 05:13:37 +04:00
btrfs_free_qgroup_config ( fs_info ) ;
2012-06-28 20:03:02 +04:00
ret = btrfs_clean_quota_tree ( trans , quota_root ) ;
2018-07-05 14:50:48 +03:00
if ( ret ) {
btrfs_abort_transaction ( trans , ret ) ;
goto end_trans ;
}
2012-06-28 20:03:02 +04:00
2018-08-01 06:32:27 +03:00
ret = btrfs_del_root ( trans , & quota_root - > root_key ) ;
2018-07-05 14:50:48 +03:00
if ( ret ) {
btrfs_abort_transaction ( trans , ret ) ;
goto end_trans ;
}
2012-06-28 20:03:02 +04:00
list_del ( & quota_root - > dirty_list ) ;
btrfs_tree_lock ( quota_root - > node ) ;
2019-03-20 16:30:02 +03:00
btrfs_clean_tree_block ( quota_root - > node ) ;
2012-06-28 20:03:02 +04:00
btrfs_tree_unlock ( quota_root - > node ) ;
btrfs_free_tree_block ( trans , quota_root , quota_root - > node , 0 , 1 ) ;
free_extent_buffer ( quota_root - > node ) ;
free_extent_buffer ( quota_root - > commit_root ) ;
kfree ( quota_root ) ;
2018-07-05 14:50:48 +03:00
end_trans :
ret = btrfs_end_transaction ( trans ) ;
2012-06-28 20:03:02 +04:00
out :
2013-04-07 14:50:16 +04:00
mutex_unlock ( & fs_info - > qgroup_ioctl_lock ) ;
2012-06-28 20:03:02 +04:00
return ret ;
}
2013-04-25 20:04:51 +04:00
static void qgroup_dirty ( struct btrfs_fs_info * fs_info ,
struct btrfs_qgroup * qgroup )
2012-06-28 20:03:02 +04:00
{
2013-04-25 20:04:51 +04:00
if ( list_empty ( & qgroup - > dirty ) )
list_add ( & qgroup - > dirty , & fs_info - > dirty_qgroups ) ;
2012-06-28 20:03:02 +04:00
}
2015-02-27 11:24:27 +03:00
/*
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:26 +03:00
* The easy accounting , we ' re updating qgroup relationship whose child qgroup
* only has exclusive extents .
*
2018-11-28 14:05:13 +03:00
* In this case , all exclusive extents will also be exclusive for parent , so
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:26 +03:00
* excl / rfer just get added / removed .
*
* So is qgroup reservation space , which should also be added / removed to
* parent .
* Or when child tries to release reservation space , parent will underflow its
* reservation ( for relationship adding case ) .
2015-02-27 11:24:27 +03:00
*
* Caller should hold fs_info - > qgroup_lock .
*/
static int __qgroup_excl_accounting ( struct btrfs_fs_info * fs_info ,
struct ulist * tmp , u64 ref_root ,
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:26 +03:00
struct btrfs_qgroup * src , int sign )
2015-02-27 11:24:27 +03:00
{
struct btrfs_qgroup * qgroup ;
struct btrfs_qgroup_list * glist ;
struct ulist_node * unode ;
struct ulist_iterator uiter ;
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:26 +03:00
u64 num_bytes = src - > excl ;
2015-02-27 11:24:27 +03:00
int ret = 0 ;
qgroup = find_qgroup_rb ( fs_info , ref_root ) ;
if ( ! qgroup )
goto out ;
qgroup - > rfer + = sign * num_bytes ;
qgroup - > rfer_cmpr + = sign * num_bytes ;
WARN_ON ( sign < 0 & & qgroup - > excl < num_bytes ) ;
qgroup - > excl + = sign * num_bytes ;
qgroup - > excl_cmpr + = sign * num_bytes ;
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:26 +03:00
if ( sign > 0 )
2017-12-12 10:34:27 +03:00
qgroup_rsv_add_by_qgroup ( fs_info , qgroup , src ) ;
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:26 +03:00
else
2017-12-12 10:34:27 +03:00
qgroup_rsv_release_by_qgroup ( fs_info , qgroup , src ) ;
2015-02-27 11:24:27 +03:00
qgroup_dirty ( fs_info , qgroup ) ;
/* Get all of the parent groups that contain this qgroup */
list_for_each_entry ( glist , & qgroup - > groups , next_group ) {
ret = ulist_add ( tmp , glist - > group - > qgroupid ,
2016-10-26 17:23:50 +03:00
qgroup_to_aux ( glist - > group ) , GFP_ATOMIC ) ;
2015-02-27 11:24:27 +03:00
if ( ret < 0 )
goto out ;
}
/* Iterate all of the parents and adjust their reference counts */
ULIST_ITER_INIT ( & uiter ) ;
while ( ( unode = ulist_next ( tmp , & uiter ) ) ) {
2016-10-26 17:23:50 +03:00
qgroup = unode_aux_to_qgroup ( unode ) ;
2015-02-27 11:24:27 +03:00
qgroup - > rfer + = sign * num_bytes ;
qgroup - > rfer_cmpr + = sign * num_bytes ;
WARN_ON ( sign < 0 & & qgroup - > excl < num_bytes ) ;
qgroup - > excl + = sign * num_bytes ;
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:26 +03:00
if ( sign > 0 )
2017-12-12 10:34:27 +03:00
qgroup_rsv_add_by_qgroup ( fs_info , qgroup , src ) ;
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:26 +03:00
else
2017-12-12 10:34:27 +03:00
qgroup_rsv_release_by_qgroup ( fs_info , qgroup , src ) ;
2015-02-27 11:24:27 +03:00
qgroup - > excl_cmpr + = sign * num_bytes ;
qgroup_dirty ( fs_info , qgroup ) ;
/* Add any parents of the parents */
list_for_each_entry ( glist , & qgroup - > groups , next_group ) {
ret = ulist_add ( tmp , glist - > group - > qgroupid ,
2016-10-26 17:23:50 +03:00
qgroup_to_aux ( glist - > group ) , GFP_ATOMIC ) ;
2015-02-27 11:24:27 +03:00
if ( ret < 0 )
goto out ;
}
}
ret = 0 ;
out :
return ret ;
}
/*
* Quick path for updating qgroup with only excl refs .
*
* In that case , just update all parent will be enough .
* Or we needs to do a full rescan .
* Caller should also hold fs_info - > qgroup_lock .
*
* Return 0 for quick update , return > 0 for need to full rescan
* and mark INCONSISTENT flag .
* Return < 0 for other error .
*/
static int quick_update_accounting ( struct btrfs_fs_info * fs_info ,
struct ulist * tmp , u64 src , u64 dst ,
int sign )
{
struct btrfs_qgroup * qgroup ;
int ret = 1 ;
int err = 0 ;
qgroup = find_qgroup_rb ( fs_info , src ) ;
if ( ! qgroup )
goto out ;
if ( qgroup - > excl = = qgroup - > rfer ) {
ret = 0 ;
err = __qgroup_excl_accounting ( fs_info , tmp , dst ,
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:26 +03:00
qgroup , sign ) ;
2015-02-27 11:24:27 +03:00
if ( err < 0 ) {
ret = err ;
goto out ;
}
}
out :
if ( ret )
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
return ret ;
}
2018-07-18 09:45:30 +03:00
int btrfs_add_qgroup_relation ( struct btrfs_trans_handle * trans , u64 src ,
u64 dst )
2012-06-28 20:03:02 +04:00
{
2018-07-18 09:45:30 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2012-06-28 20:03:02 +04:00
struct btrfs_root * quota_root ;
2013-04-07 14:50:18 +04:00
struct btrfs_qgroup * parent ;
struct btrfs_qgroup * member ;
2013-04-17 18:49:51 +04:00
struct btrfs_qgroup_list * list ;
2015-02-27 11:24:27 +03:00
struct ulist * tmp ;
2012-06-28 20:03:02 +04:00
int ret = 0 ;
2015-02-27 11:24:22 +03:00
/* Check the level of src and dst first */
if ( btrfs_qgroup_level ( src ) > = btrfs_qgroup_level ( dst ) )
return - EINVAL ;
2017-02-13 14:41:02 +03:00
tmp = ulist_alloc ( GFP_KERNEL ) ;
2015-05-02 18:19:55 +03:00
if ( ! tmp )
return - ENOMEM ;
2013-04-07 14:50:16 +04:00
mutex_lock ( & fs_info - > qgroup_ioctl_lock ) ;
2012-06-28 20:03:02 +04:00
quota_root = fs_info - > quota_root ;
2013-04-07 14:50:16 +04:00
if ( ! quota_root ) {
ret = - EINVAL ;
goto out ;
}
2013-04-07 14:50:18 +04:00
member = find_qgroup_rb ( fs_info , src ) ;
parent = find_qgroup_rb ( fs_info , dst ) ;
if ( ! member | | ! parent ) {
ret = - EINVAL ;
goto out ;
}
2012-06-28 20:03:02 +04:00
2013-04-17 18:49:51 +04:00
/* check if such qgroup relation exist firstly */
list_for_each_entry ( list , & member - > groups , next_group ) {
if ( list - > group = = parent ) {
ret = - EEXIST ;
goto out ;
}
}
2018-07-18 09:45:24 +03:00
ret = add_qgroup_relation_item ( trans , src , dst ) ;
2012-06-28 20:03:02 +04:00
if ( ret )
2013-04-07 14:50:16 +04:00
goto out ;
2012-06-28 20:03:02 +04:00
2018-07-18 09:45:24 +03:00
ret = add_qgroup_relation_item ( trans , dst , src ) ;
2012-06-28 20:03:02 +04:00
if ( ret ) {
2018-07-18 09:45:25 +03:00
del_qgroup_relation_item ( trans , src , dst ) ;
2013-04-07 14:50:16 +04:00
goto out ;
2012-06-28 20:03:02 +04:00
}
spin_lock ( & fs_info - > qgroup_lock ) ;
2016-06-23 01:54:23 +03:00
ret = add_relation_rb ( fs_info , src , dst ) ;
2015-02-27 11:24:27 +03:00
if ( ret < 0 ) {
spin_unlock ( & fs_info - > qgroup_lock ) ;
goto out ;
}
ret = quick_update_accounting ( fs_info , tmp , src , dst , 1 ) ;
2012-06-28 20:03:02 +04:00
spin_unlock ( & fs_info - > qgroup_lock ) ;
2013-04-07 14:50:16 +04:00
out :
mutex_unlock ( & fs_info - > qgroup_ioctl_lock ) ;
2015-02-27 11:24:27 +03:00
ulist_free ( tmp ) ;
2012-06-28 20:03:02 +04:00
return ret ;
}
2018-07-18 09:45:31 +03:00
static int __del_qgroup_relation ( struct btrfs_trans_handle * trans , u64 src ,
u64 dst )
2012-06-28 20:03:02 +04:00
{
2018-07-18 09:45:31 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2012-06-28 20:03:02 +04:00
struct btrfs_root * quota_root ;
2013-04-17 18:49:51 +04:00
struct btrfs_qgroup * parent ;
struct btrfs_qgroup * member ;
struct btrfs_qgroup_list * list ;
2015-02-27 11:24:27 +03:00
struct ulist * tmp ;
2012-06-28 20:03:02 +04:00
int ret = 0 ;
int err ;
2017-02-13 14:41:02 +03:00
tmp = ulist_alloc ( GFP_KERNEL ) ;
2015-02-27 11:24:27 +03:00
if ( ! tmp )
return - ENOMEM ;
2012-06-28 20:03:02 +04:00
quota_root = fs_info - > quota_root ;
2013-04-07 14:50:16 +04:00
if ( ! quota_root ) {
ret = - EINVAL ;
goto out ;
}
2012-06-28 20:03:02 +04:00
2013-04-17 18:49:51 +04:00
member = find_qgroup_rb ( fs_info , src ) ;
parent = find_qgroup_rb ( fs_info , dst ) ;
if ( ! member | | ! parent ) {
ret = - EINVAL ;
goto out ;
}
/* check if such qgroup relation exist firstly */
list_for_each_entry ( list , & member - > groups , next_group ) {
if ( list - > group = = parent )
goto exist ;
}
ret = - ENOENT ;
goto out ;
exist :
2018-07-18 09:45:25 +03:00
ret = del_qgroup_relation_item ( trans , src , dst ) ;
err = del_qgroup_relation_item ( trans , dst , src ) ;
2012-06-28 20:03:02 +04:00
if ( err & & ! ret )
ret = err ;
spin_lock ( & fs_info - > qgroup_lock ) ;
del_relation_rb ( fs_info , src , dst ) ;
2015-02-27 11:24:27 +03:00
ret = quick_update_accounting ( fs_info , tmp , src , dst , - 1 ) ;
2012-06-28 20:03:02 +04:00
spin_unlock ( & fs_info - > qgroup_lock ) ;
2013-04-07 14:50:16 +04:00
out :
2015-02-27 11:24:27 +03:00
ulist_free ( tmp ) ;
2014-11-24 18:27:09 +03:00
return ret ;
}
2018-07-18 09:45:32 +03:00
int btrfs_del_qgroup_relation ( struct btrfs_trans_handle * trans , u64 src ,
u64 dst )
2014-11-24 18:27:09 +03:00
{
2018-07-18 09:45:32 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2014-11-24 18:27:09 +03:00
int ret = 0 ;
mutex_lock ( & fs_info - > qgroup_ioctl_lock ) ;
2018-07-18 09:45:31 +03:00
ret = __del_qgroup_relation ( trans , src , dst ) ;
2013-04-07 14:50:16 +04:00
mutex_unlock ( & fs_info - > qgroup_ioctl_lock ) ;
2014-11-24 18:27:09 +03:00
2012-06-28 20:03:02 +04:00
return ret ;
}
2018-07-18 09:45:33 +03:00
int btrfs_create_qgroup ( struct btrfs_trans_handle * trans , u64 qgroupid )
2012-06-28 20:03:02 +04:00
{
2018-07-18 09:45:33 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2012-06-28 20:03:02 +04:00
struct btrfs_root * quota_root ;
struct btrfs_qgroup * qgroup ;
int ret = 0 ;
2013-04-07 14:50:16 +04:00
mutex_lock ( & fs_info - > qgroup_ioctl_lock ) ;
2012-06-28 20:03:02 +04:00
quota_root = fs_info - > quota_root ;
2013-04-07 14:50:16 +04:00
if ( ! quota_root ) {
ret = - EINVAL ;
goto out ;
}
2013-04-17 18:49:51 +04:00
qgroup = find_qgroup_rb ( fs_info , qgroupid ) ;
if ( qgroup ) {
ret = - EEXIST ;
goto out ;
}
2012-06-28 20:03:02 +04:00
ret = add_qgroup_item ( trans , quota_root , qgroupid ) ;
2013-04-17 18:49:51 +04:00
if ( ret )
goto out ;
2012-06-28 20:03:02 +04:00
spin_lock ( & fs_info - > qgroup_lock ) ;
qgroup = add_qgroup_rb ( fs_info , qgroupid ) ;
spin_unlock ( & fs_info - > qgroup_lock ) ;
if ( IS_ERR ( qgroup ) )
ret = PTR_ERR ( qgroup ) ;
2013-04-07 14:50:16 +04:00
out :
mutex_unlock ( & fs_info - > qgroup_ioctl_lock ) ;
2012-06-28 20:03:02 +04:00
return ret ;
}
2018-07-18 09:45:34 +03:00
int btrfs_remove_qgroup ( struct btrfs_trans_handle * trans , u64 qgroupid )
2012-06-28 20:03:02 +04:00
{
2018-07-18 09:45:34 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2012-06-28 20:03:02 +04:00
struct btrfs_root * quota_root ;
2013-01-17 12:22:09 +04:00
struct btrfs_qgroup * qgroup ;
2014-11-24 18:27:09 +03:00
struct btrfs_qgroup_list * list ;
2012-06-28 20:03:02 +04:00
int ret = 0 ;
2013-04-07 14:50:16 +04:00
mutex_lock ( & fs_info - > qgroup_ioctl_lock ) ;
2012-06-28 20:03:02 +04:00
quota_root = fs_info - > quota_root ;
2013-04-07 14:50:16 +04:00
if ( ! quota_root ) {
ret = - EINVAL ;
goto out ;
}
2012-06-28 20:03:02 +04:00
2013-01-17 12:22:09 +04:00
qgroup = find_qgroup_rb ( fs_info , qgroupid ) ;
2013-04-17 18:49:51 +04:00
if ( ! qgroup ) {
ret = - ENOENT ;
goto out ;
2013-01-17 12:22:09 +04:00
}
2018-10-11 08:42:56 +03:00
/* Check if there are no children of this qgroup */
if ( ! list_empty ( & qgroup - > members ) ) {
ret = - EBUSY ;
goto out ;
}
2018-07-18 09:45:26 +03:00
ret = del_qgroup_item ( trans , qgroupid ) ;
2017-09-17 12:02:29 +03:00
if ( ret & & ret ! = - ENOENT )
goto out ;
2012-06-28 20:03:02 +04:00
2014-11-24 18:27:09 +03:00
while ( ! list_empty ( & qgroup - > groups ) ) {
list = list_first_entry ( & qgroup - > groups ,
struct btrfs_qgroup_list , next_group ) ;
2018-07-18 09:45:31 +03:00
ret = __del_qgroup_relation ( trans , qgroupid ,
list - > group - > qgroupid ) ;
2014-11-24 18:27:09 +03:00
if ( ret )
goto out ;
}
2012-06-28 20:03:02 +04:00
spin_lock ( & fs_info - > qgroup_lock ) ;
2016-06-23 01:54:23 +03:00
del_qgroup_rb ( fs_info , qgroupid ) ;
2012-06-28 20:03:02 +04:00
spin_unlock ( & fs_info - > qgroup_lock ) ;
2013-04-07 14:50:16 +04:00
out :
mutex_unlock ( & fs_info - > qgroup_ioctl_lock ) ;
2012-06-28 20:03:02 +04:00
return ret ;
}
2018-07-18 09:45:35 +03:00
int btrfs_limit_qgroup ( struct btrfs_trans_handle * trans , u64 qgroupid ,
2012-06-28 20:03:02 +04:00
struct btrfs_qgroup_limit * limit )
{
2018-07-18 09:45:35 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2013-04-07 14:50:16 +04:00
struct btrfs_root * quota_root ;
2012-06-28 20:03:02 +04:00
struct btrfs_qgroup * qgroup ;
int ret = 0 ;
2015-06-03 09:57:32 +03:00
/* Sometimes we would want to clear the limit on this qgroup.
* To meet this requirement , we treat the - 1 as a special value
* which tell kernel to clear the limit on this qgroup .
*/
const u64 CLEAR_VALUE = - 1 ;
2012-06-28 20:03:02 +04:00
2013-04-07 14:50:16 +04:00
mutex_lock ( & fs_info - > qgroup_ioctl_lock ) ;
quota_root = fs_info - > quota_root ;
if ( ! quota_root ) {
ret = - EINVAL ;
goto out ;
}
2012-06-28 20:03:02 +04:00
2013-04-07 14:50:20 +04:00
qgroup = find_qgroup_rb ( fs_info , qgroupid ) ;
if ( ! qgroup ) {
ret = - ENOENT ;
goto out ;
}
2012-06-28 20:03:02 +04:00
2013-04-07 14:50:17 +04:00
spin_lock ( & fs_info - > qgroup_lock ) ;
2015-06-03 09:57:32 +03:00
if ( limit - > flags & BTRFS_QGROUP_LIMIT_MAX_RFER ) {
if ( limit - > max_rfer = = CLEAR_VALUE ) {
qgroup - > lim_flags & = ~ BTRFS_QGROUP_LIMIT_MAX_RFER ;
limit - > flags & = ~ BTRFS_QGROUP_LIMIT_MAX_RFER ;
qgroup - > max_rfer = 0 ;
} else {
qgroup - > max_rfer = limit - > max_rfer ;
}
}
if ( limit - > flags & BTRFS_QGROUP_LIMIT_MAX_EXCL ) {
if ( limit - > max_excl = = CLEAR_VALUE ) {
qgroup - > lim_flags & = ~ BTRFS_QGROUP_LIMIT_MAX_EXCL ;
limit - > flags & = ~ BTRFS_QGROUP_LIMIT_MAX_EXCL ;
qgroup - > max_excl = 0 ;
} else {
qgroup - > max_excl = limit - > max_excl ;
}
}
if ( limit - > flags & BTRFS_QGROUP_LIMIT_RSV_RFER ) {
if ( limit - > rsv_rfer = = CLEAR_VALUE ) {
qgroup - > lim_flags & = ~ BTRFS_QGROUP_LIMIT_RSV_RFER ;
limit - > flags & = ~ BTRFS_QGROUP_LIMIT_RSV_RFER ;
qgroup - > rsv_rfer = 0 ;
} else {
qgroup - > rsv_rfer = limit - > rsv_rfer ;
}
}
if ( limit - > flags & BTRFS_QGROUP_LIMIT_RSV_EXCL ) {
if ( limit - > rsv_excl = = CLEAR_VALUE ) {
qgroup - > lim_flags & = ~ BTRFS_QGROUP_LIMIT_RSV_EXCL ;
limit - > flags & = ~ BTRFS_QGROUP_LIMIT_RSV_EXCL ;
qgroup - > rsv_excl = 0 ;
} else {
qgroup - > rsv_excl = limit - > rsv_excl ;
}
}
2015-02-06 19:06:25 +03:00
qgroup - > lim_flags | = limit - > flags ;
2012-06-28 20:03:02 +04:00
spin_unlock ( & fs_info - > qgroup_lock ) ;
2014-11-21 05:01:41 +03:00
2018-07-18 09:45:27 +03:00
ret = update_qgroup_limit_item ( trans , qgroup ) ;
2014-11-21 05:01:41 +03:00
if ( ret ) {
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
btrfs_info ( fs_info , " unable to update quota limit for %llu " ,
qgroupid ) ;
}
2013-04-07 14:50:16 +04:00
out :
mutex_unlock ( & fs_info - > qgroup_ioctl_lock ) ;
2012-06-28 20:03:02 +04:00
return ret ;
}
2014-07-17 23:39:01 +04:00
2016-10-18 04:31:27 +03:00
int btrfs_qgroup_trace_extent_nolock ( struct btrfs_fs_info * fs_info ,
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 05:36:50 +03:00
struct btrfs_delayed_ref_root * delayed_refs ,
struct btrfs_qgroup_extent_record * record )
2015-04-16 09:34:17 +03:00
{
struct rb_node * * p = & delayed_refs - > dirty_extent_root . rb_node ;
struct rb_node * parent_node = NULL ;
struct btrfs_qgroup_extent_record * entry ;
u64 bytenr = record - > bytenr ;
2018-03-16 04:21:22 +03:00
lockdep_assert_held ( & delayed_refs - > lock ) ;
2016-10-18 04:31:27 +03:00
trace_btrfs_qgroup_trace_extent ( fs_info , record ) ;
2015-11-06 01:38:00 +03:00
2015-04-16 09:34:17 +03:00
while ( * p ) {
parent_node = * p ;
entry = rb_entry ( parent_node , struct btrfs_qgroup_extent_record ,
node ) ;
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 10:15:12 +03:00
if ( bytenr < entry - > bytenr ) {
2015-04-16 09:34:17 +03:00
p = & ( * p ) - > rb_left ;
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 10:15:12 +03:00
} else if ( bytenr > entry - > bytenr ) {
2015-04-16 09:34:17 +03:00
p = & ( * p ) - > rb_right ;
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 10:15:12 +03:00
} else {
if ( record - > data_rsv & & ! entry - > data_rsv ) {
entry - > data_rsv = record - > data_rsv ;
entry - > data_rsv_refroot =
record - > data_rsv_refroot ;
}
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 05:36:50 +03:00
return 1 ;
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 10:15:12 +03:00
}
2015-04-16 09:34:17 +03:00
}
rb_link_node ( & record - > node , parent_node , p ) ;
rb_insert_color ( & record - > node , & delayed_refs - > dirty_extent_root ) ;
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 05:36:50 +03:00
return 0 ;
}
2017-02-15 05:43:03 +03:00
int btrfs_qgroup_trace_extent_post ( struct btrfs_fs_info * fs_info ,
struct btrfs_qgroup_extent_record * qrecord )
{
struct ulist * old_root ;
u64 bytenr = qrecord - > bytenr ;
int ret ;
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:45 +03:00
ret = btrfs_find_all_roots ( NULL , fs_info , bytenr , 0 , & old_root , false ) ;
2018-01-29 16:53:01 +03:00
if ( ret < 0 ) {
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
btrfs_warn ( fs_info ,
" error accounting new delayed refs extent (err code: %d), quota inconsistent " ,
ret ) ;
return 0 ;
}
2017-02-15 05:43:03 +03:00
/*
* Here we don ' t need to get the lock of
* trans - > transaction - > delayed_refs , since inserted qrecord won ' t
* be deleted , only qrecord - > node may be modified ( new qrecord insert )
*
* So modifying qrecord - > old_roots is safe here
*/
qrecord - > old_roots = old_root ;
return 0 ;
}
2018-07-18 11:28:03 +03:00
int btrfs_qgroup_trace_extent ( struct btrfs_trans_handle * trans , u64 bytenr ,
u64 num_bytes , gfp_t gfp_flag )
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 05:36:50 +03:00
{
2018-07-18 11:28:03 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 05:36:50 +03:00
struct btrfs_qgroup_extent_record * record ;
struct btrfs_delayed_ref_root * delayed_refs ;
int ret ;
2016-09-02 22:40:02 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags )
| | bytenr = = 0 | | num_bytes = = 0 )
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 05:36:50 +03:00
return 0 ;
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 10:15:12 +03:00
record = kzalloc ( sizeof ( * record ) , gfp_flag ) ;
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 05:36:50 +03:00
if ( ! record )
return - ENOMEM ;
delayed_refs = & trans - > transaction - > delayed_refs ;
record - > bytenr = bytenr ;
record - > num_bytes = num_bytes ;
record - > old_roots = NULL ;
spin_lock ( & delayed_refs - > lock ) ;
2016-06-23 01:54:24 +03:00
ret = btrfs_qgroup_trace_extent_nolock ( fs_info , delayed_refs , record ) ;
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 05:36:50 +03:00
spin_unlock ( & delayed_refs - > lock ) ;
2017-02-15 05:43:03 +03:00
if ( ret > 0 ) {
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 05:36:50 +03:00
kfree ( record ) ;
2017-02-15 05:43:03 +03:00
return 0 ;
}
return btrfs_qgroup_trace_extent_post ( fs_info , record ) ;
2015-04-16 09:34:17 +03:00
}
2016-10-18 04:31:28 +03:00
int btrfs_qgroup_trace_leaf_items ( struct btrfs_trans_handle * trans ,
struct extent_buffer * eb )
{
2018-07-18 09:45:37 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2016-10-18 04:31:28 +03:00
int nr = btrfs_header_nritems ( eb ) ;
int i , extent_type , ret ;
struct btrfs_key key ;
struct btrfs_file_extent_item * fi ;
u64 bytenr , num_bytes ;
/* We can be called directly from walk_up_proc() */
2016-06-23 01:54:23 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) )
2016-10-18 04:31:28 +03:00
return 0 ;
for ( i = 0 ; i < nr ; i + + ) {
btrfs_item_key_to_cpu ( eb , & key , i ) ;
if ( key . type ! = BTRFS_EXTENT_DATA_KEY )
continue ;
fi = btrfs_item_ptr ( eb , i , struct btrfs_file_extent_item ) ;
/* filter out non qgroup-accountable extents */
extent_type = btrfs_file_extent_type ( eb , fi ) ;
if ( extent_type = = BTRFS_FILE_EXTENT_INLINE )
continue ;
bytenr = btrfs_file_extent_disk_bytenr ( eb , fi ) ;
if ( ! bytenr )
continue ;
num_bytes = btrfs_file_extent_disk_num_bytes ( eb , fi ) ;
2018-07-18 11:28:03 +03:00
ret = btrfs_qgroup_trace_extent ( trans , bytenr , num_bytes ,
GFP_NOFS ) ;
2016-10-18 04:31:28 +03:00
if ( ret )
return ret ;
}
2017-06-20 15:15:26 +03:00
cond_resched ( ) ;
2016-10-18 04:31:28 +03:00
return 0 ;
}
/*
* Walk up the tree from the bottom , freeing leaves and any interior
* nodes which have had all slots visited . If a node ( leaf or
* interior ) is freed , the node above it will have it ' s slot
* incremented . The root node will never be freed .
*
* At the end of this function , we should have a path which has all
* slots incremented to the next position for a search . If we need to
* read a new node it will be NULL and the node above it will have the
* correct slot selected for a later read .
*
* If we increment the root nodes slot counter past the number of
* elements , 1 is returned to signal completion of the search .
*/
2017-02-10 22:30:23 +03:00
static int adjust_slots_upwards ( struct btrfs_path * path , int root_level )
2016-10-18 04:31:28 +03:00
{
int level = 0 ;
int nr , slot ;
struct extent_buffer * eb ;
if ( root_level = = 0 )
return 1 ;
while ( level < = root_level ) {
eb = path - > nodes [ level ] ;
nr = btrfs_header_nritems ( eb ) ;
path - > slots [ level ] + + ;
slot = path - > slots [ level ] ;
if ( slot > = nr | | level = = 0 ) {
/*
* Don ' t free the root - we will detect this
* condition after our loop and return a
* positive value for caller to stop walking the tree .
*/
if ( level ! = root_level ) {
btrfs_tree_unlock_rw ( eb , path - > locks [ level ] ) ;
path - > locks [ level ] = 0 ;
free_extent_buffer ( eb ) ;
path - > nodes [ level ] = NULL ;
path - > slots [ level ] = 0 ;
}
} else {
/*
* We have a valid slot to walk back down
* from . Stop here so caller can process these
* new nodes .
*/
break ;
}
level + + ;
}
eb = path - > nodes [ root_level ] ;
if ( path - > slots [ root_level ] > = btrfs_header_nritems ( eb ) )
return 1 ;
return 0 ;
}
2018-09-27 09:42:30 +03:00
/*
* Helper function to trace a subtree tree block swap .
*
* The swap will happen in highest tree block , but there may be a lot of
* tree blocks involved .
*
* For example :
* OO = Old tree blocks
* NN = New tree blocks allocated during balance
*
* File tree ( 257 ) Reloc tree for 257
* L2 OO NN
* / \ / \
* L1 OO OO ( a ) OO NN ( a )
* / \ / \ / \ / \
* L0 OO OO OO OO OO OO NN NN
* ( b ) ( c ) ( b ) ( c )
*
* When calling qgroup_trace_extent_swap ( ) , we will pass :
* @ src_eb = OO ( a )
* @ dst_path = [ nodes [ 1 ] = NN ( a ) , nodes [ 0 ] = NN ( c ) ]
* @ dst_level = 0
* @ root_level = 1
*
* In that case , qgroup_trace_extent_swap ( ) will search from OO ( a ) to
* reach OO ( c ) , then mark both OO ( c ) and NN ( c ) as qgroup dirty .
*
* The main work of qgroup_trace_extent_swap ( ) can be split into 3 parts :
*
* 1 ) Tree search from @ src_eb
* It should acts as a simplified btrfs_search_slot ( ) .
* The key for search can be extracted from @ dst_path - > nodes [ dst_level ]
* ( first key ) .
*
* 2 ) Mark the final tree blocks in @ src_path and @ dst_path qgroup dirty
* NOTE : In above case , OO ( a ) and NN ( a ) won ' t be marked qgroup dirty .
2018-11-28 14:05:13 +03:00
* They should be marked during previous ( @ dst_level = 1 ) iteration .
2018-09-27 09:42:30 +03:00
*
* 3 ) Mark file extents in leaves dirty
* We don ' t have good way to pick out new file extents only .
* So we still follow the old method by scanning all file extents in
* the leave .
*
2018-11-28 14:05:13 +03:00
* This function can free us from keeping two paths , thus later we only need
2018-09-27 09:42:30 +03:00
* to care about how to iterate all new tree blocks in reloc tree .
*/
static int qgroup_trace_extent_swap ( struct btrfs_trans_handle * trans ,
struct extent_buffer * src_eb ,
struct btrfs_path * dst_path ,
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 09:42:35 +03:00
int dst_level , int root_level ,
bool trace_leaf )
2018-09-27 09:42:30 +03:00
{
struct btrfs_key key ;
struct btrfs_path * src_path ;
struct btrfs_fs_info * fs_info = trans - > fs_info ;
u32 nodesize = fs_info - > nodesize ;
int cur_level = root_level ;
int ret ;
BUG_ON ( dst_level > root_level ) ;
/* Level mismatch */
if ( btrfs_header_level ( src_eb ) ! = root_level )
return - EINVAL ;
src_path = btrfs_alloc_path ( ) ;
if ( ! src_path ) {
ret = - ENOMEM ;
goto out ;
}
if ( dst_level )
btrfs_node_key_to_cpu ( dst_path - > nodes [ dst_level ] , & key , 0 ) ;
else
btrfs_item_key_to_cpu ( dst_path - > nodes [ dst_level ] , & key , 0 ) ;
/* For src_path */
extent_buffer_get ( src_eb ) ;
src_path - > nodes [ root_level ] = src_eb ;
src_path - > slots [ root_level ] = dst_path - > slots [ root_level ] ;
src_path - > locks [ root_level ] = 0 ;
/* A simplified version of btrfs_search_slot() */
while ( cur_level > = dst_level ) {
struct btrfs_key src_key ;
struct btrfs_key dst_key ;
if ( src_path - > nodes [ cur_level ] = = NULL ) {
struct btrfs_key first_key ;
struct extent_buffer * eb ;
int parent_slot ;
u64 child_gen ;
u64 child_bytenr ;
eb = src_path - > nodes [ cur_level + 1 ] ;
parent_slot = src_path - > slots [ cur_level + 1 ] ;
child_bytenr = btrfs_node_blockptr ( eb , parent_slot ) ;
child_gen = btrfs_node_ptr_generation ( eb , parent_slot ) ;
btrfs_node_key_to_cpu ( eb , & first_key , parent_slot ) ;
eb = read_tree_block ( fs_info , child_bytenr , child_gen ,
cur_level , & first_key ) ;
if ( IS_ERR ( eb ) ) {
ret = PTR_ERR ( eb ) ;
goto out ;
} else if ( ! extent_buffer_uptodate ( eb ) ) {
free_extent_buffer ( eb ) ;
ret = - EIO ;
goto out ;
}
src_path - > nodes [ cur_level ] = eb ;
btrfs_tree_read_lock ( eb ) ;
2018-04-04 03:00:17 +03:00
btrfs_set_lock_blocking_read ( eb ) ;
2018-09-27 09:42:30 +03:00
src_path - > locks [ cur_level ] = BTRFS_READ_LOCK_BLOCKING ;
}
src_path - > slots [ cur_level ] = dst_path - > slots [ cur_level ] ;
if ( cur_level ) {
btrfs_node_key_to_cpu ( dst_path - > nodes [ cur_level ] ,
& dst_key , dst_path - > slots [ cur_level ] ) ;
btrfs_node_key_to_cpu ( src_path - > nodes [ cur_level ] ,
& src_key , src_path - > slots [ cur_level ] ) ;
} else {
btrfs_item_key_to_cpu ( dst_path - > nodes [ cur_level ] ,
& dst_key , dst_path - > slots [ cur_level ] ) ;
btrfs_item_key_to_cpu ( src_path - > nodes [ cur_level ] ,
& src_key , src_path - > slots [ cur_level ] ) ;
}
/* Content mismatch, something went wrong */
if ( btrfs_comp_cpu_keys ( & dst_key , & src_key ) ) {
ret = - ENOENT ;
goto out ;
}
cur_level - - ;
}
/*
* Now both @ dst_path and @ src_path have been populated , record the tree
* blocks for qgroup accounting .
*/
ret = btrfs_qgroup_trace_extent ( trans , src_path - > nodes [ dst_level ] - > start ,
nodesize , GFP_NOFS ) ;
if ( ret < 0 )
goto out ;
ret = btrfs_qgroup_trace_extent ( trans ,
dst_path - > nodes [ dst_level ] - > start ,
nodesize , GFP_NOFS ) ;
if ( ret < 0 )
goto out ;
/* Record leaf file extents */
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 09:42:35 +03:00
if ( dst_level = = 0 & & trace_leaf ) {
2018-09-27 09:42:30 +03:00
ret = btrfs_qgroup_trace_leaf_items ( trans , src_path - > nodes [ 0 ] ) ;
if ( ret < 0 )
goto out ;
ret = btrfs_qgroup_trace_leaf_items ( trans , dst_path - > nodes [ 0 ] ) ;
}
out :
btrfs_free_path ( src_path ) ;
return ret ;
}
2018-09-27 09:42:31 +03:00
/*
* Helper function to do recursive generation - aware depth - first search , to
* locate all new tree blocks in a subtree of reloc tree .
*
* E . g . ( OO = Old tree blocks , NN = New tree blocks , whose gen = = last_snapshot )
* reloc tree
* L2 NN ( a )
* / \
* L1 OO NN ( b )
* / \ / \
* L0 OO OO OO NN
* ( c ) ( d )
* If we pass :
* @ dst_path = [ nodes [ 1 ] = NN ( b ) , nodes [ 0 ] = NULL ] ,
* @ cur_level = 1
* @ root_level = 1
*
* We will iterate through tree blocks NN ( b ) , NN ( d ) and info qgroup to trace
* above tree blocks along with their counter parts in file tree .
2018-11-28 14:05:13 +03:00
* While during search , old tree blocks OO ( c ) will be skipped as tree block swap
2018-09-27 09:42:31 +03:00
* won ' t affect OO ( c ) .
*/
static int qgroup_trace_new_subtree_blocks ( struct btrfs_trans_handle * trans ,
struct extent_buffer * src_eb ,
struct btrfs_path * dst_path ,
int cur_level , int root_level ,
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 09:42:35 +03:00
u64 last_snapshot , bool trace_leaf )
2018-09-27 09:42:31 +03:00
{
struct btrfs_fs_info * fs_info = trans - > fs_info ;
struct extent_buffer * eb ;
bool need_cleanup = false ;
int ret = 0 ;
int i ;
/* Level sanity check */
2019-03-18 18:45:19 +03:00
if ( cur_level < 0 | | cur_level > = BTRFS_MAX_LEVEL - 1 | |
root_level < 0 | | root_level > = BTRFS_MAX_LEVEL - 1 | |
2018-09-27 09:42:31 +03:00
root_level < cur_level ) {
btrfs_err_rl ( fs_info ,
" %s: bad levels, cur_level=%d root_level=%d " ,
__func__ , cur_level , root_level ) ;
return - EUCLEAN ;
}
/* Read the tree block if needed */
if ( dst_path - > nodes [ cur_level ] = = NULL ) {
struct btrfs_key first_key ;
int parent_slot ;
u64 child_gen ;
u64 child_bytenr ;
/*
* dst_path - > nodes [ root_level ] must be initialized before
* calling this function .
*/
if ( cur_level = = root_level ) {
btrfs_err_rl ( fs_info ,
" %s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d " ,
__func__ , root_level , root_level , cur_level ) ;
return - EUCLEAN ;
}
/*
* We need to get child blockptr / gen from parent before we can
* read it .
*/
eb = dst_path - > nodes [ cur_level + 1 ] ;
parent_slot = dst_path - > slots [ cur_level + 1 ] ;
child_bytenr = btrfs_node_blockptr ( eb , parent_slot ) ;
child_gen = btrfs_node_ptr_generation ( eb , parent_slot ) ;
btrfs_node_key_to_cpu ( eb , & first_key , parent_slot ) ;
/* This node is old, no need to trace */
if ( child_gen < last_snapshot )
goto out ;
eb = read_tree_block ( fs_info , child_bytenr , child_gen ,
cur_level , & first_key ) ;
if ( IS_ERR ( eb ) ) {
ret = PTR_ERR ( eb ) ;
goto out ;
} else if ( ! extent_buffer_uptodate ( eb ) ) {
free_extent_buffer ( eb ) ;
ret = - EIO ;
goto out ;
}
dst_path - > nodes [ cur_level ] = eb ;
dst_path - > slots [ cur_level ] = 0 ;
btrfs_tree_read_lock ( eb ) ;
2018-04-04 03:00:17 +03:00
btrfs_set_lock_blocking_read ( eb ) ;
2018-09-27 09:42:31 +03:00
dst_path - > locks [ cur_level ] = BTRFS_READ_LOCK_BLOCKING ;
need_cleanup = true ;
}
/* Now record this tree block and its counter part for qgroups */
ret = qgroup_trace_extent_swap ( trans , src_eb , dst_path , cur_level ,
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 09:42:35 +03:00
root_level , trace_leaf ) ;
2018-09-27 09:42:31 +03:00
if ( ret < 0 )
goto cleanup ;
eb = dst_path - > nodes [ cur_level ] ;
if ( cur_level > 0 ) {
/* Iterate all child tree blocks */
for ( i = 0 ; i < btrfs_header_nritems ( eb ) ; i + + ) {
/* Skip old tree blocks as they won't be swapped */
if ( btrfs_node_ptr_generation ( eb , i ) < last_snapshot )
continue ;
dst_path - > slots [ cur_level ] = i ;
/* Recursive call (at most 7 times) */
ret = qgroup_trace_new_subtree_blocks ( trans , src_eb ,
dst_path , cur_level - 1 , root_level ,
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 09:42:35 +03:00
last_snapshot , trace_leaf ) ;
2018-09-27 09:42:31 +03:00
if ( ret < 0 )
goto cleanup ;
}
}
cleanup :
if ( need_cleanup ) {
/* Clean up */
btrfs_tree_unlock_rw ( dst_path - > nodes [ cur_level ] ,
dst_path - > locks [ cur_level ] ) ;
free_extent_buffer ( dst_path - > nodes [ cur_level ] ) ;
dst_path - > nodes [ cur_level ] = NULL ;
dst_path - > slots [ cur_level ] = 0 ;
dst_path - > locks [ cur_level ] = 0 ;
}
out :
return ret ;
}
2019-01-23 10:15:15 +03:00
static int qgroup_trace_subtree_swap ( struct btrfs_trans_handle * trans ,
struct extent_buffer * src_eb ,
struct extent_buffer * dst_eb ,
u64 last_snapshot , bool trace_leaf )
{
struct btrfs_fs_info * fs_info = trans - > fs_info ;
struct btrfs_path * dst_path = NULL ;
int level ;
int ret ;
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) )
return 0 ;
/* Wrong parameter order */
if ( btrfs_header_generation ( src_eb ) > btrfs_header_generation ( dst_eb ) ) {
btrfs_err_rl ( fs_info ,
" %s: bad parameter order, src_gen=%llu dst_gen=%llu " , __func__ ,
btrfs_header_generation ( src_eb ) ,
btrfs_header_generation ( dst_eb ) ) ;
return - EUCLEAN ;
}
if ( ! extent_buffer_uptodate ( src_eb ) | | ! extent_buffer_uptodate ( dst_eb ) ) {
ret = - EIO ;
goto out ;
}
level = btrfs_header_level ( dst_eb ) ;
dst_path = btrfs_alloc_path ( ) ;
if ( ! dst_path ) {
ret = - ENOMEM ;
goto out ;
}
/* For dst_path */
extent_buffer_get ( dst_eb ) ;
dst_path - > nodes [ level ] = dst_eb ;
dst_path - > slots [ level ] = 0 ;
dst_path - > locks [ level ] = 0 ;
/* Do the generation aware breadth-first search */
ret = qgroup_trace_new_subtree_blocks ( trans , src_eb , dst_path , level ,
level , last_snapshot , trace_leaf ) ;
if ( ret < 0 )
goto out ;
ret = 0 ;
out :
btrfs_free_path ( dst_path ) ;
if ( ret < 0 )
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
return ret ;
}
2016-10-18 04:31:28 +03:00
int btrfs_qgroup_trace_subtree ( struct btrfs_trans_handle * trans ,
struct extent_buffer * root_eb ,
u64 root_gen , int root_level )
{
2018-07-18 09:45:38 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2016-10-18 04:31:28 +03:00
int ret = 0 ;
int level ;
struct extent_buffer * eb = root_eb ;
struct btrfs_path * path = NULL ;
2017-07-12 09:42:19 +03:00
BUG_ON ( root_level < 0 | | root_level > = BTRFS_MAX_LEVEL ) ;
2016-10-18 04:31:28 +03:00
BUG_ON ( root_eb = = NULL ) ;
2016-06-23 01:54:23 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) )
2016-10-18 04:31:28 +03:00
return 0 ;
if ( ! extent_buffer_uptodate ( root_eb ) ) {
2018-03-29 04:08:11 +03:00
ret = btrfs_read_buffer ( root_eb , root_gen , root_level , NULL ) ;
2016-10-18 04:31:28 +03:00
if ( ret )
goto out ;
}
if ( root_level = = 0 ) {
2018-07-18 09:45:37 +03:00
ret = btrfs_qgroup_trace_leaf_items ( trans , root_eb ) ;
2016-10-18 04:31:28 +03:00
goto out ;
}
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
/*
* Walk down the tree . Missing extent blocks are filled in as
* we go . Metadata is accounted every time we read a new
* extent block .
*
* When we reach a leaf , we account for file extent items in it ,
* walk back up the tree ( adjusting slot pointers as we go )
* and restart the search process .
*/
extent_buffer_get ( root_eb ) ; /* For path */
path - > nodes [ root_level ] = root_eb ;
path - > slots [ root_level ] = 0 ;
path - > locks [ root_level ] = 0 ; /* so release_path doesn't try to unlock */
walk_down :
level = root_level ;
while ( level > = 0 ) {
if ( path - > nodes [ level ] = = NULL ) {
2018-03-29 04:08:11 +03:00
struct btrfs_key first_key ;
2016-10-18 04:31:28 +03:00
int parent_slot ;
u64 child_gen ;
u64 child_bytenr ;
/*
* We need to get child blockptr / gen from parent before
* we can read it .
*/
eb = path - > nodes [ level + 1 ] ;
parent_slot = path - > slots [ level + 1 ] ;
child_bytenr = btrfs_node_blockptr ( eb , parent_slot ) ;
child_gen = btrfs_node_ptr_generation ( eb , parent_slot ) ;
2018-03-29 04:08:11 +03:00
btrfs_node_key_to_cpu ( eb , & first_key , parent_slot ) ;
2016-10-18 04:31:28 +03:00
2018-03-29 04:08:11 +03:00
eb = read_tree_block ( fs_info , child_bytenr , child_gen ,
level , & first_key ) ;
2016-10-18 04:31:28 +03:00
if ( IS_ERR ( eb ) ) {
ret = PTR_ERR ( eb ) ;
goto out ;
} else if ( ! extent_buffer_uptodate ( eb ) ) {
free_extent_buffer ( eb ) ;
ret = - EIO ;
goto out ;
}
path - > nodes [ level ] = eb ;
path - > slots [ level ] = 0 ;
btrfs_tree_read_lock ( eb ) ;
2018-04-04 03:00:17 +03:00
btrfs_set_lock_blocking_read ( eb ) ;
2016-10-18 04:31:28 +03:00
path - > locks [ level ] = BTRFS_READ_LOCK_BLOCKING ;
2018-07-18 11:28:03 +03:00
ret = btrfs_qgroup_trace_extent ( trans , child_bytenr ,
2016-06-23 01:54:23 +03:00
fs_info - > nodesize ,
GFP_NOFS ) ;
2016-10-18 04:31:28 +03:00
if ( ret )
goto out ;
}
if ( level = = 0 ) {
2018-07-18 09:45:37 +03:00
ret = btrfs_qgroup_trace_leaf_items ( trans ,
path - > nodes [ level ] ) ;
2016-10-18 04:31:28 +03:00
if ( ret )
goto out ;
/* Nonzero return here means we completed our search */
2017-02-10 22:30:23 +03:00
ret = adjust_slots_upwards ( path , root_level ) ;
2016-10-18 04:31:28 +03:00
if ( ret )
break ;
/* Restart search with new slots */
goto walk_down ;
}
level - - ;
}
ret = 0 ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
2015-04-12 11:52:34 +03:00
# define UPDATE_NEW 0
# define UPDATE_OLD 1
/*
* Walk all of the roots that points to the bytenr and adjust their refcnts .
*/
static int qgroup_update_refcnt ( struct btrfs_fs_info * fs_info ,
struct ulist * roots , struct ulist * tmp ,
struct ulist * qgroups , u64 seq , int update_old )
{
struct ulist_node * unode ;
struct ulist_iterator uiter ;
struct ulist_node * tmp_unode ;
struct ulist_iterator tmp_uiter ;
struct btrfs_qgroup * qg ;
int ret = 0 ;
if ( ! roots )
return 0 ;
ULIST_ITER_INIT ( & uiter ) ;
while ( ( unode = ulist_next ( roots , & uiter ) ) ) {
qg = find_qgroup_rb ( fs_info , unode - > val ) ;
if ( ! qg )
continue ;
ulist_reinit ( tmp ) ;
2016-10-26 17:23:50 +03:00
ret = ulist_add ( qgroups , qg - > qgroupid , qgroup_to_aux ( qg ) ,
2015-04-12 11:52:34 +03:00
GFP_ATOMIC ) ;
if ( ret < 0 )
return ret ;
2016-10-26 17:23:50 +03:00
ret = ulist_add ( tmp , qg - > qgroupid , qgroup_to_aux ( qg ) , GFP_ATOMIC ) ;
2015-04-12 11:52:34 +03:00
if ( ret < 0 )
return ret ;
ULIST_ITER_INIT ( & tmp_uiter ) ;
while ( ( tmp_unode = ulist_next ( tmp , & tmp_uiter ) ) ) {
struct btrfs_qgroup_list * glist ;
2016-10-26 17:23:50 +03:00
qg = unode_aux_to_qgroup ( tmp_unode ) ;
2015-04-12 11:52:34 +03:00
if ( update_old )
btrfs_qgroup_update_old_refcnt ( qg , seq , 1 ) ;
else
btrfs_qgroup_update_new_refcnt ( qg , seq , 1 ) ;
list_for_each_entry ( glist , & qg - > groups , next_group ) {
ret = ulist_add ( qgroups , glist - > group - > qgroupid ,
2016-10-26 17:23:50 +03:00
qgroup_to_aux ( glist - > group ) ,
2015-04-12 11:52:34 +03:00
GFP_ATOMIC ) ;
if ( ret < 0 )
return ret ;
ret = ulist_add ( tmp , glist - > group - > qgroupid ,
2016-10-26 17:23:50 +03:00
qgroup_to_aux ( glist - > group ) ,
2015-04-12 11:52:34 +03:00
GFP_ATOMIC ) ;
if ( ret < 0 )
return ret ;
}
}
}
return 0 ;
}
2015-04-12 11:59:57 +03:00
/*
* Update qgroup rfer / excl counters .
* Rfer update is easy , codes can explain themselves .
2015-04-17 05:23:16 +03:00
*
2015-04-12 11:59:57 +03:00
* Excl update is tricky , the update is split into 2 part .
* Part 1 : Possible exclusive < - > sharing detect :
* | A | ! A |
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* B | * | - |
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* ! B | + | * * |
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*
* Conditions :
* A : cur_old_roots < nr_old_roots ( not exclusive before )
* ! A : cur_old_roots = = nr_old_roots ( possible exclusive before )
* B : cur_new_roots < nr_new_roots ( not exclusive now )
2016-05-20 04:18:45 +03:00
* ! B : cur_new_roots = = nr_new_roots ( possible exclusive now )
2015-04-12 11:59:57 +03:00
*
* Results :
* + : Possible sharing - > exclusive - : Possible exclusive - > sharing
* * : Definitely not changed . * * : Possible unchanged .
*
* For ! A and ! B condition , the exception is cur_old / new_roots = = 0 case .
*
* To make the logic clear , we first use condition A and B to split
* combination into 4 results .
*
* Then , for result " + " and " - " , check old / new_roots = = 0 case , as in them
* only on variant maybe 0.
*
* Lastly , check result * * , since there are 2 variants maybe 0 , split them
* again ( 2 x2 ) .
* But this time we don ' t need to consider other things , the codes and logic
* is easy to understand now .
*/
static int qgroup_update_counters ( struct btrfs_fs_info * fs_info ,
struct ulist * qgroups ,
u64 nr_old_roots ,
u64 nr_new_roots ,
u64 num_bytes , u64 seq )
{
struct ulist_node * unode ;
struct ulist_iterator uiter ;
struct btrfs_qgroup * qg ;
u64 cur_new_count , cur_old_count ;
ULIST_ITER_INIT ( & uiter ) ;
while ( ( unode = ulist_next ( qgroups , & uiter ) ) ) {
bool dirty = false ;
2016-10-26 17:23:50 +03:00
qg = unode_aux_to_qgroup ( unode ) ;
2015-04-12 11:59:57 +03:00
cur_old_count = btrfs_qgroup_get_old_refcnt ( qg , seq ) ;
cur_new_count = btrfs_qgroup_get_new_refcnt ( qg , seq ) ;
2018-04-30 10:04:44 +03:00
trace_qgroup_update_counters ( fs_info , qg , cur_old_count ,
cur_new_count ) ;
2016-03-30 03:19:55 +03:00
2015-04-12 11:59:57 +03:00
/* Rfer update part */
if ( cur_old_count = = 0 & & cur_new_count > 0 ) {
qg - > rfer + = num_bytes ;
qg - > rfer_cmpr + = num_bytes ;
dirty = true ;
}
if ( cur_old_count > 0 & & cur_new_count = = 0 ) {
qg - > rfer - = num_bytes ;
qg - > rfer_cmpr - = num_bytes ;
dirty = true ;
}
/* Excl update part */
/* Exclusive/none -> shared case */
if ( cur_old_count = = nr_old_roots & &
cur_new_count < nr_new_roots ) {
/* Exclusive -> shared */
if ( cur_old_count ! = 0 ) {
qg - > excl - = num_bytes ;
qg - > excl_cmpr - = num_bytes ;
dirty = true ;
}
}
/* Shared -> exclusive/none case */
if ( cur_old_count < nr_old_roots & &
cur_new_count = = nr_new_roots ) {
/* Shared->exclusive */
if ( cur_new_count ! = 0 ) {
qg - > excl + = num_bytes ;
qg - > excl_cmpr + = num_bytes ;
dirty = true ;
}
}
/* Exclusive/none -> exclusive/none case */
if ( cur_old_count = = nr_old_roots & &
cur_new_count = = nr_new_roots ) {
if ( cur_old_count = = 0 ) {
/* None -> exclusive/none */
if ( cur_new_count ! = 0 ) {
/* None -> exclusive */
qg - > excl + = num_bytes ;
qg - > excl_cmpr + = num_bytes ;
dirty = true ;
}
/* None -> none, nothing changed */
} else {
/* Exclusive -> exclusive/none */
if ( cur_new_count = = 0 ) {
/* Exclusive -> none */
qg - > excl - = num_bytes ;
qg - > excl_cmpr - = num_bytes ;
dirty = true ;
}
/* Exclusive -> exclusive, nothing changed */
}
}
2015-08-03 09:44:29 +03:00
2015-04-12 11:59:57 +03:00
if ( dirty )
qgroup_dirty ( fs_info , qg ) ;
}
return 0 ;
}
2017-02-27 10:10:34 +03:00
/*
* Check if the @ roots potentially is a list of fs tree roots
*
* Return 0 for definitely not a fs / subvol tree roots ulist
* Return 1 for possible fs / subvol tree roots in the list ( considering an empty
* one as well )
*/
static int maybe_fs_roots ( struct ulist * roots )
{
struct ulist_node * unode ;
struct ulist_iterator uiter ;
/* Empty one, still possible for fs roots */
if ( ! roots | | roots - > nnodes = = 0 )
return 1 ;
ULIST_ITER_INIT ( & uiter ) ;
unode = ulist_next ( roots , & uiter ) ;
if ( ! unode )
return 1 ;
/*
* If it contains fs tree roots , then it must belong to fs / subvol
* trees .
* If it contains a non - fs tree , it won ' t be shared with fs / subvol trees .
*/
return is_fstree ( unode - > val ) ;
}
2018-07-18 09:45:39 +03:00
int btrfs_qgroup_account_extent ( struct btrfs_trans_handle * trans , u64 bytenr ,
u64 num_bytes , struct ulist * old_roots ,
struct ulist * new_roots )
2015-04-16 10:37:33 +03:00
{
2018-07-18 09:45:39 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2015-04-16 10:37:33 +03:00
struct ulist * qgroups = NULL ;
struct ulist * tmp = NULL ;
u64 seq ;
u64 nr_new_roots = 0 ;
u64 nr_old_roots = 0 ;
int ret = 0 ;
2017-02-13 16:05:24 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) )
return 0 ;
2017-02-27 10:10:34 +03:00
if ( new_roots ) {
if ( ! maybe_fs_roots ( new_roots ) )
goto out_free ;
2015-04-16 10:37:33 +03:00
nr_new_roots = new_roots - > nnodes ;
2017-02-27 10:10:34 +03:00
}
if ( old_roots ) {
if ( ! maybe_fs_roots ( old_roots ) )
goto out_free ;
2015-04-16 10:37:33 +03:00
nr_old_roots = old_roots - > nnodes ;
2017-02-27 10:10:34 +03:00
}
/* Quick exit, either not fs tree roots, or won't affect any qgroup */
if ( nr_old_roots = = 0 & & nr_new_roots = = 0 )
goto out_free ;
2015-04-16 10:37:33 +03:00
BUG_ON ( ! fs_info - > quota_root ) ;
2018-05-03 04:59:02 +03:00
trace_btrfs_qgroup_account_extent ( fs_info , trans - > transid , bytenr ,
num_bytes , nr_old_roots , nr_new_roots ) ;
2016-03-30 03:19:55 +03:00
2015-04-16 10:37:33 +03:00
qgroups = ulist_alloc ( GFP_NOFS ) ;
if ( ! qgroups ) {
ret = - ENOMEM ;
goto out_free ;
}
tmp = ulist_alloc ( GFP_NOFS ) ;
if ( ! tmp ) {
ret = - ENOMEM ;
goto out_free ;
}
mutex_lock ( & fs_info - > qgroup_rescan_lock ) ;
if ( fs_info - > qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN ) {
if ( fs_info - > qgroup_rescan_progress . objectid < = bytenr ) {
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
ret = 0 ;
goto out_free ;
}
}
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
spin_lock ( & fs_info - > qgroup_lock ) ;
seq = fs_info - > qgroup_seq ;
/* Update old refcnts using old_roots */
ret = qgroup_update_refcnt ( fs_info , old_roots , tmp , qgroups , seq ,
UPDATE_OLD ) ;
if ( ret < 0 )
goto out ;
/* Update new refcnts using new_roots */
ret = qgroup_update_refcnt ( fs_info , new_roots , tmp , qgroups , seq ,
UPDATE_NEW ) ;
if ( ret < 0 )
goto out ;
qgroup_update_counters ( fs_info , qgroups , nr_old_roots , nr_new_roots ,
num_bytes , seq ) ;
/*
* Bump qgroup_seq to avoid seq overlap
*/
fs_info - > qgroup_seq + = max ( nr_old_roots , nr_new_roots ) + 1 ;
out :
spin_unlock ( & fs_info - > qgroup_lock ) ;
out_free :
ulist_free ( tmp ) ;
ulist_free ( qgroups ) ;
ulist_free ( old_roots ) ;
ulist_free ( new_roots ) ;
return ret ;
}
2018-03-15 17:00:25 +03:00
int btrfs_qgroup_account_extents ( struct btrfs_trans_handle * trans )
2015-04-16 10:37:33 +03:00
{
2018-03-15 17:00:25 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2015-04-16 10:37:33 +03:00
struct btrfs_qgroup_extent_record * record ;
struct btrfs_delayed_ref_root * delayed_refs ;
struct ulist * new_roots = NULL ;
struct rb_node * node ;
2018-09-27 09:42:29 +03:00
u64 num_dirty_extents = 0 ;
2015-04-20 04:53:50 +03:00
u64 qgroup_to_skip ;
2015-04-16 10:37:33 +03:00
int ret = 0 ;
delayed_refs = & trans - > transaction - > delayed_refs ;
2015-04-20 04:53:50 +03:00
qgroup_to_skip = delayed_refs - > qgroup_to_skip ;
2015-04-16 10:37:33 +03:00
while ( ( node = rb_first ( & delayed_refs - > dirty_extent_root ) ) ) {
record = rb_entry ( node , struct btrfs_qgroup_extent_record ,
node ) ;
2018-09-27 09:42:29 +03:00
num_dirty_extents + + ;
2016-06-10 00:27:55 +03:00
trace_btrfs_qgroup_account_extents ( fs_info , record ) ;
2016-03-30 03:19:55 +03:00
2015-04-16 10:37:33 +03:00
if ( ! ret ) {
2017-02-27 10:10:35 +03:00
/*
* Old roots should be searched when inserting qgroup
* extent record
*/
if ( WARN_ON ( ! record - > old_roots ) ) {
/* Search commit root to find old_roots */
ret = btrfs_find_all_roots ( NULL , fs_info ,
record - > bytenr , 0 ,
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:45 +03:00
& record - > old_roots , false ) ;
2017-02-27 10:10:35 +03:00
if ( ret < 0 )
goto cleanup ;
}
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 10:15:12 +03:00
/* Free the reserved data space */
btrfs_qgroup_free_refroot ( fs_info ,
record - > data_rsv_refroot ,
record - > data_rsv ,
BTRFS_QGROUP_RSV_DATA ) ;
2015-04-16 10:37:33 +03:00
/*
2017-03-16 19:04:34 +03:00
* Use SEQ_LAST as time_seq to do special search , which
2015-04-16 10:37:33 +03:00
* doesn ' t lock tree or delayed_refs and search current
* root . It ' s safe inside commit_transaction ( ) .
*/
ret = btrfs_find_all_roots ( trans , fs_info ,
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:45 +03:00
record - > bytenr , SEQ_LAST , & new_roots , false ) ;
2015-04-16 10:37:33 +03:00
if ( ret < 0 )
goto cleanup ;
2017-02-27 10:10:35 +03:00
if ( qgroup_to_skip ) {
2015-04-20 04:53:50 +03:00
ulist_del ( new_roots , qgroup_to_skip , 0 ) ;
2017-02-27 10:10:35 +03:00
ulist_del ( record - > old_roots , qgroup_to_skip ,
0 ) ;
}
2018-07-18 09:45:39 +03:00
ret = btrfs_qgroup_account_extent ( trans , record - > bytenr ,
record - > num_bytes ,
record - > old_roots ,
new_roots ) ;
2015-04-16 10:37:33 +03:00
record - > old_roots = NULL ;
new_roots = NULL ;
}
cleanup :
ulist_free ( record - > old_roots ) ;
ulist_free ( new_roots ) ;
new_roots = NULL ;
rb_erase ( node , & delayed_refs - > dirty_extent_root ) ;
kfree ( record ) ;
}
2018-09-27 09:42:29 +03:00
trace_qgroup_num_dirty_extents ( fs_info , trans - > transid ,
num_dirty_extents ) ;
2015-04-16 10:37:33 +03:00
return ret ;
}
2012-06-28 20:03:02 +04:00
/*
* called from commit_transaction . Writes all changed qgroups to disk .
*/
2018-07-18 09:45:40 +03:00
int btrfs_run_qgroups ( struct btrfs_trans_handle * trans )
2012-06-28 20:03:02 +04:00
{
2018-07-18 09:45:40 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2012-06-28 20:03:02 +04:00
struct btrfs_root * quota_root = fs_info - > quota_root ;
int ret = 0 ;
if ( ! quota_root )
2018-01-31 11:52:04 +03:00
return ret ;
2012-06-28 20:03:02 +04:00
spin_lock ( & fs_info - > qgroup_lock ) ;
while ( ! list_empty ( & fs_info - > dirty_qgroups ) ) {
struct btrfs_qgroup * qgroup ;
qgroup = list_first_entry ( & fs_info - > dirty_qgroups ,
struct btrfs_qgroup , dirty ) ;
list_del_init ( & qgroup - > dirty ) ;
spin_unlock ( & fs_info - > qgroup_lock ) ;
2018-07-18 09:45:28 +03:00
ret = update_qgroup_info_item ( trans , qgroup ) ;
2014-11-21 05:04:56 +03:00
if ( ret )
fs_info - > qgroup_flags | =
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
2018-07-18 09:45:27 +03:00
ret = update_qgroup_limit_item ( trans , qgroup ) ;
2012-06-28 20:03:02 +04:00
if ( ret )
fs_info - > qgroup_flags | =
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
spin_lock ( & fs_info - > qgroup_lock ) ;
}
2016-09-02 22:40:02 +03:00
if ( test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) )
2012-06-28 20:03:02 +04:00
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_ON ;
else
fs_info - > qgroup_flags & = ~ BTRFS_QGROUP_STATUS_FLAG_ON ;
spin_unlock ( & fs_info - > qgroup_lock ) ;
2018-07-18 09:45:29 +03:00
ret = update_qgroup_status_item ( trans ) ;
2012-06-28 20:03:02 +04:00
if ( ret )
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
return ret ;
}
/*
2016-05-20 04:18:45 +03:00
* Copy the accounting information between qgroups . This is necessary
2016-03-31 03:57:48 +03:00
* when a snapshot or a subvolume is created . Throwing an error will
* cause a transaction abort so we take extra care here to only error
* when a readonly fs is a reasonable outcome .
2012-06-28 20:03:02 +04:00
*/
2018-07-18 09:45:41 +03:00
int btrfs_qgroup_inherit ( struct btrfs_trans_handle * trans , u64 srcid ,
u64 objectid , struct btrfs_qgroup_inherit * inherit )
2012-06-28 20:03:02 +04:00
{
int ret = 0 ;
int i ;
u64 * i_qgroups ;
btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit()
[BUG]
Lockdep will report the following circular locking dependency:
WARNING: possible circular locking dependency detected
5.2.0-rc2-custom #24 Tainted: G O
------------------------------------------------------
btrfs/8631 is trying to acquire lock:
000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs]
but task is already holding lock:
000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&fs_info->tree_log_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x475/0xa00 [btrfs]
btrfs_commit_super+0x71/0x80 [btrfs]
close_ctree+0x2bd/0x320 [btrfs]
btrfs_put_super+0x15/0x20 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x16/0xa0 [btrfs]
deactivate_locked_super+0x3a/0x80
deactivate_super+0x51/0x60
cleanup_mnt+0x3f/0x80
__cleanup_mnt+0x12/0x20
task_work_run+0x94/0xb0
exit_to_usermode_loop+0xd8/0xe0
do_syscall_64+0x210/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&fs_info->reloc_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x40d/0xa00 [btrfs]
btrfs_quota_enable+0x2da/0x730 [btrfs]
btrfs_ioctl+0x2691/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}:
lock_acquire+0xa7/0x190
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_qgroup_inherit+0x40/0x620 [btrfs]
create_pending_snapshot+0x9d7/0xe60 [btrfs]
create_pending_snapshots+0x94/0xb0 [btrfs]
btrfs_commit_transaction+0x415/0xa00 [btrfs]
btrfs_mksubvol+0x496/0x4e0 [btrfs]
btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
btrfs_ioctl+0xa90/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_info->tree_log_mutex);
lock(&fs_info->reloc_mutex);
lock(&fs_info->tree_log_mutex);
lock(&fs_info->qgroup_ioctl_lock#2);
*** DEADLOCK ***
6 locks held by btrfs/8631:
#0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60
#1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs]
#2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs]
#3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs]
#4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs]
#5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
[CAUSE]
Due to the delayed subvolume creation, we need to call
btrfs_qgroup_inherit() inside commit transaction code, with a lot of
other mutex hold.
This hell of lock chain can lead to above problem.
[FIX]
On the other hand, we don't really need to hold qgroup_ioctl_lock if
we're in the context of create_pending_snapshot().
As in that context, we're the only one being able to modify qgroup.
All other qgroup functions which needs qgroup_ioctl_lock are either
holding a transaction handle, or will start a new transaction:
Functions will start a new transaction():
* btrfs_quota_enable()
* btrfs_quota_disable()
Functions hold a transaction handler:
* btrfs_add_qgroup_relation()
* btrfs_del_qgroup_relation()
* btrfs_create_qgroup()
* btrfs_remove_qgroup()
* btrfs_limit_qgroup()
* btrfs_qgroup_inherit() call inside create_subvol()
So we have a higher level protection provided by transaction, thus we
don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit().
Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold
qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in
create_pending_snapshot() is already protected by transaction.
So the fix is to detect the context by checking
trans->transaction->state.
If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction
context and no need to get the mutex.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-06-13 12:31:24 +03:00
bool committing = false ;
2018-07-18 09:45:41 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2018-11-19 19:20:34 +03:00
struct btrfs_root * quota_root ;
2012-06-28 20:03:02 +04:00
struct btrfs_qgroup * srcgroup ;
struct btrfs_qgroup * dstgroup ;
u32 level_size = 0 ;
2013-04-07 14:50:19 +04:00
u64 nums ;
2012-06-28 20:03:02 +04:00
btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit()
[BUG]
Lockdep will report the following circular locking dependency:
WARNING: possible circular locking dependency detected
5.2.0-rc2-custom #24 Tainted: G O
------------------------------------------------------
btrfs/8631 is trying to acquire lock:
000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs]
but task is already holding lock:
000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&fs_info->tree_log_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x475/0xa00 [btrfs]
btrfs_commit_super+0x71/0x80 [btrfs]
close_ctree+0x2bd/0x320 [btrfs]
btrfs_put_super+0x15/0x20 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x16/0xa0 [btrfs]
deactivate_locked_super+0x3a/0x80
deactivate_super+0x51/0x60
cleanup_mnt+0x3f/0x80
__cleanup_mnt+0x12/0x20
task_work_run+0x94/0xb0
exit_to_usermode_loop+0xd8/0xe0
do_syscall_64+0x210/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&fs_info->reloc_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x40d/0xa00 [btrfs]
btrfs_quota_enable+0x2da/0x730 [btrfs]
btrfs_ioctl+0x2691/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}:
lock_acquire+0xa7/0x190
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_qgroup_inherit+0x40/0x620 [btrfs]
create_pending_snapshot+0x9d7/0xe60 [btrfs]
create_pending_snapshots+0x94/0xb0 [btrfs]
btrfs_commit_transaction+0x415/0xa00 [btrfs]
btrfs_mksubvol+0x496/0x4e0 [btrfs]
btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
btrfs_ioctl+0xa90/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_info->tree_log_mutex);
lock(&fs_info->reloc_mutex);
lock(&fs_info->tree_log_mutex);
lock(&fs_info->qgroup_ioctl_lock#2);
*** DEADLOCK ***
6 locks held by btrfs/8631:
#0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60
#1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs]
#2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs]
#3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs]
#4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs]
#5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
[CAUSE]
Due to the delayed subvolume creation, we need to call
btrfs_qgroup_inherit() inside commit transaction code, with a lot of
other mutex hold.
This hell of lock chain can lead to above problem.
[FIX]
On the other hand, we don't really need to hold qgroup_ioctl_lock if
we're in the context of create_pending_snapshot().
As in that context, we're the only one being able to modify qgroup.
All other qgroup functions which needs qgroup_ioctl_lock are either
holding a transaction handle, or will start a new transaction:
Functions will start a new transaction():
* btrfs_quota_enable()
* btrfs_quota_disable()
Functions hold a transaction handler:
* btrfs_add_qgroup_relation()
* btrfs_del_qgroup_relation()
* btrfs_create_qgroup()
* btrfs_remove_qgroup()
* btrfs_limit_qgroup()
* btrfs_qgroup_inherit() call inside create_subvol()
So we have a higher level protection provided by transaction, thus we
don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit().
Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold
qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in
create_pending_snapshot() is already protected by transaction.
So the fix is to detect the context by checking
trans->transaction->state.
If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction
context and no need to get the mutex.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-06-13 12:31:24 +03:00
/*
* There are only two callers of this function .
*
* One in create_subvol ( ) in the ioctl context , which needs to hold
* the qgroup_ioctl_lock .
*
* The other one in create_pending_snapshot ( ) where no other qgroup
* code can modify the fs as they all need to either start a new trans
* or hold a trans handler , thus we don ' t need to hold
* qgroup_ioctl_lock .
* This would avoid long and complex lock chain and make lockdep happy .
*/
spin_lock ( & fs_info - > trans_lock ) ;
if ( trans - > transaction - > state = = TRANS_STATE_COMMIT_DOING )
committing = true ;
spin_unlock ( & fs_info - > trans_lock ) ;
if ( ! committing )
mutex_lock ( & fs_info - > qgroup_ioctl_lock ) ;
2016-09-02 22:40:02 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) )
2013-04-07 14:50:16 +04:00
goto out ;
2012-06-28 20:03:02 +04:00
2018-11-19 19:20:34 +03:00
quota_root = fs_info - > quota_root ;
2013-04-07 14:50:16 +04:00
if ( ! quota_root ) {
ret = - EINVAL ;
goto out ;
}
2012-06-28 20:03:02 +04:00
2013-04-07 14:50:19 +04:00
if ( inherit ) {
i_qgroups = ( u64 * ) ( inherit + 1 ) ;
nums = inherit - > num_qgroups + 2 * inherit - > num_ref_copies +
2 * inherit - > num_excl_copies ;
for ( i = 0 ; i < nums ; + + i ) {
srcgroup = find_qgroup_rb ( fs_info , * i_qgroups ) ;
2014-11-11 15:18:22 +03:00
2016-03-31 03:57:48 +03:00
/*
* Zero out invalid groups so we can ignore
* them later .
*/
if ( ! srcgroup | |
( ( srcgroup - > qgroupid > > 48 ) < = ( objectid > > 48 ) ) )
* i_qgroups = 0ULL ;
2013-04-07 14:50:19 +04:00
+ + i_qgroups ;
}
}
2012-06-28 20:03:02 +04:00
/*
* create a tracking group for the subvol itself
*/
ret = add_qgroup_item ( trans , quota_root , objectid ) ;
if ( ret )
goto out ;
/*
* add qgroup to all inherited groups
*/
if ( inherit ) {
i_qgroups = ( u64 * ) ( inherit + 1 ) ;
2016-03-31 03:57:48 +03:00
for ( i = 0 ; i < inherit - > num_qgroups ; + + i , + + i_qgroups ) {
if ( * i_qgroups = = 0 )
continue ;
2018-07-18 09:45:24 +03:00
ret = add_qgroup_relation_item ( trans , objectid ,
* i_qgroups ) ;
2016-03-31 03:57:48 +03:00
if ( ret & & ret ! = - EEXIST )
2012-06-28 20:03:02 +04:00
goto out ;
2018-07-18 09:45:24 +03:00
ret = add_qgroup_relation_item ( trans , * i_qgroups ,
objectid ) ;
2016-03-31 03:57:48 +03:00
if ( ret & & ret ! = - EEXIST )
2012-06-28 20:03:02 +04:00
goto out ;
}
2016-03-31 03:57:48 +03:00
ret = 0 ;
2012-06-28 20:03:02 +04:00
}
spin_lock ( & fs_info - > qgroup_lock ) ;
dstgroup = add_qgroup_rb ( fs_info , objectid ) ;
2012-07-30 12:15:43 +04:00
if ( IS_ERR ( dstgroup ) ) {
ret = PTR_ERR ( dstgroup ) ;
2012-06-28 20:03:02 +04:00
goto unlock ;
2012-07-30 12:15:43 +04:00
}
2012-06-28 20:03:02 +04:00
2014-11-21 04:58:34 +03:00
if ( inherit & & inherit - > flags & BTRFS_QGROUP_INHERIT_SET_LIMITS ) {
dstgroup - > lim_flags = inherit - > lim . flags ;
dstgroup - > max_rfer = inherit - > lim . max_rfer ;
dstgroup - > max_excl = inherit - > lim . max_excl ;
dstgroup - > rsv_rfer = inherit - > lim . rsv_rfer ;
dstgroup - > rsv_excl = inherit - > lim . rsv_excl ;
2014-11-21 05:01:41 +03:00
2018-07-18 09:45:27 +03:00
ret = update_qgroup_limit_item ( trans , dstgroup ) ;
2014-11-21 05:01:41 +03:00
if ( ret ) {
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
2016-09-20 17:05:00 +03:00
btrfs_info ( fs_info ,
" unable to update quota limit for %llu " ,
dstgroup - > qgroupid ) ;
2014-11-21 05:01:41 +03:00
goto unlock ;
}
2014-11-21 04:58:34 +03:00
}
2012-06-28 20:03:02 +04:00
if ( srcid ) {
srcgroup = find_qgroup_rb ( fs_info , srcid ) ;
2012-09-15 04:06:30 +04:00
if ( ! srcgroup )
2012-06-28 20:03:02 +04:00
goto unlock ;
2014-05-14 04:30:47 +04:00
/*
* We call inherit after we clone the root in order to make sure
* our counts don ' t go crazy , so at this point the only
* difference between the two roots should be the root node .
*/
2018-07-17 11:58:22 +03:00
level_size = fs_info - > nodesize ;
2014-05-14 04:30:47 +04:00
dstgroup - > rfer = srcgroup - > rfer ;
dstgroup - > rfer_cmpr = srcgroup - > rfer_cmpr ;
dstgroup - > excl = level_size ;
dstgroup - > excl_cmpr = level_size ;
2012-06-28 20:03:02 +04:00
srcgroup - > excl = level_size ;
srcgroup - > excl_cmpr = level_size ;
2014-11-21 04:14:38 +03:00
/* inherit the limit info */
dstgroup - > lim_flags = srcgroup - > lim_flags ;
dstgroup - > max_rfer = srcgroup - > max_rfer ;
dstgroup - > max_excl = srcgroup - > max_excl ;
dstgroup - > rsv_rfer = srcgroup - > rsv_rfer ;
dstgroup - > rsv_excl = srcgroup - > rsv_excl ;
2012-06-28 20:03:02 +04:00
qgroup_dirty ( fs_info , dstgroup ) ;
qgroup_dirty ( fs_info , srcgroup ) ;
}
2012-09-15 04:06:30 +04:00
if ( ! inherit )
2012-06-28 20:03:02 +04:00
goto unlock ;
i_qgroups = ( u64 * ) ( inherit + 1 ) ;
for ( i = 0 ; i < inherit - > num_qgroups ; + + i ) {
2016-03-31 03:57:48 +03:00
if ( * i_qgroups ) {
2016-06-23 01:54:23 +03:00
ret = add_relation_rb ( fs_info , objectid , * i_qgroups ) ;
2016-03-31 03:57:48 +03:00
if ( ret )
goto unlock ;
}
2012-06-28 20:03:02 +04:00
+ + i_qgroups ;
}
2016-03-31 03:57:48 +03:00
for ( i = 0 ; i < inherit - > num_ref_copies ; + + i , i_qgroups + = 2 ) {
2012-06-28 20:03:02 +04:00
struct btrfs_qgroup * src ;
struct btrfs_qgroup * dst ;
2016-03-31 03:57:48 +03:00
if ( ! i_qgroups [ 0 ] | | ! i_qgroups [ 1 ] )
continue ;
2012-06-28 20:03:02 +04:00
src = find_qgroup_rb ( fs_info , i_qgroups [ 0 ] ) ;
dst = find_qgroup_rb ( fs_info , i_qgroups [ 1 ] ) ;
if ( ! src | | ! dst ) {
ret = - EINVAL ;
goto unlock ;
}
dst - > rfer = src - > rfer - level_size ;
dst - > rfer_cmpr = src - > rfer_cmpr - level_size ;
}
2016-03-31 03:57:48 +03:00
for ( i = 0 ; i < inherit - > num_excl_copies ; + + i , i_qgroups + = 2 ) {
2012-06-28 20:03:02 +04:00
struct btrfs_qgroup * src ;
struct btrfs_qgroup * dst ;
2016-03-31 03:57:48 +03:00
if ( ! i_qgroups [ 0 ] | | ! i_qgroups [ 1 ] )
continue ;
2012-06-28 20:03:02 +04:00
src = find_qgroup_rb ( fs_info , i_qgroups [ 0 ] ) ;
dst = find_qgroup_rb ( fs_info , i_qgroups [ 1 ] ) ;
if ( ! src | | ! dst ) {
ret = - EINVAL ;
goto unlock ;
}
dst - > excl = src - > excl + level_size ;
dst - > excl_cmpr = src - > excl_cmpr + level_size ;
}
unlock :
spin_unlock ( & fs_info - > qgroup_lock ) ;
out :
btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit()
[BUG]
Lockdep will report the following circular locking dependency:
WARNING: possible circular locking dependency detected
5.2.0-rc2-custom #24 Tainted: G O
------------------------------------------------------
btrfs/8631 is trying to acquire lock:
000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs]
but task is already holding lock:
000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&fs_info->tree_log_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x475/0xa00 [btrfs]
btrfs_commit_super+0x71/0x80 [btrfs]
close_ctree+0x2bd/0x320 [btrfs]
btrfs_put_super+0x15/0x20 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x16/0xa0 [btrfs]
deactivate_locked_super+0x3a/0x80
deactivate_super+0x51/0x60
cleanup_mnt+0x3f/0x80
__cleanup_mnt+0x12/0x20
task_work_run+0x94/0xb0
exit_to_usermode_loop+0xd8/0xe0
do_syscall_64+0x210/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&fs_info->reloc_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x40d/0xa00 [btrfs]
btrfs_quota_enable+0x2da/0x730 [btrfs]
btrfs_ioctl+0x2691/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}:
lock_acquire+0xa7/0x190
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_qgroup_inherit+0x40/0x620 [btrfs]
create_pending_snapshot+0x9d7/0xe60 [btrfs]
create_pending_snapshots+0x94/0xb0 [btrfs]
btrfs_commit_transaction+0x415/0xa00 [btrfs]
btrfs_mksubvol+0x496/0x4e0 [btrfs]
btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
btrfs_ioctl+0xa90/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_info->tree_log_mutex);
lock(&fs_info->reloc_mutex);
lock(&fs_info->tree_log_mutex);
lock(&fs_info->qgroup_ioctl_lock#2);
*** DEADLOCK ***
6 locks held by btrfs/8631:
#0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60
#1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs]
#2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs]
#3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs]
#4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs]
#5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
[CAUSE]
Due to the delayed subvolume creation, we need to call
btrfs_qgroup_inherit() inside commit transaction code, with a lot of
other mutex hold.
This hell of lock chain can lead to above problem.
[FIX]
On the other hand, we don't really need to hold qgroup_ioctl_lock if
we're in the context of create_pending_snapshot().
As in that context, we're the only one being able to modify qgroup.
All other qgroup functions which needs qgroup_ioctl_lock are either
holding a transaction handle, or will start a new transaction:
Functions will start a new transaction():
* btrfs_quota_enable()
* btrfs_quota_disable()
Functions hold a transaction handler:
* btrfs_add_qgroup_relation()
* btrfs_del_qgroup_relation()
* btrfs_create_qgroup()
* btrfs_remove_qgroup()
* btrfs_limit_qgroup()
* btrfs_qgroup_inherit() call inside create_subvol()
So we have a higher level protection provided by transaction, thus we
don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit().
Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold
qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in
create_pending_snapshot() is already protected by transaction.
So the fix is to detect the context by checking
trans->transaction->state.
If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction
context and no need to get the mutex.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-06-13 12:31:24 +03:00
if ( ! committing )
mutex_unlock ( & fs_info - > qgroup_ioctl_lock ) ;
2012-06-28 20:03:02 +04:00
return ret ;
}
2017-12-22 11:06:39 +03:00
/*
* Two limits to commit transaction in advance .
*
2019-01-25 02:55:27 +03:00
* For RATIO , it will be 1 / RATIO of the remaining limit as threshold .
2017-12-22 11:06:39 +03:00
* For SIZE , it will be in byte unit as threshold .
*/
2019-01-25 02:55:27 +03:00
# define QGROUP_FREE_RATIO 32
# define QGROUP_FREE_SIZE SZ_32M
2017-12-22 11:06:39 +03:00
static bool qgroup_check_limits ( struct btrfs_fs_info * fs_info ,
const struct btrfs_qgroup * qg , u64 num_bytes )
2017-01-25 17:50:33 +03:00
{
2019-01-25 02:55:27 +03:00
u64 free ;
2017-12-22 11:06:39 +03:00
u64 threshold ;
2017-01-25 17:50:33 +03:00
if ( ( qg - > lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER ) & &
2017-12-12 10:34:25 +03:00
qgroup_rsv_total ( qg ) + ( s64 ) qg - > rfer + num_bytes > qg - > max_rfer )
2017-01-25 17:50:33 +03:00
return false ;
if ( ( qg - > lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL ) & &
2017-12-12 10:34:25 +03:00
qgroup_rsv_total ( qg ) + ( s64 ) qg - > excl + num_bytes > qg - > max_excl )
2017-01-25 17:50:33 +03:00
return false ;
2017-12-22 11:06:39 +03:00
/*
* Even if we passed the check , it ' s better to check if reservation
* for meta_pertrans is pushing us near limit .
* If there is too much pertrans reservation or it ' s near the limit ,
* let ' s try commit transaction to free some , using transaction_kthread
*/
if ( ( qg - > lim_flags & ( BTRFS_QGROUP_LIMIT_MAX_RFER |
BTRFS_QGROUP_LIMIT_MAX_EXCL ) ) ) {
2019-01-25 02:55:27 +03:00
if ( qg - > lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL ) {
free = qg - > max_excl - qgroup_rsv_total ( qg ) - qg - > excl ;
threshold = min_t ( u64 , qg - > max_excl / QGROUP_FREE_RATIO ,
QGROUP_FREE_SIZE ) ;
} else {
free = qg - > max_rfer - qgroup_rsv_total ( qg ) - qg - > rfer ;
threshold = min_t ( u64 , qg - > max_rfer / QGROUP_FREE_RATIO ,
QGROUP_FREE_SIZE ) ;
}
2017-12-22 11:06:39 +03:00
/*
* Use transaction_kthread to commit transaction , so we no
* longer need to bother nested transaction nor lock context .
*/
2019-01-25 02:55:27 +03:00
if ( free < threshold )
2017-12-22 11:06:39 +03:00
btrfs_commit_transaction_locksafe ( fs_info ) ;
}
2017-01-25 17:50:33 +03:00
return true ;
}
2017-12-12 10:34:25 +03:00
static int qgroup_reserve ( struct btrfs_root * root , u64 num_bytes , bool enforce ,
enum btrfs_qgroup_rsv_type type )
2012-06-28 20:03:02 +04:00
{
struct btrfs_root * quota_root ;
struct btrfs_qgroup * qgroup ;
struct btrfs_fs_info * fs_info = root - > fs_info ;
u64 ref_root = root - > root_key . objectid ;
int ret = 0 ;
struct ulist_node * unode ;
struct ulist_iterator uiter ;
if ( ! is_fstree ( ref_root ) )
return 0 ;
if ( num_bytes = = 0 )
return 0 ;
2017-05-12 00:17:33 +03:00
if ( test_bit ( BTRFS_FS_QUOTA_OVERRIDE , & fs_info - > flags ) & &
capable ( CAP_SYS_RESOURCE ) )
enforce = false ;
2012-06-28 20:03:02 +04:00
spin_lock ( & fs_info - > qgroup_lock ) ;
quota_root = fs_info - > quota_root ;
if ( ! quota_root )
goto out ;
qgroup = find_qgroup_rb ( fs_info , ref_root ) ;
if ( ! qgroup )
goto out ;
/*
* in a first step , we check all affected qgroups if any limits would
* be exceeded
*/
2013-05-06 15:03:27 +04:00
ulist_reinit ( fs_info - > qgroup_ulist ) ;
ret = ulist_add ( fs_info - > qgroup_ulist , qgroup - > qgroupid ,
2018-03-27 20:04:50 +03:00
qgroup_to_aux ( qgroup ) , GFP_ATOMIC ) ;
2013-04-17 18:00:36 +04:00
if ( ret < 0 )
goto out ;
2012-06-28 20:03:02 +04:00
ULIST_ITER_INIT ( & uiter ) ;
2013-05-06 15:03:27 +04:00
while ( ( unode = ulist_next ( fs_info - > qgroup_ulist , & uiter ) ) ) {
2012-06-28 20:03:02 +04:00
struct btrfs_qgroup * qg ;
struct btrfs_qgroup_list * glist ;
2016-10-26 17:23:50 +03:00
qg = unode_aux_to_qgroup ( unode ) ;
2012-06-28 20:03:02 +04:00
2017-12-22 11:06:39 +03:00
if ( enforce & & ! qgroup_check_limits ( fs_info , qg , num_bytes ) ) {
2012-06-28 20:03:02 +04:00
ret = - EDQUOT ;
2013-03-06 15:51:47 +04:00
goto out ;
}
2012-06-28 20:03:02 +04:00
list_for_each_entry ( glist , & qg - > groups , next_group ) {
2013-05-06 15:03:27 +04:00
ret = ulist_add ( fs_info - > qgroup_ulist ,
glist - > group - > qgroupid ,
2018-03-27 20:04:50 +03:00
qgroup_to_aux ( glist - > group ) , GFP_ATOMIC ) ;
2013-04-17 18:00:36 +04:00
if ( ret < 0 )
goto out ;
2012-06-28 20:03:02 +04:00
}
}
2013-04-17 18:00:36 +04:00
ret = 0 ;
2012-06-28 20:03:02 +04:00
/*
* no limits exceeded , now record the reservation into all qgroups
*/
ULIST_ITER_INIT ( & uiter ) ;
2013-05-06 15:03:27 +04:00
while ( ( unode = ulist_next ( fs_info - > qgroup_ulist , & uiter ) ) ) {
2012-06-28 20:03:02 +04:00
struct btrfs_qgroup * qg ;
2016-10-26 17:23:50 +03:00
qg = unode_aux_to_qgroup ( unode ) ;
2012-06-28 20:03:02 +04:00
2017-12-12 10:34:27 +03:00
qgroup_rsv_add ( fs_info , qg , num_bytes , type ) ;
2012-06-28 20:03:02 +04:00
}
out :
spin_unlock ( & fs_info - > qgroup_lock ) ;
return ret ;
}
2017-12-12 10:34:30 +03:00
/*
* Free @ num_bytes of reserved space with @ type for qgroup . ( Normally level 0
* qgroup ) .
*
* Will handle all higher level qgroup too .
*
* NOTE : If @ num_bytes is ( u64 ) - 1 , this means to free all bytes of this qgroup .
* This special case is only used for META_PERTRANS type .
*/
2015-09-08 12:08:37 +03:00
void btrfs_qgroup_free_refroot ( struct btrfs_fs_info * fs_info ,
2017-12-12 10:34:23 +03:00
u64 ref_root , u64 num_bytes ,
enum btrfs_qgroup_rsv_type type )
2012-06-28 20:03:02 +04:00
{
struct btrfs_root * quota_root ;
struct btrfs_qgroup * qgroup ;
struct ulist_node * unode ;
struct ulist_iterator uiter ;
2013-04-17 18:00:36 +04:00
int ret = 0 ;
2012-06-28 20:03:02 +04:00
if ( ! is_fstree ( ref_root ) )
return ;
if ( num_bytes = = 0 )
return ;
2017-12-12 10:34:30 +03:00
if ( num_bytes = = ( u64 ) - 1 & & type ! = BTRFS_QGROUP_RSV_META_PERTRANS ) {
WARN ( 1 , " %s: Invalid type to free " , __func__ ) ;
return ;
}
2012-06-28 20:03:02 +04:00
spin_lock ( & fs_info - > qgroup_lock ) ;
quota_root = fs_info - > quota_root ;
if ( ! quota_root )
goto out ;
qgroup = find_qgroup_rb ( fs_info , ref_root ) ;
if ( ! qgroup )
goto out ;
2017-12-12 10:34:30 +03:00
if ( num_bytes = = ( u64 ) - 1 )
2017-12-12 10:34:34 +03:00
/*
* We ' re freeing all pertrans rsv , get reserved value from
* level 0 qgroup as real num_bytes to free .
*/
2017-12-12 10:34:30 +03:00
num_bytes = qgroup - > rsv . values [ type ] ;
2013-05-06 15:03:27 +04:00
ulist_reinit ( fs_info - > qgroup_ulist ) ;
ret = ulist_add ( fs_info - > qgroup_ulist , qgroup - > qgroupid ,
2018-03-27 20:04:50 +03:00
qgroup_to_aux ( qgroup ) , GFP_ATOMIC ) ;
2013-04-17 18:00:36 +04:00
if ( ret < 0 )
goto out ;
2012-06-28 20:03:02 +04:00
ULIST_ITER_INIT ( & uiter ) ;
2013-05-06 15:03:27 +04:00
while ( ( unode = ulist_next ( fs_info - > qgroup_ulist , & uiter ) ) ) {
2012-06-28 20:03:02 +04:00
struct btrfs_qgroup * qg ;
struct btrfs_qgroup_list * glist ;
2016-10-26 17:23:50 +03:00
qg = unode_aux_to_qgroup ( unode ) ;
2012-06-28 20:03:02 +04:00
2017-12-12 10:34:27 +03:00
qgroup_rsv_release ( fs_info , qg , num_bytes , type ) ;
2012-06-28 20:03:02 +04:00
list_for_each_entry ( glist , & qg - > groups , next_group ) {
2013-05-06 15:03:27 +04:00
ret = ulist_add ( fs_info - > qgroup_ulist ,
glist - > group - > qgroupid ,
2018-03-27 20:04:50 +03:00
qgroup_to_aux ( glist - > group ) , GFP_ATOMIC ) ;
2013-04-17 18:00:36 +04:00
if ( ret < 0 )
goto out ;
2012-06-28 20:03:02 +04:00
}
}
out :
spin_unlock ( & fs_info - > qgroup_lock ) ;
}
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 04:38:13 +03:00
/*
* Check if the leaf is the last leaf . Which means all node pointers
* are at their last position .
*/
static bool is_last_leaf ( struct btrfs_path * path )
{
int i ;
for ( i = 1 ; i < BTRFS_MAX_LEVEL & & path - > nodes [ i ] ; i + + ) {
if ( path - > slots [ i ] ! = btrfs_header_nritems ( path - > nodes [ i ] ) - 1 )
return false ;
}
return true ;
}
2013-04-25 20:04:51 +04:00
/*
* returns < 0 on error , 0 when more leafs are to be scanned .
2015-02-27 11:24:24 +03:00
* returns 1 when done .
2013-04-25 20:04:51 +04:00
*/
2018-07-18 09:45:42 +03:00
static int qgroup_rescan_leaf ( struct btrfs_trans_handle * trans ,
struct btrfs_path * path )
2013-04-25 20:04:51 +04:00
{
2018-07-18 09:45:42 +03:00
struct btrfs_fs_info * fs_info = trans - > fs_info ;
2013-04-25 20:04:51 +04:00
struct btrfs_key found ;
2015-10-26 04:19:43 +03:00
struct extent_buffer * scratch_leaf = NULL ;
2013-04-25 20:04:51 +04:00
struct ulist * roots = NULL ;
2014-05-14 04:30:47 +04:00
u64 num_bytes ;
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 04:38:13 +03:00
bool done ;
2013-04-25 20:04:51 +04:00
int slot ;
int ret ;
mutex_lock ( & fs_info - > qgroup_rescan_lock ) ;
ret = btrfs_search_slot_for_read ( fs_info - > extent_root ,
& fs_info - > qgroup_rescan_progress ,
path , 1 , 0 ) ;
2016-09-20 17:05:02 +03:00
btrfs_debug ( fs_info ,
" current progress key (%llu %u %llu), search_slot ret %d " ,
fs_info - > qgroup_rescan_progress . objectid ,
fs_info - > qgroup_rescan_progress . type ,
fs_info - > qgroup_rescan_progress . offset , ret ) ;
2013-04-25 20:04:51 +04:00
if ( ret ) {
/*
* The rescan is about to end , we will not be scanning any
* further blocks . We cannot unset the RESCAN flag here , because
* we want to commit the transaction if everything went well .
* To make the live accounting work in this phase , we set our
* scan progress pointer such that every real extent objectid
* will be smaller .
*/
fs_info - > qgroup_rescan_progress . objectid = ( u64 ) - 1 ;
btrfs_release_path ( path ) ;
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
return ret ;
}
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 04:38:13 +03:00
done = is_last_leaf ( path ) ;
2013-04-25 20:04:51 +04:00
btrfs_item_key_to_cpu ( path - > nodes [ 0 ] , & found ,
btrfs_header_nritems ( path - > nodes [ 0 ] ) - 1 ) ;
fs_info - > qgroup_rescan_progress . objectid = found . objectid + 1 ;
2015-10-26 04:19:43 +03:00
scratch_leaf = btrfs_clone_extent_buffer ( path - > nodes [ 0 ] ) ;
if ( ! scratch_leaf ) {
ret = - ENOMEM ;
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
goto out ;
}
2013-04-25 20:04:51 +04:00
slot = path - > slots [ 0 ] ;
btrfs_release_path ( path ) ;
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
for ( ; slot < btrfs_header_nritems ( scratch_leaf ) ; + + slot ) {
btrfs_item_key_to_cpu ( scratch_leaf , & found , slot ) ;
2014-01-24 01:45:10 +04:00
if ( found . type ! = BTRFS_EXTENT_ITEM_KEY & &
found . type ! = BTRFS_METADATA_ITEM_KEY )
2013-04-25 20:04:51 +04:00
continue ;
2014-01-24 01:45:10 +04:00
if ( found . type = = BTRFS_METADATA_ITEM_KEY )
2016-06-15 16:22:56 +03:00
num_bytes = fs_info - > nodesize ;
2014-01-24 01:45:10 +04:00
else
num_bytes = found . offset ;
2014-05-14 04:30:47 +04:00
ret = btrfs_find_all_roots ( NULL , fs_info , found . objectid , 0 ,
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:45 +03:00
& roots , false ) ;
2013-04-25 20:04:51 +04:00
if ( ret < 0 )
goto out ;
2015-04-13 06:02:16 +03:00
/* For rescan, just pass old_roots as NULL */
2018-07-18 09:45:39 +03:00
ret = btrfs_qgroup_account_extent ( trans , found . objectid ,
num_bytes , NULL , roots ) ;
2015-04-13 06:02:16 +03:00
if ( ret < 0 )
2014-05-14 04:30:47 +04:00
goto out ;
2013-04-25 20:04:51 +04:00
}
out :
2018-08-15 18:26:56 +03:00
if ( scratch_leaf )
2015-10-26 04:19:43 +03:00
free_extent_buffer ( scratch_leaf ) ;
2013-04-25 20:04:51 +04:00
2018-06-27 13:19:55 +03:00
if ( done & & ! ret ) {
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 04:38:13 +03:00
ret = 1 ;
2018-06-27 13:19:55 +03:00
fs_info - > qgroup_rescan_progress . objectid = ( u64 ) - 1 ;
}
2013-04-25 20:04:51 +04:00
return ret ;
}
2014-02-28 06:46:19 +04:00
static void btrfs_qgroup_rescan_worker ( struct btrfs_work * work )
2013-04-25 20:04:51 +04:00
{
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
struct btrfs_fs_info * fs_info = container_of ( work , struct btrfs_fs_info ,
qgroup_rescan_work ) ;
2013-04-25 20:04:51 +04:00
struct btrfs_path * path ;
struct btrfs_trans_handle * trans = NULL ;
int err = - ENOMEM ;
2015-02-27 11:24:25 +03:00
int ret = 0 ;
2013-04-25 20:04:51 +04:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
goto out ;
2018-05-14 04:38:12 +03:00
/*
* Rescan should only search for commit root , and any later difference
* should be recorded by qgroup
*/
path - > search_commit_root = 1 ;
path - > skip_locking = 1 ;
2013-04-25 20:04:51 +04:00
err = 0 ;
2015-11-05 02:56:16 +03:00
while ( ! err & & ! btrfs_fs_closing ( fs_info ) ) {
2013-04-25 20:04:51 +04:00
trans = btrfs_start_transaction ( fs_info - > fs_root , 0 ) ;
if ( IS_ERR ( trans ) ) {
err = PTR_ERR ( trans ) ;
break ;
}
2016-09-02 22:40:02 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) ) {
2013-04-25 20:04:51 +04:00
err = - EINTR ;
} else {
2018-07-18 09:45:42 +03:00
err = qgroup_rescan_leaf ( trans , path ) ;
2013-04-25 20:04:51 +04:00
}
if ( err > 0 )
2016-09-10 04:39:03 +03:00
btrfs_commit_transaction ( trans ) ;
2013-04-25 20:04:51 +04:00
else
2016-09-10 04:39:03 +03:00
btrfs_end_transaction ( trans ) ;
2013-04-25 20:04:51 +04:00
}
out :
btrfs_free_path ( path ) ;
mutex_lock ( & fs_info - > qgroup_rescan_lock ) ;
2015-11-05 02:56:16 +03:00
if ( ! btrfs_fs_closing ( fs_info ) )
fs_info - > qgroup_flags & = ~ BTRFS_QGROUP_STATUS_FLAG_RESCAN ;
2013-04-25 20:04:51 +04:00
2015-02-27 11:24:24 +03:00
if ( err > 0 & &
2013-04-25 20:04:51 +04:00
fs_info - > qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ) {
fs_info - > qgroup_flags & = ~ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
} else if ( err < 0 ) {
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
}
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
2015-02-27 11:24:25 +03:00
/*
2016-05-20 04:18:45 +03:00
* only update status , since the previous part has already updated the
2015-02-27 11:24:25 +03:00
* qgroup info .
*/
trans = btrfs_start_transaction ( fs_info - > quota_root , 1 ) ;
if ( IS_ERR ( trans ) ) {
err = PTR_ERR ( trans ) ;
btrfs_err ( fs_info ,
2017-07-13 16:32:18 +03:00
" fail to start transaction for status update: %d " ,
2015-02-27 11:24:25 +03:00
err ) ;
goto done ;
}
2018-07-18 09:45:29 +03:00
ret = update_qgroup_status_item ( trans ) ;
2015-02-27 11:24:25 +03:00
if ( ret < 0 ) {
err = ret ;
2016-09-20 17:05:02 +03:00
btrfs_err ( fs_info , " fail to update qgroup status: %d " , err ) ;
2015-02-27 11:24:25 +03:00
}
2016-09-10 04:39:03 +03:00
btrfs_end_transaction ( trans ) ;
2015-02-27 11:24:25 +03:00
2015-11-05 02:56:16 +03:00
if ( btrfs_fs_closing ( fs_info ) ) {
btrfs_info ( fs_info , " qgroup scan paused " ) ;
} else if ( err > = 0 ) {
2013-12-20 20:37:06 +04:00
btrfs_info ( fs_info , " qgroup scan completed%s " ,
2015-02-27 11:24:24 +03:00
err > 0 ? " (inconsistency flag cleared) " : " " ) ;
2013-04-25 20:04:51 +04:00
} else {
2013-12-20 20:37:06 +04:00
btrfs_err ( fs_info , " qgroup scan failed with %d " , err ) ;
2013-04-25 20:04:51 +04:00
}
2013-05-06 23:14:17 +04:00
2015-02-27 11:24:25 +03:00
done :
2016-08-15 19:10:33 +03:00
mutex_lock ( & fs_info - > qgroup_rescan_lock ) ;
fs_info - > qgroup_rescan_running = false ;
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
2013-05-06 23:14:17 +04:00
complete_all ( & fs_info - > qgroup_rescan_completion ) ;
2013-04-25 20:04:51 +04:00
}
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
/*
* Checks that ( a ) no rescan is running and ( b ) quota is enabled . Allocates all
* memory required for the rescan context .
*/
static int
qgroup_rescan_init ( struct btrfs_fs_info * fs_info , u64 progress_objectid ,
int init_flags )
2013-04-25 20:04:51 +04:00
{
int ret = 0 ;
2018-05-02 08:28:03 +03:00
if ( ! init_flags ) {
/* we're resuming qgroup rescan at mount time */
2018-06-27 02:43:15 +03:00
if ( ! ( fs_info - > qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_RESCAN ) ) {
2018-05-02 08:28:03 +03:00
btrfs_warn ( fs_info ,
" qgroup rescan init failed, qgroup is not enabled " ) ;
2018-06-27 02:43:15 +03:00
ret = - EINVAL ;
} else if ( ! ( fs_info - > qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON ) ) {
2018-05-02 08:28:03 +03:00
btrfs_warn ( fs_info ,
" qgroup rescan init failed, qgroup rescan is not queued " ) ;
2018-06-27 02:43:15 +03:00
ret = - EINVAL ;
}
if ( ret )
return ret ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
}
2013-04-25 20:04:51 +04:00
mutex_lock ( & fs_info - > qgroup_rescan_lock ) ;
spin_lock ( & fs_info - > qgroup_lock ) ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
if ( init_flags ) {
2018-05-02 08:28:03 +03:00
if ( fs_info - > qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN ) {
btrfs_warn ( fs_info ,
" qgroup rescan is already in progress " ) ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
ret = - EINPROGRESS ;
2018-05-02 08:28:03 +03:00
} else if ( ! ( fs_info - > qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON ) ) {
btrfs_warn ( fs_info ,
" qgroup rescan init failed, qgroup is not enabled " ) ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
ret = - EINVAL ;
2018-05-02 08:28:03 +03:00
}
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
if ( ret ) {
spin_unlock ( & fs_info - > qgroup_lock ) ;
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
2018-05-02 08:28:03 +03:00
return ret ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
}
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_RESCAN ;
2013-04-25 20:04:51 +04:00
}
memset ( & fs_info - > qgroup_rescan_progress , 0 ,
sizeof ( fs_info - > qgroup_rescan_progress ) ) ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
fs_info - > qgroup_rescan_progress . objectid = progress_objectid ;
2015-11-05 13:06:23 +03:00
init_completion ( & fs_info - > qgroup_rescan_completion ) ;
2016-11-24 05:09:04 +03:00
fs_info - > qgroup_rescan_running = true ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
spin_unlock ( & fs_info - > qgroup_lock ) ;
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
memset ( & fs_info - > qgroup_rescan_work , 0 ,
sizeof ( fs_info - > qgroup_rescan_work ) ) ;
2014-02-28 06:46:16 +04:00
btrfs_init_work ( & fs_info - > qgroup_rescan_work ,
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 19:36:53 +04:00
btrfs_qgroup_rescan_helper ,
2014-02-28 06:46:16 +04:00
btrfs_qgroup_rescan_worker , NULL , NULL ) ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
return 0 ;
}
static void
qgroup_rescan_zero_tracking ( struct btrfs_fs_info * fs_info )
{
struct rb_node * n ;
struct btrfs_qgroup * qgroup ;
spin_lock ( & fs_info - > qgroup_lock ) ;
2013-04-25 20:04:51 +04:00
/* clear all current qgroup tracking information */
for ( n = rb_first ( & fs_info - > qgroup_tree ) ; n ; n = rb_next ( n ) ) {
qgroup = rb_entry ( n , struct btrfs_qgroup , node ) ;
qgroup - > rfer = 0 ;
qgroup - > rfer_cmpr = 0 ;
qgroup - > excl = 0 ;
qgroup - > excl_cmpr = 0 ;
2018-08-10 05:20:26 +03:00
qgroup_dirty ( fs_info , qgroup ) ;
2013-04-25 20:04:51 +04:00
}
spin_unlock ( & fs_info - > qgroup_lock ) ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
}
2013-04-25 20:04:51 +04:00
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
int
btrfs_qgroup_rescan ( struct btrfs_fs_info * fs_info )
{
int ret = 0 ;
struct btrfs_trans_handle * trans ;
ret = qgroup_rescan_init ( fs_info , 0 , 1 ) ;
if ( ret )
return ret ;
/*
* We have set the rescan_progress to 0 , which means no more
* delayed refs will be accounted by btrfs_qgroup_account_ref .
* However , btrfs_qgroup_account_ref may be right after its call
* to btrfs_find_all_roots , in which case it would still do the
* accounting .
* To solve this , we ' re committing the transaction , which will
* ensure we run all delayed refs and only after that , we are
* going to clear all tracking information for a clean start .
*/
trans = btrfs_join_transaction ( fs_info - > fs_root ) ;
if ( IS_ERR ( trans ) ) {
fs_info - > qgroup_flags & = ~ BTRFS_QGROUP_STATUS_FLAG_RESCAN ;
return PTR_ERR ( trans ) ;
}
2016-09-10 04:39:03 +03:00
ret = btrfs_commit_transaction ( trans ) ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
if ( ret ) {
fs_info - > qgroup_flags & = ~ BTRFS_QGROUP_STATUS_FLAG_RESCAN ;
return ret ;
}
qgroup_rescan_zero_tracking ( fs_info ) ;
2014-02-28 06:46:16 +04:00
btrfs_queue_work ( fs_info - > qgroup_rescan_workers ,
& fs_info - > qgroup_rescan_work ) ;
2013-04-25 20:04:51 +04:00
return 0 ;
}
2013-05-06 23:14:17 +04:00
2016-08-09 05:08:06 +03:00
int btrfs_qgroup_wait_for_completion ( struct btrfs_fs_info * fs_info ,
bool interruptible )
2013-05-06 23:14:17 +04:00
{
int running ;
int ret = 0 ;
mutex_lock ( & fs_info - > qgroup_rescan_lock ) ;
spin_lock ( & fs_info - > qgroup_lock ) ;
2016-08-15 19:10:33 +03:00
running = fs_info - > qgroup_rescan_running ;
2013-05-06 23:14:17 +04:00
spin_unlock ( & fs_info - > qgroup_lock ) ;
mutex_unlock ( & fs_info - > qgroup_rescan_lock ) ;
2016-08-09 05:08:06 +03:00
if ( ! running )
return 0 ;
if ( interruptible )
2013-05-06 23:14:17 +04:00
ret = wait_for_completion_interruptible (
& fs_info - > qgroup_rescan_completion ) ;
2016-08-09 05:08:06 +03:00
else
wait_for_completion ( & fs_info - > qgroup_rescan_completion ) ;
2013-05-06 23:14:17 +04:00
return ret ;
}
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
/*
* this is only called from open_ctree where we ' re still single threaded , thus
* locking is omitted here .
*/
void
btrfs_qgroup_rescan_resume ( struct btrfs_fs_info * fs_info )
{
if ( fs_info - > qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN )
2014-02-28 06:46:16 +04:00
btrfs_queue_work ( fs_info - > qgroup_rescan_workers ,
& fs_info - > qgroup_rescan_work ) ;
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 19:47:24 +04:00
}
2015-10-12 11:05:40 +03:00
/*
* Reserve qgroup space for range [ start , start + len ) .
*
* This function will either reserve space from related qgroups or doing
* nothing if the range is already reserved .
*
* Return 0 for successful reserve
* Return < 0 for error ( including - EQUOT )
*
* NOTE : this function may sleep for memory allocation .
2017-02-27 10:10:38 +03:00
* if btrfs_qgroup_reserve_data ( ) is called multiple times with
* same @ reserved , caller must ensure when error happens it ' s OK
* to free * ALL * reserved space .
2015-10-12 11:05:40 +03:00
*/
2017-02-27 10:10:38 +03:00
int btrfs_qgroup_reserve_data ( struct inode * inode ,
struct extent_changeset * * reserved_ret , u64 start ,
u64 len )
2015-10-12 11:05:40 +03:00
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct ulist_node * unode ;
struct ulist_iterator uiter ;
2017-02-27 10:10:38 +03:00
struct extent_changeset * reserved ;
u64 orig_reserved ;
u64 to_reserve ;
2015-10-12 11:05:40 +03:00
int ret ;
2016-09-02 22:40:02 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & root - > fs_info - > flags ) | |
2018-08-06 08:25:24 +03:00
! is_fstree ( root - > root_key . objectid ) | | len = = 0 )
2015-10-12 11:05:40 +03:00
return 0 ;
2017-02-27 10:10:38 +03:00
/* @reserved parameter is mandatory for qgroup */
if ( WARN_ON ( ! reserved_ret ) )
return - EINVAL ;
if ( ! * reserved_ret ) {
* reserved_ret = extent_changeset_alloc ( ) ;
if ( ! * reserved_ret )
return - ENOMEM ;
}
reserved = * reserved_ret ;
/* Record already reserved space */
orig_reserved = reserved - > bytes_changed ;
2015-10-12 11:05:40 +03:00
ret = set_record_extent_bits ( & BTRFS_I ( inode ) - > io_tree , start ,
2017-02-27 10:10:38 +03:00
start + len - 1 , EXTENT_QGROUP_RESERVED , reserved ) ;
/* Newly reserved space */
to_reserve = reserved - > bytes_changed - orig_reserved ;
2015-09-28 11:57:53 +03:00
trace_btrfs_qgroup_reserve_data ( inode , start , len ,
2017-02-27 10:10:38 +03:00
to_reserve , QGROUP_RESERVE ) ;
2015-10-12 11:05:40 +03:00
if ( ret < 0 )
goto cleanup ;
2017-12-12 10:34:25 +03:00
ret = qgroup_reserve ( root , to_reserve , true , BTRFS_QGROUP_RSV_DATA ) ;
2015-10-12 11:05:40 +03:00
if ( ret < 0 )
goto cleanup ;
return ret ;
cleanup :
2017-02-27 10:10:38 +03:00
/* cleanup *ALL* already reserved ranges */
2015-10-12 11:05:40 +03:00
ULIST_ITER_INIT ( & uiter ) ;
2017-02-27 10:10:38 +03:00
while ( ( unode = ulist_next ( & reserved - > range_changed , & uiter ) ) )
2015-10-12 11:05:40 +03:00
clear_extent_bit ( & BTRFS_I ( inode ) - > io_tree , unode - > val ,
2017-10-31 18:37:52 +03:00
unode - > aux , EXTENT_QGROUP_RESERVED , 0 , 0 , NULL ) ;
2017-02-27 10:10:38 +03:00
extent_changeset_release ( reserved ) ;
2015-10-12 11:05:40 +03:00
return ret ;
}
2015-10-12 11:28:06 +03:00
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 10:10:39 +03:00
/* Free ranges specified by @reserved, normally in error path */
static int qgroup_free_reserved_data ( struct inode * inode ,
struct extent_changeset * reserved , u64 start , u64 len )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct ulist_node * unode ;
struct ulist_iterator uiter ;
struct extent_changeset changeset ;
int freed = 0 ;
int ret ;
extent_changeset_init ( & changeset ) ;
len = round_up ( start + len , root - > fs_info - > sectorsize ) ;
start = round_down ( start , root - > fs_info - > sectorsize ) ;
ULIST_ITER_INIT ( & uiter ) ;
while ( ( unode = ulist_next ( & reserved - > range_changed , & uiter ) ) ) {
u64 range_start = unode - > val ;
/* unode->aux is the inclusive end */
u64 range_len = unode - > aux - range_start + 1 ;
u64 free_start ;
u64 free_len ;
extent_changeset_release ( & changeset ) ;
/* Only free range in range [start, start + len) */
if ( range_start > = start + len | |
range_start + range_len < = start )
continue ;
free_start = max ( range_start , start ) ;
free_len = min ( start + len , range_start + range_len ) -
free_start ;
/*
* TODO : To also modify reserved - > ranges_reserved to reflect
* the modification .
*
* However as long as we free qgroup reserved according to
* EXTENT_QGROUP_RESERVED , we won ' t double free .
* So not need to rush .
*/
ret = clear_record_extent_bits ( & BTRFS_I ( inode ) - > io_failure_tree ,
free_start , free_start + free_len - 1 ,
EXTENT_QGROUP_RESERVED , & changeset ) ;
if ( ret < 0 )
goto out ;
freed + = changeset . bytes_changed ;
}
2018-08-06 08:25:24 +03:00
btrfs_qgroup_free_refroot ( root - > fs_info , root - > root_key . objectid , freed ,
2017-12-12 10:34:23 +03:00
BTRFS_QGROUP_RSV_DATA ) ;
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 10:10:39 +03:00
ret = freed ;
out :
extent_changeset_release ( & changeset ) ;
return ret ;
}
static int __btrfs_qgroup_release_data ( struct inode * inode ,
struct extent_changeset * reserved , u64 start , u64 len ,
int free )
2015-10-12 11:28:06 +03:00
{
struct extent_changeset changeset ;
2015-09-28 11:57:53 +03:00
int trace_op = QGROUP_RELEASE ;
2015-10-12 11:28:06 +03:00
int ret ;
2018-10-09 09:36:45 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED ,
& BTRFS_I ( inode ) - > root - > fs_info - > flags ) )
return 0 ;
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 10:10:39 +03:00
/* In release case, we shouldn't have @reserved */
WARN_ON ( ! free & & reserved ) ;
if ( free & & reserved )
return qgroup_free_reserved_data ( inode , reserved , start , len ) ;
2017-02-27 10:10:38 +03:00
extent_changeset_init ( & changeset ) ;
2015-10-12 11:28:06 +03:00
ret = clear_record_extent_bits ( & BTRFS_I ( inode ) - > io_tree , start ,
2016-04-27 00:54:39 +03:00
start + len - 1 , EXTENT_QGROUP_RESERVED , & changeset ) ;
2015-10-12 11:28:06 +03:00
if ( ret < 0 )
goto out ;
2017-03-13 10:52:09 +03:00
if ( free )
2015-09-28 11:57:53 +03:00
trace_op = QGROUP_FREE ;
trace_btrfs_qgroup_release_data ( inode , start , len ,
changeset . bytes_changed , trace_op ) ;
2017-03-13 10:52:09 +03:00
if ( free )
btrfs_qgroup_free_refroot ( BTRFS_I ( inode ) - > root - > fs_info ,
2018-08-06 08:25:24 +03:00
BTRFS_I ( inode ) - > root - > root_key . objectid ,
2017-12-12 10:34:23 +03:00
changeset . bytes_changed , BTRFS_QGROUP_RSV_DATA ) ;
2017-02-27 10:10:36 +03:00
ret = changeset . bytes_changed ;
2015-10-12 11:28:06 +03:00
out :
2017-02-27 10:10:38 +03:00
extent_changeset_release ( & changeset ) ;
2015-10-12 11:28:06 +03:00
return ret ;
}
/*
* Free a reserved space range from io_tree and related qgroups
*
* Should be called when a range of pages get invalidated before reaching disk .
* Or for error cleanup case .
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 10:10:39 +03:00
* if @ reserved is given , only reserved range in [ @ start , @ start + @ len ) will
* be freed .
2015-10-12 11:28:06 +03:00
*
* For data written to disk , use btrfs_qgroup_release_data ( ) .
*
* NOTE : This function may sleep for memory allocation .
*/
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 10:10:39 +03:00
int btrfs_qgroup_free_data ( struct inode * inode ,
struct extent_changeset * reserved , u64 start , u64 len )
2015-10-12 11:28:06 +03:00
{
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 10:10:39 +03:00
return __btrfs_qgroup_release_data ( inode , reserved , start , len , 1 ) ;
2015-10-12 11:28:06 +03:00
}
/*
* Release a reserved space range from io_tree only .
*
* Should be called when a range of pages get written to disk and corresponding
* FILE_EXTENT is inserted into corresponding root .
*
* Since new qgroup accounting framework will only update qgroup numbers at
* commit_transaction ( ) time , its reserved space shouldn ' t be freed from
* related qgroups .
*
* But we should release the range from io_tree , to allow further write to be
* COWed .
*
* NOTE : This function may sleep for memory allocation .
*/
int btrfs_qgroup_release_data ( struct inode * inode , u64 start , u64 len )
{
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 10:10:39 +03:00
return __btrfs_qgroup_release_data ( inode , NULL , start , len , 0 ) ;
2015-10-12 11:28:06 +03:00
}
2015-09-08 12:08:38 +03:00
2017-12-12 10:34:34 +03:00
static void add_root_meta_rsv ( struct btrfs_root * root , int num_bytes ,
enum btrfs_qgroup_rsv_type type )
{
if ( type ! = BTRFS_QGROUP_RSV_META_PREALLOC & &
type ! = BTRFS_QGROUP_RSV_META_PERTRANS )
return ;
if ( num_bytes = = 0 )
return ;
spin_lock ( & root - > qgroup_meta_rsv_lock ) ;
if ( type = = BTRFS_QGROUP_RSV_META_PREALLOC )
root - > qgroup_meta_rsv_prealloc + = num_bytes ;
else
root - > qgroup_meta_rsv_pertrans + = num_bytes ;
spin_unlock ( & root - > qgroup_meta_rsv_lock ) ;
}
static int sub_root_meta_rsv ( struct btrfs_root * root , int num_bytes ,
enum btrfs_qgroup_rsv_type type )
{
if ( type ! = BTRFS_QGROUP_RSV_META_PREALLOC & &
type ! = BTRFS_QGROUP_RSV_META_PERTRANS )
return 0 ;
if ( num_bytes = = 0 )
return 0 ;
spin_lock ( & root - > qgroup_meta_rsv_lock ) ;
if ( type = = BTRFS_QGROUP_RSV_META_PREALLOC ) {
num_bytes = min_t ( u64 , root - > qgroup_meta_rsv_prealloc ,
num_bytes ) ;
root - > qgroup_meta_rsv_prealloc - = num_bytes ;
} else {
num_bytes = min_t ( u64 , root - > qgroup_meta_rsv_pertrans ,
num_bytes ) ;
root - > qgroup_meta_rsv_pertrans - = num_bytes ;
}
spin_unlock ( & root - > qgroup_meta_rsv_lock ) ;
return num_bytes ;
}
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:29 +03:00
int __btrfs_qgroup_reserve_meta ( struct btrfs_root * root , int num_bytes ,
enum btrfs_qgroup_rsv_type type , bool enforce )
2015-09-08 12:08:38 +03:00
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = root - > fs_info ;
2015-09-08 12:08:38 +03:00
int ret ;
2016-06-23 01:54:23 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) | |
2018-08-06 08:25:24 +03:00
! is_fstree ( root - > root_key . objectid ) | | num_bytes = = 0 )
2015-09-08 12:08:38 +03:00
return 0 ;
2016-06-23 01:54:23 +03:00
BUG_ON ( num_bytes ! = round_down ( num_bytes , fs_info - > nodesize ) ) ;
2017-12-12 10:34:35 +03:00
trace_qgroup_meta_reserve ( root , type , ( s64 ) num_bytes ) ;
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:29 +03:00
ret = qgroup_reserve ( root , num_bytes , enforce , type ) ;
2015-09-08 12:08:38 +03:00
if ( ret < 0 )
return ret ;
2017-12-12 10:34:34 +03:00
/*
* Record what we have reserved into root .
*
* To avoid quota disabled - > enabled underflow .
* In that case , we may try to free space we haven ' t reserved
* ( since quota was disabled ) , so record what we reserved into root .
* And ensure later release won ' t underflow this number .
*/
add_root_meta_rsv ( root , num_bytes , type ) ;
2015-09-08 12:08:38 +03:00
return ret ;
}
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:29 +03:00
void btrfs_qgroup_free_meta_all_pertrans ( struct btrfs_root * root )
2015-09-08 12:08:38 +03:00
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = root - > fs_info ;
2015-09-08 12:08:38 +03:00
2016-06-23 01:54:23 +03:00
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) | |
2018-08-06 08:25:24 +03:00
! is_fstree ( root - > root_key . objectid ) )
2015-09-08 12:08:38 +03:00
return ;
2017-12-12 10:34:30 +03:00
/* TODO: Update trace point to handle such free */
2017-12-12 10:34:35 +03:00
trace_qgroup_meta_free_all_pertrans ( root ) ;
2017-12-12 10:34:30 +03:00
/* Special value -1 means to free all reserved space */
2018-08-06 08:25:24 +03:00
btrfs_qgroup_free_refroot ( fs_info , root - > root_key . objectid , ( u64 ) - 1 ,
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:29 +03:00
BTRFS_QGROUP_RSV_META_PERTRANS ) ;
2015-09-08 12:08:38 +03:00
}
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:29 +03:00
void __btrfs_qgroup_free_meta ( struct btrfs_root * root , int num_bytes ,
enum btrfs_qgroup_rsv_type type )
2015-09-08 12:08:38 +03:00
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = root - > fs_info ;
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) | |
2018-08-06 08:25:24 +03:00
! is_fstree ( root - > root_key . objectid ) )
2015-09-08 12:08:38 +03:00
return ;
2017-12-12 10:34:34 +03:00
/*
* reservation for META_PREALLOC can happen before quota is enabled ,
* which can lead to underflow .
* Here ensure we will only free what we really have reserved .
*/
num_bytes = sub_root_meta_rsv ( root , num_bytes , type ) ;
2016-06-23 01:54:23 +03:00
BUG_ON ( num_bytes ! = round_down ( num_bytes , fs_info - > nodesize ) ) ;
2017-12-12 10:34:35 +03:00
trace_qgroup_meta_reserve ( root , type , - ( s64 ) num_bytes ) ;
2018-08-06 08:25:24 +03:00
btrfs_qgroup_free_refroot ( fs_info , root - > root_key . objectid ,
num_bytes , type ) ;
2015-09-08 12:08:38 +03:00
}
2015-10-13 04:53:10 +03:00
2017-12-12 10:34:31 +03:00
static void qgroup_convert_meta ( struct btrfs_fs_info * fs_info , u64 ref_root ,
int num_bytes )
{
struct btrfs_root * quota_root = fs_info - > quota_root ;
struct btrfs_qgroup * qgroup ;
struct ulist_node * unode ;
struct ulist_iterator uiter ;
int ret = 0 ;
if ( num_bytes = = 0 )
return ;
if ( ! quota_root )
return ;
spin_lock ( & fs_info - > qgroup_lock ) ;
qgroup = find_qgroup_rb ( fs_info , ref_root ) ;
if ( ! qgroup )
goto out ;
ulist_reinit ( fs_info - > qgroup_ulist ) ;
ret = ulist_add ( fs_info - > qgroup_ulist , qgroup - > qgroupid ,
2018-03-27 20:04:50 +03:00
qgroup_to_aux ( qgroup ) , GFP_ATOMIC ) ;
2017-12-12 10:34:31 +03:00
if ( ret < 0 )
goto out ;
ULIST_ITER_INIT ( & uiter ) ;
while ( ( unode = ulist_next ( fs_info - > qgroup_ulist , & uiter ) ) ) {
struct btrfs_qgroup * qg ;
struct btrfs_qgroup_list * glist ;
qg = unode_aux_to_qgroup ( unode ) ;
qgroup_rsv_release ( fs_info , qg , num_bytes ,
BTRFS_QGROUP_RSV_META_PREALLOC ) ;
qgroup_rsv_add ( fs_info , qg , num_bytes ,
BTRFS_QGROUP_RSV_META_PERTRANS ) ;
list_for_each_entry ( glist , & qg - > groups , next_group ) {
ret = ulist_add ( fs_info - > qgroup_ulist ,
glist - > group - > qgroupid ,
2018-03-27 20:04:50 +03:00
qgroup_to_aux ( glist - > group ) , GFP_ATOMIC ) ;
2017-12-12 10:34:31 +03:00
if ( ret < 0 )
goto out ;
}
}
out :
spin_unlock ( & fs_info - > qgroup_lock ) ;
}
void btrfs_qgroup_convert_reserved_meta ( struct btrfs_root * root , int num_bytes )
{
struct btrfs_fs_info * fs_info = root - > fs_info ;
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) | |
2018-08-06 08:25:24 +03:00
! is_fstree ( root - > root_key . objectid ) )
2017-12-12 10:34:31 +03:00
return ;
2017-12-12 10:34:34 +03:00
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv ( root , num_bytes ,
BTRFS_QGROUP_RSV_META_PREALLOC ) ;
2017-12-12 10:34:35 +03:00
trace_qgroup_meta_convert ( root , num_bytes ) ;
2018-08-06 08:25:24 +03:00
qgroup_convert_meta ( fs_info , root - > root_key . objectid , num_bytes ) ;
2017-12-12 10:34:31 +03:00
}
2015-10-13 04:53:10 +03:00
/*
2016-05-20 04:18:45 +03:00
* Check qgroup reserved space leaking , normally at destroy inode
2015-10-13 04:53:10 +03:00
* time
*/
void btrfs_qgroup_check_reserved_leak ( struct inode * inode )
{
struct extent_changeset changeset ;
struct ulist_node * unode ;
struct ulist_iterator iter ;
int ret ;
2017-02-27 10:10:38 +03:00
extent_changeset_init ( & changeset ) ;
2015-10-13 04:53:10 +03:00
ret = clear_record_extent_bits ( & BTRFS_I ( inode ) - > io_tree , 0 , ( u64 ) - 1 ,
2016-04-27 00:54:39 +03:00
EXTENT_QGROUP_RESERVED , & changeset ) ;
2015-10-13 04:53:10 +03:00
WARN_ON ( ret < 0 ) ;
if ( WARN_ON ( changeset . bytes_changed ) ) {
ULIST_ITER_INIT ( & iter ) ;
2017-02-13 15:42:29 +03:00
while ( ( unode = ulist_next ( & changeset . range_changed , & iter ) ) ) {
2015-10-13 04:53:10 +03:00
btrfs_warn ( BTRFS_I ( inode ) - > root - > fs_info ,
" leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu " ,
inode - > i_ino , unode - > val , unode - > aux ) ;
}
2017-02-13 16:24:35 +03:00
btrfs_qgroup_free_refroot ( BTRFS_I ( inode ) - > root - > fs_info ,
2018-08-06 08:25:24 +03:00
BTRFS_I ( inode ) - > root - > root_key . objectid ,
2017-12-12 10:34:23 +03:00
changeset . bytes_changed , BTRFS_QGROUP_RSV_DATA ) ;
2017-02-13 16:24:35 +03:00
2015-10-13 04:53:10 +03:00
}
2017-02-27 10:10:38 +03:00
extent_changeset_release ( & changeset ) ;
2015-10-13 04:53:10 +03:00
}
btrfs: qgroup: Introduce per-root swapped blocks infrastructure
To allow delayed subtree swap rescan, btrfs needs to record per-root
information about which tree blocks get swapped. This patch introduces
the required infrastructure.
The designed workflow will be:
1) Record the subtree root block that gets swapped.
During subtree swap:
O = Old tree blocks
N = New tree blocks
reloc tree subvolume tree X
Root Root
/ \ / \
NA OB OA OB
/ | | \ / | | \
NC ND OE OF OC OD OE OF
In this case, NA and OA are going to be swapped, record (NA, OA) into
subvolume tree X.
2) After subtree swap.
reloc tree subvolume tree X
Root Root
/ \ / \
OA OB NA OB
/ | | \ / | | \
OC OD OE OF NC ND OE OF
3a) COW happens for OB
If we are going to COW tree block OB, we check OB's bytenr against
tree X's swapped_blocks structure.
If it doesn't fit any, nothing will happen.
3b) COW happens for NA
Check NA's bytenr against tree X's swapped_blocks, and get a hit.
Then we do subtree scan on both subtrees OA and NA.
Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
Then no matter what we do to subvolume tree X, qgroup numbers will
still be correct.
Then NA's record gets removed from X's swapped_blocks.
4) Transaction commit
Any record in X's swapped_blocks gets removed, since there is no
modification to swapped subtrees, no need to trigger heavy qgroup
subtree rescan for them.
This will introduce 128 bytes overhead for each btrfs_root even qgroup
is not enabled. This is to reduce memory allocations and potential
failures.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 10:15:16 +03:00
void btrfs_qgroup_init_swapped_blocks (
struct btrfs_qgroup_swapped_blocks * swapped_blocks )
{
int i ;
spin_lock_init ( & swapped_blocks - > lock ) ;
for ( i = 0 ; i < BTRFS_MAX_LEVEL ; i + + )
swapped_blocks - > blocks [ i ] = RB_ROOT ;
swapped_blocks - > swapped = false ;
}
/*
* Delete all swapped blocks record of @ root .
* Every record here means we skipped a full subtree scan for qgroup .
*
* Gets called when committing one transaction .
*/
void btrfs_qgroup_clean_swapped_blocks ( struct btrfs_root * root )
{
struct btrfs_qgroup_swapped_blocks * swapped_blocks ;
int i ;
swapped_blocks = & root - > swapped_blocks ;
spin_lock ( & swapped_blocks - > lock ) ;
if ( ! swapped_blocks - > swapped )
goto out ;
for ( i = 0 ; i < BTRFS_MAX_LEVEL ; i + + ) {
struct rb_root * cur_root = & swapped_blocks - > blocks [ i ] ;
struct btrfs_qgroup_swapped_block * entry ;
struct btrfs_qgroup_swapped_block * next ;
rbtree_postorder_for_each_entry_safe ( entry , next , cur_root ,
node )
kfree ( entry ) ;
swapped_blocks - > blocks [ i ] = RB_ROOT ;
}
swapped_blocks - > swapped = false ;
out :
spin_unlock ( & swapped_blocks - > lock ) ;
}
/*
* Add subtree roots record into @ subvol_root .
*
* @ subvol_root : tree root of the subvolume tree get swapped
* @ bg : block group under balance
* @ subvol_parent / slot : pointer to the subtree root in subvolume tree
* @ reloc_parent / slot : pointer to the subtree root in reloc tree
* BOTH POINTERS ARE BEFORE TREE SWAP
* @ last_snapshot : last snapshot generation of the subvolume tree
*/
int btrfs_qgroup_add_swapped_blocks ( struct btrfs_trans_handle * trans ,
struct btrfs_root * subvol_root ,
struct btrfs_block_group_cache * bg ,
struct extent_buffer * subvol_parent , int subvol_slot ,
struct extent_buffer * reloc_parent , int reloc_slot ,
u64 last_snapshot )
{
struct btrfs_fs_info * fs_info = subvol_root - > fs_info ;
struct btrfs_qgroup_swapped_blocks * blocks = & subvol_root - > swapped_blocks ;
struct btrfs_qgroup_swapped_block * block ;
struct rb_node * * cur ;
struct rb_node * parent = NULL ;
int level = btrfs_header_level ( subvol_parent ) - 1 ;
int ret = 0 ;
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) )
return 0 ;
if ( btrfs_node_ptr_generation ( subvol_parent , subvol_slot ) >
btrfs_node_ptr_generation ( reloc_parent , reloc_slot ) ) {
btrfs_err_rl ( fs_info ,
" %s: bad parameter order, subvol_gen=%llu reloc_gen=%llu " ,
__func__ ,
btrfs_node_ptr_generation ( subvol_parent , subvol_slot ) ,
btrfs_node_ptr_generation ( reloc_parent , reloc_slot ) ) ;
return - EUCLEAN ;
}
block = kmalloc ( sizeof ( * block ) , GFP_NOFS ) ;
if ( ! block ) {
ret = - ENOMEM ;
goto out ;
}
/*
* @ reloc_parent / slot is still before swap , while @ block is going to
* record the bytenr after swap , so we do the swap here .
*/
block - > subvol_bytenr = btrfs_node_blockptr ( reloc_parent , reloc_slot ) ;
block - > subvol_generation = btrfs_node_ptr_generation ( reloc_parent ,
reloc_slot ) ;
block - > reloc_bytenr = btrfs_node_blockptr ( subvol_parent , subvol_slot ) ;
block - > reloc_generation = btrfs_node_ptr_generation ( subvol_parent ,
subvol_slot ) ;
block - > last_snapshot = last_snapshot ;
block - > level = level ;
btrfs: qgroup: Check bg while resuming relocation to avoid NULL pointer dereference
[BUG]
When mounting a fs with reloc tree and has qgroup enabled, it can cause
NULL pointer dereference at mount time:
BUG: kernel NULL pointer dereference, address: 00000000000000a8
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP NOPTI
RIP: 0010:btrfs_qgroup_add_swapped_blocks+0x186/0x300 [btrfs]
Call Trace:
replace_path.isra.23+0x685/0x900 [btrfs]
merge_reloc_root+0x26e/0x5f0 [btrfs]
merge_reloc_roots+0x10a/0x1a0 [btrfs]
btrfs_recover_relocation+0x3cd/0x420 [btrfs]
open_ctree+0x1bc8/0x1ed0 [btrfs]
btrfs_mount_root+0x544/0x680 [btrfs]
legacy_get_tree+0x34/0x60
vfs_get_tree+0x2d/0xf0
fc_mount+0x12/0x40
vfs_kern_mount.part.12+0x61/0xa0
vfs_kern_mount+0x13/0x20
btrfs_mount+0x16f/0x860 [btrfs]
legacy_get_tree+0x34/0x60
vfs_get_tree+0x2d/0xf0
do_mount+0x81f/0xac0
ksys_mount+0xbf/0xe0
__x64_sys_mount+0x25/0x30
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
[CAUSE]
In btrfs_recover_relocation(), we don't have enough info to determine
which block group we're relocating, but only to merge existing reloc
trees.
Thus in btrfs_recover_relocation(), rc->block_group is NULL.
btrfs_qgroup_add_swapped_blocks() hasn't taken this into consideration,
and causes a NULL pointer dereference.
The bug is introduced by commit 3d0174f78e72 ("btrfs: qgroup: Only trace
data extents in leaves if we're relocating data block group"), and
later qgroup refactoring still keeps this optimization.
[FIX]
Thankfully in the context of btrfs_recover_relocation(), there is no
other progress can modify tree blocks, thus those swapped tree blocks
pair will never affect qgroup numbers, no matter whatever we set for
block->trace_leaf.
So we only need to check if @bg is NULL before accessing @bg->flags.
Reported-by: Juan Erbes <jerbes@gmail.com>
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1134806
Fixes: 3d0174f78e72 ("btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group")
CC: stable@vger.kernel.org # 4.20+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-05-21 14:28:08 +03:00
/*
* If we have bg = = NULL , we ' re called from btrfs_recover_relocation ( ) ,
* no one else can modify tree blocks thus we qgroup will not change
* no matter the value of trace_leaf .
*/
if ( bg & & bg - > flags & BTRFS_BLOCK_GROUP_DATA )
btrfs: qgroup: Introduce per-root swapped blocks infrastructure
To allow delayed subtree swap rescan, btrfs needs to record per-root
information about which tree blocks get swapped. This patch introduces
the required infrastructure.
The designed workflow will be:
1) Record the subtree root block that gets swapped.
During subtree swap:
O = Old tree blocks
N = New tree blocks
reloc tree subvolume tree X
Root Root
/ \ / \
NA OB OA OB
/ | | \ / | | \
NC ND OE OF OC OD OE OF
In this case, NA and OA are going to be swapped, record (NA, OA) into
subvolume tree X.
2) After subtree swap.
reloc tree subvolume tree X
Root Root
/ \ / \
OA OB NA OB
/ | | \ / | | \
OC OD OE OF NC ND OE OF
3a) COW happens for OB
If we are going to COW tree block OB, we check OB's bytenr against
tree X's swapped_blocks structure.
If it doesn't fit any, nothing will happen.
3b) COW happens for NA
Check NA's bytenr against tree X's swapped_blocks, and get a hit.
Then we do subtree scan on both subtrees OA and NA.
Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
Then no matter what we do to subvolume tree X, qgroup numbers will
still be correct.
Then NA's record gets removed from X's swapped_blocks.
4) Transaction commit
Any record in X's swapped_blocks gets removed, since there is no
modification to swapped subtrees, no need to trigger heavy qgroup
subtree rescan for them.
This will introduce 128 bytes overhead for each btrfs_root even qgroup
is not enabled. This is to reduce memory allocations and potential
failures.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 10:15:16 +03:00
block - > trace_leaf = true ;
else
block - > trace_leaf = false ;
btrfs_node_key_to_cpu ( reloc_parent , & block - > first_key , reloc_slot ) ;
/* Insert @block into @blocks */
spin_lock ( & blocks - > lock ) ;
cur = & blocks - > blocks [ level ] . rb_node ;
while ( * cur ) {
struct btrfs_qgroup_swapped_block * entry ;
parent = * cur ;
entry = rb_entry ( parent , struct btrfs_qgroup_swapped_block ,
node ) ;
if ( entry - > subvol_bytenr < block - > subvol_bytenr ) {
cur = & ( * cur ) - > rb_left ;
} else if ( entry - > subvol_bytenr > block - > subvol_bytenr ) {
cur = & ( * cur ) - > rb_right ;
} else {
if ( entry - > subvol_generation ! =
block - > subvol_generation | |
entry - > reloc_bytenr ! = block - > reloc_bytenr | |
entry - > reloc_generation ! =
block - > reloc_generation ) {
/*
* Duplicated but mismatch entry found .
* Shouldn ' t happen .
*
* Marking qgroup inconsistent should be enough
* for end users .
*/
WARN_ON ( IS_ENABLED ( CONFIG_BTRFS_DEBUG ) ) ;
ret = - EEXIST ;
}
kfree ( block ) ;
goto out_unlock ;
}
}
rb_link_node ( & block - > node , parent , cur ) ;
rb_insert_color ( & block - > node , & blocks - > blocks [ level ] ) ;
blocks - > swapped = true ;
out_unlock :
spin_unlock ( & blocks - > lock ) ;
out :
if ( ret < 0 )
fs_info - > qgroup_flags | =
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
return ret ;
}
btrfs: qgroup: Use delayed subtree rescan for balance
Before this patch, qgroup code traces the whole subtree of subvolume and
reloc trees unconditionally.
This makes qgroup numbers consistent, but it could cause tons of
unnecessary extent tracing, which causes a lot of overhead.
However for subtree swap of balance, just swap both subtrees because
they contain the same contents and tree structure, so qgroup numbers
won't change.
It's the race window between subtree swap and transaction commit could
cause qgroup number change.
This patch will delay the qgroup subtree scan until COW happens for the
subtree root.
So if there is no other operations for the fs, balance won't cause extra
qgroup overhead. (best case scenario)
Depending on the workload, most of the subtree scan can still be
avoided.
Only for worst case scenario, it will fall back to old subtree swap
overhead. (scan all swapped subtrees)
[[Benchmark]]
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
And after file system population, there is no other activity, so it
should be the best case scenario.
| v4.20-rc1 | w/ patchset | diff
-----------------------------------------------------------------------
relocated extents | 22615 | 22457 | -0.1%
qgroup dirty extents | 163457 | 121606 | -25.6%
time (sys) | 22.884s | 18.842s | -17.6%
time (real) | 27.724s | 22.884s | -17.5%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 10:15:17 +03:00
/*
* Check if the tree block is a subtree root , and if so do the needed
* delayed subtree trace for qgroup .
*
* This is called during btrfs_cow_block ( ) .
*/
int btrfs_qgroup_trace_subtree_after_cow ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ,
struct extent_buffer * subvol_eb )
{
struct btrfs_fs_info * fs_info = root - > fs_info ;
struct btrfs_qgroup_swapped_blocks * blocks = & root - > swapped_blocks ;
struct btrfs_qgroup_swapped_block * block ;
struct extent_buffer * reloc_eb = NULL ;
struct rb_node * node ;
bool found = false ;
bool swapped = false ;
int level = btrfs_header_level ( subvol_eb ) ;
int ret = 0 ;
int i ;
if ( ! test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) )
return 0 ;
if ( ! is_fstree ( root - > root_key . objectid ) | | ! root - > reloc_root )
return 0 ;
spin_lock ( & blocks - > lock ) ;
if ( ! blocks - > swapped ) {
spin_unlock ( & blocks - > lock ) ;
return 0 ;
}
node = blocks - > blocks [ level ] . rb_node ;
while ( node ) {
block = rb_entry ( node , struct btrfs_qgroup_swapped_block , node ) ;
if ( block - > subvol_bytenr < subvol_eb - > start ) {
node = node - > rb_left ;
} else if ( block - > subvol_bytenr > subvol_eb - > start ) {
node = node - > rb_right ;
} else {
found = true ;
break ;
}
}
if ( ! found ) {
spin_unlock ( & blocks - > lock ) ;
goto out ;
}
/* Found one, remove it from @blocks first and update blocks->swapped */
rb_erase ( & block - > node , & blocks - > blocks [ level ] ) ;
for ( i = 0 ; i < BTRFS_MAX_LEVEL ; i + + ) {
if ( RB_EMPTY_ROOT ( & blocks - > blocks [ i ] ) ) {
swapped = true ;
break ;
}
}
blocks - > swapped = swapped ;
spin_unlock ( & blocks - > lock ) ;
/* Read out reloc subtree root */
reloc_eb = read_tree_block ( fs_info , block - > reloc_bytenr ,
block - > reloc_generation , block - > level ,
& block - > first_key ) ;
if ( IS_ERR ( reloc_eb ) ) {
ret = PTR_ERR ( reloc_eb ) ;
reloc_eb = NULL ;
goto free_out ;
}
if ( ! extent_buffer_uptodate ( reloc_eb ) ) {
ret = - EIO ;
goto free_out ;
}
ret = qgroup_trace_subtree_swap ( trans , reloc_eb , subvol_eb ,
block - > last_snapshot , block - > trace_leaf ) ;
free_out :
kfree ( block ) ;
free_extent_buffer ( reloc_eb ) ;
out :
if ( ret < 0 ) {
btrfs_err_rl ( fs_info ,
" failed to account subtree at bytenr %llu: %d " ,
subvol_eb - > start , ret ) ;
fs_info - > qgroup_flags | = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT ;
}
return ret ;
}