2018-04-03 19:23:33 +02:00
// SPDX-License-Identifier: GPL-2.0
2007-06-12 09:07:21 -04:00
/*
* Copyright ( C ) 2007 Oracle . All rights reserved .
*/
2014-02-17 09:13:57 +05:30
# include <linux/err.h>
2012-07-25 17:35:53 +02:00
# include <linux/uuid.h>
2007-03-13 16:47:54 -04:00
# include "ctree.h"
2022-10-19 10:50:51 -04:00
# include "fs.h"
# include "messages.h"
2007-06-22 14:16:25 -04:00
# include "transaction.h"
2007-03-13 16:47:54 -04:00
# include "disk-io.h"
# include "print-tree.h"
2019-06-19 15:12:01 -04:00
# include "qgroup.h"
# include "space-info.h"
2022-10-19 10:51:00 -04:00
# include "accessors.h"
2022-10-24 14:47:00 -04:00
# include "root-tree.h"
2022-10-26 15:08:41 -04:00
# include "orphan.h"
2007-03-13 16:47:54 -04:00
2012-07-25 17:35:53 +02:00
/*
* Read a root item from the tree . In case we detect a root item smaller then
* sizeof ( root_item ) , we know it ' s an old version of the root structure and
* initialize all new fields to zero . The same happens if we detect mismatching
* generation numbers as then we know the root was once mounted with an older
* kernel that was not aware of the root item structure change .
*/
2013-08-14 23:27:46 +03:00
static void btrfs_read_root_item ( struct extent_buffer * eb , int slot ,
struct btrfs_root_item * item )
2012-07-25 17:35:53 +02:00
{
2019-02-20 12:32:02 +00:00
u32 len ;
2012-07-25 17:35:53 +02:00
int need_reset = 0 ;
2021-10-21 14:58:35 -04:00
len = btrfs_item_size ( eb , slot ) ;
2012-07-25 17:35:53 +02:00
read_extent_buffer ( eb , item , btrfs_item_ptr_offset ( eb , slot ) ,
2019-02-20 12:32:02 +00:00
min_t ( u32 , len , sizeof ( * item ) ) ) ;
2012-07-25 17:35:53 +02:00
if ( len < sizeof ( * item ) )
need_reset = 1 ;
if ( ! need_reset & & btrfs_root_generation ( item )
! = btrfs_root_generation_v2 ( item ) ) {
if ( btrfs_root_generation_v2 ( item ) ! = 0 ) {
2015-10-08 11:37:06 +02:00
btrfs_warn ( eb - > fs_info ,
2016-09-20 10:05:00 -04:00
" mismatching generation and generation_v2 found in root item. This root was probably mounted with an older kernel. Resetting all new fields. " ) ;
2012-07-25 17:35:53 +02:00
}
need_reset = 1 ;
}
if ( need_reset ) {
2021-05-20 15:30:56 -07:00
/* Clear all members from generation_v2 onwards. */
memset_startat ( item , 0 , generation_v2 ) ;
2020-02-24 17:37:51 +02:00
generate_random_guid ( item - > uuid ) ;
2012-07-25 17:35:53 +02:00
}
}
2008-09-29 15:18:18 -04:00
/*
2013-05-15 07:48:19 +00:00
* btrfs_find_root - lookup the root by the key .
* root : the root of the root tree
* search_key : the key to search
* path : the path we search
* root_item : the root item of the tree we look for
2016-05-19 21:18:45 -04:00
* root_key : the root key of the tree we look for
2013-05-15 07:48:19 +00:00
*
2016-05-19 21:18:45 -04:00
* If - > offset of ' search_key ' is - 1ULL , it means we are not sure the offset
2013-05-15 07:48:19 +00:00
* of the search key , just lookup the root with the highest offset for a
* given objectid .
*
* If we find something return 0 , otherwise > 0 , < 0 on error .
2008-09-29 15:18:18 -04:00
*/
2017-01-17 23:24:37 -08:00
int btrfs_find_root ( struct btrfs_root * root , const struct btrfs_key * search_key ,
2013-05-15 07:48:19 +00:00
struct btrfs_path * path , struct btrfs_root_item * root_item ,
struct btrfs_key * root_key )
2007-03-13 16:47:54 -04:00
{
2007-10-15 16:14:19 -04:00
struct btrfs_key found_key ;
struct extent_buffer * l ;
2007-03-13 16:47:54 -04:00
int ret ;
int slot ;
2013-05-15 07:48:19 +00:00
ret = btrfs_search_slot ( NULL , root , search_key , path , 0 , 0 ) ;
2007-03-13 16:47:54 -04:00
if ( ret < 0 )
2013-05-15 07:48:19 +00:00
return ret ;
2007-10-15 16:14:19 -04:00
2013-05-15 07:48:19 +00:00
if ( search_key - > offset ! = - 1ULL ) { /* the search key is exact */
if ( ret > 0 )
goto out ;
} else {
BUG_ON ( ret = = 0 ) ; /* Logical error */
if ( path - > slots [ 0 ] = = 0 )
goto out ;
path - > slots [ 0 ] - - ;
ret = 0 ;
2009-09-21 16:00:26 -04:00
}
2013-05-15 07:48:19 +00:00
2007-10-15 16:14:19 -04:00
l = path - > nodes [ 0 ] ;
2013-05-15 07:48:19 +00:00
slot = path - > slots [ 0 ] ;
2007-10-15 16:14:19 -04:00
btrfs_item_key_to_cpu ( l , & found_key , slot ) ;
2013-05-15 07:48:19 +00:00
if ( found_key . objectid ! = search_key - > objectid | |
2009-09-21 16:00:26 -04:00
found_key . type ! = BTRFS_ROOT_ITEM_KEY ) {
2007-03-13 16:47:54 -04:00
ret = 1 ;
goto out ;
}
2012-07-25 17:35:53 +02:00
2013-05-15 07:48:19 +00:00
if ( root_item )
btrfs_read_root_item ( l , slot , root_item ) ;
if ( root_key )
memcpy ( root_key , & found_key , sizeof ( found_key ) ) ;
2007-03-13 16:47:54 -04:00
out :
2013-05-15 07:48:19 +00:00
btrfs_release_path ( path ) ;
2007-03-13 16:47:54 -04:00
return ret ;
}
2011-07-14 21:23:06 +00:00
void btrfs_set_root_node ( struct btrfs_root_item * item ,
struct extent_buffer * node )
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
{
btrfs_set_root_bytenr ( item , node - > start ) ;
btrfs_set_root_level ( item , btrfs_header_level ( node ) ) ;
btrfs_set_root_generation ( item , btrfs_header_generation ( node ) ) ;
}
2008-09-29 15:18:18 -04:00
/*
* copy the data in ' item ' into the btree
*/
2007-03-16 16:20:31 -04:00
int btrfs_update_root ( struct btrfs_trans_handle * trans , struct btrfs_root
* root , struct btrfs_key * key , struct btrfs_root_item
* item )
2007-03-13 16:47:54 -04:00
{
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = root - > fs_info ;
2007-04-02 11:20:42 -04:00
struct btrfs_path * path ;
2007-10-15 16:14:19 -04:00
struct extent_buffer * l ;
2007-03-13 16:47:54 -04:00
int ret ;
int slot ;
2007-10-15 16:14:19 -04:00
unsigned long ptr ;
2015-08-24 21:56:28 +00:00
u32 old_len ;
2007-03-13 16:47:54 -04:00
2007-04-02 11:20:42 -04:00
path = btrfs_alloc_path ( ) ;
2011-10-03 23:22:44 -04:00
if ( ! path )
return - ENOMEM ;
2007-04-02 11:20:42 -04:00
ret = btrfs_search_slot ( trans , root , key , path , 0 , 1 ) ;
Btrfs: do not abort transaction at btrfs_update_root() after failure to COW path
Currently when we fail to COW a path at btrfs_update_root() we end up
always aborting the transaction. However all the current callers of
btrfs_update_root() are able to deal with errors returned from it, many do
end up aborting the transaction themselves (directly or not, such as the
transaction commit path), other BUG_ON() or just gracefully cancel whatever
they were doing.
When syncing the fsync log, we call btrfs_update_root() through
tree-log.c:update_log_root(), and if it returns an -ENOSPC error, the log
sync code does not abort the transaction, instead it gracefully handles
the error and returns -EAGAIN to the fsync handler, so that it falls back
to a transaction commit. Any other error different from -ENOSPC, makes the
log sync code abort the transaction.
So remove the transaction abort from btrfs_update_log() when we fail to
COW a path to update the root item, so that if an -ENOSPC failure happens
we avoid aborting the current transaction and have a chance of the fsync
succeeding after falling back to a transaction commit.
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203413
Fixes: 79787eaab46121 ("btrfs: replace many BUG_ONs with proper error handling")
Cc: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-29 13:08:14 +01:00
if ( ret < 0 )
2012-09-18 07:52:32 -06:00
goto out ;
2008-01-03 14:51:00 -05:00
2019-02-26 16:33:56 +08:00
if ( ret > 0 ) {
btrfs_crit ( fs_info ,
" unable to find root key (%llu %u %llu) in tree %llu " ,
key - > objectid , key - > type , key - > offset ,
root - > root_key . objectid ) ;
ret = - EUCLEAN ;
btrfs_abort_transaction ( trans , ret ) ;
goto out ;
2008-01-03 14:51:00 -05:00
}
2007-10-15 16:14:19 -04:00
l = path - > nodes [ 0 ] ;
2007-04-02 11:20:42 -04:00
slot = path - > slots [ 0 ] ;
2007-10-15 16:14:19 -04:00
ptr = btrfs_item_ptr_offset ( l , slot ) ;
2021-10-21 14:58:35 -04:00
old_len = btrfs_item_size ( l , slot ) ;
2012-07-25 17:35:53 +02:00
/*
* If this is the first time we update the root item which originated
* from an older kernel , we need to enlarge the item size to make room
* for the added fields .
*/
if ( old_len < sizeof ( * item ) ) {
btrfs_release_path ( path ) ;
ret = btrfs_search_slot ( trans , root , key , path ,
- 1 , 1 ) ;
2012-09-18 07:52:32 -06:00
if ( ret < 0 ) {
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2012-09-18 07:52:32 -06:00
goto out ;
}
2012-07-25 17:35:53 +02:00
ret = btrfs_del_item ( trans , root , path ) ;
2012-09-18 07:52:32 -06:00
if ( ret < 0 ) {
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2012-09-18 07:52:32 -06:00
goto out ;
}
2012-07-25 17:35:53 +02:00
btrfs_release_path ( path ) ;
ret = btrfs_insert_empty_item ( trans , root , path ,
key , sizeof ( * item ) ) ;
2012-09-18 07:52:32 -06:00
if ( ret < 0 ) {
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2012-09-18 07:52:32 -06:00
goto out ;
}
2012-07-25 17:35:53 +02:00
l = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
ptr = btrfs_item_ptr_offset ( l , slot ) ;
}
/*
* Update generation_v2 so at the next mount we know the new root
* fields are valid .
*/
btrfs_set_root_generation_v2 ( item , btrfs_root_generation ( item ) ) ;
2007-10-15 16:14:19 -04:00
write_extent_buffer ( l , item , ptr , sizeof ( * item ) ) ;
2007-04-02 11:20:42 -04:00
btrfs_mark_buffer_dirty ( path - > nodes [ 0 ] ) ;
2007-03-13 16:47:54 -04:00
out :
2007-04-02 11:20:42 -04:00
btrfs_free_path ( path ) ;
2007-03-13 16:47:54 -04:00
return ret ;
}
2011-10-03 23:22:34 -04:00
int btrfs_insert_root ( struct btrfs_trans_handle * trans , struct btrfs_root * root ,
2017-01-17 23:24:37 -08:00
const struct btrfs_key * key , struct btrfs_root_item * item )
2007-03-13 16:47:54 -04:00
{
2012-07-25 17:35:53 +02:00
/*
* Make sure generation v1 and v2 match . See update_root for details .
*/
btrfs_set_root_generation_v2 ( item , btrfs_root_generation ( item ) ) ;
2011-10-03 23:22:34 -04:00
return btrfs_insert_item ( trans , root , key , item , sizeof ( * item ) ) ;
2007-03-13 16:47:54 -04:00
}
2016-06-21 21:16:51 -04:00
int btrfs_find_orphan_roots ( struct btrfs_fs_info * fs_info )
2009-09-21 16:00:26 -04:00
{
2016-06-21 21:16:51 -04:00
struct btrfs_root * tree_root = fs_info - > tree_root ;
2009-09-21 16:00:26 -04:00
struct extent_buffer * leaf ;
struct btrfs_path * path ;
struct btrfs_key key ;
2010-05-16 10:49:58 -04:00
struct btrfs_root * root ;
2009-09-21 16:00:26 -04:00
int err = 0 ;
int ret ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = BTRFS_ORPHAN_OBJECTID ;
key . type = BTRFS_ORPHAN_ITEM_KEY ;
key . offset = 0 ;
while ( 1 ) {
2020-05-15 19:35:55 +02:00
u64 root_objectid ;
2009-09-21 16:00:26 -04:00
ret = btrfs_search_slot ( NULL , tree_root , & key , path , 0 , 0 ) ;
if ( ret < 0 ) {
err = ret ;
break ;
}
leaf = path - > nodes [ 0 ] ;
if ( path - > slots [ 0 ] > = btrfs_header_nritems ( leaf ) ) {
ret = btrfs_next_leaf ( tree_root , path ) ;
if ( ret < 0 )
err = ret ;
if ( ret ! = 0 )
break ;
leaf = path - > nodes [ 0 ] ;
}
btrfs_item_key_to_cpu ( leaf , & key , path - > slots [ 0 ] ) ;
2011-04-21 01:20:15 +02:00
btrfs_release_path ( path ) ;
2009-09-21 16:00:26 -04:00
if ( key . objectid ! = BTRFS_ORPHAN_OBJECTID | |
key . type ! = BTRFS_ORPHAN_ITEM_KEY )
break ;
2020-05-15 19:35:55 +02:00
root_objectid = key . offset ;
2010-05-16 10:49:58 -04:00
key . offset + + ;
2020-05-15 19:35:55 +02:00
root = btrfs_get_fs_root ( fs_info , root_objectid , false ) ;
2014-02-17 09:13:57 +05:30
err = PTR_ERR_OR_ZERO ( root ) ;
2013-06-27 11:32:16 -04:00
if ( err & & err ! = - ENOENT ) {
2013-05-15 07:48:19 +00:00
break ;
2013-06-27 11:32:16 -04:00
} else if ( err = = - ENOENT ) {
struct btrfs_trans_handle * trans ;
btrfs_release_path ( path ) ;
trans = btrfs_join_transaction ( tree_root ) ;
if ( IS_ERR ( trans ) ) {
err = PTR_ERR ( trans ) ;
2016-06-22 18:54:23 -04:00
btrfs_handle_fs_error ( fs_info , err ,
2016-09-20 10:05:00 -04:00
" Failed to start trans to delete orphan item " ) ;
2013-06-27 11:32:16 -04:00
break ;
}
err = btrfs_del_orphan_item ( trans , tree_root ,
2020-05-15 19:35:55 +02:00
root_objectid ) ;
2016-09-09 21:39:03 -04:00
btrfs_end_transaction ( trans ) ;
2013-06-27 11:32:16 -04:00
if ( err ) {
2016-06-22 18:54:23 -04:00
btrfs_handle_fs_error ( fs_info , err ,
2016-09-20 10:05:00 -04:00
" Failed to delete root orphan item " ) ;
2013-06-27 11:32:16 -04:00
break ;
}
continue ;
2013-05-15 07:48:19 +00:00
}
2020-01-24 09:32:20 -05:00
WARN_ON ( ! test_bit ( BTRFS_ROOT_ORPHAN_ITEM_INSERTED , & root - > state ) ) ;
2019-02-06 15:46:14 -05:00
if ( btrfs_root_refs ( & root - > root_item ) = = 0 ) {
2022-02-18 14:56:10 -05:00
struct btrfs_key drop_key ;
btrfs_disk_key_to_cpu ( & drop_key , & root - > root_item . drop_progress ) ;
/*
* If we have a non - zero drop_progress then we know we
* made it partly through deleting this snapshot , and
* thus we need to make sure we block any balance from
* happening until this snapshot is completely dropped .
*/
if ( drop_key . objectid ! = 0 | | drop_key . type ! = 0 | |
drop_key . offset ! = 0 ) {
set_bit ( BTRFS_FS_UNFINISHED_DROPS , & fs_info - > flags ) ;
set_bit ( BTRFS_ROOT_UNFINISHED_DROP , & root - > state ) ;
}
2019-02-06 15:46:14 -05:00
set_bit ( BTRFS_ROOT_DEAD_TREE , & root - > state ) ;
2013-09-25 21:47:43 +08:00
btrfs_add_dead_root ( root ) ;
2019-02-06 15:46:14 -05:00
}
2020-01-24 09:33:01 -05:00
btrfs_put_root ( root ) ;
2009-09-21 16:00:26 -04:00
}
btrfs_free_path ( path ) ;
return err ;
}
2017-08-17 10:25:11 -04:00
/* drop the root item for 'key' from the tree root */
int btrfs_del_root ( struct btrfs_trans_handle * trans ,
2018-08-01 11:32:27 +08:00
const struct btrfs_key * key )
2007-03-13 16:47:54 -04:00
{
2018-08-01 11:32:27 +08:00
struct btrfs_root * root = trans - > fs_info - > tree_root ;
2007-04-02 11:20:42 -04:00
struct btrfs_path * path ;
2007-03-13 16:47:54 -04:00
int ret ;
2007-04-02 11:20:42 -04:00
path = btrfs_alloc_path ( ) ;
2011-03-23 08:14:16 +00:00
if ( ! path )
return - ENOMEM ;
2007-04-02 11:20:42 -04:00
ret = btrfs_search_slot ( trans , root , key , path , - 1 , 1 ) ;
2007-03-13 16:47:54 -04:00
if ( ret < 0 )
goto out ;
2007-12-21 16:27:24 -05:00
2007-03-13 16:47:54 -04:00
BUG_ON ( ret ! = 0 ) ;
2007-04-10 09:27:04 -04:00
2007-06-22 14:16:25 -04:00
ret = btrfs_del_item ( trans , root , path ) ;
2007-03-13 16:47:54 -04:00
out :
2007-04-02 11:20:42 -04:00
btrfs_free_path ( path ) ;
2007-03-13 16:47:54 -04:00
return ret ;
}
2008-11-17 20:37:39 -05:00
2018-08-01 11:32:28 +08:00
int btrfs_del_root_ref ( struct btrfs_trans_handle * trans , u64 root_id ,
2022-10-20 12:58:25 -04:00
u64 ref_id , u64 dirid , u64 * sequence ,
2022-10-20 12:58:27 -04:00
const struct fscrypt_str * name )
2008-11-17 20:37:39 -05:00
{
2018-08-01 11:32:28 +08:00
struct btrfs_root * tree_root = trans - > fs_info - > tree_root ;
2009-09-21 15:56:00 -04:00
struct btrfs_path * path ;
struct btrfs_root_ref * ref ;
struct extent_buffer * leaf ;
2008-11-17 20:37:39 -05:00
struct btrfs_key key ;
2009-09-21 15:56:00 -04:00
unsigned long ptr ;
2008-11-17 20:37:39 -05:00
int ret ;
path = btrfs_alloc_path ( ) ;
2009-09-21 15:56:00 -04:00
if ( ! path )
return - ENOMEM ;
2008-11-17 20:37:39 -05:00
key . objectid = root_id ;
2009-09-21 15:56:00 -04:00
key . type = BTRFS_ROOT_BACKREF_KEY ;
2008-11-17 20:37:39 -05:00
key . offset = ref_id ;
2009-09-21 15:56:00 -04:00
again :
2008-11-17 20:37:39 -05:00
ret = btrfs_search_slot ( trans , tree_root , & key , path , - 1 , 1 ) ;
2022-08-22 15:47:09 +01:00
if ( ret < 0 ) {
2021-12-01 19:56:17 +08:00
goto out ;
2022-08-22 15:47:09 +01:00
} else if ( ret = = 0 ) {
2009-09-21 15:56:00 -04:00
leaf = path - > nodes [ 0 ] ;
ref = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_root_ref ) ;
ptr = ( unsigned long ) ( ref + 1 ) ;
2019-12-18 17:20:29 -05:00
if ( ( btrfs_root_ref_dirid ( leaf , ref ) ! = dirid ) | |
2022-10-20 12:58:25 -04:00
( btrfs_root_ref_name_len ( leaf , ref ) ! = name - > len ) | |
memcmp_extent_buffer ( leaf , name - > name , ptr , name - > len ) ) {
2022-08-22 15:47:10 +01:00
ret = - ENOENT ;
2019-12-18 17:20:29 -05:00
goto out ;
}
2009-09-21 15:56:00 -04:00
* sequence = btrfs_root_ref_sequence ( leaf , ref ) ;
ret = btrfs_del_item ( trans , tree_root , path ) ;
2022-08-22 15:47:10 +01:00
if ( ret )
2011-05-19 04:37:44 +00:00
goto out ;
2022-08-22 15:47:10 +01:00
} else {
ret = - ENOENT ;
goto out ;
}
2009-09-21 15:56:00 -04:00
if ( key . type = = BTRFS_ROOT_BACKREF_KEY ) {
2011-04-21 01:20:15 +02:00
btrfs_release_path ( path ) ;
2009-09-21 15:56:00 -04:00
key . objectid = ref_id ;
key . type = BTRFS_ROOT_REF_KEY ;
key . offset = root_id ;
goto again ;
}
2008-11-17 20:37:39 -05:00
2011-05-19 04:37:44 +00:00
out :
2008-11-17 20:37:39 -05:00
btrfs_free_path ( path ) ;
2022-08-22 15:47:10 +01:00
return ret ;
2008-11-17 20:37:39 -05:00
}
/*
* add a btrfs_root_ref item . type is either BTRFS_ROOT_REF_KEY
* or BTRFS_ROOT_BACKREF_KEY .
*
* The dirid , sequence , name and name_len refer to the directory entry
* that is referencing the root .
*
* For a forward ref , the root_id is the id of the tree referencing
* the root and ref_id is the id of the subvol or snapshot .
*
* For a back ref the root_id is the id of the subvol or snapshot and
* ref_id is the id of the tree referencing it .
2012-03-12 16:03:00 +01:00
*
* Will return 0 , - ENOMEM , or anything from the CoW path
2008-11-17 20:37:39 -05:00
*/
2018-08-01 11:32:29 +08:00
int btrfs_add_root_ref ( struct btrfs_trans_handle * trans , u64 root_id ,
2022-10-20 12:58:25 -04:00
u64 ref_id , u64 dirid , u64 sequence ,
2022-10-20 12:58:27 -04:00
const struct fscrypt_str * name )
2008-11-17 20:37:39 -05:00
{
2018-08-01 11:32:29 +08:00
struct btrfs_root * tree_root = trans - > fs_info - > tree_root ;
2008-11-17 20:37:39 -05:00
struct btrfs_key key ;
int ret ;
struct btrfs_path * path ;
struct btrfs_root_ref * ref ;
struct extent_buffer * leaf ;
unsigned long ptr ;
path = btrfs_alloc_path ( ) ;
2009-09-21 15:56:00 -04:00
if ( ! path )
return - ENOMEM ;
2008-11-17 20:37:39 -05:00
key . objectid = root_id ;
2009-09-21 15:56:00 -04:00
key . type = BTRFS_ROOT_BACKREF_KEY ;
2008-11-17 20:37:39 -05:00
key . offset = ref_id ;
2009-09-21 15:56:00 -04:00
again :
2008-11-17 20:37:39 -05:00
ret = btrfs_insert_empty_item ( trans , tree_root , path , & key ,
2022-10-20 12:58:25 -04:00
sizeof ( * ref ) + name - > len ) ;
2012-03-12 16:03:00 +01:00
if ( ret ) {
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2012-03-12 16:03:00 +01:00
btrfs_free_path ( path ) ;
return ret ;
}
2008-11-17 20:37:39 -05:00
leaf = path - > nodes [ 0 ] ;
ref = btrfs_item_ptr ( leaf , path - > slots [ 0 ] , struct btrfs_root_ref ) ;
btrfs_set_root_ref_dirid ( leaf , ref , dirid ) ;
btrfs_set_root_ref_sequence ( leaf , ref , sequence ) ;
2022-10-20 12:58:25 -04:00
btrfs_set_root_ref_name_len ( leaf , ref , name - > len ) ;
2008-11-17 20:37:39 -05:00
ptr = ( unsigned long ) ( ref + 1 ) ;
2022-10-20 12:58:25 -04:00
write_extent_buffer ( leaf , name - > name , ptr , name - > len ) ;
2008-11-17 20:37:39 -05:00
btrfs_mark_buffer_dirty ( leaf ) ;
2009-09-21 15:56:00 -04:00
if ( key . type = = BTRFS_ROOT_BACKREF_KEY ) {
2011-04-21 01:20:15 +02:00
btrfs_release_path ( path ) ;
2009-09-21 15:56:00 -04:00
key . objectid = ref_id ;
key . type = BTRFS_ROOT_REF_KEY ;
key . offset = root_id ;
goto again ;
}
2008-11-17 20:37:39 -05:00
btrfs_free_path ( path ) ;
2009-09-21 15:56:00 -04:00
return 0 ;
2008-11-17 20:37:39 -05:00
}
2011-03-28 02:01:25 +00:00
/*
* Old btrfs forgets to init root_item - > flags and root_item - > byte_limit
* for subvolumes . To work around this problem , we steal a bit from
* root_item - > inode_item - > flags , and use it to indicate if those fields
* have been properly initialized .
*/
void btrfs_check_and_init_root_item ( struct btrfs_root_item * root_item )
{
2013-07-16 11:19:18 +08:00
u64 inode_flags = btrfs_stack_inode_flags ( & root_item - > inode ) ;
2011-03-28 02:01:25 +00:00
if ( ! ( inode_flags & BTRFS_INODE_ROOT_ITEM_INIT ) ) {
inode_flags | = BTRFS_INODE_ROOT_ITEM_INIT ;
2013-07-16 11:19:18 +08:00
btrfs_set_stack_inode_flags ( & root_item - > inode , inode_flags ) ;
btrfs_set_root_flags ( root_item , 0 ) ;
btrfs_set_root_limit ( root_item , 0 ) ;
2011-03-28 02:01:25 +00:00
}
}
2012-07-25 17:35:53 +02:00
void btrfs_update_root_times ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root )
{
struct btrfs_root_item * item = & root - > root_item ;
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-08 19:36:02 -07:00
struct timespec64 ct ;
2012-07-25 17:35:53 +02:00
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-08 19:36:02 -07:00
ktime_get_real_ts64 ( & ct ) ;
2012-12-07 09:28:54 +00:00
spin_lock ( & root - > root_item_lock ) ;
2013-07-16 11:19:18 +08:00
btrfs_set_root_ctransid ( item , trans - > transid ) ;
btrfs_set_stack_timespec_sec ( & item - > ctime , ct . tv_sec ) ;
btrfs_set_stack_timespec_nsec ( & item - > ctime , ct . tv_nsec ) ;
2012-12-07 09:28:54 +00:00
spin_unlock ( & root - > root_item_lock ) ;
2012-07-25 17:35:53 +02:00
}
2019-06-19 15:12:01 -04:00
/*
* btrfs_subvolume_reserve_metadata ( ) - reserve space for subvolume operation
* root : the root of the parent directory
* rsv : block reservation
* items : the number of items that we need do reservation
* use_global_rsv : allow fallback to the global block reservation
*
* This function is used to reserve the space for snapshot / subvolume
* creation and deletion . Those operations are different with the
* common file / directory operations , they change two fs / file trees
* and root tree , the number of items that the qgroup reserves is
* different with the free space reservation . So we can not use
* the space reservation mechanism in start_transaction ( ) .
*/
int btrfs_subvolume_reserve_metadata ( struct btrfs_root * root ,
struct btrfs_block_rsv * rsv , int items ,
bool use_global_rsv )
{
u64 qgroup_num_bytes = 0 ;
u64 num_bytes ;
int ret ;
struct btrfs_fs_info * fs_info = root - > fs_info ;
struct btrfs_block_rsv * global_rsv = & fs_info - > global_block_rsv ;
if ( test_bit ( BTRFS_FS_QUOTA_ENABLED , & fs_info - > flags ) ) {
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info - > nodesize ;
ret = btrfs_qgroup_reserve_meta_prealloc ( root ,
btrfs: avoid blocking on space revervation when doing nowait dio writes
When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.
So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.
This is part of a patchset comprised of the following patches:
btrfs: avoid blocking on page locks with nowait dio on compressed range
btrfs: avoid blocking nowait dio when locking file range
btrfs: avoid double nocow check when doing nowait dio writes
btrfs: stop allocating a path when checking if cross reference exists
btrfs: free path at can_nocow_extent() before checking for checksum items
btrfs: release path earlier at can_nocow_extent()
btrfs: avoid blocking when allocating context for nowait dio read/write
btrfs: avoid blocking on space revervation when doing nowait dio writes
The following test was run before and after applying this patchset:
$ cat io-uring-nodatacow-test.sh
#!/bin/bash
DEV=/dev/sdc
MNT=/mnt/sdc
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
NUM_JOBS=4
FILE_SIZE=8G
RUN_TIME=300
cat <<EOF > /tmp/fio-job.ini
[io_uring_rw]
rw=randrw
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
filesize=$FILE_SIZE
runtime=$RUN_TIME
time_based
filename=foobar
directory=$MNT
numjobs=$NUM_JOBS
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.
Result before the patchset:
READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
Result after the patchset:
READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec
That's about +7.2% throughput for reads and +6.9% for writes.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:30 +00:00
qgroup_num_bytes , true ,
false ) ;
2019-06-19 15:12:01 -04:00
if ( ret )
return ret ;
}
2019-08-22 15:14:33 -04:00
num_bytes = btrfs_calc_insert_metadata_size ( fs_info , items ) ;
2019-06-19 15:12:01 -04:00
rsv - > space_info = btrfs_find_space_info ( fs_info ,
BTRFS_BLOCK_GROUP_METADATA ) ;
2021-11-09 10:12:07 -05:00
ret = btrfs_block_rsv_add ( fs_info , rsv , num_bytes ,
2019-06-19 15:12:01 -04:00
BTRFS_RESERVE_FLUSH_ALL ) ;
if ( ret = = - ENOSPC & & use_global_rsv )
ret = btrfs_block_rsv_migrate ( global_rsv , rsv , num_bytes , true ) ;
if ( ret & & qgroup_num_bytes )
btrfs_qgroup_free_meta_prealloc ( root , qgroup_num_bytes ) ;
btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations
[BUG]
When quota is enabled for TEST_DEV, generic/013 sometimes fails like this:
generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg)
And with the following metadata leak:
BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152
------------[ cut here ]------------
WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs]
Call Trace:
btrfs_put_super+0x15/0x17 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x17/0x30 [btrfs]
deactivate_locked_super+0x3b/0xa0
deactivate_super+0x40/0x50
cleanup_mnt+0x135/0x190
__cleanup_mnt+0x12/0x20
task_work_run+0x64/0xb0
__prepare_exit_to_usermode+0x1bc/0x1c0
__syscall_return_slowpath+0x47/0x230
do_syscall_64+0x64/0xb0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
---[ end trace a6cfd45ba80e4e06 ]---
BTRFS error (device dm-3): qgroup reserved space leaked
BTRFS info (device dm-3): disk space caching is enabled
BTRFS info (device dm-3): has skinny extents
[CAUSE]
The qgroup preallocated meta rsv operations of that offending root are:
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
It's pretty obvious that, we reserve qgroup meta rsv in
btrfs_subvolume_reserve_metadata(), but doesn't have corresponding
release/convert calls in btrfs_subvolume_release_metadata().
This leads to the leakage.
[FIX]
To fix this bug, we should follow what we're doing in
btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and
add it to block_rsv->qgroup_rsv_reserved.
And free the qgroup reserved metadata space when releasing the
block_rsv.
To do this, we need to change the btrfs_subvolume_release_metadata() to
accept btrfs_root, and record the qgroup_to_release number, and call
btrfs_qgroup_convert_reserved_meta() for it.
Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-24 14:46:10 +08:00
if ( ! ret ) {
spin_lock ( & rsv - > lock ) ;
rsv - > qgroup_rsv_reserved + = qgroup_num_bytes ;
spin_unlock ( & rsv - > lock ) ;
}
2019-06-19 15:12:01 -04:00
return ret ;
}
btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations
[BUG]
When quota is enabled for TEST_DEV, generic/013 sometimes fails like this:
generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg)
And with the following metadata leak:
BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152
------------[ cut here ]------------
WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs]
Call Trace:
btrfs_put_super+0x15/0x17 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x17/0x30 [btrfs]
deactivate_locked_super+0x3b/0xa0
deactivate_super+0x40/0x50
cleanup_mnt+0x135/0x190
__cleanup_mnt+0x12/0x20
task_work_run+0x64/0xb0
__prepare_exit_to_usermode+0x1bc/0x1c0
__syscall_return_slowpath+0x47/0x230
do_syscall_64+0x64/0xb0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
---[ end trace a6cfd45ba80e4e06 ]---
BTRFS error (device dm-3): qgroup reserved space leaked
BTRFS info (device dm-3): disk space caching is enabled
BTRFS info (device dm-3): has skinny extents
[CAUSE]
The qgroup preallocated meta rsv operations of that offending root are:
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
It's pretty obvious that, we reserve qgroup meta rsv in
btrfs_subvolume_reserve_metadata(), but doesn't have corresponding
release/convert calls in btrfs_subvolume_release_metadata().
This leads to the leakage.
[FIX]
To fix this bug, we should follow what we're doing in
btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and
add it to block_rsv->qgroup_rsv_reserved.
And free the qgroup reserved metadata space when releasing the
block_rsv.
To do this, we need to change the btrfs_subvolume_release_metadata() to
accept btrfs_root, and record the qgroup_to_release number, and call
btrfs_qgroup_convert_reserved_meta() for it.
Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-24 14:46:10 +08:00
void btrfs_subvolume_release_metadata ( struct btrfs_root * root ,
2019-06-19 15:12:01 -04:00
struct btrfs_block_rsv * rsv )
{
btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations
[BUG]
When quota is enabled for TEST_DEV, generic/013 sometimes fails like this:
generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg)
And with the following metadata leak:
BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152
------------[ cut here ]------------
WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs]
Call Trace:
btrfs_put_super+0x15/0x17 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x17/0x30 [btrfs]
deactivate_locked_super+0x3b/0xa0
deactivate_super+0x40/0x50
cleanup_mnt+0x135/0x190
__cleanup_mnt+0x12/0x20
task_work_run+0x64/0xb0
__prepare_exit_to_usermode+0x1bc/0x1c0
__syscall_return_slowpath+0x47/0x230
do_syscall_64+0x64/0xb0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
---[ end trace a6cfd45ba80e4e06 ]---
BTRFS error (device dm-3): qgroup reserved space leaked
BTRFS info (device dm-3): disk space caching is enabled
BTRFS info (device dm-3): has skinny extents
[CAUSE]
The qgroup preallocated meta rsv operations of that offending root are:
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
It's pretty obvious that, we reserve qgroup meta rsv in
btrfs_subvolume_reserve_metadata(), but doesn't have corresponding
release/convert calls in btrfs_subvolume_release_metadata().
This leads to the leakage.
[FIX]
To fix this bug, we should follow what we're doing in
btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and
add it to block_rsv->qgroup_rsv_reserved.
And free the qgroup reserved metadata space when releasing the
block_rsv.
To do this, we need to change the btrfs_subvolume_release_metadata() to
accept btrfs_root, and record the qgroup_to_release number, and call
btrfs_qgroup_convert_reserved_meta() for it.
Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-24 14:46:10 +08:00
struct btrfs_fs_info * fs_info = root - > fs_info ;
u64 qgroup_to_release ;
btrfs_block_rsv_release ( fs_info , rsv , ( u64 ) - 1 , & qgroup_to_release ) ;
btrfs_qgroup_convert_reserved_meta ( root , qgroup_to_release ) ;
2019-06-19 15:12:01 -04:00
}