2008-03-24 15:01:56 -04:00
/*
* Copyright ( C ) 2007 Oracle . All rights reserved .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*
* You should have received a copy of the GNU General Public
* License along with this program ; if not , write to the
* Free Software Foundation , Inc . , 59 Temple Place - Suite 330 ,
* Boston , MA 021110 - 1307 , USA .
*/
# include <linux/sched.h>
# include <linux/bio.h>
2008-03-24 15:02:07 -04:00
# include <linux/buffer_head.h>
2008-04-21 10:03:05 -04:00
# include <linux/blkdev.h>
2008-04-28 15:29:42 -04:00
# include <linux/random.h>
2009-04-03 10:27:10 -04:00
# include <linux/iocontext.h>
2008-03-25 16:50:33 -04:00
# include <asm/div64.h>
2008-11-20 10:22:27 -05:00
# include "compat.h"
2008-03-24 15:01:56 -04:00
# include "ctree.h"
# include "extent_map.h"
# include "disk-io.h"
# include "transaction.h"
# include "print-tree.h"
# include "volumes.h"
2008-06-11 16:50:36 -04:00
# include "async-thread.h"
2008-03-24 15:01:56 -04:00
2008-03-25 16:50:33 -04:00
struct map_lookup {
u64 type ;
int io_align ;
int io_width ;
int stripe_len ;
int sector_size ;
int num_stripes ;
2008-04-16 10:49:51 -04:00
int sub_stripes ;
2008-04-09 16:28:12 -04:00
struct btrfs_bio_stripe stripes [ ] ;
2008-03-25 16:50:33 -04:00
} ;
2008-11-17 21:11:30 -05:00
static int init_first_rw_device ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ,
struct btrfs_device * device ) ;
static int btrfs_relocate_sys_chunks ( struct btrfs_root * root ) ;
2008-03-25 16:50:33 -04:00
# define map_lookup_size(n) (sizeof(struct map_lookup) + \
2008-04-09 16:28:12 -04:00
( sizeof ( struct btrfs_bio_stripe ) * ( n ) ) )
2008-03-25 16:50:33 -04:00
2008-03-24 15:02:07 -04:00
static DEFINE_MUTEX ( uuid_mutex ) ;
static LIST_HEAD ( fs_uuids ) ;
2008-05-07 11:43:44 -04:00
void btrfs_lock_volumes ( void )
{
mutex_lock ( & uuid_mutex ) ;
}
void btrfs_unlock_volumes ( void )
{
mutex_unlock ( & uuid_mutex ) ;
}
2008-07-08 14:19:17 -04:00
static void lock_chunks ( struct btrfs_root * root )
{
mutex_lock ( & root - > fs_info - > chunk_mutex ) ;
}
static void unlock_chunks ( struct btrfs_root * root )
{
mutex_unlock ( & root - > fs_info - > chunk_mutex ) ;
}
2008-12-12 10:03:26 -05:00
static void free_fs_devices ( struct btrfs_fs_devices * fs_devices )
{
struct btrfs_device * device ;
WARN_ON ( fs_devices - > opened ) ;
while ( ! list_empty ( & fs_devices - > devices ) ) {
device = list_entry ( fs_devices - > devices . next ,
struct btrfs_device , dev_list ) ;
list_del ( & device - > dev_list ) ;
kfree ( device - > name ) ;
kfree ( device ) ;
}
kfree ( fs_devices ) ;
}
2008-03-24 15:02:07 -04:00
int btrfs_cleanup_fs_uuids ( void )
{
struct btrfs_fs_devices * fs_devices ;
2008-11-17 21:11:30 -05:00
while ( ! list_empty ( & fs_uuids ) ) {
fs_devices = list_entry ( fs_uuids . next ,
struct btrfs_fs_devices , list ) ;
list_del ( & fs_devices - > list ) ;
2008-12-12 10:03:26 -05:00
free_fs_devices ( fs_devices ) ;
2008-03-24 15:02:07 -04:00
}
return 0 ;
}
2008-09-05 16:09:51 -04:00
static noinline struct btrfs_device * __find_device ( struct list_head * head ,
u64 devid , u8 * uuid )
2008-03-24 15:02:07 -04:00
{
struct btrfs_device * dev ;
2009-01-21 10:59:08 -05:00
list_for_each_entry ( dev , head , dev_list ) {
2008-04-18 10:29:38 -04:00
if ( dev - > devid = = devid & &
2008-04-25 16:53:30 -04:00
( ! uuid | | ! memcmp ( dev - > uuid , uuid , BTRFS_UUID_SIZE ) ) ) {
2008-03-24 15:02:07 -04:00
return dev ;
2008-04-18 10:29:38 -04:00
}
2008-03-24 15:02:07 -04:00
}
return NULL ;
}
2008-09-05 16:09:51 -04:00
static noinline struct btrfs_fs_devices * find_fsid ( u8 * fsid )
2008-03-24 15:02:07 -04:00
{
struct btrfs_fs_devices * fs_devices ;
2009-01-21 10:59:08 -05:00
list_for_each_entry ( fs_devices , & fs_uuids , list ) {
2008-03-24 15:02:07 -04:00
if ( memcmp ( fsid , fs_devices - > fsid , BTRFS_FSID_SIZE ) = = 0 )
return fs_devices ;
}
return NULL ;
}
2009-04-20 15:50:09 -04:00
static void requeue_list ( struct btrfs_pending_bios * pending_bios ,
struct bio * head , struct bio * tail )
{
struct bio * old_head ;
old_head = pending_bios - > head ;
pending_bios - > head = head ;
if ( pending_bios - > tail )
tail - > bi_next = old_head ;
else
pending_bios - > tail = tail ;
}
2008-06-11 16:50:36 -04:00
/*
* we try to collect pending bios for a device so we don ' t get a large
* number of procs sending bios down to the same device . This greatly
* improves the schedulers ability to collect and merge the bios .
*
* But , it also turns into a long list of bios to process and that is sure
* to eventually make the worker thread block . The solution here is to
* make some progress and then put this work struct back at the end of
* the list if the block device is congested . This way , multiple devices
* can make progress from a single worker thread .
*/
2009-01-05 21:25:51 -05:00
static noinline int run_scheduled_bios ( struct btrfs_device * device )
2008-06-11 16:50:36 -04:00
{
struct bio * pending ;
struct backing_dev_info * bdi ;
2008-08-20 13:39:41 -04:00
struct btrfs_fs_info * fs_info ;
2009-04-20 15:50:09 -04:00
struct btrfs_pending_bios * pending_bios ;
2008-06-11 16:50:36 -04:00
struct bio * tail ;
struct bio * cur ;
int again = 0 ;
2009-04-20 15:50:09 -04:00
unsigned long num_run ;
unsigned long num_sync_run ;
2009-06-09 15:59:22 -04:00
unsigned long batch_run = 0 ;
2008-08-20 13:39:41 -04:00
unsigned long limit ;
2009-04-03 10:27:10 -04:00
unsigned long last_waited = 0 ;
2009-06-09 15:39:08 -04:00
int force_reg = 0 ;
2008-06-11 16:50:36 -04:00
2009-04-03 10:32:58 -04:00
bdi = blk_get_backing_dev_info ( device - > bdev ) ;
2008-08-20 13:39:41 -04:00
fs_info = device - > dev_root - > fs_info ;
limit = btrfs_async_submit_limit ( fs_info ) ;
limit = limit * 2 / 3 ;
2009-04-20 15:50:09 -04:00
/* we want to make sure that every time we switch from the sync
* list to the normal list , we unplug
*/
num_sync_run = 0 ;
2008-06-11 16:50:36 -04:00
loop :
spin_lock ( & device - > io_lock ) ;
2009-02-04 09:19:41 -05:00
loop_lock :
2009-06-09 15:39:08 -04:00
num_run = 0 ;
2009-04-20 15:50:09 -04:00
2008-06-11 16:50:36 -04:00
/* take all the bios off the list at once and process them
* later on ( without the lock held ) . But , remember the
* tail and other pointers so the bios can be properly reinserted
* into the list if we hit congestion
*/
2009-06-09 15:39:08 -04:00
if ( ! force_reg & & device - > pending_sync_bios . head ) {
2009-04-20 15:50:09 -04:00
pending_bios = & device - > pending_sync_bios ;
2009-06-09 15:39:08 -04:00
force_reg = 1 ;
} else {
2009-04-20 15:50:09 -04:00
pending_bios = & device - > pending_bios ;
2009-06-09 15:39:08 -04:00
force_reg = 0 ;
}
2009-04-20 15:50:09 -04:00
pending = pending_bios - > head ;
tail = pending_bios - > tail ;
2008-06-11 16:50:36 -04:00
WARN_ON ( pending & & ! tail ) ;
/*
* if pending was null this time around , no bios need processing
* at all and we can stop . Otherwise it ' ll loop back up again
* and do an additional check so no bios are missed .
*
* device - > running_pending is used to synchronize with the
* schedule_bio code .
*/
2009-04-20 15:50:09 -04:00
if ( device - > pending_sync_bios . head = = NULL & &
device - > pending_bios . head = = NULL ) {
2008-06-11 16:50:36 -04:00
again = 0 ;
device - > running_pending = 0 ;
2009-04-20 15:50:09 -04:00
} else {
again = 1 ;
device - > running_pending = 1 ;
2008-06-11 16:50:36 -04:00
}
2009-04-20 15:50:09 -04:00
pending_bios - > head = NULL ;
pending_bios - > tail = NULL ;
2008-06-11 16:50:36 -04:00
spin_unlock ( & device - > io_lock ) ;
2009-04-20 15:50:09 -04:00
/*
* if we ' re doing the regular priority list , make sure we unplug
* for any high prio bios we ' ve sent down
*/
if ( pending_bios = = & device - > pending_bios & & num_sync_run > 0 ) {
num_sync_run = 0 ;
blk_run_backing_dev ( bdi , NULL ) ;
}
2009-01-05 21:25:51 -05:00
while ( pending ) {
2009-04-20 15:50:09 -04:00
rmb ( ) ;
2009-06-09 15:39:08 -04:00
/* we want to work on both lists, but do more bios on the
* sync list than the regular list
*/
if ( ( num_run > 32 & &
pending_bios ! = & device - > pending_sync_bios & &
device - > pending_sync_bios . head ) | |
( num_run > 64 & & pending_bios = = & device - > pending_sync_bios & &
device - > pending_bios . head ) ) {
2009-04-20 15:50:09 -04:00
spin_lock ( & device - > io_lock ) ;
requeue_list ( pending_bios , pending , tail ) ;
goto loop_lock ;
}
2008-06-11 16:50:36 -04:00
cur = pending ;
pending = pending - > bi_next ;
cur - > bi_next = NULL ;
2008-08-20 13:39:41 -04:00
atomic_dec ( & fs_info - > nr_async_bios ) ;
if ( atomic_read ( & fs_info - > nr_async_bios ) < limit & &
waitqueue_active ( & fs_info - > async_submit_wait ) )
wake_up ( & fs_info - > async_submit_wait ) ;
2008-07-31 16:29:02 -04:00
BUG_ON ( atomic_read ( & cur - > bi_cnt ) = = 0 ) ;
2008-06-11 16:50:36 -04:00
submit_bio ( cur - > bi_rw , cur ) ;
num_run + + ;
2009-06-09 15:59:22 -04:00
batch_run + + ;
2009-09-11 14:32:04 +02:00
if ( bio_rw_flagged ( cur , BIO_RW_SYNCIO ) )
2009-04-20 15:50:09 -04:00
num_sync_run + + ;
if ( need_resched ( ) ) {
if ( num_sync_run ) {
blk_run_backing_dev ( bdi , NULL ) ;
num_sync_run = 0 ;
}
cond_resched ( ) ;
}
2008-06-11 16:50:36 -04:00
/*
* we made progress , there is more work to do and the bdi
* is now congested . Back off and let other work structs
* run instead
*/
2009-08-07 09:59:15 -04:00
if ( pending & & bdi_write_congested ( bdi ) & & batch_run > 8 & &
2008-11-07 18:22:45 -05:00
fs_info - > fs_devices - > open_devices > 1 ) {
2009-04-03 10:27:10 -04:00
struct io_context * ioc ;
2008-06-11 16:50:36 -04:00
2009-04-03 10:27:10 -04:00
ioc = current - > io_context ;
/*
* the main goal here is that we don ' t want to
* block if we ' re going to be able to submit
* more requests without blocking .
*
* This code does two great things , it pokes into
* the elevator code from a filesystem _and_
* it makes assumptions about how batching works .
*/
if ( ioc & & ioc - > nr_batch_requests > 0 & &
time_before ( jiffies , ioc - > last_waited + HZ / 50UL ) & &
( last_waited = = 0 | |
ioc - > last_waited = = last_waited ) ) {
/*
* we want to go through our batch of
* requests and stop . So , we copy out
* the ioc - > last_waited time and test
* against it before looping
*/
last_waited = ioc - > last_waited ;
2009-04-20 15:50:09 -04:00
if ( need_resched ( ) ) {
if ( num_sync_run ) {
blk_run_backing_dev ( bdi , NULL ) ;
num_sync_run = 0 ;
}
cond_resched ( ) ;
}
2009-04-03 10:27:10 -04:00
continue ;
}
2008-06-11 16:50:36 -04:00
spin_lock ( & device - > io_lock ) ;
2009-04-20 15:50:09 -04:00
requeue_list ( pending_bios , pending , tail ) ;
2009-02-04 09:19:41 -05:00
device - > running_pending = 1 ;
2008-06-11 16:50:36 -04:00
spin_unlock ( & device - > io_lock ) ;
btrfs_requeue_work ( & device - > work ) ;
goto done ;
}
}
2009-04-20 15:50:09 -04:00
if ( num_sync_run ) {
num_sync_run = 0 ;
blk_run_backing_dev ( bdi , NULL ) ;
}
2009-04-03 10:32:58 -04:00
/*
* IO has already been through a long path to get here . Checksumming ,
* async helper threads , perhaps compression . We ' ve done a pretty
* good job of collecting a batch of IO and should just unplug
* the device right away .
*
* This will help anyone who is waiting on the IO , they might have
* already unplugged , but managed to do so before the bio they
* cared about found its way down here .
*/
blk_run_backing_dev ( bdi , NULL ) ;
2010-03-10 15:33:32 -05:00
cond_resched ( ) ;
if ( again )
goto loop ;
spin_lock ( & device - > io_lock ) ;
if ( device - > pending_bios . head | | device - > pending_sync_bios . head )
goto loop_lock ;
spin_unlock ( & device - > io_lock ) ;
2008-06-11 16:50:36 -04:00
done :
return 0 ;
}
2008-12-02 09:54:17 -05:00
static void pending_bios_fn ( struct btrfs_work * work )
2008-06-11 16:50:36 -04:00
{
struct btrfs_device * device ;
device = container_of ( work , struct btrfs_device , work ) ;
run_scheduled_bios ( device ) ;
}
2008-09-05 16:09:51 -04:00
static noinline int device_list_add ( const char * path ,
2008-03-24 15:02:07 -04:00
struct btrfs_super_block * disk_super ,
u64 devid , struct btrfs_fs_devices * * fs_devices_ret )
{
struct btrfs_device * device ;
struct btrfs_fs_devices * fs_devices ;
u64 found_transid = btrfs_super_generation ( disk_super ) ;
fs_devices = find_fsid ( disk_super - > fsid ) ;
if ( ! fs_devices ) {
2008-05-16 13:30:15 -04:00
fs_devices = kzalloc ( sizeof ( * fs_devices ) , GFP_NOFS ) ;
2008-03-24 15:02:07 -04:00
if ( ! fs_devices )
return - ENOMEM ;
INIT_LIST_HEAD ( & fs_devices - > devices ) ;
2008-04-22 09:22:07 -04:00
INIT_LIST_HEAD ( & fs_devices - > alloc_list ) ;
2008-03-24 15:02:07 -04:00
list_add ( & fs_devices - > list , & fs_uuids ) ;
memcpy ( fs_devices - > fsid , disk_super - > fsid , BTRFS_FSID_SIZE ) ;
fs_devices - > latest_devid = devid ;
fs_devices - > latest_trans = found_transid ;
2009-06-10 15:17:02 -04:00
mutex_init ( & fs_devices - > device_list_mutex ) ;
2008-03-24 15:02:07 -04:00
device = NULL ;
} else {
2008-04-18 10:29:38 -04:00
device = __find_device ( & fs_devices - > devices , devid ,
disk_super - > dev_item . uuid ) ;
2008-03-24 15:02:07 -04:00
}
if ( ! device ) {
2008-11-17 21:11:30 -05:00
if ( fs_devices - > opened )
return - EBUSY ;
2008-03-24 15:02:07 -04:00
device = kzalloc ( sizeof ( * device ) , GFP_NOFS ) ;
if ( ! device ) {
/* we can safely leave the fs_devices entry around */
return - ENOMEM ;
}
device - > devid = devid ;
2008-06-11 16:50:36 -04:00
device - > work . func = pending_bios_fn ;
2008-04-18 10:29:38 -04:00
memcpy ( device - > uuid , disk_super - > dev_item . uuid ,
BTRFS_UUID_SIZE ) ;
2008-04-10 16:19:33 -04:00
device - > barriers = 1 ;
2008-04-14 09:48:18 -04:00
spin_lock_init ( & device - > io_lock ) ;
2008-03-24 15:02:07 -04:00
device - > name = kstrdup ( path , GFP_NOFS ) ;
if ( ! device - > name ) {
kfree ( device ) ;
return - ENOMEM ;
}
2008-11-17 21:11:30 -05:00
INIT_LIST_HEAD ( & device - > dev_alloc_list ) ;
2009-06-10 15:17:02 -04:00
mutex_lock ( & fs_devices - > device_list_mutex ) ;
2008-03-24 15:02:07 -04:00
list_add ( & device - > dev_list , & fs_devices - > devices ) ;
2009-06-10 15:17:02 -04:00
mutex_unlock ( & fs_devices - > device_list_mutex ) ;
2008-11-17 21:11:30 -05:00
device - > fs_devices = fs_devices ;
2008-03-24 15:02:07 -04:00
fs_devices - > num_devices + + ;
}
if ( found_transid > fs_devices - > latest_trans ) {
fs_devices - > latest_devid = devid ;
fs_devices - > latest_trans = found_transid ;
}
* fs_devices_ret = fs_devices ;
return 0 ;
}
2008-12-12 10:03:26 -05:00
static struct btrfs_fs_devices * clone_fs_devices ( struct btrfs_fs_devices * orig )
{
struct btrfs_fs_devices * fs_devices ;
struct btrfs_device * device ;
struct btrfs_device * orig_dev ;
fs_devices = kzalloc ( sizeof ( * fs_devices ) , GFP_NOFS ) ;
if ( ! fs_devices )
return ERR_PTR ( - ENOMEM ) ;
INIT_LIST_HEAD ( & fs_devices - > devices ) ;
INIT_LIST_HEAD ( & fs_devices - > alloc_list ) ;
INIT_LIST_HEAD ( & fs_devices - > list ) ;
2009-06-10 15:17:02 -04:00
mutex_init ( & fs_devices - > device_list_mutex ) ;
2008-12-12 10:03:26 -05:00
fs_devices - > latest_devid = orig - > latest_devid ;
fs_devices - > latest_trans = orig - > latest_trans ;
memcpy ( fs_devices - > fsid , orig - > fsid , sizeof ( fs_devices - > fsid ) ) ;
2009-06-10 15:17:02 -04:00
mutex_lock ( & orig - > device_list_mutex ) ;
2008-12-12 10:03:26 -05:00
list_for_each_entry ( orig_dev , & orig - > devices , dev_list ) {
device = kzalloc ( sizeof ( * device ) , GFP_NOFS ) ;
if ( ! device )
goto error ;
device - > name = kstrdup ( orig_dev - > name , GFP_NOFS ) ;
2009-09-29 13:51:04 -04:00
if ( ! device - > name ) {
kfree ( device ) ;
2008-12-12 10:03:26 -05:00
goto error ;
2009-09-29 13:51:04 -04:00
}
2008-12-12 10:03:26 -05:00
device - > devid = orig_dev - > devid ;
device - > work . func = pending_bios_fn ;
memcpy ( device - > uuid , orig_dev - > uuid , sizeof ( device - > uuid ) ) ;
device - > barriers = 1 ;
spin_lock_init ( & device - > io_lock ) ;
INIT_LIST_HEAD ( & device - > dev_list ) ;
INIT_LIST_HEAD ( & device - > dev_alloc_list ) ;
list_add ( & device - > dev_list , & fs_devices - > devices ) ;
device - > fs_devices = fs_devices ;
fs_devices - > num_devices + + ;
}
2009-06-10 15:17:02 -04:00
mutex_unlock ( & orig - > device_list_mutex ) ;
2008-12-12 10:03:26 -05:00
return fs_devices ;
error :
2009-06-10 15:17:02 -04:00
mutex_unlock ( & orig - > device_list_mutex ) ;
2008-12-12 10:03:26 -05:00
free_fs_devices ( fs_devices ) ;
return ERR_PTR ( - ENOMEM ) ;
}
2008-05-13 13:46:40 -04:00
int btrfs_close_extra_devices ( struct btrfs_fs_devices * fs_devices )
{
2009-01-21 10:59:08 -05:00
struct btrfs_device * device , * next ;
2008-05-13 13:46:40 -04:00
mutex_lock ( & uuid_mutex ) ;
again :
2009-06-10 15:17:02 -04:00
mutex_lock ( & fs_devices - > device_list_mutex ) ;
2009-01-21 10:59:08 -05:00
list_for_each_entry_safe ( device , next , & fs_devices - > devices , dev_list ) {
2008-11-17 21:11:30 -05:00
if ( device - > in_fs_metadata )
continue ;
if ( device - > bdev ) {
2008-11-19 21:17:22 -05:00
close_bdev_exclusive ( device - > bdev , device - > mode ) ;
2008-11-17 21:11:30 -05:00
device - > bdev = NULL ;
fs_devices - > open_devices - - ;
}
if ( device - > writeable ) {
list_del_init ( & device - > dev_alloc_list ) ;
device - > writeable = 0 ;
fs_devices - > rw_devices - - ;
}
2008-12-12 10:03:26 -05:00
list_del_init ( & device - > dev_list ) ;
fs_devices - > num_devices - - ;
kfree ( device - > name ) ;
kfree ( device ) ;
2008-05-13 13:46:40 -04:00
}
2009-06-10 15:17:02 -04:00
mutex_unlock ( & fs_devices - > device_list_mutex ) ;
2008-11-17 21:11:30 -05:00
if ( fs_devices - > seed ) {
fs_devices = fs_devices - > seed ;
goto again ;
}
2008-05-13 13:46:40 -04:00
mutex_unlock ( & uuid_mutex ) ;
return 0 ;
}
2008-05-13 16:03:06 -04:00
2008-11-17 21:11:30 -05:00
static int __btrfs_close_devices ( struct btrfs_fs_devices * fs_devices )
2008-03-24 15:02:07 -04:00
{
struct btrfs_device * device ;
2008-12-12 10:03:26 -05:00
2008-11-17 21:11:30 -05:00
if ( - - fs_devices - > opened > 0 )
return 0 ;
2008-03-24 15:02:07 -04:00
2009-01-21 10:59:08 -05:00
list_for_each_entry ( device , & fs_devices - > devices , dev_list ) {
2008-03-24 15:02:07 -04:00
if ( device - > bdev ) {
2008-11-19 21:17:22 -05:00
close_bdev_exclusive ( device - > bdev , device - > mode ) ;
2008-05-13 16:03:06 -04:00
fs_devices - > open_devices - - ;
2008-03-24 15:02:07 -04:00
}
2008-11-17 21:11:30 -05:00
if ( device - > writeable ) {
list_del_init ( & device - > dev_alloc_list ) ;
fs_devices - > rw_devices - - ;
}
2008-03-24 15:02:07 -04:00
device - > bdev = NULL ;
2008-11-17 21:11:30 -05:00
device - > writeable = 0 ;
2008-05-13 13:46:40 -04:00
device - > in_fs_metadata = 0 ;
2008-03-24 15:02:07 -04:00
}
2008-12-12 10:03:26 -05:00
WARN_ON ( fs_devices - > open_devices ) ;
WARN_ON ( fs_devices - > rw_devices ) ;
2008-11-17 21:11:30 -05:00
fs_devices - > opened = 0 ;
fs_devices - > seeding = 0 ;
2008-03-24 15:02:07 -04:00
return 0 ;
}
2008-11-17 21:11:30 -05:00
int btrfs_close_devices ( struct btrfs_fs_devices * fs_devices )
{
2008-12-12 10:03:26 -05:00
struct btrfs_fs_devices * seed_devices = NULL ;
2008-11-17 21:11:30 -05:00
int ret ;
mutex_lock ( & uuid_mutex ) ;
ret = __btrfs_close_devices ( fs_devices ) ;
2008-12-12 10:03:26 -05:00
if ( ! fs_devices - > opened ) {
seed_devices = fs_devices - > seed ;
fs_devices - > seed = NULL ;
}
2008-11-17 21:11:30 -05:00
mutex_unlock ( & uuid_mutex ) ;
2008-12-12 10:03:26 -05:00
while ( seed_devices ) {
fs_devices = seed_devices ;
seed_devices = fs_devices - > seed ;
__btrfs_close_devices ( fs_devices ) ;
free_fs_devices ( fs_devices ) ;
}
2008-11-17 21:11:30 -05:00
return ret ;
}
2008-12-12 10:03:26 -05:00
static int __btrfs_open_devices ( struct btrfs_fs_devices * fs_devices ,
fmode_t flags , void * holder )
2008-03-24 15:02:07 -04:00
{
struct block_device * bdev ;
struct list_head * head = & fs_devices - > devices ;
struct btrfs_device * device ;
2008-05-13 16:03:06 -04:00
struct block_device * latest_bdev = NULL ;
struct buffer_head * bh ;
struct btrfs_super_block * disk_super ;
u64 latest_devid = 0 ;
u64 latest_transid = 0 ;
u64 devid ;
2008-11-17 21:11:30 -05:00
int seeding = 1 ;
2008-05-13 16:03:06 -04:00
int ret = 0 ;
2008-03-24 15:02:07 -04:00
2009-01-21 10:59:08 -05:00
list_for_each_entry ( device , head , dev_list ) {
2008-05-08 15:05:58 -04:00
if ( device - > bdev )
continue ;
2008-05-13 13:46:40 -04:00
if ( ! device - > name )
continue ;
2008-11-19 21:17:22 -05:00
bdev = open_bdev_exclusive ( device - > name , flags , holder ) ;
2008-03-24 15:02:07 -04:00
if ( IS_ERR ( bdev ) ) {
2009-01-05 21:25:51 -05:00
printk ( KERN_INFO " open %s failed \n " , device - > name ) ;
2008-05-13 16:03:06 -04:00
goto error ;
2008-03-24 15:02:07 -04:00
}
2008-05-07 11:43:44 -04:00
set_blocksize ( bdev , 4096 ) ;
2008-05-13 16:03:06 -04:00
2008-12-08 16:46:26 -05:00
bh = btrfs_read_dev_super ( bdev ) ;
2008-05-13 16:03:06 -04:00
if ( ! bh )
goto error_close ;
disk_super = ( struct btrfs_super_block * ) bh - > b_data ;
devid = le64_to_cpu ( disk_super - > dev_item . devid ) ;
if ( devid ! = device - > devid )
goto error_brelse ;
2008-11-17 21:11:30 -05:00
if ( memcmp ( device - > uuid , disk_super - > dev_item . uuid ,
BTRFS_UUID_SIZE ) )
goto error_brelse ;
device - > generation = btrfs_super_generation ( disk_super ) ;
if ( ! latest_transid | | device - > generation > latest_transid ) {
2008-05-13 16:03:06 -04:00
latest_devid = devid ;
2008-11-17 21:11:30 -05:00
latest_transid = device - > generation ;
2008-05-13 16:03:06 -04:00
latest_bdev = bdev ;
}
2008-11-17 21:11:30 -05:00
if ( btrfs_super_flags ( disk_super ) & BTRFS_SUPER_FLAG_SEEDING ) {
device - > writeable = 0 ;
} else {
device - > writeable = ! bdev_read_only ( bdev ) ;
seeding = 0 ;
}
2008-03-24 15:02:07 -04:00
device - > bdev = bdev ;
2008-05-13 13:46:40 -04:00
device - > in_fs_metadata = 0 ;
2008-11-19 21:17:22 -05:00
device - > mode = flags ;
2009-06-10 09:51:32 -04:00
if ( ! blk_queue_nonrot ( bdev_get_queue ( bdev ) ) )
fs_devices - > rotating = 1 ;
2008-05-13 16:03:06 -04:00
fs_devices - > open_devices + + ;
2008-11-17 21:11:30 -05:00
if ( device - > writeable ) {
fs_devices - > rw_devices + + ;
list_add ( & device - > dev_alloc_list ,
& fs_devices - > alloc_list ) ;
}
2008-05-13 16:03:06 -04:00
continue ;
2008-05-07 11:43:44 -04:00
2008-05-13 16:03:06 -04:00
error_brelse :
brelse ( bh ) ;
error_close :
2008-12-02 06:36:09 -05:00
close_bdev_exclusive ( bdev , FMODE_READ ) ;
2008-05-13 16:03:06 -04:00
error :
continue ;
2008-03-24 15:02:07 -04:00
}
2008-05-13 16:03:06 -04:00
if ( fs_devices - > open_devices = = 0 ) {
ret = - EIO ;
goto out ;
}
2008-11-17 21:11:30 -05:00
fs_devices - > seeding = seeding ;
fs_devices - > opened = 1 ;
2008-05-13 16:03:06 -04:00
fs_devices - > latest_bdev = latest_bdev ;
fs_devices - > latest_devid = latest_devid ;
fs_devices - > latest_trans = latest_transid ;
2008-11-17 21:11:30 -05:00
fs_devices - > total_rw_bytes = 0 ;
2008-05-13 16:03:06 -04:00
out :
2008-11-17 21:11:30 -05:00
return ret ;
}
int btrfs_open_devices ( struct btrfs_fs_devices * fs_devices ,
2008-12-02 06:36:09 -05:00
fmode_t flags , void * holder )
2008-11-17 21:11:30 -05:00
{
int ret ;
mutex_lock ( & uuid_mutex ) ;
if ( fs_devices - > opened ) {
2008-12-12 10:03:26 -05:00
fs_devices - > opened + + ;
ret = 0 ;
2008-11-17 21:11:30 -05:00
} else {
2008-11-19 21:17:22 -05:00
ret = __btrfs_open_devices ( fs_devices , flags , holder ) ;
2008-11-17 21:11:30 -05:00
}
2008-03-24 15:02:07 -04:00
mutex_unlock ( & uuid_mutex ) ;
return ret ;
}
2008-12-02 06:36:09 -05:00
int btrfs_scan_one_device ( const char * path , fmode_t flags , void * holder ,
2008-03-24 15:02:07 -04:00
struct btrfs_fs_devices * * fs_devices_ret )
{
struct btrfs_super_block * disk_super ;
struct block_device * bdev ;
struct buffer_head * bh ;
int ret ;
u64 devid ;
2008-04-10 16:19:33 -04:00
u64 transid ;
2008-03-24 15:02:07 -04:00
mutex_lock ( & uuid_mutex ) ;
2008-11-19 21:17:22 -05:00
bdev = open_bdev_exclusive ( path , flags , holder ) ;
2008-03-24 15:02:07 -04:00
if ( IS_ERR ( bdev ) ) {
ret = PTR_ERR ( bdev ) ;
goto error ;
}
ret = set_blocksize ( bdev , 4096 ) ;
if ( ret )
goto error_close ;
2008-12-08 16:46:26 -05:00
bh = btrfs_read_dev_super ( bdev ) ;
2008-03-24 15:02:07 -04:00
if ( ! bh ) {
ret = - EIO ;
goto error_close ;
}
disk_super = ( struct btrfs_super_block * ) bh - > b_data ;
devid = le64_to_cpu ( disk_super - > dev_item . devid ) ;
2008-04-10 16:19:33 -04:00
transid = btrfs_super_generation ( disk_super ) ;
2008-04-18 10:29:49 -04:00
if ( disk_super - > label [ 0 ] )
2009-01-05 21:25:51 -05:00
printk ( KERN_INFO " device label %s " , disk_super - > label ) ;
2008-04-18 10:29:49 -04:00
else {
/* FIXME, make a readl uuid parser */
2009-01-05 21:25:51 -05:00
printk ( KERN_INFO " device fsid %llx-%llx " ,
2008-04-18 10:29:49 -04:00
* ( unsigned long long * ) disk_super - > fsid ,
* ( unsigned long long * ) ( disk_super - > fsid + 8 ) ) ;
}
2009-01-21 10:49:16 -05:00
printk ( KERN_CONT " devid %llu transid %llu %s \n " ,
2009-01-05 21:25:51 -05:00
( unsigned long long ) devid , ( unsigned long long ) transid , path ) ;
2008-03-24 15:02:07 -04:00
ret = device_list_add ( path , disk_super , devid , fs_devices_ret ) ;
brelse ( bh ) ;
error_close :
2008-11-19 21:17:22 -05:00
close_bdev_exclusive ( bdev , flags ) ;
2008-03-24 15:02:07 -04:00
error :
mutex_unlock ( & uuid_mutex ) ;
return ret ;
}
2008-03-24 15:01:56 -04:00
/*
* this uses a pretty simple search , the expectation is that it is
* called very infrequently and that a given device has a small number
* of extents
*/
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
int find_free_dev_extent ( struct btrfs_trans_handle * trans ,
struct btrfs_device * device , u64 num_bytes ,
u64 * start , u64 * max_avail )
2008-03-24 15:01:56 -04:00
{
struct btrfs_key key ;
struct btrfs_root * root = device - > dev_root ;
struct btrfs_dev_extent * dev_extent = NULL ;
2008-11-17 21:11:30 -05:00
struct btrfs_path * path ;
2008-03-24 15:01:56 -04:00
u64 hole_size = 0 ;
u64 last_byte = 0 ;
u64 search_start = 0 ;
u64 search_end = device - > total_bytes ;
int ret ;
int slot = 0 ;
int start_found ;
struct extent_buffer * l ;
2008-11-17 21:11:30 -05:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2008-03-24 15:01:56 -04:00
path - > reada = 2 ;
2008-11-17 21:11:30 -05:00
start_found = 0 ;
2008-03-24 15:01:56 -04:00
/* FIXME use last free of some kind */
2008-03-24 15:02:07 -04:00
/* we don't want to overwrite the superblock on the drive,
* so we make sure to start at an offset of at least 1 MB
*/
search_start = max ( ( u64 ) 1024 * 1024 , search_start ) ;
2008-04-25 16:53:30 -04:00
if ( root - > fs_info - > alloc_start + num_bytes < = device - > total_bytes )
search_start = max ( root - > fs_info - > alloc_start , search_start ) ;
2008-03-24 15:01:56 -04:00
key . objectid = device - > devid ;
key . offset = search_start ;
key . type = BTRFS_DEV_EXTENT_KEY ;
ret = btrfs_search_slot ( trans , root , & key , path , 0 , 0 ) ;
if ( ret < 0 )
goto error ;
2009-07-24 11:06:53 -04:00
if ( ret > 0 ) {
ret = btrfs_previous_item ( root , path , key . objectid , key . type ) ;
if ( ret < 0 )
goto error ;
if ( ret > 0 )
start_found = 1 ;
}
2008-03-24 15:01:56 -04:00
l = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( l , & key , path - > slots [ 0 ] ) ;
while ( 1 ) {
l = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
if ( slot > = btrfs_header_nritems ( l ) ) {
ret = btrfs_next_leaf ( root , path ) ;
if ( ret = = 0 )
continue ;
if ( ret < 0 )
goto error ;
no_more_items :
if ( ! start_found ) {
if ( search_start > = search_end ) {
ret = - ENOSPC ;
goto error ;
}
* start = search_start ;
start_found = 1 ;
goto check_pending ;
}
* start = last_byte > search_start ?
last_byte : search_start ;
if ( search_end < = * start ) {
ret = - ENOSPC ;
goto error ;
}
goto check_pending ;
}
btrfs_item_key_to_cpu ( l , & key , slot ) ;
if ( key . objectid < device - > devid )
goto next ;
if ( key . objectid > device - > devid )
goto no_more_items ;
if ( key . offset > = search_start & & key . offset > last_byte & &
start_found ) {
if ( last_byte < search_start )
last_byte = search_start ;
hole_size = key . offset - last_byte ;
2009-07-24 16:41:41 -04:00
if ( hole_size > * max_avail )
* max_avail = hole_size ;
2008-03-24 15:01:56 -04:00
if ( key . offset > last_byte & &
hole_size > = num_bytes ) {
* start = last_byte ;
goto check_pending ;
}
}
2009-01-05 21:25:51 -05:00
if ( btrfs_key_type ( & key ) ! = BTRFS_DEV_EXTENT_KEY )
2008-03-24 15:01:56 -04:00
goto next ;
start_found = 1 ;
dev_extent = btrfs_item_ptr ( l , slot , struct btrfs_dev_extent ) ;
last_byte = key . offset + btrfs_dev_extent_length ( l , dev_extent ) ;
next :
path - > slots [ 0 ] + + ;
cond_resched ( ) ;
}
check_pending :
/* we have to make sure we didn't find an extent that has already
* been allocated by the map tree or the original allocation
*/
BUG_ON ( * start < search_start ) ;
2008-03-24 15:01:59 -04:00
if ( * start + num_bytes > search_end ) {
2008-03-24 15:01:56 -04:00
ret = - ENOSPC ;
goto error ;
}
/* check for pending inserts here */
2008-11-17 21:11:30 -05:00
ret = 0 ;
2008-03-24 15:01:56 -04:00
error :
2008-11-17 21:11:30 -05:00
btrfs_free_path ( path ) ;
2008-03-24 15:01:56 -04:00
return ret ;
}
2008-12-02 09:54:17 -05:00
static int btrfs_free_dev_extent ( struct btrfs_trans_handle * trans ,
2008-04-25 16:53:30 -04:00
struct btrfs_device * device ,
u64 start )
{
int ret ;
struct btrfs_path * path ;
struct btrfs_root * root = device - > dev_root ;
struct btrfs_key key ;
2008-05-07 11:43:44 -04:00
struct btrfs_key found_key ;
struct extent_buffer * leaf = NULL ;
struct btrfs_dev_extent * extent = NULL ;
2008-04-25 16:53:30 -04:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = device - > devid ;
key . offset = start ;
key . type = BTRFS_DEV_EXTENT_KEY ;
ret = btrfs_search_slot ( trans , root , & key , path , - 1 , 1 ) ;
2008-05-07 11:43:44 -04:00
if ( ret > 0 ) {
ret = btrfs_previous_item ( root , path , key . objectid ,
BTRFS_DEV_EXTENT_KEY ) ;
BUG_ON ( ret ) ;
leaf = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & found_key , path - > slots [ 0 ] ) ;
extent = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_dev_extent ) ;
BUG_ON ( found_key . offset > start | | found_key . offset +
btrfs_dev_extent_length ( leaf , extent ) < start ) ;
ret = 0 ;
} else if ( ret = = 0 ) {
leaf = path - > nodes [ 0 ] ;
extent = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_dev_extent ) ;
}
2008-04-25 16:53:30 -04:00
BUG_ON ( ret ) ;
2008-05-13 13:46:40 -04:00
if ( device - > bytes_used > 0 )
device - > bytes_used - = btrfs_dev_extent_length ( leaf , extent ) ;
2008-04-25 16:53:30 -04:00
ret = btrfs_del_item ( trans , root , path ) ;
BUG_ON ( ret ) ;
btrfs_free_path ( path ) ;
return ret ;
}
2008-11-17 21:11:30 -05:00
int btrfs_alloc_dev_extent ( struct btrfs_trans_handle * trans ,
2008-03-24 15:01:56 -04:00
struct btrfs_device * device ,
2008-04-15 15:41:47 -04:00
u64 chunk_tree , u64 chunk_objectid ,
2008-11-17 21:11:30 -05:00
u64 chunk_offset , u64 start , u64 num_bytes )
2008-03-24 15:01:56 -04:00
{
int ret ;
struct btrfs_path * path ;
struct btrfs_root * root = device - > dev_root ;
struct btrfs_dev_extent * extent ;
struct extent_buffer * leaf ;
struct btrfs_key key ;
2008-05-13 13:46:40 -04:00
WARN_ON ( ! device - > in_fs_metadata ) ;
2008-03-24 15:01:56 -04:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = device - > devid ;
2008-11-17 21:11:30 -05:00
key . offset = start ;
2008-03-24 15:01:56 -04:00
key . type = BTRFS_DEV_EXTENT_KEY ;
ret = btrfs_insert_empty_item ( trans , root , path , & key ,
sizeof ( * extent ) ) ;
BUG_ON ( ret ) ;
leaf = path - > nodes [ 0 ] ;
extent = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_dev_extent ) ;
2008-04-15 15:41:47 -04:00
btrfs_set_dev_extent_chunk_tree ( leaf , extent , chunk_tree ) ;
btrfs_set_dev_extent_chunk_objectid ( leaf , extent , chunk_objectid ) ;
btrfs_set_dev_extent_chunk_offset ( leaf , extent , chunk_offset ) ;
write_extent_buffer ( leaf , root - > fs_info - > chunk_tree_uuid ,
( unsigned long ) btrfs_dev_extent_chunk_tree_uuid ( extent ) ,
BTRFS_UUID_SIZE ) ;
2008-03-24 15:01:56 -04:00
btrfs_set_dev_extent_length ( leaf , extent , num_bytes ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
btrfs_free_path ( path ) ;
return ret ;
}
2008-09-05 16:09:51 -04:00
static noinline int find_next_chunk ( struct btrfs_root * root ,
u64 objectid , u64 * offset )
2008-03-24 15:01:56 -04:00
{
struct btrfs_path * path ;
int ret ;
struct btrfs_key key ;
2008-04-15 15:41:47 -04:00
struct btrfs_chunk * chunk ;
2008-03-24 15:01:56 -04:00
struct btrfs_key found_key ;
path = btrfs_alloc_path ( ) ;
BUG_ON ( ! path ) ;
2008-04-15 15:41:47 -04:00
key . objectid = objectid ;
2008-03-24 15:01:56 -04:00
key . offset = ( u64 ) - 1 ;
key . type = BTRFS_CHUNK_ITEM_KEY ;
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 )
goto error ;
BUG_ON ( ret = = 0 ) ;
ret = btrfs_previous_item ( root , path , 0 , BTRFS_CHUNK_ITEM_KEY ) ;
if ( ret ) {
2008-04-15 15:41:47 -04:00
* offset = 0 ;
2008-03-24 15:01:56 -04:00
} else {
btrfs_item_key_to_cpu ( path - > nodes [ 0 ] , & found_key ,
path - > slots [ 0 ] ) ;
2008-04-15 15:41:47 -04:00
if ( found_key . objectid ! = objectid )
* offset = 0 ;
else {
chunk = btrfs_item_ptr ( path - > nodes [ 0 ] , path - > slots [ 0 ] ,
struct btrfs_chunk ) ;
* offset = found_key . offset +
btrfs_chunk_length ( path - > nodes [ 0 ] , chunk ) ;
}
2008-03-24 15:01:56 -04:00
}
ret = 0 ;
error :
btrfs_free_path ( path ) ;
return ret ;
}
2008-11-17 21:11:30 -05:00
static noinline int find_next_devid ( struct btrfs_root * root , u64 * objectid )
2008-03-24 15:01:56 -04:00
{
int ret ;
struct btrfs_key key ;
struct btrfs_key found_key ;
2008-11-17 21:11:30 -05:00
struct btrfs_path * path ;
root = root - > fs_info - > chunk_root ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2008-03-24 15:01:56 -04:00
key . objectid = BTRFS_DEV_ITEMS_OBJECTID ;
key . type = BTRFS_DEV_ITEM_KEY ;
key . offset = ( u64 ) - 1 ;
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 )
goto error ;
BUG_ON ( ret = = 0 ) ;
ret = btrfs_previous_item ( root , path , BTRFS_DEV_ITEMS_OBJECTID ,
BTRFS_DEV_ITEM_KEY ) ;
if ( ret ) {
* objectid = 1 ;
} else {
btrfs_item_key_to_cpu ( path - > nodes [ 0 ] , & found_key ,
path - > slots [ 0 ] ) ;
* objectid = found_key . offset + 1 ;
}
ret = 0 ;
error :
2008-11-17 21:11:30 -05:00
btrfs_free_path ( path ) ;
2008-03-24 15:01:56 -04:00
return ret ;
}
/*
* the device information is stored in the chunk root
* the btrfs_device struct should be fully filled in
*/
int btrfs_add_device ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ,
struct btrfs_device * device )
{
int ret ;
struct btrfs_path * path ;
struct btrfs_dev_item * dev_item ;
struct extent_buffer * leaf ;
struct btrfs_key key ;
unsigned long ptr ;
root = root - > fs_info - > chunk_root ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = BTRFS_DEV_ITEMS_OBJECTID ;
key . type = BTRFS_DEV_ITEM_KEY ;
2008-11-17 21:11:30 -05:00
key . offset = device - > devid ;
2008-03-24 15:01:56 -04:00
ret = btrfs_insert_empty_item ( trans , root , path , & key ,
2008-03-24 15:02:07 -04:00
sizeof ( * dev_item ) ) ;
2008-03-24 15:01:56 -04:00
if ( ret )
goto out ;
leaf = path - > nodes [ 0 ] ;
dev_item = btrfs_item_ptr ( leaf , path - > slots [ 0 ] , struct btrfs_dev_item ) ;
btrfs_set_device_id ( leaf , dev_item , device - > devid ) ;
2008-11-17 21:11:30 -05:00
btrfs_set_device_generation ( leaf , dev_item , 0 ) ;
2008-03-24 15:01:56 -04:00
btrfs_set_device_type ( leaf , dev_item , device - > type ) ;
btrfs_set_device_io_align ( leaf , dev_item , device - > io_align ) ;
btrfs_set_device_io_width ( leaf , dev_item , device - > io_width ) ;
btrfs_set_device_sector_size ( leaf , dev_item , device - > sector_size ) ;
btrfs_set_device_total_bytes ( leaf , dev_item , device - > total_bytes ) ;
btrfs_set_device_bytes_used ( leaf , dev_item , device - > bytes_used ) ;
2008-04-15 15:41:47 -04:00
btrfs_set_device_group ( leaf , dev_item , 0 ) ;
btrfs_set_device_seek_speed ( leaf , dev_item , 0 ) ;
btrfs_set_device_bandwidth ( leaf , dev_item , 0 ) ;
2008-12-08 16:40:21 -05:00
btrfs_set_device_start_offset ( leaf , dev_item , 0 ) ;
2008-03-24 15:01:56 -04:00
ptr = ( unsigned long ) btrfs_device_uuid ( dev_item ) ;
2008-04-15 15:41:47 -04:00
write_extent_buffer ( leaf , device - > uuid , ptr , BTRFS_UUID_SIZE ) ;
2008-11-17 21:11:30 -05:00
ptr = ( unsigned long ) btrfs_device_fsid ( dev_item ) ;
write_extent_buffer ( leaf , root - > fs_info - > fsid , ptr , BTRFS_UUID_SIZE ) ;
2008-03-24 15:01:56 -04:00
btrfs_mark_buffer_dirty ( leaf ) ;
2008-11-17 21:11:30 -05:00
ret = 0 ;
2008-03-24 15:01:56 -04:00
out :
btrfs_free_path ( path ) ;
return ret ;
}
2008-04-25 16:53:30 -04:00
2008-05-07 11:43:44 -04:00
static int btrfs_rm_dev_item ( struct btrfs_root * root ,
struct btrfs_device * device )
{
int ret ;
struct btrfs_path * path ;
struct btrfs_key key ;
struct btrfs_trans_handle * trans ;
root = root - > fs_info - > chunk_root ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
trans = btrfs_start_transaction ( root , 1 ) ;
key . objectid = BTRFS_DEV_ITEMS_OBJECTID ;
key . type = BTRFS_DEV_ITEM_KEY ;
key . offset = device - > devid ;
2008-07-08 14:19:17 -04:00
lock_chunks ( root ) ;
2008-05-07 11:43:44 -04:00
ret = btrfs_search_slot ( trans , root , & key , path , - 1 , 1 ) ;
if ( ret < 0 )
goto out ;
if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
ret = btrfs_del_item ( trans , root , path ) ;
if ( ret )
goto out ;
out :
btrfs_free_path ( path ) ;
2008-07-08 14:19:17 -04:00
unlock_chunks ( root ) ;
2008-05-07 11:43:44 -04:00
btrfs_commit_transaction ( trans , root ) ;
return ret ;
}
int btrfs_rm_device ( struct btrfs_root * root , char * device_path )
{
struct btrfs_device * device ;
2008-11-17 21:11:30 -05:00
struct btrfs_device * next_device ;
2008-05-07 11:43:44 -04:00
struct block_device * bdev ;
2008-05-13 13:46:40 -04:00
struct buffer_head * bh = NULL ;
2008-05-07 11:43:44 -04:00
struct btrfs_super_block * disk_super ;
u64 all_avail ;
u64 devid ;
2008-11-17 21:11:30 -05:00
u64 num_devices ;
u8 * dev_uuid ;
2008-05-07 11:43:44 -04:00
int ret = 0 ;
mutex_lock ( & uuid_mutex ) ;
2008-07-08 14:19:17 -04:00
mutex_lock ( & root - > fs_info - > volume_mutex ) ;
2008-05-07 11:43:44 -04:00
all_avail = root - > fs_info - > avail_data_alloc_bits |
root - > fs_info - > avail_system_alloc_bits |
root - > fs_info - > avail_metadata_alloc_bits ;
if ( ( all_avail & BTRFS_BLOCK_GROUP_RAID10 ) & &
2010-01-27 02:09:38 +00:00
root - > fs_info - > fs_devices - > num_devices < = 4 ) {
2009-01-05 21:25:51 -05:00
printk ( KERN_ERR " btrfs: unable to go below four devices "
" on raid10 \n " ) ;
2008-05-07 11:43:44 -04:00
ret = - EINVAL ;
goto out ;
}
if ( ( all_avail & BTRFS_BLOCK_GROUP_RAID1 ) & &
2010-01-27 02:09:38 +00:00
root - > fs_info - > fs_devices - > num_devices < = 2 ) {
2009-01-05 21:25:51 -05:00
printk ( KERN_ERR " btrfs: unable to go below two "
" devices on raid1 \n " ) ;
2008-05-07 11:43:44 -04:00
ret = - EINVAL ;
goto out ;
}
2008-05-13 13:46:40 -04:00
if ( strcmp ( device_path , " missing " ) = = 0 ) {
struct list_head * devices ;
struct btrfs_device * tmp ;
2008-05-07 11:43:44 -04:00
2008-05-13 13:46:40 -04:00
device = NULL ;
devices = & root - > fs_info - > fs_devices - > devices ;
2009-06-10 15:17:02 -04:00
mutex_lock ( & root - > fs_info - > fs_devices - > device_list_mutex ) ;
2009-01-21 10:59:08 -05:00
list_for_each_entry ( tmp , devices , dev_list ) {
2008-05-13 13:46:40 -04:00
if ( tmp - > in_fs_metadata & & ! tmp - > bdev ) {
device = tmp ;
break ;
}
}
2009-06-10 15:17:02 -04:00
mutex_unlock ( & root - > fs_info - > fs_devices - > device_list_mutex ) ;
2008-05-13 13:46:40 -04:00
bdev = NULL ;
bh = NULL ;
disk_super = NULL ;
if ( ! device ) {
2009-01-05 21:25:51 -05:00
printk ( KERN_ERR " btrfs: no missing devices found to "
" remove \n " ) ;
2008-05-13 13:46:40 -04:00
goto out ;
}
} else {
2008-12-02 06:36:09 -05:00
bdev = open_bdev_exclusive ( device_path , FMODE_READ ,
2008-05-13 13:46:40 -04:00
root - > fs_info - > bdev_holder ) ;
if ( IS_ERR ( bdev ) ) {
ret = PTR_ERR ( bdev ) ;
goto out ;
}
2008-05-07 11:43:44 -04:00
2008-11-17 21:11:30 -05:00
set_blocksize ( bdev , 4096 ) ;
2008-12-08 16:46:26 -05:00
bh = btrfs_read_dev_super ( bdev ) ;
2008-05-13 13:46:40 -04:00
if ( ! bh ) {
ret = - EIO ;
goto error_close ;
}
disk_super = ( struct btrfs_super_block * ) bh - > b_data ;
devid = le64_to_cpu ( disk_super - > dev_item . devid ) ;
2008-11-17 21:11:30 -05:00
dev_uuid = disk_super - > dev_item . uuid ;
device = btrfs_find_device ( root , devid , dev_uuid ,
disk_super - > fsid ) ;
2008-05-13 13:46:40 -04:00
if ( ! device ) {
ret = - ENOENT ;
goto error_brelse ;
}
2008-11-17 21:11:30 -05:00
}
2008-05-13 13:46:40 -04:00
2008-11-17 21:11:30 -05:00
if ( device - > writeable & & root - > fs_info - > fs_devices - > rw_devices = = 1 ) {
2009-01-05 21:25:51 -05:00
printk ( KERN_ERR " btrfs: unable to remove the only writeable "
" device \n " ) ;
2008-11-17 21:11:30 -05:00
ret = - EINVAL ;
goto error_brelse ;
}
if ( device - > writeable ) {
list_del_init ( & device - > dev_alloc_list ) ;
root - > fs_info - > fs_devices - > rw_devices - - ;
2008-05-13 13:46:40 -04:00
}
2008-05-07 11:43:44 -04:00
ret = btrfs_shrink_device ( device , 0 ) ;
if ( ret )
goto error_brelse ;
ret = btrfs_rm_dev_item ( root - > fs_info - > chunk_root , device ) ;
if ( ret )
goto error_brelse ;
2008-11-17 21:11:30 -05:00
device - > in_fs_metadata = 0 ;
2009-06-10 15:17:02 -04:00
/*
* the device list mutex makes sure that we don ' t change
* the device list while someone else is writing out all
* the device supers .
*/
mutex_lock ( & root - > fs_info - > fs_devices - > device_list_mutex ) ;
2008-12-12 10:03:26 -05:00
list_del_init ( & device - > dev_list ) ;
2009-06-10 15:17:02 -04:00
mutex_unlock ( & root - > fs_info - > fs_devices - > device_list_mutex ) ;
2008-12-12 10:03:26 -05:00
device - > fs_devices - > num_devices - - ;
2008-11-17 21:11:30 -05:00
next_device = list_entry ( root - > fs_info - > fs_devices - > devices . next ,
struct btrfs_device , dev_list ) ;
if ( device - > bdev = = root - > fs_info - > sb - > s_bdev )
root - > fs_info - > sb - > s_bdev = next_device - > bdev ;
if ( device - > bdev = = root - > fs_info - > fs_devices - > latest_bdev )
root - > fs_info - > fs_devices - > latest_bdev = next_device - > bdev ;
2008-12-12 10:03:26 -05:00
if ( device - > bdev ) {
close_bdev_exclusive ( device - > bdev , device - > mode ) ;
device - > bdev = NULL ;
device - > fs_devices - > open_devices - - ;
}
2008-11-17 21:11:30 -05:00
num_devices = btrfs_super_num_devices ( & root - > fs_info - > super_copy ) - 1 ;
btrfs_set_super_num_devices ( & root - > fs_info - > super_copy , num_devices ) ;
2008-12-12 10:03:26 -05:00
if ( device - > fs_devices - > open_devices = = 0 ) {
struct btrfs_fs_devices * fs_devices ;
fs_devices = root - > fs_info - > fs_devices ;
while ( fs_devices ) {
if ( fs_devices - > seed = = device - > fs_devices )
break ;
fs_devices = fs_devices - > seed ;
2008-11-17 21:11:30 -05:00
}
2008-12-12 10:03:26 -05:00
fs_devices - > seed = device - > fs_devices - > seed ;
device - > fs_devices - > seed = NULL ;
__btrfs_close_devices ( device - > fs_devices ) ;
free_fs_devices ( device - > fs_devices ) ;
2008-11-17 21:11:30 -05:00
}
/*
* at this point , the device is zero sized . We want to
* remove it from the devices list and zero out the old super
*/
if ( device - > writeable ) {
2008-05-13 13:46:40 -04:00
/* make sure this device isn't detected as part of
* the FS anymore
*/
memset ( & disk_super - > magic , 0 , sizeof ( disk_super - > magic ) ) ;
set_buffer_dirty ( bh ) ;
sync_dirty_buffer ( bh ) ;
}
2008-05-07 11:43:44 -04:00
kfree ( device - > name ) ;
kfree ( device ) ;
ret = 0 ;
error_brelse :
brelse ( bh ) ;
error_close :
2008-05-13 13:46:40 -04:00
if ( bdev )
2008-12-02 06:36:09 -05:00
close_bdev_exclusive ( bdev , FMODE_READ ) ;
2008-05-07 11:43:44 -04:00
out :
2008-07-08 14:19:17 -04:00
mutex_unlock ( & root - > fs_info - > volume_mutex ) ;
2008-05-07 11:43:44 -04:00
mutex_unlock ( & uuid_mutex ) ;
return ret ;
}
2008-11-17 21:11:30 -05:00
/*
* does all the dirty work required for changing file system ' s UUID .
*/
static int btrfs_prepare_sprout ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root )
{
struct btrfs_fs_devices * fs_devices = root - > fs_info - > fs_devices ;
struct btrfs_fs_devices * old_devices ;
2008-12-12 10:03:26 -05:00
struct btrfs_fs_devices * seed_devices ;
2008-11-17 21:11:30 -05:00
struct btrfs_super_block * disk_super = & root - > fs_info - > super_copy ;
struct btrfs_device * device ;
u64 super_flags ;
BUG_ON ( ! mutex_is_locked ( & uuid_mutex ) ) ;
2008-12-12 10:03:26 -05:00
if ( ! fs_devices - > seeding )
2008-11-17 21:11:30 -05:00
return - EINVAL ;
2008-12-12 10:03:26 -05:00
seed_devices = kzalloc ( sizeof ( * fs_devices ) , GFP_NOFS ) ;
if ( ! seed_devices )
2008-11-17 21:11:30 -05:00
return - ENOMEM ;
2008-12-12 10:03:26 -05:00
old_devices = clone_fs_devices ( fs_devices ) ;
if ( IS_ERR ( old_devices ) ) {
kfree ( seed_devices ) ;
return PTR_ERR ( old_devices ) ;
2008-11-17 21:11:30 -05:00
}
2008-12-12 10:03:26 -05:00
2008-11-17 21:11:30 -05:00
list_add ( & old_devices - > list , & fs_uuids ) ;
2008-12-12 10:03:26 -05:00
memcpy ( seed_devices , fs_devices , sizeof ( * seed_devices ) ) ;
seed_devices - > opened = 1 ;
INIT_LIST_HEAD ( & seed_devices - > devices ) ;
INIT_LIST_HEAD ( & seed_devices - > alloc_list ) ;
2009-06-10 15:17:02 -04:00
mutex_init ( & seed_devices - > device_list_mutex ) ;
2008-12-12 10:03:26 -05:00
list_splice_init ( & fs_devices - > devices , & seed_devices - > devices ) ;
list_splice_init ( & fs_devices - > alloc_list , & seed_devices - > alloc_list ) ;
list_for_each_entry ( device , & seed_devices - > devices , dev_list ) {
device - > fs_devices = seed_devices ;
}
2008-11-17 21:11:30 -05:00
fs_devices - > seeding = 0 ;
fs_devices - > num_devices = 0 ;
fs_devices - > open_devices = 0 ;
2008-12-12 10:03:26 -05:00
fs_devices - > seed = seed_devices ;
2008-11-17 21:11:30 -05:00
generate_random_uuid ( fs_devices - > fsid ) ;
memcpy ( root - > fs_info - > fsid , fs_devices - > fsid , BTRFS_FSID_SIZE ) ;
memcpy ( disk_super - > fsid , fs_devices - > fsid , BTRFS_FSID_SIZE ) ;
super_flags = btrfs_super_flags ( disk_super ) &
~ BTRFS_SUPER_FLAG_SEEDING ;
btrfs_set_super_flags ( disk_super , super_flags ) ;
return 0 ;
}
/*
* strore the expected generation for seed devices in device items .
*/
static int btrfs_finish_sprout ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root )
{
struct btrfs_path * path ;
struct extent_buffer * leaf ;
struct btrfs_dev_item * dev_item ;
struct btrfs_device * device ;
struct btrfs_key key ;
u8 fs_uuid [ BTRFS_UUID_SIZE ] ;
u8 dev_uuid [ BTRFS_UUID_SIZE ] ;
u64 devid ;
int ret ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
root = root - > fs_info - > chunk_root ;
key . objectid = BTRFS_DEV_ITEMS_OBJECTID ;
key . offset = 0 ;
key . type = BTRFS_DEV_ITEM_KEY ;
while ( 1 ) {
ret = btrfs_search_slot ( trans , root , & key , path , 0 , 1 ) ;
if ( ret < 0 )
goto error ;
leaf = path - > nodes [ 0 ] ;
next_slot :
if ( path - > slots [ 0 ] > = btrfs_header_nritems ( leaf ) ) {
ret = btrfs_next_leaf ( root , path ) ;
if ( ret > 0 )
break ;
if ( ret < 0 )
goto error ;
leaf = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , path - > slots [ 0 ] ) ;
btrfs_release_path ( root , path ) ;
continue ;
}
btrfs_item_key_to_cpu ( leaf , & key , path - > slots [ 0 ] ) ;
if ( key . objectid ! = BTRFS_DEV_ITEMS_OBJECTID | |
key . type ! = BTRFS_DEV_ITEM_KEY )
break ;
dev_item = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_dev_item ) ;
devid = btrfs_device_id ( leaf , dev_item ) ;
read_extent_buffer ( leaf , dev_uuid ,
( unsigned long ) btrfs_device_uuid ( dev_item ) ,
BTRFS_UUID_SIZE ) ;
read_extent_buffer ( leaf , fs_uuid ,
( unsigned long ) btrfs_device_fsid ( dev_item ) ,
BTRFS_UUID_SIZE ) ;
device = btrfs_find_device ( root , devid , dev_uuid , fs_uuid ) ;
BUG_ON ( ! device ) ;
if ( device - > fs_devices - > seeding ) {
btrfs_set_device_generation ( leaf , dev_item ,
device - > generation ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
}
path - > slots [ 0 ] + + ;
goto next_slot ;
}
ret = 0 ;
error :
btrfs_free_path ( path ) ;
return ret ;
}
2008-04-28 15:29:42 -04:00
int btrfs_init_new_device ( struct btrfs_root * root , char * device_path )
{
struct btrfs_trans_handle * trans ;
struct btrfs_device * device ;
struct block_device * bdev ;
struct list_head * devices ;
2008-11-17 21:11:30 -05:00
struct super_block * sb = root - > fs_info - > sb ;
2008-04-28 15:29:42 -04:00
u64 total_bytes ;
2008-11-17 21:11:30 -05:00
int seeding_dev = 0 ;
2008-04-28 15:29:42 -04:00
int ret = 0 ;
2008-11-17 21:11:30 -05:00
if ( ( sb - > s_flags & MS_RDONLY ) & & ! root - > fs_info - > fs_devices - > seeding )
return - EINVAL ;
2008-04-28 15:29:42 -04:00
2008-11-19 21:17:22 -05:00
bdev = open_bdev_exclusive ( device_path , 0 , root - > fs_info - > bdev_holder ) ;
2010-01-27 02:09:00 +00:00
if ( IS_ERR ( bdev ) )
return PTR_ERR ( bdev ) ;
2008-06-25 16:01:30 -04:00
2008-11-17 21:11:30 -05:00
if ( root - > fs_info - > fs_devices - > seeding ) {
seeding_dev = 1 ;
down_write ( & sb - > s_umount ) ;
mutex_lock ( & uuid_mutex ) ;
}
2008-09-29 11:19:10 -04:00
filemap_write_and_wait ( bdev - > bd_inode - > i_mapping ) ;
2008-07-08 14:19:17 -04:00
mutex_lock ( & root - > fs_info - > volume_mutex ) ;
2008-06-25 16:01:30 -04:00
2008-04-28 15:29:42 -04:00
devices = & root - > fs_info - > fs_devices - > devices ;
2009-06-10 15:17:02 -04:00
/*
* we have the volume lock , so we don ' t need the extra
* device list mutex while reading the list here .
*/
2009-01-21 10:59:08 -05:00
list_for_each_entry ( device , devices , dev_list ) {
2008-04-28 15:29:42 -04:00
if ( device - > bdev = = bdev ) {
ret = - EEXIST ;
2008-11-17 21:11:30 -05:00
goto error ;
2008-04-28 15:29:42 -04:00
}
}
device = kzalloc ( sizeof ( * device ) , GFP_NOFS ) ;
if ( ! device ) {
/* we can safely leave the fs_devices entry around */
ret = - ENOMEM ;
2008-11-17 21:11:30 -05:00
goto error ;
2008-04-28 15:29:42 -04:00
}
device - > name = kstrdup ( device_path , GFP_NOFS ) ;
if ( ! device - > name ) {
kfree ( device ) ;
2008-11-17 21:11:30 -05:00
ret = - ENOMEM ;
goto error ;
2008-04-28 15:29:42 -04:00
}
2008-11-17 21:11:30 -05:00
ret = find_next_devid ( root , & device - > devid ) ;
if ( ret ) {
kfree ( device ) ;
goto error ;
}
trans = btrfs_start_transaction ( root , 1 ) ;
lock_chunks ( root ) ;
device - > barriers = 1 ;
device - > writeable = 1 ;
device - > work . func = pending_bios_fn ;
generate_random_uuid ( device - > uuid ) ;
spin_lock_init ( & device - > io_lock ) ;
device - > generation = trans - > transid ;
2008-04-28 15:29:42 -04:00
device - > io_width = root - > sectorsize ;
device - > io_align = root - > sectorsize ;
device - > sector_size = root - > sectorsize ;
device - > total_bytes = i_size_read ( bdev - > bd_inode ) ;
2009-06-04 09:23:50 -04:00
device - > disk_total_bytes = device - > total_bytes ;
2008-04-28 15:29:42 -04:00
device - > dev_root = root - > fs_info - > dev_root ;
device - > bdev = bdev ;
2008-05-13 13:46:40 -04:00
device - > in_fs_metadata = 1 ;
2008-11-19 21:17:22 -05:00
device - > mode = 0 ;
2008-11-17 21:11:30 -05:00
set_blocksize ( device - > bdev , 4096 ) ;
2008-04-28 15:29:42 -04:00
2008-11-17 21:11:30 -05:00
if ( seeding_dev ) {
sb - > s_flags & = ~ MS_RDONLY ;
ret = btrfs_prepare_sprout ( trans , root ) ;
BUG_ON ( ret ) ;
}
2008-04-28 15:29:42 -04:00
2008-11-17 21:11:30 -05:00
device - > fs_devices = root - > fs_info - > fs_devices ;
2009-06-10 15:17:02 -04:00
/*
* we don ' t want write_supers to jump in here with our device
* half setup
*/
mutex_lock ( & root - > fs_info - > fs_devices - > device_list_mutex ) ;
2008-11-17 21:11:30 -05:00
list_add ( & device - > dev_list , & root - > fs_info - > fs_devices - > devices ) ;
list_add ( & device - > dev_alloc_list ,
& root - > fs_info - > fs_devices - > alloc_list ) ;
root - > fs_info - > fs_devices - > num_devices + + ;
root - > fs_info - > fs_devices - > open_devices + + ;
root - > fs_info - > fs_devices - > rw_devices + + ;
root - > fs_info - > fs_devices - > total_rw_bytes + = device - > total_bytes ;
2008-09-05 16:43:54 -04:00
2009-06-10 09:51:32 -04:00
if ( ! blk_queue_nonrot ( bdev_get_queue ( bdev ) ) )
root - > fs_info - > fs_devices - > rotating = 1 ;
2008-04-28 15:29:42 -04:00
total_bytes = btrfs_super_total_bytes ( & root - > fs_info - > super_copy ) ;
btrfs_set_super_total_bytes ( & root - > fs_info - > super_copy ,
total_bytes + device - > total_bytes ) ;
total_bytes = btrfs_super_num_devices ( & root - > fs_info - > super_copy ) ;
btrfs_set_super_num_devices ( & root - > fs_info - > super_copy ,
total_bytes + 1 ) ;
2009-06-10 15:17:02 -04:00
mutex_unlock ( & root - > fs_info - > fs_devices - > device_list_mutex ) ;
2008-04-28 15:29:42 -04:00
2008-11-17 21:11:30 -05:00
if ( seeding_dev ) {
ret = init_first_rw_device ( trans , root , device ) ;
BUG_ON ( ret ) ;
ret = btrfs_finish_sprout ( trans , root ) ;
BUG_ON ( ret ) ;
} else {
ret = btrfs_add_device ( trans , root , device ) ;
}
2009-03-10 13:17:18 -04:00
/*
* we ' ve got more storage , clear any full flags on the space
* infos
*/
btrfs_clear_space_info_full ( root - > fs_info ) ;
2008-07-08 14:19:17 -04:00
unlock_chunks ( root ) ;
2008-11-17 21:11:30 -05:00
btrfs_commit_transaction ( trans , root ) ;
2008-06-25 16:01:30 -04:00
2008-11-17 21:11:30 -05:00
if ( seeding_dev ) {
mutex_unlock ( & uuid_mutex ) ;
up_write ( & sb - > s_umount ) ;
2008-04-28 15:29:42 -04:00
2008-11-17 21:11:30 -05:00
ret = btrfs_relocate_sys_chunks ( root ) ;
BUG_ON ( ret ) ;
}
out :
mutex_unlock ( & root - > fs_info - > volume_mutex ) ;
return ret ;
error :
2008-11-19 21:17:22 -05:00
close_bdev_exclusive ( bdev , 0 ) ;
2008-11-17 21:11:30 -05:00
if ( seeding_dev ) {
mutex_unlock ( & uuid_mutex ) ;
up_write ( & sb - > s_umount ) ;
}
2008-04-28 15:29:42 -04:00
goto out ;
}
2009-01-05 21:25:51 -05:00
static noinline int btrfs_update_device ( struct btrfs_trans_handle * trans ,
struct btrfs_device * device )
2008-03-24 15:01:56 -04:00
{
int ret ;
struct btrfs_path * path ;
struct btrfs_root * root ;
struct btrfs_dev_item * dev_item ;
struct extent_buffer * leaf ;
struct btrfs_key key ;
root = device - > dev_root - > fs_info - > chunk_root ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = BTRFS_DEV_ITEMS_OBJECTID ;
key . type = BTRFS_DEV_ITEM_KEY ;
key . offset = device - > devid ;
ret = btrfs_search_slot ( trans , root , & key , path , 0 , 1 ) ;
if ( ret < 0 )
goto out ;
if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
leaf = path - > nodes [ 0 ] ;
dev_item = btrfs_item_ptr ( leaf , path - > slots [ 0 ] , struct btrfs_dev_item ) ;
btrfs_set_device_id ( leaf , dev_item , device - > devid ) ;
btrfs_set_device_type ( leaf , dev_item , device - > type ) ;
btrfs_set_device_io_align ( leaf , dev_item , device - > io_align ) ;
btrfs_set_device_io_width ( leaf , dev_item , device - > io_width ) ;
btrfs_set_device_sector_size ( leaf , dev_item , device - > sector_size ) ;
2009-04-27 07:29:03 -04:00
btrfs_set_device_total_bytes ( leaf , dev_item , device - > disk_total_bytes ) ;
2008-03-24 15:01:56 -04:00
btrfs_set_device_bytes_used ( leaf , dev_item , device - > bytes_used ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
2008-07-08 14:19:17 -04:00
static int __btrfs_grow_device ( struct btrfs_trans_handle * trans ,
2008-04-25 16:53:30 -04:00
struct btrfs_device * device , u64 new_size )
{
struct btrfs_super_block * super_copy =
& device - > dev_root - > fs_info - > super_copy ;
u64 old_total = btrfs_super_total_bytes ( super_copy ) ;
u64 diff = new_size - device - > total_bytes ;
2008-11-17 21:11:30 -05:00
if ( ! device - > writeable )
return - EACCES ;
if ( new_size < = device - > total_bytes )
return - EINVAL ;
2008-04-25 16:53:30 -04:00
btrfs_set_super_total_bytes ( super_copy , old_total + diff ) ;
2008-11-17 21:11:30 -05:00
device - > fs_devices - > total_rw_bytes + = diff ;
device - > total_bytes = new_size ;
2009-07-24 16:41:41 -04:00
device - > disk_total_bytes = new_size ;
2009-03-10 12:39:20 -04:00
btrfs_clear_space_info_full ( device - > dev_root - > fs_info ) ;
2008-04-25 16:53:30 -04:00
return btrfs_update_device ( trans , device ) ;
}
2008-07-08 14:19:17 -04:00
int btrfs_grow_device ( struct btrfs_trans_handle * trans ,
struct btrfs_device * device , u64 new_size )
{
int ret ;
lock_chunks ( device - > dev_root ) ;
ret = __btrfs_grow_device ( trans , device , new_size ) ;
unlock_chunks ( device - > dev_root ) ;
return ret ;
}
2008-04-25 16:53:30 -04:00
static int btrfs_free_chunk ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ,
u64 chunk_tree , u64 chunk_objectid ,
u64 chunk_offset )
{
int ret ;
struct btrfs_path * path ;
struct btrfs_key key ;
root = root - > fs_info - > chunk_root ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
key . objectid = chunk_objectid ;
key . offset = chunk_offset ;
key . type = BTRFS_CHUNK_ITEM_KEY ;
ret = btrfs_search_slot ( trans , root , & key , path , - 1 , 1 ) ;
BUG_ON ( ret ) ;
ret = btrfs_del_item ( trans , root , path ) ;
BUG_ON ( ret ) ;
btrfs_free_path ( path ) ;
return 0 ;
}
2008-12-02 09:54:17 -05:00
static int btrfs_del_sys_chunk ( struct btrfs_root * root , u64 chunk_objectid , u64
2008-04-25 16:53:30 -04:00
chunk_offset )
{
struct btrfs_super_block * super_copy = & root - > fs_info - > super_copy ;
struct btrfs_disk_key * disk_key ;
struct btrfs_chunk * chunk ;
u8 * ptr ;
int ret = 0 ;
u32 num_stripes ;
u32 array_size ;
u32 len = 0 ;
u32 cur ;
struct btrfs_key key ;
array_size = btrfs_super_sys_array_size ( super_copy ) ;
ptr = super_copy - > sys_chunk_array ;
cur = 0 ;
while ( cur < array_size ) {
disk_key = ( struct btrfs_disk_key * ) ptr ;
btrfs_disk_key_to_cpu ( & key , disk_key ) ;
len = sizeof ( * disk_key ) ;
if ( key . type = = BTRFS_CHUNK_ITEM_KEY ) {
chunk = ( struct btrfs_chunk * ) ( ptr + len ) ;
num_stripes = btrfs_stack_chunk_num_stripes ( chunk ) ;
len + = btrfs_chunk_item_size ( num_stripes ) ;
} else {
ret = - EIO ;
break ;
}
if ( key . objectid = = chunk_objectid & &
key . offset = = chunk_offset ) {
memmove ( ptr , ptr + len , array_size - ( cur + len ) ) ;
array_size - = len ;
btrfs_set_super_sys_array_size ( super_copy , array_size ) ;
} else {
ptr + = len ;
cur + = len ;
}
}
return ret ;
}
2008-12-02 09:54:17 -05:00
static int btrfs_relocate_chunk ( struct btrfs_root * root ,
2008-04-25 16:53:30 -04:00
u64 chunk_tree , u64 chunk_objectid ,
u64 chunk_offset )
{
struct extent_map_tree * em_tree ;
struct btrfs_root * extent_root ;
struct btrfs_trans_handle * trans ;
struct extent_map * em ;
struct map_lookup * map ;
int ret ;
int i ;
root = root - > fs_info - > chunk_root ;
extent_root = root - > fs_info - > extent_root ;
em_tree = & root - > fs_info - > mapping_tree . map_tree ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
ret = btrfs_can_relocate ( extent_root , chunk_offset ) ;
if ( ret )
return - ENOSPC ;
2008-04-25 16:53:30 -04:00
/* step one, relocate all the extents inside this chunk */
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 10:09:34 -04:00
ret = btrfs_relocate_block_group ( extent_root , chunk_offset ) ;
2008-04-25 16:53:30 -04:00
BUG_ON ( ret ) ;
trans = btrfs_start_transaction ( root , 1 ) ;
BUG_ON ( ! trans ) ;
2008-07-08 14:19:17 -04:00
lock_chunks ( root ) ;
2008-04-25 16:53:30 -04:00
/*
* step two , delete the device extents and the
* chunk tree entries
*/
2009-09-02 16:24:52 -04:00
read_lock ( & em_tree - > lock ) ;
2008-04-25 16:53:30 -04:00
em = lookup_extent_mapping ( em_tree , chunk_offset , 1 ) ;
2009-09-02 16:24:52 -04:00
read_unlock ( & em_tree - > lock ) ;
2008-04-25 16:53:30 -04:00
2008-05-07 11:43:44 -04:00
BUG_ON ( em - > start > chunk_offset | |
em - > start + em - > len < chunk_offset ) ;
2008-04-25 16:53:30 -04:00
map = ( struct map_lookup * ) em - > bdev ;
for ( i = 0 ; i < map - > num_stripes ; i + + ) {
ret = btrfs_free_dev_extent ( trans , map - > stripes [ i ] . dev ,
map - > stripes [ i ] . physical ) ;
BUG_ON ( ret ) ;
2008-05-07 11:43:44 -04:00
2008-05-13 13:46:40 -04:00
if ( map - > stripes [ i ] . dev ) {
ret = btrfs_update_device ( trans , map - > stripes [ i ] . dev ) ;
BUG_ON ( ret ) ;
}
2008-04-25 16:53:30 -04:00
}
ret = btrfs_free_chunk ( trans , root , chunk_tree , chunk_objectid ,
chunk_offset ) ;
BUG_ON ( ret ) ;
if ( map - > type & BTRFS_BLOCK_GROUP_SYSTEM ) {
ret = btrfs_del_sys_chunk ( root , chunk_objectid , chunk_offset ) ;
BUG_ON ( ret ) ;
}
2008-11-17 21:11:30 -05:00
ret = btrfs_remove_block_group ( trans , extent_root , chunk_offset ) ;
BUG_ON ( ret ) ;
2009-09-02 16:24:52 -04:00
write_lock ( & em_tree - > lock ) ;
2008-11-17 21:11:30 -05:00
remove_extent_mapping ( em_tree , em ) ;
2009-09-02 16:24:52 -04:00
write_unlock ( & em_tree - > lock ) ;
2008-11-17 21:11:30 -05:00
kfree ( map ) ;
em - > bdev = NULL ;
/* once for the tree */
free_extent_map ( em ) ;
/* once for us */
free_extent_map ( em ) ;
unlock_chunks ( root ) ;
btrfs_end_transaction ( trans , root ) ;
return 0 ;
}
static int btrfs_relocate_sys_chunks ( struct btrfs_root * root )
{
struct btrfs_root * chunk_root = root - > fs_info - > chunk_root ;
struct btrfs_path * path ;
struct extent_buffer * leaf ;
struct btrfs_chunk * chunk ;
struct btrfs_key key ;
struct btrfs_key found_key ;
u64 chunk_tree = chunk_root - > root_key . objectid ;
u64 chunk_type ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
bool retried = false ;
int failed = 0 ;
2008-11-17 21:11:30 -05:00
int ret ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
again :
2008-11-17 21:11:30 -05:00
key . objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID ;
key . offset = ( u64 ) - 1 ;
key . type = BTRFS_CHUNK_ITEM_KEY ;
while ( 1 ) {
ret = btrfs_search_slot ( NULL , chunk_root , & key , path , 0 , 0 ) ;
if ( ret < 0 )
goto error ;
BUG_ON ( ret = = 0 ) ;
ret = btrfs_previous_item ( chunk_root , path , key . objectid ,
key . type ) ;
if ( ret < 0 )
goto error ;
if ( ret > 0 )
break ;
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 10:09:34 -04:00
2008-11-17 21:11:30 -05:00
leaf = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & found_key , path - > slots [ 0 ] ) ;
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 10:09:34 -04:00
2008-11-17 21:11:30 -05:00
chunk = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_chunk ) ;
chunk_type = btrfs_chunk_type ( leaf , chunk ) ;
btrfs_release_path ( chunk_root , path ) ;
2008-04-25 16:53:30 -04:00
2008-11-17 21:11:30 -05:00
if ( chunk_type & BTRFS_BLOCK_GROUP_SYSTEM ) {
ret = btrfs_relocate_chunk ( chunk_root , chunk_tree ,
found_key . objectid ,
found_key . offset ) ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
if ( ret = = - ENOSPC )
failed + + ;
else if ( ret )
BUG ( ) ;
2008-11-17 21:11:30 -05:00
}
2008-04-25 16:53:30 -04:00
2008-11-17 21:11:30 -05:00
if ( found_key . offset = = 0 )
break ;
key . offset = found_key . offset - 1 ;
}
ret = 0 ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
if ( failed & & ! retried ) {
failed = 0 ;
retried = true ;
goto again ;
} else if ( failed & & retried ) {
WARN_ON ( 1 ) ;
ret = - ENOSPC ;
}
2008-11-17 21:11:30 -05:00
error :
btrfs_free_path ( path ) ;
return ret ;
2008-04-25 16:53:30 -04:00
}
2008-04-28 15:29:52 -04:00
static u64 div_factor ( u64 num , int factor )
{
if ( factor = = 10 )
return num ;
num * = factor ;
do_div ( num , 10 ) ;
return num ;
}
int btrfs_balance ( struct btrfs_root * dev_root )
{
int ret ;
struct list_head * devices = & dev_root - > fs_info - > fs_devices - > devices ;
struct btrfs_device * device ;
u64 old_size ;
u64 size_to_free ;
struct btrfs_path * path ;
struct btrfs_key key ;
struct btrfs_chunk * chunk ;
struct btrfs_root * chunk_root = dev_root - > fs_info - > chunk_root ;
struct btrfs_trans_handle * trans ;
struct btrfs_key found_key ;
2008-11-17 21:11:30 -05:00
if ( dev_root - > fs_info - > sb - > s_flags & MS_RDONLY )
return - EROFS ;
2008-04-28 15:29:52 -04:00
2008-07-08 14:19:17 -04:00
mutex_lock ( & dev_root - > fs_info - > volume_mutex ) ;
2008-04-28 15:29:52 -04:00
dev_root = dev_root - > fs_info - > dev_root ;
/* step one make some room on all the devices */
2009-01-21 10:59:08 -05:00
list_for_each_entry ( device , devices , dev_list ) {
2008-04-28 15:29:52 -04:00
old_size = device - > total_bytes ;
size_to_free = div_factor ( old_size , 1 ) ;
size_to_free = min ( size_to_free , ( u64 ) 1 * 1024 * 1024 ) ;
2008-11-17 21:11:30 -05:00
if ( ! device - > writeable | |
device - > total_bytes - device - > bytes_used > size_to_free )
2008-04-28 15:29:52 -04:00
continue ;
ret = btrfs_shrink_device ( device , old_size - size_to_free ) ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
if ( ret = = - ENOSPC )
break ;
2008-04-28 15:29:52 -04:00
BUG_ON ( ret ) ;
trans = btrfs_start_transaction ( dev_root , 1 ) ;
BUG_ON ( ! trans ) ;
ret = btrfs_grow_device ( trans , device , old_size ) ;
BUG_ON ( ret ) ;
btrfs_end_transaction ( trans , dev_root ) ;
}
/* step two, relocate all the chunks */
path = btrfs_alloc_path ( ) ;
BUG_ON ( ! path ) ;
key . objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID ;
key . offset = ( u64 ) - 1 ;
key . type = BTRFS_CHUNK_ITEM_KEY ;
2009-01-05 21:25:51 -05:00
while ( 1 ) {
2008-04-28 15:29:52 -04:00
ret = btrfs_search_slot ( NULL , chunk_root , & key , path , 0 , 0 ) ;
if ( ret < 0 )
goto error ;
/*
* this shouldn ' t happen , it means the last relocate
* failed
*/
if ( ret = = 0 )
break ;
ret = btrfs_previous_item ( chunk_root , path , 0 ,
BTRFS_CHUNK_ITEM_KEY ) ;
2008-07-08 14:19:17 -04:00
if ( ret )
2008-04-28 15:29:52 -04:00
break ;
2008-07-08 14:19:17 -04:00
2008-04-28 15:29:52 -04:00
btrfs_item_key_to_cpu ( path - > nodes [ 0 ] , & found_key ,
path - > slots [ 0 ] ) ;
if ( found_key . objectid ! = key . objectid )
break ;
2008-07-08 14:19:17 -04:00
2008-04-28 15:29:52 -04:00
chunk = btrfs_item_ptr ( path - > nodes [ 0 ] ,
path - > slots [ 0 ] ,
struct btrfs_chunk ) ;
/* chunk zero is special */
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
if ( found_key . offset = = 0 )
2008-04-28 15:29:52 -04:00
break ;
2008-07-08 14:19:17 -04:00
btrfs_release_path ( chunk_root , path ) ;
2008-04-28 15:29:52 -04:00
ret = btrfs_relocate_chunk ( chunk_root ,
chunk_root - > root_key . objectid ,
found_key . objectid ,
found_key . offset ) ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
BUG_ON ( ret & & ret ! = - ENOSPC ) ;
key . offset = found_key . offset - 1 ;
2008-04-28 15:29:52 -04:00
}
ret = 0 ;
error :
btrfs_free_path ( path ) ;
2008-07-08 14:19:17 -04:00
mutex_unlock ( & dev_root - > fs_info - > volume_mutex ) ;
2008-04-28 15:29:52 -04:00
return ret ;
}
2008-04-25 16:53:30 -04:00
/*
* shrinking a device means finding all of the device extents past
* the new size , and then following the back refs to the chunks .
* The chunk relocation code actually frees the device extent
*/
int btrfs_shrink_device ( struct btrfs_device * device , u64 new_size )
{
struct btrfs_trans_handle * trans ;
struct btrfs_root * root = device - > dev_root ;
struct btrfs_dev_extent * dev_extent = NULL ;
struct btrfs_path * path ;
u64 length ;
u64 chunk_tree ;
u64 chunk_objectid ;
u64 chunk_offset ;
int ret ;
int slot ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
int failed = 0 ;
bool retried = false ;
2008-04-25 16:53:30 -04:00
struct extent_buffer * l ;
struct btrfs_key key ;
struct btrfs_super_block * super_copy = & root - > fs_info - > super_copy ;
u64 old_total = btrfs_super_total_bytes ( super_copy ) ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
u64 old_size = device - > total_bytes ;
2008-04-25 16:53:30 -04:00
u64 diff = device - > total_bytes - new_size ;
2008-11-17 21:11:30 -05:00
if ( new_size > = device - > total_bytes )
return - EINVAL ;
2008-04-25 16:53:30 -04:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
path - > reada = 2 ;
2008-07-08 14:19:17 -04:00
lock_chunks ( root ) ;
2008-04-25 16:53:30 -04:00
device - > total_bytes = new_size ;
2008-11-17 21:11:30 -05:00
if ( device - > writeable )
device - > fs_devices - > total_rw_bytes - = diff ;
2008-07-08 14:19:17 -04:00
unlock_chunks ( root ) ;
2008-04-25 16:53:30 -04:00
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
again :
2008-04-25 16:53:30 -04:00
key . objectid = device - > devid ;
key . offset = ( u64 ) - 1 ;
key . type = BTRFS_DEV_EXTENT_KEY ;
while ( 1 ) {
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 )
goto done ;
ret = btrfs_previous_item ( root , path , 0 , key . type ) ;
if ( ret < 0 )
goto done ;
if ( ret ) {
ret = 0 ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
btrfs_release_path ( root , path ) ;
2009-07-22 09:59:00 -04:00
break ;
2008-04-25 16:53:30 -04:00
}
l = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( l , & key , path - > slots [ 0 ] ) ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
if ( key . objectid ! = device - > devid ) {
btrfs_release_path ( root , path ) ;
2009-07-22 09:59:00 -04:00
break ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
}
2008-04-25 16:53:30 -04:00
dev_extent = btrfs_item_ptr ( l , slot , struct btrfs_dev_extent ) ;
length = btrfs_dev_extent_length ( l , dev_extent ) ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
if ( key . offset + length < = new_size ) {
btrfs_release_path ( root , path ) ;
2009-04-27 07:29:03 -04:00
break ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
}
2008-04-25 16:53:30 -04:00
chunk_tree = btrfs_dev_extent_chunk_tree ( l , dev_extent ) ;
chunk_objectid = btrfs_dev_extent_chunk_objectid ( l , dev_extent ) ;
chunk_offset = btrfs_dev_extent_chunk_offset ( l , dev_extent ) ;
btrfs_release_path ( root , path ) ;
ret = btrfs_relocate_chunk ( root , chunk_tree , chunk_objectid ,
chunk_offset ) ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
if ( ret & & ret ! = - ENOSPC )
2008-04-25 16:53:30 -04:00
goto done ;
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-11 16:11:19 -04:00
if ( ret = = - ENOSPC )
failed + + ;
key . offset - = 1 ;
}
if ( failed & & ! retried ) {
failed = 0 ;
retried = true ;
goto again ;
} else if ( failed & & retried ) {
ret = - ENOSPC ;
lock_chunks ( root ) ;
device - > total_bytes = old_size ;
if ( device - > writeable )
device - > fs_devices - > total_rw_bytes + = diff ;
unlock_chunks ( root ) ;
goto done ;
2008-04-25 16:53:30 -04:00
}
2009-04-27 07:29:03 -04:00
/* Shrinking succeeded, else we would be at "done". */
trans = btrfs_start_transaction ( root , 1 ) ;
if ( ! trans ) {
ret = - ENOMEM ;
goto done ;
}
lock_chunks ( root ) ;
device - > disk_total_bytes = new_size ;
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device ( trans , device ) ;
if ( ret ) {
unlock_chunks ( root ) ;
btrfs_end_transaction ( trans , root ) ;
goto done ;
}
WARN_ON ( diff > old_total ) ;
btrfs_set_super_total_bytes ( super_copy , old_total - diff ) ;
unlock_chunks ( root ) ;
btrfs_end_transaction ( trans , root ) ;
2008-04-25 16:53:30 -04:00
done :
btrfs_free_path ( path ) ;
return ret ;
}
2008-12-02 09:54:17 -05:00
static int btrfs_add_system_chunk ( struct btrfs_trans_handle * trans ,
2008-03-24 15:01:56 -04:00
struct btrfs_root * root ,
struct btrfs_key * key ,
struct btrfs_chunk * chunk , int item_size )
{
struct btrfs_super_block * super_copy = & root - > fs_info - > super_copy ;
struct btrfs_disk_key disk_key ;
u32 array_size ;
u8 * ptr ;
array_size = btrfs_super_sys_array_size ( super_copy ) ;
if ( array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE )
return - EFBIG ;
ptr = super_copy - > sys_chunk_array + array_size ;
btrfs_cpu_key_to_disk ( & disk_key , key ) ;
memcpy ( ptr , & disk_key , sizeof ( disk_key ) ) ;
ptr + = sizeof ( disk_key ) ;
memcpy ( ptr , chunk , item_size ) ;
item_size + = sizeof ( disk_key ) ;
btrfs_set_super_sys_array_size ( super_copy , array_size + item_size ) ;
return 0 ;
}
2009-01-05 21:25:51 -05:00
static noinline u64 chunk_bytes_by_type ( u64 type , u64 calc_size ,
2008-09-05 16:09:51 -04:00
int num_stripes , int sub_stripes )
2008-04-18 10:29:51 -04:00
{
if ( type & ( BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP ) )
return calc_size ;
else if ( type & BTRFS_BLOCK_GROUP_RAID10 )
return calc_size * ( num_stripes / sub_stripes ) ;
else
return calc_size * num_stripes ;
}
2008-11-17 21:11:30 -05:00
static int __btrfs_alloc_chunk ( struct btrfs_trans_handle * trans ,
struct btrfs_root * extent_root ,
struct map_lookup * * map_ret ,
u64 * num_bytes , u64 * stripe_size ,
u64 start , u64 type )
2008-03-24 15:01:56 -04:00
{
2008-03-25 16:50:33 -04:00
struct btrfs_fs_info * info = extent_root - > fs_info ;
2008-03-24 15:01:56 -04:00
struct btrfs_device * device = NULL ;
2008-11-17 21:11:30 -05:00
struct btrfs_fs_devices * fs_devices = info - > fs_devices ;
2008-03-24 15:01:59 -04:00
struct list_head * cur ;
2008-11-17 21:11:30 -05:00
struct map_lookup * map = NULL ;
2008-03-24 15:01:56 -04:00
struct extent_map_tree * em_tree ;
struct extent_map * em ;
2008-11-17 21:11:30 -05:00
struct list_head private_devs ;
2008-04-18 11:55:51 -04:00
int min_stripe_size = 1 * 1024 * 1024 ;
2008-03-24 15:01:56 -04:00
u64 calc_size = 1024 * 1024 * 1024 ;
2008-04-18 10:29:51 -04:00
u64 max_chunk_size = calc_size ;
u64 min_free ;
2008-03-24 15:01:59 -04:00
u64 avail ;
u64 max_avail = 0 ;
2008-11-17 21:11:30 -05:00
u64 dev_offset ;
2008-03-24 15:01:59 -04:00
int num_stripes = 1 ;
2008-04-18 11:55:51 -04:00
int min_stripes = 1 ;
2008-04-16 10:49:51 -04:00
int sub_stripes = 0 ;
2008-03-24 15:01:59 -04:00
int looped = 0 ;
2008-03-24 15:01:56 -04:00
int ret ;
2008-03-24 15:01:59 -04:00
int index ;
2008-03-25 16:50:33 -04:00
int stripe_len = 64 * 1024 ;
2008-03-24 15:01:56 -04:00
2008-04-28 15:29:52 -04:00
if ( ( type & BTRFS_BLOCK_GROUP_RAID1 ) & &
( type & BTRFS_BLOCK_GROUP_DUP ) ) {
WARN_ON ( 1 ) ;
type & = ~ BTRFS_BLOCK_GROUP_DUP ;
}
2008-11-17 21:11:30 -05:00
if ( list_empty ( & fs_devices - > alloc_list ) )
2008-03-24 15:01:59 -04:00
return - ENOSPC ;
2008-03-25 16:50:33 -04:00
2008-04-18 11:55:51 -04:00
if ( type & ( BTRFS_BLOCK_GROUP_RAID0 ) ) {
2008-11-17 21:11:30 -05:00
num_stripes = fs_devices - > rw_devices ;
2008-04-18 11:55:51 -04:00
min_stripes = 2 ;
}
if ( type & ( BTRFS_BLOCK_GROUP_DUP ) ) {
2008-04-03 16:29:03 -04:00
num_stripes = 2 ;
2008-04-18 11:55:51 -04:00
min_stripes = 2 ;
}
2008-04-03 16:29:03 -04:00
if ( type & ( BTRFS_BLOCK_GROUP_RAID1 ) ) {
2008-11-17 21:11:30 -05:00
num_stripes = min_t ( u64 , 2 , fs_devices - > rw_devices ) ;
2008-04-18 10:29:51 -04:00
if ( num_stripes < 2 )
return - ENOSPC ;
2008-04-18 11:55:51 -04:00
min_stripes = 2 ;
2008-04-03 16:29:03 -04:00
}
2008-04-16 10:49:51 -04:00
if ( type & ( BTRFS_BLOCK_GROUP_RAID10 ) ) {
2008-11-17 21:11:30 -05:00
num_stripes = fs_devices - > rw_devices ;
2008-04-16 10:49:51 -04:00
if ( num_stripes < 4 )
return - ENOSPC ;
num_stripes & = ~ ( u32 ) 1 ;
sub_stripes = 2 ;
2008-04-18 11:55:51 -04:00
min_stripes = 4 ;
2008-04-16 10:49:51 -04:00
}
2008-04-18 10:29:51 -04:00
if ( type & BTRFS_BLOCK_GROUP_DATA ) {
max_chunk_size = 10 * calc_size ;
2008-04-18 11:55:51 -04:00
min_stripe_size = 64 * 1024 * 1024 ;
2008-04-18 10:29:51 -04:00
} else if ( type & BTRFS_BLOCK_GROUP_METADATA ) {
2009-12-07 21:45:59 +00:00
max_chunk_size = 256 * 1024 * 1024 ;
2008-04-18 11:55:51 -04:00
min_stripe_size = 32 * 1024 * 1024 ;
} else if ( type & BTRFS_BLOCK_GROUP_SYSTEM ) {
calc_size = 8 * 1024 * 1024 ;
max_chunk_size = calc_size * 2 ;
min_stripe_size = 1 * 1024 * 1024 ;
2008-04-18 10:29:51 -04:00
}
2008-11-17 21:11:30 -05:00
/* we don't want a chunk larger than 10% of writeable space */
max_chunk_size = min ( div_factor ( fs_devices - > total_rw_bytes , 1 ) ,
max_chunk_size ) ;
2008-04-18 10:29:51 -04:00
2008-04-18 11:55:51 -04:00
again :
2009-07-24 16:41:41 -04:00
max_avail = 0 ;
2008-11-17 21:11:30 -05:00
if ( ! map | | map - > num_stripes ! = num_stripes ) {
kfree ( map ) ;
map = kmalloc ( map_lookup_size ( num_stripes ) , GFP_NOFS ) ;
if ( ! map )
return - ENOMEM ;
map - > num_stripes = num_stripes ;
}
2008-04-18 10:29:51 -04:00
if ( calc_size * num_stripes > max_chunk_size ) {
calc_size = max_chunk_size ;
do_div ( calc_size , num_stripes ) ;
do_div ( calc_size , stripe_len ) ;
calc_size * = stripe_len ;
}
/* we don't want tiny stripes */
2008-04-18 11:55:51 -04:00
calc_size = max_t ( u64 , min_stripe_size , calc_size ) ;
2008-04-18 10:29:51 -04:00
do_div ( calc_size , stripe_len ) ;
calc_size * = stripe_len ;
2008-11-17 21:11:30 -05:00
cur = fs_devices - > alloc_list . next ;
2008-03-24 15:01:59 -04:00
index = 0 ;
2008-04-03 16:29:03 -04:00
if ( type & BTRFS_BLOCK_GROUP_DUP )
min_free = calc_size * 2 ;
2008-04-18 10:29:51 -04:00
else
min_free = calc_size ;
2008-04-03 16:29:03 -04:00
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-23 13:14:11 -04:00
/*
* we add 1 MB because we never use the first 1 MB of the device , unless
* we ' ve looped , then we are likely allocating the maximum amount of
* space left already
*/
if ( ! looped )
min_free + = 1024 * 1024 ;
2008-04-21 08:28:10 -04:00
2008-11-17 21:11:30 -05:00
INIT_LIST_HEAD ( & private_devs ) ;
2009-01-05 21:25:51 -05:00
while ( index < num_stripes ) {
2008-04-22 09:22:07 -04:00
device = list_entry ( cur , struct btrfs_device , dev_alloc_list ) ;
2008-11-17 21:11:30 -05:00
BUG_ON ( ! device - > writeable ) ;
2008-05-13 13:46:40 -04:00
if ( device - > total_bytes > device - > bytes_used )
avail = device - > total_bytes - device - > bytes_used ;
else
avail = 0 ;
2008-03-24 15:01:59 -04:00
cur = cur - > next ;
2008-04-25 16:53:30 -04:00
2008-05-13 13:46:40 -04:00
if ( device - > in_fs_metadata & & avail > = min_free ) {
2008-11-17 21:11:30 -05:00
ret = find_free_dev_extent ( trans , device ,
2009-07-24 16:41:41 -04:00
min_free , & dev_offset ,
& max_avail ) ;
2008-04-25 16:53:30 -04:00
if ( ret = = 0 ) {
list_move_tail ( & device - > dev_alloc_list ,
& private_devs ) ;
2008-11-17 21:11:30 -05:00
map - > stripes [ index ] . dev = device ;
map - > stripes [ index ] . physical = dev_offset ;
2008-04-03 16:29:03 -04:00
index + + ;
2008-11-17 21:11:30 -05:00
if ( type & BTRFS_BLOCK_GROUP_DUP ) {
map - > stripes [ index ] . dev = device ;
map - > stripes [ index ] . physical =
dev_offset + calc_size ;
2008-04-25 16:53:30 -04:00
index + + ;
2008-11-17 21:11:30 -05:00
}
2008-04-25 16:53:30 -04:00
}
2008-05-13 13:46:40 -04:00
} else if ( device - > in_fs_metadata & & avail > max_avail )
2008-04-18 11:55:51 -04:00
max_avail = avail ;
2008-11-17 21:11:30 -05:00
if ( cur = = & fs_devices - > alloc_list )
2008-03-24 15:01:59 -04:00
break ;
}
2008-11-17 21:11:30 -05:00
list_splice ( & private_devs , & fs_devices - > alloc_list ) ;
2008-03-24 15:01:59 -04:00
if ( index < num_stripes ) {
2008-04-18 11:55:51 -04:00
if ( index > = min_stripes ) {
num_stripes = index ;
if ( type & ( BTRFS_BLOCK_GROUP_RAID10 ) ) {
num_stripes / = sub_stripes ;
num_stripes * = sub_stripes ;
}
looped = 1 ;
goto again ;
}
2008-03-24 15:01:59 -04:00
if ( ! looped & & max_avail > 0 ) {
looped = 1 ;
calc_size = max_avail ;
goto again ;
}
2008-11-17 21:11:30 -05:00
kfree ( map ) ;
2008-03-24 15:01:59 -04:00
return - ENOSPC ;
}
2008-11-17 21:11:30 -05:00
map - > sector_size = extent_root - > sectorsize ;
map - > stripe_len = stripe_len ;
map - > io_align = stripe_len ;
map - > io_width = stripe_len ;
map - > type = type ;
map - > num_stripes = num_stripes ;
map - > sub_stripes = sub_stripes ;
2008-03-24 15:01:56 -04:00
2008-11-17 21:11:30 -05:00
* map_ret = map ;
* stripe_size = calc_size ;
* num_bytes = chunk_bytes_by_type ( type , calc_size ,
num_stripes , sub_stripes ) ;
2008-03-24 15:01:56 -04:00
2008-11-17 21:11:30 -05:00
em = alloc_extent_map ( GFP_NOFS ) ;
if ( ! em ) {
kfree ( map ) ;
2008-03-25 16:50:33 -04:00
return - ENOMEM ;
}
2008-11-17 21:11:30 -05:00
em - > bdev = ( struct block_device * ) map ;
em - > start = start ;
em - > len = * num_bytes ;
em - > block_start = 0 ;
em - > block_len = em - > len ;
2008-03-25 16:50:33 -04:00
2008-11-17 21:11:30 -05:00
em_tree = & extent_root - > fs_info - > mapping_tree . map_tree ;
2009-09-02 16:24:52 -04:00
write_lock ( & em_tree - > lock ) ;
2008-11-17 21:11:30 -05:00
ret = add_extent_mapping ( em_tree , em ) ;
2009-09-02 16:24:52 -04:00
write_unlock ( & em_tree - > lock ) ;
2008-11-17 21:11:30 -05:00
BUG_ON ( ret ) ;
free_extent_map ( em ) ;
2008-03-24 15:01:56 -04:00
2008-11-17 21:11:30 -05:00
ret = btrfs_make_block_group ( trans , extent_root , 0 , type ,
BTRFS_FIRST_CHUNK_TREE_OBJECTID ,
start , * num_bytes ) ;
BUG_ON ( ret ) ;
2008-04-03 16:29:03 -04:00
2008-11-17 21:11:30 -05:00
index = 0 ;
while ( index < map - > num_stripes ) {
device = map - > stripes [ index ] . dev ;
dev_offset = map - > stripes [ index ] . physical ;
2008-03-24 15:01:56 -04:00
ret = btrfs_alloc_dev_extent ( trans , device ,
2008-11-17 21:11:30 -05:00
info - > chunk_root - > root_key . objectid ,
BTRFS_FIRST_CHUNK_TREE_OBJECTID ,
start , dev_offset , calc_size ) ;
2008-03-24 15:01:56 -04:00
BUG_ON ( ret ) ;
2008-11-17 21:11:30 -05:00
index + + ;
}
return 0 ;
}
static int __finish_chunk_alloc ( struct btrfs_trans_handle * trans ,
struct btrfs_root * extent_root ,
struct map_lookup * map , u64 chunk_offset ,
u64 chunk_size , u64 stripe_size )
{
u64 dev_offset ;
struct btrfs_key key ;
struct btrfs_root * chunk_root = extent_root - > fs_info - > chunk_root ;
struct btrfs_device * device ;
struct btrfs_chunk * chunk ;
struct btrfs_stripe * stripe ;
size_t item_size = btrfs_chunk_item_size ( map - > num_stripes ) ;
int index = 0 ;
int ret ;
chunk = kzalloc ( item_size , GFP_NOFS ) ;
if ( ! chunk )
return - ENOMEM ;
index = 0 ;
while ( index < map - > num_stripes ) {
device = map - > stripes [ index ] . dev ;
device - > bytes_used + = stripe_size ;
2008-03-24 15:01:56 -04:00
ret = btrfs_update_device ( trans , device ) ;
BUG_ON ( ret ) ;
2008-11-17 21:11:30 -05:00
index + + ;
}
index = 0 ;
stripe = & chunk - > stripe ;
while ( index < map - > num_stripes ) {
device = map - > stripes [ index ] . dev ;
dev_offset = map - > stripes [ index ] . physical ;
2008-03-24 15:01:56 -04:00
2008-04-15 15:41:47 -04:00
btrfs_set_stack_stripe_devid ( stripe , device - > devid ) ;
btrfs_set_stack_stripe_offset ( stripe , dev_offset ) ;
memcpy ( stripe - > dev_uuid , device - > uuid , BTRFS_UUID_SIZE ) ;
2008-11-17 21:11:30 -05:00
stripe + + ;
2008-03-24 15:01:56 -04:00
index + + ;
}
2008-11-17 21:11:30 -05:00
btrfs_set_stack_chunk_length ( chunk , chunk_size ) ;
2008-03-24 15:01:56 -04:00
btrfs_set_stack_chunk_owner ( chunk , extent_root - > root_key . objectid ) ;
2008-11-17 21:11:30 -05:00
btrfs_set_stack_chunk_stripe_len ( chunk , map - > stripe_len ) ;
btrfs_set_stack_chunk_type ( chunk , map - > type ) ;
btrfs_set_stack_chunk_num_stripes ( chunk , map - > num_stripes ) ;
btrfs_set_stack_chunk_io_align ( chunk , map - > stripe_len ) ;
btrfs_set_stack_chunk_io_width ( chunk , map - > stripe_len ) ;
2008-03-24 15:01:56 -04:00
btrfs_set_stack_chunk_sector_size ( chunk , extent_root - > sectorsize ) ;
2008-11-17 21:11:30 -05:00
btrfs_set_stack_chunk_sub_stripes ( chunk , map - > sub_stripes ) ;
2008-03-24 15:01:56 -04:00
2008-11-17 21:11:30 -05:00
key . objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID ;
key . type = BTRFS_CHUNK_ITEM_KEY ;
key . offset = chunk_offset ;
2008-03-24 15:01:56 -04:00
2008-11-17 21:11:30 -05:00
ret = btrfs_insert_item ( trans , chunk_root , & key , chunk , item_size ) ;
BUG_ON ( ret ) ;
2008-03-24 15:01:56 -04:00
2008-11-17 21:11:30 -05:00
if ( map - > type & BTRFS_BLOCK_GROUP_SYSTEM ) {
ret = btrfs_add_system_chunk ( trans , chunk_root , & key , chunk ,
item_size ) ;
2008-04-25 16:53:30 -04:00
BUG_ON ( ret ) ;
}
2008-03-24 15:01:56 -04:00
kfree ( chunk ) ;
2008-11-17 21:11:30 -05:00
return 0 ;
}
2008-03-24 15:01:56 -04:00
2008-11-17 21:11:30 -05:00
/*
* Chunk allocation falls into two parts . The first part does works
* that make the new allocated chunk useable , but not do any operation
* that modifies the chunk tree . The second part does the works that
* require modifying the chunk tree . This division is important for the
* bootstrap process of adding storage to a seed btrfs .
*/
int btrfs_alloc_chunk ( struct btrfs_trans_handle * trans ,
struct btrfs_root * extent_root , u64 type )
{
u64 chunk_offset ;
u64 chunk_size ;
u64 stripe_size ;
struct map_lookup * map ;
struct btrfs_root * chunk_root = extent_root - > fs_info - > chunk_root ;
int ret ;
ret = find_next_chunk ( chunk_root , BTRFS_FIRST_CHUNK_TREE_OBJECTID ,
& chunk_offset ) ;
if ( ret )
return ret ;
ret = __btrfs_alloc_chunk ( trans , extent_root , & map , & chunk_size ,
& stripe_size , chunk_offset , type ) ;
if ( ret )
return ret ;
ret = __finish_chunk_alloc ( trans , extent_root , map , chunk_offset ,
chunk_size , stripe_size ) ;
BUG_ON ( ret ) ;
return 0 ;
}
2009-01-05 21:25:51 -05:00
static noinline int init_first_rw_device ( struct btrfs_trans_handle * trans ,
2008-11-17 21:11:30 -05:00
struct btrfs_root * root ,
struct btrfs_device * device )
{
u64 chunk_offset ;
u64 sys_chunk_offset ;
u64 chunk_size ;
u64 sys_chunk_size ;
u64 stripe_size ;
u64 sys_stripe_size ;
u64 alloc_profile ;
struct map_lookup * map ;
struct map_lookup * sys_map ;
struct btrfs_fs_info * fs_info = root - > fs_info ;
struct btrfs_root * extent_root = fs_info - > extent_root ;
int ret ;
ret = find_next_chunk ( fs_info - > chunk_root ,
BTRFS_FIRST_CHUNK_TREE_OBJECTID , & chunk_offset ) ;
BUG_ON ( ret ) ;
alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
( fs_info - > metadata_alloc_profile &
fs_info - > avail_metadata_alloc_bits ) ;
alloc_profile = btrfs_reduce_alloc_profile ( root , alloc_profile ) ;
ret = __btrfs_alloc_chunk ( trans , extent_root , & map , & chunk_size ,
& stripe_size , chunk_offset , alloc_profile ) ;
BUG_ON ( ret ) ;
sys_chunk_offset = chunk_offset + chunk_size ;
alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
( fs_info - > system_alloc_profile &
fs_info - > avail_system_alloc_bits ) ;
alloc_profile = btrfs_reduce_alloc_profile ( root , alloc_profile ) ;
ret = __btrfs_alloc_chunk ( trans , extent_root , & sys_map ,
& sys_chunk_size , & sys_stripe_size ,
sys_chunk_offset , alloc_profile ) ;
BUG_ON ( ret ) ;
ret = btrfs_add_device ( trans , fs_info - > chunk_root , device ) ;
BUG_ON ( ret ) ;
/*
* Modifying chunk tree needs allocating new blocks from both
* system block group and metadata block group . So we only can
* do operations require modifying the chunk tree after both
* block groups were created .
*/
ret = __finish_chunk_alloc ( trans , extent_root , map , chunk_offset ,
chunk_size , stripe_size ) ;
BUG_ON ( ret ) ;
ret = __finish_chunk_alloc ( trans , extent_root , sys_map ,
sys_chunk_offset , sys_chunk_size ,
sys_stripe_size ) ;
2008-04-14 09:48:18 -04:00
BUG_ON ( ret ) ;
2008-11-17 21:11:30 -05:00
return 0 ;
}
int btrfs_chunk_readonly ( struct btrfs_root * root , u64 chunk_offset )
{
struct extent_map * em ;
struct map_lookup * map ;
struct btrfs_mapping_tree * map_tree = & root - > fs_info - > mapping_tree ;
int readonly = 0 ;
int i ;
2009-09-02 16:24:52 -04:00
read_lock ( & map_tree - > map_tree . lock ) ;
2008-11-17 21:11:30 -05:00
em = lookup_extent_mapping ( & map_tree - > map_tree , chunk_offset , 1 ) ;
2009-09-02 16:24:52 -04:00
read_unlock ( & map_tree - > map_tree . lock ) ;
2008-11-17 21:11:30 -05:00
if ( ! em )
return 1 ;
2010-01-27 02:07:59 +00:00
if ( btrfs_test_opt ( root , DEGRADED ) ) {
free_extent_map ( em ) ;
return 0 ;
}
2008-11-17 21:11:30 -05:00
map = ( struct map_lookup * ) em - > bdev ;
for ( i = 0 ; i < map - > num_stripes ; i + + ) {
if ( ! map - > stripes [ i ] . dev - > writeable ) {
readonly = 1 ;
break ;
}
}
2008-03-24 15:01:56 -04:00
free_extent_map ( em ) ;
2008-11-17 21:11:30 -05:00
return readonly ;
2008-03-24 15:01:56 -04:00
}
void btrfs_mapping_init ( struct btrfs_mapping_tree * tree )
{
extent_map_tree_init ( & tree - > map_tree , GFP_NOFS ) ;
}
void btrfs_mapping_tree_free ( struct btrfs_mapping_tree * tree )
{
struct extent_map * em ;
2009-01-05 21:25:51 -05:00
while ( 1 ) {
2009-09-02 16:24:52 -04:00
write_lock ( & tree - > map_tree . lock ) ;
2008-03-24 15:01:56 -04:00
em = lookup_extent_mapping ( & tree - > map_tree , 0 , ( u64 ) - 1 ) ;
if ( em )
remove_extent_mapping ( & tree - > map_tree , em ) ;
2009-09-02 16:24:52 -04:00
write_unlock ( & tree - > map_tree . lock ) ;
2008-03-24 15:01:56 -04:00
if ( ! em )
break ;
kfree ( em - > bdev ) ;
/* once for us */
free_extent_map ( em ) ;
/* once for the tree */
free_extent_map ( em ) ;
}
}
2008-04-09 16:28:12 -04:00
int btrfs_num_copies ( struct btrfs_mapping_tree * map_tree , u64 logical , u64 len )
{
struct extent_map * em ;
struct map_lookup * map ;
struct extent_map_tree * em_tree = & map_tree - > map_tree ;
int ret ;
2009-09-02 16:24:52 -04:00
read_lock ( & em_tree - > lock ) ;
2008-04-09 16:28:12 -04:00
em = lookup_extent_mapping ( em_tree , logical , len ) ;
2009-09-02 16:24:52 -04:00
read_unlock ( & em_tree - > lock ) ;
2008-04-09 16:28:12 -04:00
BUG_ON ( ! em ) ;
BUG_ON ( em - > start > logical | | em - > start + em - > len < logical ) ;
map = ( struct map_lookup * ) em - > bdev ;
if ( map - > type & ( BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 ) )
ret = map - > num_stripes ;
2008-04-16 10:49:51 -04:00
else if ( map - > type & BTRFS_BLOCK_GROUP_RAID10 )
ret = map - > sub_stripes ;
2008-04-09 16:28:12 -04:00
else
ret = 1 ;
free_extent_map ( em ) ;
return ret ;
}
2008-05-13 13:46:40 -04:00
static int find_live_mirror ( struct map_lookup * map , int first , int num ,
int optimal )
{
int i ;
if ( map - > stripes [ optimal ] . dev - > bdev )
return optimal ;
for ( i = first ; i < first + num ; i + + ) {
if ( map - > stripes [ i ] . dev - > bdev )
return i ;
}
/* we couldn't find one that doesn't fail. Just return something
* and the io error handling code will clean up eventually
*/
return optimal ;
}
2008-04-21 10:03:05 -04:00
static int __btrfs_map_block ( struct btrfs_mapping_tree * map_tree , int rw ,
u64 logical , u64 * length ,
struct btrfs_multi_bio * * multi_ret ,
int mirror_num , struct page * unplug_page )
2008-03-24 15:01:56 -04:00
{
struct extent_map * em ;
struct map_lookup * map ;
struct extent_map_tree * em_tree = & map_tree - > map_tree ;
u64 offset ;
2008-03-25 16:50:33 -04:00
u64 stripe_offset ;
u64 stripe_nr ;
2008-04-09 16:28:12 -04:00
int stripes_allocated = 8 ;
2008-04-16 10:49:51 -04:00
int stripes_required = 1 ;
2008-03-25 16:50:33 -04:00
int stripe_index ;
2008-04-09 16:28:12 -04:00
int i ;
2008-04-21 10:03:05 -04:00
int num_stripes ;
2008-04-29 09:38:00 -04:00
int max_errors = 0 ;
2008-04-09 16:28:12 -04:00
struct btrfs_multi_bio * multi = NULL ;
2008-03-24 15:01:56 -04:00
2009-01-05 21:25:51 -05:00
if ( multi_ret & & ! ( rw & ( 1 < < BIO_RW ) ) )
2008-04-09 16:28:12 -04:00
stripes_allocated = 1 ;
again :
if ( multi_ret ) {
multi = kzalloc ( btrfs_multi_bio_size ( stripes_allocated ) ,
GFP_NOFS ) ;
if ( ! multi )
return - ENOMEM ;
2008-04-29 09:38:00 -04:00
atomic_set ( & multi - > error , 0 ) ;
2008-04-09 16:28:12 -04:00
}
2008-03-24 15:01:56 -04:00
2009-09-02 16:24:52 -04:00
read_lock ( & em_tree - > lock ) ;
2008-03-24 15:01:56 -04:00
em = lookup_extent_mapping ( em_tree , logical , * length ) ;
2009-09-02 16:24:52 -04:00
read_unlock ( & em_tree - > lock ) ;
2008-04-21 10:03:05 -04:00
2010-01-06 16:57:22 +00:00
if ( ! em & & unplug_page ) {
kfree ( multi ) ;
2008-04-21 10:03:05 -04:00
return 0 ;
2010-01-06 16:57:22 +00:00
}
2008-04-21 10:03:05 -04:00
2008-04-17 11:29:12 -04:00
if ( ! em ) {
2009-01-05 21:25:51 -05:00
printk ( KERN_CRIT " unable to find logical %llu len %llu \n " ,
( unsigned long long ) logical ,
( unsigned long long ) * length ) ;
2008-04-21 10:03:05 -04:00
BUG ( ) ;
2008-04-17 11:29:12 -04:00
}
2008-03-24 15:01:56 -04:00
BUG_ON ( em - > start > logical | | em - > start + em - > len < logical ) ;
map = ( struct map_lookup * ) em - > bdev ;
offset = logical - em - > start ;
2008-03-25 16:50:33 -04:00
2008-04-09 16:28:12 -04:00
if ( mirror_num > map - > num_stripes )
mirror_num = 0 ;
2008-04-09 16:28:12 -04:00
/* if our multi bio struct is too small, back off and try again */
2008-04-16 10:49:51 -04:00
if ( rw & ( 1 < < BIO_RW ) ) {
if ( map - > type & ( BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_DUP ) ) {
stripes_required = map - > num_stripes ;
2008-04-29 09:38:00 -04:00
max_errors = 1 ;
2008-04-16 10:49:51 -04:00
} else if ( map - > type & BTRFS_BLOCK_GROUP_RAID10 ) {
stripes_required = map - > sub_stripes ;
2008-04-29 09:38:00 -04:00
max_errors = 1 ;
2008-04-16 10:49:51 -04:00
}
}
2009-04-20 15:50:09 -04:00
if ( multi_ret & & ( rw & ( 1 < < BIO_RW ) ) & &
2008-04-16 10:49:51 -04:00
stripes_allocated < stripes_required ) {
2008-04-09 16:28:12 -04:00
stripes_allocated = map - > num_stripes ;
free_extent_map ( em ) ;
kfree ( multi ) ;
goto again ;
}
2008-03-25 16:50:33 -04:00
stripe_nr = offset ;
/*
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
do_div ( stripe_nr , map - > stripe_len ) ;
stripe_offset = stripe_nr * map - > stripe_len ;
BUG_ON ( offset < stripe_offset ) ;
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset ;
2008-04-09 16:28:12 -04:00
if ( map - > type & ( BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2008-04-16 10:49:51 -04:00
BTRFS_BLOCK_GROUP_RAID10 |
2008-04-09 16:28:12 -04:00
BTRFS_BLOCK_GROUP_DUP ) ) {
/* we limit the length of each bio to what fits in a stripe */
* length = min_t ( u64 , em - > len - offset ,
map - > stripe_len - stripe_offset ) ;
} else {
* length = em - > len - offset ;
}
2008-04-21 10:03:05 -04:00
if ( ! multi_ret & & ! unplug_page )
2008-04-09 16:28:12 -04:00
goto out ;
2008-04-21 10:03:05 -04:00
num_stripes = 1 ;
2008-04-09 16:28:12 -04:00
stripe_index = 0 ;
2008-04-03 16:29:03 -04:00
if ( map - > type & BTRFS_BLOCK_GROUP_RAID1 ) {
2008-04-21 10:03:05 -04:00
if ( unplug_page | | ( rw & ( 1 < < BIO_RW ) ) )
num_stripes = map - > num_stripes ;
2008-04-29 14:12:09 -04:00
else if ( mirror_num )
2008-04-09 16:28:12 -04:00
stripe_index = mirror_num - 1 ;
2008-05-13 13:46:40 -04:00
else {
stripe_index = find_live_mirror ( map , 0 ,
map - > num_stripes ,
current - > pid % map - > num_stripes ) ;
}
2008-04-29 14:12:09 -04:00
2008-04-03 16:29:03 -04:00
} else if ( map - > type & BTRFS_BLOCK_GROUP_DUP ) {
2008-04-09 16:28:12 -04:00
if ( rw & ( 1 < < BIO_RW ) )
2008-04-21 10:03:05 -04:00
num_stripes = map - > num_stripes ;
2008-04-09 16:28:12 -04:00
else if ( mirror_num )
stripe_index = mirror_num - 1 ;
2008-04-29 14:12:09 -04:00
2008-04-16 10:49:51 -04:00
} else if ( map - > type & BTRFS_BLOCK_GROUP_RAID10 ) {
int factor = map - > num_stripes / map - > sub_stripes ;
stripe_index = do_div ( stripe_nr , factor ) ;
stripe_index * = map - > sub_stripes ;
2008-04-21 10:03:05 -04:00
if ( unplug_page | | ( rw & ( 1 < < BIO_RW ) ) )
num_stripes = map - > sub_stripes ;
2008-04-16 10:49:51 -04:00
else if ( mirror_num )
stripe_index + = mirror_num - 1 ;
2008-05-13 13:46:40 -04:00
else {
stripe_index = find_live_mirror ( map , stripe_index ,
map - > sub_stripes , stripe_index +
current - > pid % map - > sub_stripes ) ;
}
2008-04-03 16:29:03 -04:00
} else {
/*
* after this do_div call , stripe_nr is the number of stripes
* on this device we have to walk to find the data , and
* stripe_index is the number of our device in the stripe array
*/
stripe_index = do_div ( stripe_nr , map - > num_stripes ) ;
}
2008-03-25 16:50:33 -04:00
BUG_ON ( stripe_index > = map - > num_stripes ) ;
2008-04-09 16:28:12 -04:00
2008-04-21 10:03:05 -04:00
for ( i = 0 ; i < num_stripes ; i + + ) {
if ( unplug_page ) {
struct btrfs_device * device ;
struct backing_dev_info * bdi ;
device = map - > stripes [ stripe_index ] . dev ;
2008-05-13 13:46:40 -04:00
if ( device - > bdev ) {
bdi = blk_get_backing_dev_info ( device - > bdev ) ;
2009-01-05 21:25:51 -05:00
if ( bdi - > unplug_io_fn )
2008-05-13 13:46:40 -04:00
bdi - > unplug_io_fn ( bdi , unplug_page ) ;
2008-04-21 10:03:05 -04:00
}
} else {
multi - > stripes [ i ] . physical =
map - > stripes [ stripe_index ] . physical +
stripe_offset + stripe_nr * map - > stripe_len ;
multi - > stripes [ i ] . dev = map - > stripes [ stripe_index ] . dev ;
}
2008-04-09 16:28:12 -04:00
stripe_index + + ;
2008-03-25 16:50:33 -04:00
}
2008-04-21 10:03:05 -04:00
if ( multi_ret ) {
* multi_ret = multi ;
multi - > num_stripes = num_stripes ;
2008-04-29 09:38:00 -04:00
multi - > max_errors = max_errors ;
2008-04-21 10:03:05 -04:00
}
2008-04-09 16:28:12 -04:00
out :
2008-03-24 15:01:56 -04:00
free_extent_map ( em ) ;
return 0 ;
}
2008-04-21 10:03:05 -04:00
int btrfs_map_block ( struct btrfs_mapping_tree * map_tree , int rw ,
u64 logical , u64 * length ,
struct btrfs_multi_bio * * multi_ret , int mirror_num )
{
return __btrfs_map_block ( map_tree , rw , logical , length , multi_ret ,
mirror_num , NULL ) ;
}
2008-12-08 16:46:26 -05:00
int btrfs_rmap_block ( struct btrfs_mapping_tree * map_tree ,
u64 chunk_start , u64 physical , u64 devid ,
u64 * * logical , int * naddrs , int * stripe_len )
{
struct extent_map_tree * em_tree = & map_tree - > map_tree ;
struct extent_map * em ;
struct map_lookup * map ;
u64 * buf ;
u64 bytenr ;
u64 length ;
u64 stripe_nr ;
int i , j , nr = 0 ;
2009-09-02 16:24:52 -04:00
read_lock ( & em_tree - > lock ) ;
2008-12-08 16:46:26 -05:00
em = lookup_extent_mapping ( em_tree , chunk_start , 1 ) ;
2009-09-02 16:24:52 -04:00
read_unlock ( & em_tree - > lock ) ;
2008-12-08 16:46:26 -05:00
BUG_ON ( ! em | | em - > start ! = chunk_start ) ;
map = ( struct map_lookup * ) em - > bdev ;
length = em - > len ;
if ( map - > type & BTRFS_BLOCK_GROUP_RAID10 )
do_div ( length , map - > num_stripes / map - > sub_stripes ) ;
else if ( map - > type & BTRFS_BLOCK_GROUP_RAID0 )
do_div ( length , map - > num_stripes ) ;
buf = kzalloc ( sizeof ( u64 ) * map - > num_stripes , GFP_NOFS ) ;
BUG_ON ( ! buf ) ;
for ( i = 0 ; i < map - > num_stripes ; i + + ) {
if ( devid & & map - > stripes [ i ] . dev - > devid ! = devid )
continue ;
if ( map - > stripes [ i ] . physical > physical | |
map - > stripes [ i ] . physical + length < = physical )
continue ;
stripe_nr = physical - map - > stripes [ i ] . physical ;
do_div ( stripe_nr , map - > stripe_len ) ;
if ( map - > type & BTRFS_BLOCK_GROUP_RAID10 ) {
stripe_nr = stripe_nr * map - > num_stripes + i ;
do_div ( stripe_nr , map - > sub_stripes ) ;
} else if ( map - > type & BTRFS_BLOCK_GROUP_RAID0 ) {
stripe_nr = stripe_nr * map - > num_stripes + i ;
}
bytenr = chunk_start + stripe_nr * map - > stripe_len ;
2008-12-08 16:43:10 -05:00
WARN_ON ( nr > = map - > num_stripes ) ;
2008-12-08 16:46:26 -05:00
for ( j = 0 ; j < nr ; j + + ) {
if ( buf [ j ] = = bytenr )
break ;
}
2008-12-08 16:43:10 -05:00
if ( j = = nr ) {
WARN_ON ( nr > = map - > num_stripes ) ;
2008-12-08 16:46:26 -05:00
buf [ nr + + ] = bytenr ;
2008-12-08 16:43:10 -05:00
}
2008-12-08 16:46:26 -05:00
}
* logical = buf ;
* naddrs = nr ;
* stripe_len = map - > stripe_len ;
free_extent_map ( em ) ;
return 0 ;
}
2008-04-21 10:03:05 -04:00
int btrfs_unplug_page ( struct btrfs_mapping_tree * map_tree ,
u64 logical , struct page * page )
{
u64 length = PAGE_CACHE_SIZE ;
return __btrfs_map_block ( map_tree , READ , logical , & length ,
NULL , 0 , page ) ;
}
2008-04-03 16:29:03 -04:00
static void end_bio_multi_stripe ( struct bio * bio , int err )
{
2008-04-09 16:28:12 -04:00
struct btrfs_multi_bio * multi = bio - > bi_private ;
2008-08-05 10:13:57 -04:00
int is_orig_bio = 0 ;
2008-04-03 16:29:03 -04:00
if ( err )
2008-04-29 09:38:00 -04:00
atomic_inc ( & multi - > error ) ;
2008-04-03 16:29:03 -04:00
2008-08-05 10:13:57 -04:00
if ( bio = = multi - > orig_bio )
is_orig_bio = 1 ;
2008-04-09 16:28:12 -04:00
if ( atomic_dec_and_test ( & multi - > stripes_pending ) ) {
2008-08-05 10:13:57 -04:00
if ( ! is_orig_bio ) {
bio_put ( bio ) ;
bio = multi - > orig_bio ;
}
2008-04-03 16:29:03 -04:00
bio - > bi_private = multi - > private ;
bio - > bi_end_io = multi - > end_io ;
2008-04-29 09:38:00 -04:00
/* only send an error to the higher layers if it is
* beyond the tolerance of the multi - bio
*/
2008-05-12 13:39:03 -04:00
if ( atomic_read ( & multi - > error ) > multi - > max_errors ) {
2008-04-29 09:38:00 -04:00
err = - EIO ;
2008-05-12 13:39:03 -04:00
} else if ( err ) {
/*
* this bio is actually up to date , we didn ' t
* go over the max number of errors
*/
set_bit ( BIO_UPTODATE , & bio - > bi_flags ) ;
2008-04-29 09:38:00 -04:00
err = 0 ;
2008-05-12 13:39:03 -04:00
}
2008-04-03 16:29:03 -04:00
kfree ( multi ) ;
bio_endio ( bio , err ) ;
2008-08-05 10:13:57 -04:00
} else if ( ! is_orig_bio ) {
2008-04-03 16:29:03 -04:00
bio_put ( bio ) ;
}
}
2008-06-11 16:50:36 -04:00
struct async_sched {
struct bio * bio ;
int rw ;
struct btrfs_fs_info * info ;
struct btrfs_work work ;
} ;
/*
* see run_scheduled_bios for a description of why bios are collected for
* async submit .
*
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled .
*/
2009-01-05 21:25:51 -05:00
static noinline int schedule_bio ( struct btrfs_root * root ,
2008-09-05 16:09:51 -04:00
struct btrfs_device * device ,
int rw , struct bio * bio )
2008-06-11 16:50:36 -04:00
{
int should_queue = 1 ;
2009-04-20 15:50:09 -04:00
struct btrfs_pending_bios * pending_bios ;
2008-06-11 16:50:36 -04:00
/* don't bother with additional async steps for reads, right now */
if ( ! ( rw & ( 1 < < BIO_RW ) ) ) {
2008-07-31 16:29:02 -04:00
bio_get ( bio ) ;
2008-06-11 16:50:36 -04:00
submit_bio ( rw , bio ) ;
2008-07-31 16:29:02 -04:00
bio_put ( bio ) ;
2008-06-11 16:50:36 -04:00
return 0 ;
}
/*
2008-08-15 15:34:15 -04:00
* nr_async_bios allows us to reliably return congestion to the
2008-06-11 16:50:36 -04:00
* higher layers . Otherwise , the async bio makes it appear we have
* made progress against dirty pages when we ' ve really just put it
* on a queue for later
*/
2008-08-15 15:34:15 -04:00
atomic_inc ( & root - > fs_info - > nr_async_bios ) ;
2008-07-31 16:29:02 -04:00
WARN_ON ( bio - > bi_next ) ;
2008-06-11 16:50:36 -04:00
bio - > bi_next = NULL ;
bio - > bi_rw | = rw ;
spin_lock ( & device - > io_lock ) ;
2009-09-11 14:32:04 +02:00
if ( bio_rw_flagged ( bio , BIO_RW_SYNCIO ) )
2009-04-20 15:50:09 -04:00
pending_bios = & device - > pending_sync_bios ;
else
pending_bios = & device - > pending_bios ;
2008-06-11 16:50:36 -04:00
2009-04-20 15:50:09 -04:00
if ( pending_bios - > tail )
pending_bios - > tail - > bi_next = bio ;
2008-06-11 16:50:36 -04:00
2009-04-20 15:50:09 -04:00
pending_bios - > tail = bio ;
if ( ! pending_bios - > head )
pending_bios - > head = bio ;
2008-06-11 16:50:36 -04:00
if ( device - > running_pending )
should_queue = 0 ;
spin_unlock ( & device - > io_lock ) ;
if ( should_queue )
2008-06-12 14:46:17 -04:00
btrfs_queue_worker ( & root - > fs_info - > submit_workers ,
& device - > work ) ;
2008-06-11 16:50:36 -04:00
return 0 ;
}
2008-04-09 16:28:12 -04:00
int btrfs_map_bio ( struct btrfs_root * root , int rw , struct bio * bio ,
2008-06-11 16:50:36 -04:00
int mirror_num , int async_submit )
2008-03-24 15:01:56 -04:00
{
struct btrfs_mapping_tree * map_tree ;
struct btrfs_device * dev ;
2008-04-03 16:29:03 -04:00
struct bio * first_bio = bio ;
2008-10-03 16:31:08 -04:00
u64 logical = ( u64 ) bio - > bi_sector < < 9 ;
2008-03-24 15:01:56 -04:00
u64 length = 0 ;
u64 map_length ;
2008-04-09 16:28:12 -04:00
struct btrfs_multi_bio * multi = NULL ;
2008-03-24 15:01:56 -04:00
int ret ;
2008-04-03 16:29:03 -04:00
int dev_nr = 0 ;
int total_devs = 1 ;
2008-03-24 15:01:56 -04:00
2008-04-21 10:03:05 -04:00
length = bio - > bi_size ;
2008-03-24 15:01:56 -04:00
map_tree = & root - > fs_info - > mapping_tree ;
map_length = length ;
2008-04-09 16:28:12 -04:00
2008-04-09 16:28:12 -04:00
ret = btrfs_map_block ( map_tree , rw , logical , & map_length , & multi ,
mirror_num ) ;
2008-04-09 16:28:12 -04:00
BUG_ON ( ret ) ;
total_devs = multi - > num_stripes ;
if ( map_length < length ) {
2009-01-05 21:25:51 -05:00
printk ( KERN_CRIT " mapping failed logical %llu bio len %llu "
" len %llu \n " , ( unsigned long long ) logical ,
( unsigned long long ) length ,
( unsigned long long ) map_length ) ;
2008-04-09 16:28:12 -04:00
BUG ( ) ;
}
multi - > end_io = first_bio - > bi_end_io ;
multi - > private = first_bio - > bi_private ;
2008-08-05 10:13:57 -04:00
multi - > orig_bio = first_bio ;
2008-04-09 16:28:12 -04:00
atomic_set ( & multi - > stripes_pending , multi - > num_stripes ) ;
2009-01-05 21:25:51 -05:00
while ( dev_nr < total_devs ) {
2008-04-03 16:29:03 -04:00
if ( total_devs > 1 ) {
if ( dev_nr < total_devs - 1 ) {
bio = bio_clone ( first_bio , GFP_NOFS ) ;
BUG_ON ( ! bio ) ;
} else {
bio = first_bio ;
}
bio - > bi_private = multi ;
bio - > bi_end_io = end_bio_multi_stripe ;
}
2008-04-09 16:28:12 -04:00
bio - > bi_sector = multi - > stripes [ dev_nr ] . physical > > 9 ;
dev = multi - > stripes [ dev_nr ] . dev ;
2008-11-17 21:11:30 -05:00
BUG_ON ( rw = = WRITE & & ! dev - > writeable ) ;
2008-05-13 13:46:40 -04:00
if ( dev & & dev - > bdev ) {
bio - > bi_bdev = dev - > bdev ;
2008-06-11 16:50:36 -04:00
if ( async_submit )
schedule_bio ( root , dev , rw , bio ) ;
else
submit_bio ( rw , bio ) ;
2008-05-13 13:46:40 -04:00
} else {
bio - > bi_bdev = root - > fs_info - > fs_devices - > latest_bdev ;
bio - > bi_sector = logical > > 9 ;
bio_endio ( bio , - EIO ) ;
}
2008-04-03 16:29:03 -04:00
dev_nr + + ;
}
2008-04-09 16:28:12 -04:00
if ( total_devs = = 1 )
kfree ( multi ) ;
2008-03-24 15:01:56 -04:00
return 0 ;
}
2008-04-18 10:29:38 -04:00
struct btrfs_device * btrfs_find_device ( struct btrfs_root * root , u64 devid ,
2008-11-17 21:11:30 -05:00
u8 * uuid , u8 * fsid )
2008-03-24 15:01:56 -04:00
{
2008-11-17 21:11:30 -05:00
struct btrfs_device * device ;
struct btrfs_fs_devices * cur_devices ;
cur_devices = root - > fs_info - > fs_devices ;
while ( cur_devices ) {
if ( ! fsid | |
! memcmp ( cur_devices - > fsid , fsid , BTRFS_UUID_SIZE ) ) {
device = __find_device ( & cur_devices - > devices ,
devid , uuid ) ;
if ( device )
return device ;
}
cur_devices = cur_devices - > seed ;
}
return NULL ;
2008-03-24 15:01:56 -04:00
}
2008-05-13 13:46:40 -04:00
static struct btrfs_device * add_missing_dev ( struct btrfs_root * root ,
u64 devid , u8 * dev_uuid )
{
struct btrfs_device * device ;
struct btrfs_fs_devices * fs_devices = root - > fs_info - > fs_devices ;
device = kzalloc ( sizeof ( * device ) , GFP_NOFS ) ;
2008-11-12 14:38:54 -05:00
if ( ! device )
return NULL ;
2008-05-13 13:46:40 -04:00
list_add ( & device - > dev_list ,
& fs_devices - > devices ) ;
device - > barriers = 1 ;
device - > dev_root = root - > fs_info - > dev_root ;
device - > devid = devid ;
2008-06-11 16:50:36 -04:00
device - > work . func = pending_bios_fn ;
2008-12-12 10:03:26 -05:00
device - > fs_devices = fs_devices ;
2008-05-13 13:46:40 -04:00
fs_devices - > num_devices + + ;
spin_lock_init ( & device - > io_lock ) ;
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 16:58:54 -05:00
INIT_LIST_HEAD ( & device - > dev_alloc_list ) ;
2008-05-13 13:46:40 -04:00
memcpy ( device - > uuid , dev_uuid , BTRFS_UUID_SIZE ) ;
return device ;
}
2008-03-24 15:01:56 -04:00
static int read_one_chunk ( struct btrfs_root * root , struct btrfs_key * key ,
struct extent_buffer * leaf ,
struct btrfs_chunk * chunk )
{
struct btrfs_mapping_tree * map_tree = & root - > fs_info - > mapping_tree ;
struct map_lookup * map ;
struct extent_map * em ;
u64 logical ;
u64 length ;
u64 devid ;
2008-04-18 10:29:38 -04:00
u8 uuid [ BTRFS_UUID_SIZE ] ;
2008-03-25 16:50:33 -04:00
int num_stripes ;
2008-03-24 15:01:56 -04:00
int ret ;
2008-03-25 16:50:33 -04:00
int i ;
2008-03-24 15:01:56 -04:00
2008-04-15 15:41:47 -04:00
logical = key - > offset ;
length = btrfs_chunk_length ( leaf , chunk ) ;
2008-05-07 11:43:44 -04:00
2009-09-02 16:24:52 -04:00
read_lock ( & map_tree - > map_tree . lock ) ;
2008-03-24 15:01:56 -04:00
em = lookup_extent_mapping ( & map_tree - > map_tree , logical , 1 ) ;
2009-09-02 16:24:52 -04:00
read_unlock ( & map_tree - > map_tree . lock ) ;
2008-03-24 15:01:56 -04:00
/* already mapped? */
if ( em & & em - > start < = logical & & em - > start + em - > len > logical ) {
free_extent_map ( em ) ;
return 0 ;
} else if ( em ) {
free_extent_map ( em ) ;
}
em = alloc_extent_map ( GFP_NOFS ) ;
if ( ! em )
return - ENOMEM ;
2008-03-25 16:50:33 -04:00
num_stripes = btrfs_chunk_num_stripes ( leaf , chunk ) ;
map = kmalloc ( map_lookup_size ( num_stripes ) , GFP_NOFS ) ;
2008-03-24 15:01:56 -04:00
if ( ! map ) {
free_extent_map ( em ) ;
return - ENOMEM ;
}
em - > bdev = ( struct block_device * ) map ;
em - > start = logical ;
em - > len = length ;
em - > block_start = 0 ;
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
em - > block_len = em - > len ;
2008-03-24 15:01:56 -04:00
2008-03-25 16:50:33 -04:00
map - > num_stripes = num_stripes ;
map - > io_width = btrfs_chunk_io_width ( leaf , chunk ) ;
map - > io_align = btrfs_chunk_io_align ( leaf , chunk ) ;
map - > sector_size = btrfs_chunk_sector_size ( leaf , chunk ) ;
map - > stripe_len = btrfs_chunk_stripe_len ( leaf , chunk ) ;
map - > type = btrfs_chunk_type ( leaf , chunk ) ;
2008-04-16 10:49:51 -04:00
map - > sub_stripes = btrfs_chunk_sub_stripes ( leaf , chunk ) ;
2008-03-25 16:50:33 -04:00
for ( i = 0 ; i < num_stripes ; i + + ) {
map - > stripes [ i ] . physical =
btrfs_stripe_offset_nr ( leaf , chunk , i ) ;
devid = btrfs_stripe_devid_nr ( leaf , chunk , i ) ;
2008-04-18 10:29:38 -04:00
read_extent_buffer ( leaf , uuid , ( unsigned long )
btrfs_stripe_dev_uuid_nr ( chunk , i ) ,
BTRFS_UUID_SIZE ) ;
2008-11-17 21:11:30 -05:00
map - > stripes [ i ] . dev = btrfs_find_device ( root , devid , uuid ,
NULL ) ;
2008-05-13 13:46:40 -04:00
if ( ! map - > stripes [ i ] . dev & & ! btrfs_test_opt ( root , DEGRADED ) ) {
2008-03-25 16:50:33 -04:00
kfree ( map ) ;
free_extent_map ( em ) ;
return - EIO ;
}
2008-05-13 13:46:40 -04:00
if ( ! map - > stripes [ i ] . dev ) {
map - > stripes [ i ] . dev =
add_missing_dev ( root , devid , uuid ) ;
if ( ! map - > stripes [ i ] . dev ) {
kfree ( map ) ;
free_extent_map ( em ) ;
return - EIO ;
}
}
map - > stripes [ i ] . dev - > in_fs_metadata = 1 ;
2008-03-24 15:01:56 -04:00
}
2009-09-02 16:24:52 -04:00
write_lock ( & map_tree - > map_tree . lock ) ;
2008-03-24 15:01:56 -04:00
ret = add_extent_mapping ( & map_tree - > map_tree , em ) ;
2009-09-02 16:24:52 -04:00
write_unlock ( & map_tree - > map_tree . lock ) ;
2008-04-14 09:48:18 -04:00
BUG_ON ( ret ) ;
2008-03-24 15:01:56 -04:00
free_extent_map ( em ) ;
return 0 ;
}
static int fill_device_from_item ( struct extent_buffer * leaf ,
struct btrfs_dev_item * dev_item ,
struct btrfs_device * device )
{
unsigned long ptr ;
device - > devid = btrfs_device_id ( leaf , dev_item ) ;
2009-04-27 07:29:03 -04:00
device - > disk_total_bytes = btrfs_device_total_bytes ( leaf , dev_item ) ;
device - > total_bytes = device - > disk_total_bytes ;
2008-03-24 15:01:56 -04:00
device - > bytes_used = btrfs_device_bytes_used ( leaf , dev_item ) ;
device - > type = btrfs_device_type ( leaf , dev_item ) ;
device - > io_align = btrfs_device_io_align ( leaf , dev_item ) ;
device - > io_width = btrfs_device_io_width ( leaf , dev_item ) ;
device - > sector_size = btrfs_device_sector_size ( leaf , dev_item ) ;
ptr = ( unsigned long ) btrfs_device_uuid ( dev_item ) ;
2008-04-15 15:41:47 -04:00
read_extent_buffer ( leaf , device - > uuid , ptr , BTRFS_UUID_SIZE ) ;
2008-03-24 15:01:56 -04:00
return 0 ;
}
2008-11-17 21:11:30 -05:00
static int open_seed_devices ( struct btrfs_root * root , u8 * fsid )
{
struct btrfs_fs_devices * fs_devices ;
int ret ;
mutex_lock ( & uuid_mutex ) ;
fs_devices = root - > fs_info - > fs_devices - > seed ;
while ( fs_devices ) {
if ( ! memcmp ( fs_devices - > fsid , fsid , BTRFS_UUID_SIZE ) ) {
ret = 0 ;
goto out ;
}
fs_devices = fs_devices - > seed ;
}
fs_devices = find_fsid ( fsid ) ;
if ( ! fs_devices ) {
ret = - ENOENT ;
goto out ;
}
2008-12-12 10:03:26 -05:00
fs_devices = clone_fs_devices ( fs_devices ) ;
if ( IS_ERR ( fs_devices ) ) {
ret = PTR_ERR ( fs_devices ) ;
2008-11-17 21:11:30 -05:00
goto out ;
}
2008-12-02 06:36:09 -05:00
ret = __btrfs_open_devices ( fs_devices , FMODE_READ ,
2008-11-19 21:17:22 -05:00
root - > fs_info - > bdev_holder ) ;
2008-11-17 21:11:30 -05:00
if ( ret )
goto out ;
if ( ! fs_devices - > seeding ) {
__btrfs_close_devices ( fs_devices ) ;
2008-12-12 10:03:26 -05:00
free_fs_devices ( fs_devices ) ;
2008-11-17 21:11:30 -05:00
ret = - EINVAL ;
goto out ;
}
fs_devices - > seed = root - > fs_info - > fs_devices - > seed ;
root - > fs_info - > fs_devices - > seed = fs_devices ;
out :
mutex_unlock ( & uuid_mutex ) ;
return ret ;
}
2008-03-24 15:02:07 -04:00
static int read_one_dev ( struct btrfs_root * root ,
2008-03-24 15:01:56 -04:00
struct extent_buffer * leaf ,
struct btrfs_dev_item * dev_item )
{
struct btrfs_device * device ;
u64 devid ;
int ret ;
2008-11-17 21:11:30 -05:00
u8 fs_uuid [ BTRFS_UUID_SIZE ] ;
2008-04-18 10:29:38 -04:00
u8 dev_uuid [ BTRFS_UUID_SIZE ] ;
2008-03-24 15:01:56 -04:00
devid = btrfs_device_id ( leaf , dev_item ) ;
2008-04-18 10:29:38 -04:00
read_extent_buffer ( leaf , dev_uuid ,
( unsigned long ) btrfs_device_uuid ( dev_item ) ,
BTRFS_UUID_SIZE ) ;
2008-11-17 21:11:30 -05:00
read_extent_buffer ( leaf , fs_uuid ,
( unsigned long ) btrfs_device_fsid ( dev_item ) ,
BTRFS_UUID_SIZE ) ;
if ( memcmp ( fs_uuid , root - > fs_info - > fsid , BTRFS_UUID_SIZE ) ) {
ret = open_seed_devices ( root , fs_uuid ) ;
2008-12-12 10:03:26 -05:00
if ( ret & & ! btrfs_test_opt ( root , DEGRADED ) )
2008-11-17 21:11:30 -05:00
return ret ;
}
device = btrfs_find_device ( root , devid , dev_uuid , fs_uuid ) ;
if ( ! device | | ! device - > bdev ) {
2008-12-12 10:03:26 -05:00
if ( ! btrfs_test_opt ( root , DEGRADED ) )
2008-11-17 21:11:30 -05:00
return - EIO ;
if ( ! device ) {
2009-01-05 21:25:51 -05:00
printk ( KERN_WARNING " warning devid %llu missing \n " ,
( unsigned long long ) devid ) ;
2008-11-17 21:11:30 -05:00
device = add_missing_dev ( root , devid , dev_uuid ) ;
if ( ! device )
return - ENOMEM ;
}
}
if ( device - > fs_devices ! = root - > fs_info - > fs_devices ) {
BUG_ON ( device - > writeable ) ;
if ( device - > generation ! =
btrfs_device_generation ( leaf , dev_item ) )
return - EINVAL ;
2008-03-24 15:01:59 -04:00
}
2008-03-24 15:01:56 -04:00
fill_device_from_item ( leaf , dev_item , device ) ;
device - > dev_root = root - > fs_info - > dev_root ;
2008-05-13 13:46:40 -04:00
device - > in_fs_metadata = 1 ;
2008-11-17 21:11:30 -05:00
if ( device - > writeable )
device - > fs_devices - > total_rw_bytes + = device - > total_bytes ;
2008-03-24 15:01:56 -04:00
ret = 0 ;
return ret ;
}
2008-03-24 15:02:07 -04:00
int btrfs_read_super_device ( struct btrfs_root * root , struct extent_buffer * buf )
{
struct btrfs_dev_item * dev_item ;
dev_item = ( struct btrfs_dev_item * ) offsetof ( struct btrfs_super_block ,
dev_item ) ;
return read_one_dev ( root , buf , dev_item ) ;
}
2008-12-12 10:03:26 -05:00
int btrfs_read_sys_array ( struct btrfs_root * root )
2008-03-24 15:01:56 -04:00
{
struct btrfs_super_block * super_copy = & root - > fs_info - > super_copy ;
2008-05-07 11:43:44 -04:00
struct extent_buffer * sb ;
2008-03-24 15:01:56 -04:00
struct btrfs_disk_key * disk_key ;
struct btrfs_chunk * chunk ;
2008-04-25 09:04:37 -04:00
u8 * ptr ;
unsigned long sb_ptr ;
int ret = 0 ;
2008-03-24 15:01:56 -04:00
u32 num_stripes ;
u32 array_size ;
u32 len = 0 ;
u32 cur ;
2008-04-25 09:04:37 -04:00
struct btrfs_key key ;
2008-03-24 15:01:56 -04:00
2008-12-12 10:03:26 -05:00
sb = btrfs_find_create_tree_block ( root , BTRFS_SUPER_INFO_OFFSET ,
2008-05-07 11:43:44 -04:00
BTRFS_SUPER_INFO_SIZE ) ;
if ( ! sb )
return - ENOMEM ;
btrfs_set_buffer_uptodate ( sb ) ;
2009-02-12 14:09:45 -05:00
btrfs_set_buffer_lockdep_class ( sb , 0 ) ;
2008-05-07 11:43:44 -04:00
write_extent_buffer ( sb , super_copy , 0 , BTRFS_SUPER_INFO_SIZE ) ;
2008-03-24 15:01:56 -04:00
array_size = btrfs_super_sys_array_size ( super_copy ) ;
ptr = super_copy - > sys_chunk_array ;
sb_ptr = offsetof ( struct btrfs_super_block , sys_chunk_array ) ;
cur = 0 ;
while ( cur < array_size ) {
disk_key = ( struct btrfs_disk_key * ) ptr ;
btrfs_disk_key_to_cpu ( & key , disk_key ) ;
2008-05-07 11:43:44 -04:00
len = sizeof ( * disk_key ) ; ptr + = len ;
2008-03-24 15:01:56 -04:00
sb_ptr + = len ;
cur + = len ;
2008-03-24 15:02:07 -04:00
if ( key . type = = BTRFS_CHUNK_ITEM_KEY ) {
2008-03-24 15:01:56 -04:00
chunk = ( struct btrfs_chunk * ) sb_ptr ;
2008-03-24 15:02:07 -04:00
ret = read_one_chunk ( root , & key , sb , chunk ) ;
2008-04-25 09:04:37 -04:00
if ( ret )
break ;
2008-03-24 15:01:56 -04:00
num_stripes = btrfs_chunk_num_stripes ( sb , chunk ) ;
len = btrfs_chunk_item_size ( num_stripes ) ;
} else {
2008-04-25 09:04:37 -04:00
ret = - EIO ;
break ;
2008-03-24 15:01:56 -04:00
}
ptr + = len ;
sb_ptr + = len ;
cur + = len ;
}
2008-05-07 11:43:44 -04:00
free_extent_buffer ( sb ) ;
2008-04-25 09:04:37 -04:00
return ret ;
2008-03-24 15:01:56 -04:00
}
int btrfs_read_chunk_tree ( struct btrfs_root * root )
{
struct btrfs_path * path ;
struct extent_buffer * leaf ;
struct btrfs_key key ;
struct btrfs_key found_key ;
int ret ;
int slot ;
root = root - > fs_info - > chunk_root ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
/* first we search for all of the device items, and then we
* read in all of the chunk items . This way we can create chunk
* mappings that reference all of the devices that are afound
*/
key . objectid = BTRFS_DEV_ITEMS_OBJECTID ;
key . offset = 0 ;
key . type = 0 ;
again :
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
2009-01-05 21:25:51 -05:00
while ( 1 ) {
2008-03-24 15:01:56 -04:00
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
if ( slot > = btrfs_header_nritems ( leaf ) ) {
ret = btrfs_next_leaf ( root , path ) ;
if ( ret = = 0 )
continue ;
if ( ret < 0 )
goto error ;
break ;
}
btrfs_item_key_to_cpu ( leaf , & found_key , slot ) ;
if ( key . objectid = = BTRFS_DEV_ITEMS_OBJECTID ) {
if ( found_key . objectid ! = BTRFS_DEV_ITEMS_OBJECTID )
break ;
if ( found_key . type = = BTRFS_DEV_ITEM_KEY ) {
struct btrfs_dev_item * dev_item ;
dev_item = btrfs_item_ptr ( leaf , slot ,
struct btrfs_dev_item ) ;
2008-03-24 15:02:07 -04:00
ret = read_one_dev ( root , leaf , dev_item ) ;
2008-11-17 21:11:30 -05:00
if ( ret )
goto error ;
2008-03-24 15:01:56 -04:00
}
} else if ( found_key . type = = BTRFS_CHUNK_ITEM_KEY ) {
struct btrfs_chunk * chunk ;
chunk = btrfs_item_ptr ( leaf , slot , struct btrfs_chunk ) ;
ret = read_one_chunk ( root , & found_key , leaf , chunk ) ;
2008-11-17 21:11:30 -05:00
if ( ret )
goto error ;
2008-03-24 15:01:56 -04:00
}
path - > slots [ 0 ] + + ;
}
if ( key . objectid = = BTRFS_DEV_ITEMS_OBJECTID ) {
key . objectid = 0 ;
btrfs_release_path ( root , path ) ;
goto again ;
}
ret = 0 ;
error :
2008-11-17 21:11:30 -05:00
btrfs_free_path ( path ) ;
2008-03-24 15:01:56 -04:00
return ret ;
}