2010-08-13 03:11:25 +04:00
/*
rbd . c - - Export ceph rados objects as a Linux block device
based on drivers / block / osdblk . c :
Copyright 2009 Red Hat , Inc .
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program ; see the file COPYING . If not , write to
the Free Software Foundation , 675 Mass Ave , Cambridge , MA 0213 9 , USA .
2010-11-20 01:51:04 +03:00
For usage instructions , please refer to :
2010-08-13 03:11:25 +04:00
2010-11-20 01:51:04 +03:00
Documentation / ABI / testing / sysfs - bus - rbd
2010-08-13 03:11:25 +04:00
*/
# include <linux/ceph/libceph.h>
# include <linux/ceph/osd_client.h>
# include <linux/ceph/mon_client.h>
# include <linux/ceph/decode.h>
2011-03-22 01:10:11 +03:00
# include <linux/parser.h>
2010-08-13 03:11:25 +04:00
# include <linux/kernel.h>
# include <linux/device.h>
# include <linux/module.h>
# include <linux/fs.h>
# include <linux/blkdev.h>
# include "rbd_types.h"
2012-09-07 01:00:54 +04:00
# define RBD_DEBUG /* Activate rbd_assert() calls */
2012-02-07 22:03:37 +04:00
/*
* The basic unit of block I / O is a sector . It is interpreted in a
* number of contexts in Linux ( blk , bio , genhd ) , but the default is
* universally 512 bytes . These symbols are just slightly more
* meaningful than the bare numbers they represent .
*/
# define SECTOR_SHIFT 9
# define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
2012-08-09 21:33:26 +04:00
/* It might be useful to have this defined elsewhere too */
# define U64_MAX ((u64) (~0ULL))
2012-01-29 23:57:44 +04:00
# define RBD_DRV_NAME "rbd"
# define RBD_DRV_NAME_LONG "rbd (rados block device)"
2010-08-13 03:11:25 +04:00
# define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
# define RBD_MAX_SNAP_NAME_LEN 32
# define RBD_MAX_OPT_LEN 1024
# define RBD_SNAP_HEAD_NAME "-"
2012-02-02 18:13:30 +04:00
/*
* An RBD device name will be " rbd# " , where the " rbd " comes from
* RBD_DRV_NAME above , and # is a unique integer identifier .
* MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
* enough to hold all possible device names .
*/
2010-08-13 03:11:25 +04:00
# define DEV_NAME_LEN 32
2012-02-02 18:13:30 +04:00
# define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
2010-08-13 03:11:25 +04:00
2012-08-11 00:12:07 +04:00
# define RBD_READ_ONLY_DEFAULT false
2011-03-22 01:10:11 +03:00
2010-08-13 03:11:25 +04:00
/*
* block device image metadata ( in - memory version )
*/
struct rbd_image_header {
u64 image_size ;
2012-07-10 06:04:24 +04:00
char * object_prefix ;
2010-08-13 03:11:25 +04:00
__u8 obj_order ;
__u8 crypt_type ;
__u8 comp_type ;
struct ceph_snap_context * snapc ;
char * snap_names ;
u64 * snap_sizes ;
2011-03-22 01:10:11 +03:00
u64 obj_version ;
} ;
struct rbd_options {
2012-08-11 00:12:07 +04:00
bool read_only ;
2010-08-13 03:11:25 +04:00
} ;
/*
2012-01-29 23:57:44 +04:00
* an instance of the client . multiple devices may share an rbd client .
2010-08-13 03:11:25 +04:00
*/
struct rbd_client {
struct ceph_client * client ;
struct kref kref ;
struct list_head node ;
} ;
/*
2012-01-29 23:57:44 +04:00
* a request completion status
2010-08-13 03:11:25 +04:00
*/
2011-05-14 00:52:56 +04:00
struct rbd_req_status {
int done ;
int rc ;
u64 bytes ;
} ;
/*
* a collection of requests
*/
struct rbd_req_coll {
int total ;
int num_done ;
struct kref kref ;
struct rbd_req_status status [ 0 ] ;
2010-08-13 03:11:25 +04:00
} ;
2012-01-29 23:57:44 +04:00
/*
* a single io request
*/
struct rbd_request {
struct request * rq ; /* blk layer request */
struct bio * bio ; /* cloned bio */
struct page * * pages ; /* list of used pages */
u64 len ;
int coll_index ;
struct rbd_req_coll * coll ;
} ;
2010-11-20 01:51:04 +03:00
struct rbd_snap {
struct device dev ;
const char * name ;
2011-12-06 06:25:13 +04:00
u64 size ;
2010-11-20 01:51:04 +03:00
struct list_head node ;
u64 id ;
} ;
2010-08-13 03:11:25 +04:00
/*
* a single device
*/
struct rbd_device {
2012-07-04 01:01:19 +04:00
int dev_id ; /* blkdev unique id */
2010-08-13 03:11:25 +04:00
int major ; /* blkdev assigned major */
struct gendisk * disk ; /* blkdev's gendisk and rq */
2012-08-11 00:12:07 +04:00
struct rbd_options rbd_opts ;
2010-08-13 03:11:25 +04:00
struct rbd_client * rbd_client ;
char name [ DEV_NAME_LEN ] ; /* blkdev name, e.g. rbd3 */
spinlock_t lock ; /* queue lock */
struct rbd_image_header header ;
2012-07-04 01:01:18 +04:00
char * image_name ;
size_t image_name_len ;
char * header_name ;
2012-07-12 19:46:35 +04:00
char * pool_name ;
2012-07-12 19:46:35 +04:00
int pool_id ;
2010-08-13 03:11:25 +04:00
2011-03-22 01:10:11 +03:00
struct ceph_osd_event * watch_event ;
struct ceph_osd_request * watch_request ;
2011-11-22 05:11:12 +04:00
/* protects updating the header */
struct rw_semaphore header_rwsem ;
2011-11-22 06:14:25 +04:00
/* name of the snapshot this device reads from */
2012-07-10 06:04:24 +04:00
char * snap_name ;
2011-11-22 06:14:25 +04:00
/* id of the snapshot this device reads from */
2011-11-22 01:04:42 +04:00
u64 snap_id ; /* current snapshot id */
2011-11-22 06:14:25 +04:00
/* whether the snap_id this device reads from still exists */
bool snap_exists ;
2012-08-11 00:12:07 +04:00
bool read_only ;
2010-08-13 03:11:25 +04:00
struct list_head node ;
2010-11-20 01:51:04 +03:00
/* list of snapshots */
struct list_head snaps ;
/* sysfs related */
struct device dev ;
} ;
2010-08-13 03:11:25 +04:00
static DEFINE_MUTEX ( ctl_mutex ) ; /* Serialize open/close/setup/teardown */
2012-01-29 23:57:44 +04:00
2010-08-13 03:11:25 +04:00
static LIST_HEAD ( rbd_dev_list ) ; /* devices */
2012-01-29 23:57:44 +04:00
static DEFINE_SPINLOCK ( rbd_dev_list_lock ) ;
2012-01-29 23:57:44 +04:00
static LIST_HEAD ( rbd_client_list ) ; /* clients */
static DEFINE_SPINLOCK ( rbd_client_list_lock ) ;
2010-08-13 03:11:25 +04:00
2012-08-24 08:48:49 +04:00
static int rbd_dev_snap_devs_update ( struct rbd_device * rbd_dev ) ;
2010-11-20 01:51:04 +03:00
static void rbd_dev_release ( struct device * dev ) ;
static ssize_t rbd_snap_add ( struct device * dev ,
struct device_attribute * attr ,
const char * buf ,
size_t count ) ;
2012-07-19 18:09:27 +04:00
static void __rbd_remove_snap_dev ( struct rbd_snap * snap ) ;
2010-11-20 01:51:04 +03:00
2012-01-29 23:57:44 +04:00
static ssize_t rbd_add ( struct bus_type * bus , const char * buf ,
size_t count ) ;
static ssize_t rbd_remove ( struct bus_type * bus , const char * buf ,
size_t count ) ;
static struct bus_attribute rbd_bus_attrs [ ] = {
__ATTR ( add , S_IWUSR , NULL , rbd_add ) ,
__ATTR ( remove , S_IWUSR , NULL , rbd_remove ) ,
__ATTR_NULL
} ;
static struct bus_type rbd_bus_type = {
. name = " rbd " ,
. bus_attrs = rbd_bus_attrs ,
} ;
static void rbd_root_dev_release ( struct device * dev )
{
}
static struct device rbd_root_dev = {
. init_name = " rbd " ,
. release = rbd_root_dev_release ,
} ;
2012-09-07 01:00:54 +04:00
# ifdef RBD_DEBUG
# define rbd_assert(expr) \
if ( unlikely ( ! ( expr ) ) ) { \
printk ( KERN_ERR " \n Assertion failure in %s() " \
" at line %d: \n \n " \
" \t rbd_assert(%s); \n \n " , \
__func__ , __LINE__ , # expr ) ; \
BUG ( ) ; \
}
# else /* !RBD_DEBUG */
# define rbd_assert(expr) ((void) 0)
# endif /* !RBD_DEBUG */
2010-11-20 01:51:04 +03:00
static struct device * rbd_get_dev ( struct rbd_device * rbd_dev )
{
return get_device ( & rbd_dev - > dev ) ;
}
static void rbd_put_dev ( struct rbd_device * rbd_dev )
{
put_device ( & rbd_dev - > dev ) ;
}
2010-08-13 03:11:25 +04:00
2012-07-25 18:32:41 +04:00
static int rbd_refresh_header ( struct rbd_device * rbd_dev , u64 * hver ) ;
2011-03-22 01:10:11 +03:00
2010-08-13 03:11:25 +04:00
static int rbd_open ( struct block_device * bdev , fmode_t mode )
{
2012-01-29 23:57:44 +04:00
struct rbd_device * rbd_dev = bdev - > bd_disk - > private_data ;
2010-08-13 03:11:25 +04:00
if ( ( mode & FMODE_WRITE ) & & rbd_dev - > read_only )
return - EROFS ;
2012-08-11 00:12:07 +04:00
rbd_get_dev ( rbd_dev ) ;
set_device_ro ( bdev , rbd_dev - > read_only ) ;
2010-08-13 03:11:25 +04:00
return 0 ;
}
2010-11-20 01:51:04 +03:00
static int rbd_release ( struct gendisk * disk , fmode_t mode )
{
struct rbd_device * rbd_dev = disk - > private_data ;
rbd_put_dev ( rbd_dev ) ;
return 0 ;
}
2010-08-13 03:11:25 +04:00
static const struct block_device_operations rbd_bd_ops = {
. owner = THIS_MODULE ,
. open = rbd_open ,
2010-11-20 01:51:04 +03:00
. release = rbd_release ,
2010-08-13 03:11:25 +04:00
} ;
/*
* Initialize an rbd client instance .
2012-07-04 01:01:18 +04:00
* We own * ceph_opts .
2010-08-13 03:11:25 +04:00
*/
2012-08-11 00:12:07 +04:00
static struct rbd_client * rbd_client_create ( struct ceph_options * ceph_opts )
2010-08-13 03:11:25 +04:00
{
struct rbd_client * rbdc ;
int ret = - ENOMEM ;
dout ( " rbd_client_create \n " ) ;
rbdc = kmalloc ( sizeof ( struct rbd_client ) , GFP_KERNEL ) ;
if ( ! rbdc )
goto out_opt ;
kref_init ( & rbdc - > kref ) ;
INIT_LIST_HEAD ( & rbdc - > node ) ;
2012-01-29 23:57:44 +04:00
mutex_lock_nested ( & ctl_mutex , SINGLE_DEPTH_NESTING ) ;
2012-07-04 01:01:18 +04:00
rbdc - > client = ceph_create_client ( ceph_opts , rbdc , 0 , 0 ) ;
2010-08-13 03:11:25 +04:00
if ( IS_ERR ( rbdc - > client ) )
2012-01-29 23:57:44 +04:00
goto out_mutex ;
2012-07-04 01:01:18 +04:00
ceph_opts = NULL ; /* Now rbdc->client is responsible for ceph_opts */
2010-08-13 03:11:25 +04:00
ret = ceph_open_session ( rbdc - > client ) ;
if ( ret < 0 )
goto out_err ;
2012-01-29 23:57:44 +04:00
spin_lock ( & rbd_client_list_lock ) ;
2010-08-13 03:11:25 +04:00
list_add_tail ( & rbdc - > node , & rbd_client_list ) ;
2012-01-29 23:57:44 +04:00
spin_unlock ( & rbd_client_list_lock ) ;
2010-08-13 03:11:25 +04:00
2012-01-29 23:57:44 +04:00
mutex_unlock ( & ctl_mutex ) ;
2010-08-13 03:11:25 +04:00
dout ( " rbd_client_create created %p \n " , rbdc ) ;
return rbdc ;
out_err :
ceph_destroy_client ( rbdc - > client ) ;
2012-01-29 23:57:44 +04:00
out_mutex :
mutex_unlock ( & ctl_mutex ) ;
2010-08-13 03:11:25 +04:00
kfree ( rbdc ) ;
out_opt :
2012-07-04 01:01:18 +04:00
if ( ceph_opts )
ceph_destroy_options ( ceph_opts ) ;
2010-09-26 12:59:37 +04:00
return ERR_PTR ( ret ) ;
2010-08-13 03:11:25 +04:00
}
/*
2012-08-11 00:12:07 +04:00
* Find a ceph client with specific addr and configuration . If
* found , bump its reference count .
2010-08-13 03:11:25 +04:00
*/
2012-08-11 00:12:07 +04:00
static struct rbd_client * rbd_client_find ( struct ceph_options * ceph_opts )
2010-08-13 03:11:25 +04:00
{
struct rbd_client * client_node ;
2012-08-11 00:12:07 +04:00
bool found = false ;
2010-08-13 03:11:25 +04:00
2012-07-04 01:01:18 +04:00
if ( ceph_opts - > flags & CEPH_OPT_NOSHARE )
2010-08-13 03:11:25 +04:00
return NULL ;
2012-08-11 00:12:07 +04:00
spin_lock ( & rbd_client_list_lock ) ;
list_for_each_entry ( client_node , & rbd_client_list , node ) {
if ( ! ceph_compare_options ( ceph_opts , client_node - > client ) ) {
kref_get ( & client_node - > kref ) ;
found = true ;
break ;
}
}
spin_unlock ( & rbd_client_list_lock ) ;
return found ? client_node : NULL ;
2010-08-13 03:11:25 +04:00
}
2011-03-22 01:10:11 +03:00
/*
* mount options
*/
enum {
Opt_last_int ,
/* int args above */
Opt_last_string ,
/* string args above */
2012-08-11 00:12:07 +04:00
Opt_read_only ,
Opt_read_write ,
/* Boolean args above */
Opt_last_bool ,
2011-03-22 01:10:11 +03:00
} ;
2012-07-04 01:01:18 +04:00
static match_table_t rbd_opts_tokens = {
2011-03-22 01:10:11 +03:00
/* int args above */
/* string args above */
2012-08-11 00:12:07 +04:00
{ Opt_read_only , " read_only " } ,
{ Opt_read_only , " ro " } , /* Alternate spelling */
{ Opt_read_write , " read_write " } ,
{ Opt_read_write , " rw " } , /* Alternate spelling */
/* Boolean args above */
2011-03-22 01:10:11 +03:00
{ - 1 , NULL }
} ;
static int parse_rbd_opts_token ( char * c , void * private )
{
2012-07-04 01:01:18 +04:00
struct rbd_options * rbd_opts = private ;
2011-03-22 01:10:11 +03:00
substring_t argstr [ MAX_OPT_ARGS ] ;
int token , intval , ret ;
2012-07-04 01:01:18 +04:00
token = match_token ( c , rbd_opts_tokens , argstr ) ;
2011-03-22 01:10:11 +03:00
if ( token < 0 )
return - EINVAL ;
if ( token < Opt_last_int ) {
ret = match_int ( & argstr [ 0 ] , & intval ) ;
if ( ret < 0 ) {
pr_err ( " bad mount option arg (not int) "
" at '%s' \n " , c ) ;
return ret ;
}
dout ( " got int token %d val %d \n " , token , intval ) ;
} else if ( token > Opt_last_int & & token < Opt_last_string ) {
dout ( " got string token %d val %s \n " , token ,
argstr [ 0 ] . from ) ;
2012-08-11 00:12:07 +04:00
} else if ( token > Opt_last_string & & token < Opt_last_bool ) {
dout ( " got Boolean token %d \n " , token ) ;
2011-03-22 01:10:11 +03:00
} else {
dout ( " got token %d \n " , token ) ;
}
switch ( token ) {
2012-08-11 00:12:07 +04:00
case Opt_read_only :
rbd_opts - > read_only = true ;
break ;
case Opt_read_write :
rbd_opts - > read_only = false ;
break ;
2011-03-22 01:10:11 +03:00
default :
2012-09-07 01:00:54 +04:00
rbd_assert ( false ) ;
break ;
2011-03-22 01:10:11 +03:00
}
return 0 ;
}
2010-08-13 03:11:25 +04:00
/*
* Get a ceph client with specific addr and configuration , if one does
* not exist create it .
*/
2012-08-11 00:12:07 +04:00
static int rbd_get_client ( struct rbd_device * rbd_dev , const char * mon_addr ,
size_t mon_addr_len , char * options )
2010-08-13 03:11:25 +04:00
{
2012-08-11 00:12:07 +04:00
struct rbd_options * rbd_opts = & rbd_dev - > rbd_opts ;
2012-07-04 01:01:18 +04:00
struct ceph_options * ceph_opts ;
2012-08-11 00:12:07 +04:00
struct rbd_client * rbdc ;
2011-03-22 01:10:11 +03:00
2012-08-11 00:12:07 +04:00
rbd_opts - > read_only = RBD_READ_ONLY_DEFAULT ;
2010-08-13 03:11:25 +04:00
2012-07-04 01:01:18 +04:00
ceph_opts = ceph_parse_options ( options , mon_addr ,
mon_addr + mon_addr_len ,
parse_rbd_opts_token , rbd_opts ) ;
2012-08-11 00:12:07 +04:00
if ( IS_ERR ( ceph_opts ) )
return PTR_ERR ( ceph_opts ) ;
2010-08-13 03:11:25 +04:00
2012-08-11 00:12:07 +04:00
rbdc = rbd_client_find ( ceph_opts ) ;
2010-08-13 03:11:25 +04:00
if ( rbdc ) {
/* using an existing client */
2012-07-04 01:01:18 +04:00
ceph_destroy_options ( ceph_opts ) ;
2012-08-11 00:12:07 +04:00
} else {
rbdc = rbd_client_create ( ceph_opts ) ;
if ( IS_ERR ( rbdc ) )
return PTR_ERR ( rbdc ) ;
2010-08-13 03:11:25 +04:00
}
2012-08-11 00:12:07 +04:00
rbd_dev - > rbd_client = rbdc ;
2010-08-13 03:11:25 +04:00
2012-08-11 00:12:07 +04:00
return 0 ;
2010-08-13 03:11:25 +04:00
}
/*
* Destroy ceph client
2012-01-29 23:57:43 +04:00
*
2012-01-29 23:57:44 +04:00
* Caller must hold rbd_client_list_lock .
2010-08-13 03:11:25 +04:00
*/
static void rbd_client_release ( struct kref * kref )
{
struct rbd_client * rbdc = container_of ( kref , struct rbd_client , kref ) ;
dout ( " rbd_release_client %p \n " , rbdc ) ;
2012-04-04 22:35:44 +04:00
spin_lock ( & rbd_client_list_lock ) ;
2010-08-13 03:11:25 +04:00
list_del ( & rbdc - > node ) ;
2012-04-04 22:35:44 +04:00
spin_unlock ( & rbd_client_list_lock ) ;
2010-08-13 03:11:25 +04:00
ceph_destroy_client ( rbdc - > client ) ;
kfree ( rbdc ) ;
}
/*
* Drop reference to ceph client node . If it ' s not referenced anymore , release
* it .
*/
static void rbd_put_client ( struct rbd_device * rbd_dev )
{
kref_put ( & rbd_dev - > rbd_client - > kref , rbd_client_release ) ;
rbd_dev - > rbd_client = NULL ;
}
2011-05-14 00:52:56 +04:00
/*
* Destroy requests collection
*/
static void rbd_coll_release ( struct kref * kref )
{
struct rbd_req_coll * coll =
container_of ( kref , struct rbd_req_coll , kref ) ;
dout ( " rbd_coll_release %p \n " , coll ) ;
kfree ( coll ) ;
}
2010-08-13 03:11:25 +04:00
2012-07-25 18:32:40 +04:00
static bool rbd_dev_ondisk_valid ( struct rbd_image_header_ondisk * ondisk )
{
2012-08-02 20:29:45 +04:00
size_t size ;
u32 snap_count ;
/* The header has to start with the magic rbd header text */
if ( memcmp ( & ondisk - > text , RBD_HEADER_TEXT , sizeof ( RBD_HEADER_TEXT ) ) )
return false ;
/*
* The size of a snapshot header has to fit in a size_t , and
* that limits the number of snapshots .
*/
snap_count = le32_to_cpu ( ondisk - > snap_count ) ;
size = SIZE_MAX - sizeof ( struct ceph_snap_context ) ;
if ( snap_count > size / sizeof ( __le64 ) )
return false ;
/*
* Not only that , but the size of the entire the snapshot
* header must also be representable in a size_t .
*/
size - = snap_count * sizeof ( __le64 ) ;
if ( ( u64 ) size < le64_to_cpu ( ondisk - > snap_names_len ) )
return false ;
return true ;
2012-07-25 18:32:40 +04:00
}
2010-08-13 03:11:25 +04:00
/*
* Create a new header structure , translate header format from the on - disk
* header .
*/
static int rbd_header_from_disk ( struct rbd_image_header * header ,
2012-08-02 20:29:46 +04:00
struct rbd_image_header_ondisk * ondisk )
2010-08-13 03:11:25 +04:00
{
rbd: fixes in rbd_header_from_disk()
This fixes a few issues in rbd_header_from_disk():
- There is a check intended to catch overflow, but it's wrong in
two ways.
- First, the type we don't want to overflow is size_t, not
unsigned int, and there is now a SIZE_MAX we can use for
use with that type.
- Second, we're allocating the snapshot ids and snapshot
image sizes separately (each has type u64; on disk they
grouped together as a rbd_image_header_ondisk structure).
So we can use the size of u64 in this overflow check.
- If there are no snapshots, then there should be no snapshot
names. Enforce this, and issue a warning if we encounter a
header with no snapshots but a non-zero snap_names_len.
- When saving the snapshot names into the header, be more direct
in defining the offset in the on-disk structure from which
they're being copied by using "snap_count" rather than "i"
in the array index.
- If an error occurs, the "snapc" and "snap_names" fields are
freed at the end of the function. Make those fields be null
pointers after they're freed, to be explicit that they are
no longer valid.
- Finally, move the definition of the local variable "i" to the
innermost scope in which it's needed.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2012-07-11 05:30:10 +04:00
u32 snap_count ;
2012-08-24 08:22:06 +04:00
size_t len ;
2012-07-27 08:37:14 +04:00
size_t size ;
2012-08-24 08:22:06 +04:00
u32 i ;
2010-08-13 03:11:25 +04:00
2012-07-20 02:12:59 +04:00
memset ( header , 0 , sizeof ( * header ) ) ;
2012-08-02 20:29:45 +04:00
snap_count = le32_to_cpu ( ondisk - > snap_count ) ;
2012-08-24 08:22:06 +04:00
len = strnlen ( ondisk - > object_prefix , sizeof ( ondisk - > object_prefix ) ) ;
header - > object_prefix = kmalloc ( len + 1 , GFP_KERNEL ) ;
2012-07-20 02:12:59 +04:00
if ( ! header - > object_prefix )
2010-08-13 03:11:25 +04:00
return - ENOMEM ;
2012-08-24 08:22:06 +04:00
memcpy ( header - > object_prefix , ondisk - > object_prefix , len ) ;
header - > object_prefix [ len ] = ' \0 ' ;
2012-02-07 22:03:36 +04:00
2010-08-13 03:11:25 +04:00
if ( snap_count ) {
2012-08-24 08:22:06 +04:00
u64 snap_names_len = le64_to_cpu ( ondisk - > snap_names_len ) ;
2012-08-24 08:22:06 +04:00
/* Save a copy of the snapshot names */
2012-08-24 08:22:06 +04:00
if ( snap_names_len > ( u64 ) SIZE_MAX )
return - EIO ;
header - > snap_names = kmalloc ( snap_names_len , GFP_KERNEL ) ;
2010-08-13 03:11:25 +04:00
if ( ! header - > snap_names )
2012-07-20 02:12:59 +04:00
goto out_err ;
2012-08-24 08:22:06 +04:00
/*
* Note that rbd_dev_v1_header_read ( ) guarantees
* the ondisk buffer we ' re working with has
* snap_names_len bytes beyond the end of the
* snapshot id array , this memcpy ( ) is safe .
*/
memcpy ( header - > snap_names , & ondisk - > snaps [ snap_count ] ,
snap_names_len ) ;
2012-07-20 02:12:59 +04:00
2012-08-24 08:22:06 +04:00
/* Record each snapshot's size */
2012-07-27 08:37:14 +04:00
size = snap_count * sizeof ( * header - > snap_sizes ) ;
header - > snap_sizes = kmalloc ( size , GFP_KERNEL ) ;
2010-08-13 03:11:25 +04:00
if ( ! header - > snap_sizes )
2012-07-20 02:12:59 +04:00
goto out_err ;
2012-08-24 08:22:06 +04:00
for ( i = 0 ; i < snap_count ; i + + )
header - > snap_sizes [ i ] =
le64_to_cpu ( ondisk - > snaps [ i ] . image_size ) ;
2010-08-13 03:11:25 +04:00
} else {
rbd: fixes in rbd_header_from_disk()
This fixes a few issues in rbd_header_from_disk():
- There is a check intended to catch overflow, but it's wrong in
two ways.
- First, the type we don't want to overflow is size_t, not
unsigned int, and there is now a SIZE_MAX we can use for
use with that type.
- Second, we're allocating the snapshot ids and snapshot
image sizes separately (each has type u64; on disk they
grouped together as a rbd_image_header_ondisk structure).
So we can use the size of u64 in this overflow check.
- If there are no snapshots, then there should be no snapshot
names. Enforce this, and issue a warning if we encounter a
header with no snapshots but a non-zero snap_names_len.
- When saving the snapshot names into the header, be more direct
in defining the offset in the on-disk structure from which
they're being copied by using "snap_count" rather than "i"
in the array index.
- If an error occurs, the "snapc" and "snap_names" fields are
freed at the end of the function. Make those fields be null
pointers after they're freed, to be explicit that they are
no longer valid.
- Finally, move the definition of the local variable "i" to the
innermost scope in which it's needed.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2012-07-11 05:30:10 +04:00
WARN_ON ( ondisk - > snap_names_len ) ;
2010-08-13 03:11:25 +04:00
header - > snap_names = NULL ;
header - > snap_sizes = NULL ;
}
2012-07-10 06:04:24 +04:00
2010-08-13 03:11:25 +04:00
header - > image_size = le64_to_cpu ( ondisk - > image_size ) ;
header - > obj_order = ondisk - > options . order ;
header - > crypt_type = ondisk - > options . crypt_type ;
header - > comp_type = ondisk - > options . comp_type ;
2012-07-20 02:12:59 +04:00
2012-08-24 08:22:06 +04:00
/* Allocate and fill in the snapshot context */
2012-07-20 02:12:59 +04:00
size = sizeof ( struct ceph_snap_context ) ;
size + = snap_count * sizeof ( header - > snapc - > snaps [ 0 ] ) ;
header - > snapc = kzalloc ( size , GFP_KERNEL ) ;
if ( ! header - > snapc )
goto out_err ;
2010-08-13 03:11:25 +04:00
atomic_set ( & header - > snapc - > nref , 1 ) ;
2012-07-19 17:49:18 +04:00
header - > snapc - > seq = le64_to_cpu ( ondisk - > snap_seq ) ;
2010-08-13 03:11:25 +04:00
header - > snapc - > num_snaps = snap_count ;
2012-08-24 08:22:06 +04:00
for ( i = 0 ; i < snap_count ; i + + )
header - > snapc - > snaps [ i ] =
le64_to_cpu ( ondisk - > snaps [ i ] . id ) ;
2010-08-13 03:11:25 +04:00
return 0 ;
2012-07-20 02:12:59 +04:00
out_err :
2012-07-10 06:04:24 +04:00
kfree ( header - > snap_sizes ) ;
rbd: fixes in rbd_header_from_disk()
This fixes a few issues in rbd_header_from_disk():
- There is a check intended to catch overflow, but it's wrong in
two ways.
- First, the type we don't want to overflow is size_t, not
unsigned int, and there is now a SIZE_MAX we can use for
use with that type.
- Second, we're allocating the snapshot ids and snapshot
image sizes separately (each has type u64; on disk they
grouped together as a rbd_image_header_ondisk structure).
So we can use the size of u64 in this overflow check.
- If there are no snapshots, then there should be no snapshot
names. Enforce this, and issue a warning if we encounter a
header with no snapshots but a non-zero snap_names_len.
- When saving the snapshot names into the header, be more direct
in defining the offset in the on-disk structure from which
they're being copied by using "snap_count" rather than "i"
in the array index.
- If an error occurs, the "snapc" and "snap_names" fields are
freed at the end of the function. Make those fields be null
pointers after they're freed, to be explicit that they are
no longer valid.
- Finally, move the definition of the local variable "i" to the
innermost scope in which it's needed.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2012-07-11 05:30:10 +04:00
header - > snap_sizes = NULL ;
2010-08-13 03:11:25 +04:00
kfree ( header - > snap_names ) ;
rbd: fixes in rbd_header_from_disk()
This fixes a few issues in rbd_header_from_disk():
- There is a check intended to catch overflow, but it's wrong in
two ways.
- First, the type we don't want to overflow is size_t, not
unsigned int, and there is now a SIZE_MAX we can use for
use with that type.
- Second, we're allocating the snapshot ids and snapshot
image sizes separately (each has type u64; on disk they
grouped together as a rbd_image_header_ondisk structure).
So we can use the size of u64 in this overflow check.
- If there are no snapshots, then there should be no snapshot
names. Enforce this, and issue a warning if we encounter a
header with no snapshots but a non-zero snap_names_len.
- When saving the snapshot names into the header, be more direct
in defining the offset in the on-disk structure from which
they're being copied by using "snap_count" rather than "i"
in the array index.
- If an error occurs, the "snapc" and "snap_names" fields are
freed at the end of the function. Make those fields be null
pointers after they're freed, to be explicit that they are
no longer valid.
- Finally, move the definition of the local variable "i" to the
innermost scope in which it's needed.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2012-07-11 05:30:10 +04:00
header - > snap_names = NULL ;
2012-07-20 02:12:59 +04:00
kfree ( header - > object_prefix ) ;
header - > object_prefix = NULL ;
rbd: fixes in rbd_header_from_disk()
This fixes a few issues in rbd_header_from_disk():
- There is a check intended to catch overflow, but it's wrong in
two ways.
- First, the type we don't want to overflow is size_t, not
unsigned int, and there is now a SIZE_MAX we can use for
use with that type.
- Second, we're allocating the snapshot ids and snapshot
image sizes separately (each has type u64; on disk they
grouped together as a rbd_image_header_ondisk structure).
So we can use the size of u64 in this overflow check.
- If there are no snapshots, then there should be no snapshot
names. Enforce this, and issue a warning if we encounter a
header with no snapshots but a non-zero snap_names_len.
- When saving the snapshot names into the header, be more direct
in defining the offset in the on-disk structure from which
they're being copied by using "snap_count" rather than "i"
in the array index.
- If an error occurs, the "snapc" and "snap_names" fields are
freed at the end of the function. Make those fields be null
pointers after they're freed, to be explicit that they are
no longer valid.
- Finally, move the definition of the local variable "i" to the
innermost scope in which it's needed.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2012-07-11 05:30:10 +04:00
2012-02-07 22:03:36 +04:00
return - ENOMEM ;
2010-08-13 03:11:25 +04:00
}
static int snap_by_name ( struct rbd_image_header * header , const char * snap_name ,
u64 * seq , u64 * size )
{
int i ;
char * p = header - > snap_names ;
2012-08-30 23:42:15 +04:00
rbd_assert ( header - > snapc ! = NULL ) ;
for ( i = 0 ; i < header - > snapc - > num_snaps ; i + + ) {
2012-02-07 22:03:36 +04:00
if ( ! strcmp ( snap_name , p ) ) {
2010-08-13 03:11:25 +04:00
2012-02-07 22:03:36 +04:00
/* Found it. Pass back its id and/or size */
2010-08-13 03:11:25 +04:00
2012-02-07 22:03:36 +04:00
if ( seq )
* seq = header - > snapc - > snaps [ i ] ;
if ( size )
* size = header - > snap_sizes [ i ] ;
return i ;
}
p + = strlen ( p ) + 1 ; /* Skip ahead to the next name */
}
return - ENOENT ;
2010-08-13 03:11:25 +04:00
}
2012-07-04 01:01:18 +04:00
static int rbd_header_set_snap ( struct rbd_device * rbd_dev , u64 * size )
2010-08-13 03:11:25 +04:00
{
2012-07-19 17:49:18 +04:00
int ret ;
2010-08-13 03:11:25 +04:00
2012-07-04 01:01:18 +04:00
down_write ( & rbd_dev - > header_rwsem ) ;
2010-08-13 03:11:25 +04:00
2012-07-04 01:01:18 +04:00
if ( ! memcmp ( rbd_dev - > snap_name , RBD_SNAP_HEAD_NAME ,
2011-11-22 06:19:13 +04:00
sizeof ( RBD_SNAP_HEAD_NAME ) ) ) {
2012-07-04 01:01:18 +04:00
rbd_dev - > snap_id = CEPH_NOSNAP ;
2011-11-22 06:14:25 +04:00
rbd_dev - > snap_exists = false ;
2012-08-11 00:12:07 +04:00
rbd_dev - > read_only = rbd_dev - > rbd_opts . read_only ;
2010-08-13 03:11:25 +04:00
if ( size )
2012-07-19 17:49:18 +04:00
* size = rbd_dev - > header . image_size ;
2010-08-13 03:11:25 +04:00
} else {
2012-07-19 17:49:18 +04:00
u64 snap_id = 0 ;
ret = snap_by_name ( & rbd_dev - > header , rbd_dev - > snap_name ,
& snap_id , size ) ;
2010-08-13 03:11:25 +04:00
if ( ret < 0 )
goto done ;
2012-07-19 17:49:18 +04:00
rbd_dev - > snap_id = snap_id ;
2011-11-22 06:14:25 +04:00
rbd_dev - > snap_exists = true ;
2012-08-11 00:12:07 +04:00
rbd_dev - > read_only = true ; /* No choice for snapshots */
2010-08-13 03:11:25 +04:00
}
ret = 0 ;
done :
2012-07-04 01:01:18 +04:00
up_write ( & rbd_dev - > header_rwsem ) ;
2010-08-13 03:11:25 +04:00
return ret ;
}
static void rbd_header_free ( struct rbd_image_header * header )
{
2012-07-10 06:04:24 +04:00
kfree ( header - > object_prefix ) ;
2012-07-27 08:37:14 +04:00
header - > object_prefix = NULL ;
2010-08-13 03:11:25 +04:00
kfree ( header - > snap_sizes ) ;
2012-07-27 08:37:14 +04:00
header - > snap_sizes = NULL ;
2012-07-10 06:04:24 +04:00
kfree ( header - > snap_names ) ;
2012-07-27 08:37:14 +04:00
header - > snap_names = NULL ;
2011-12-06 02:03:05 +04:00
ceph_put_snap_context ( header - > snapc ) ;
2012-07-27 08:37:14 +04:00
header - > snapc = NULL ;
2010-08-13 03:11:25 +04:00
}
2012-08-09 21:33:26 +04:00
static char * rbd_segment_name ( struct rbd_device * rbd_dev , u64 offset )
2010-08-13 03:11:25 +04:00
{
2012-08-09 21:33:26 +04:00
char * name ;
u64 segment ;
int ret ;
2010-08-13 03:11:25 +04:00
2012-08-09 21:33:26 +04:00
name = kmalloc ( RBD_MAX_SEG_NAME_LEN + 1 , GFP_NOIO ) ;
if ( ! name )
return NULL ;
segment = offset > > rbd_dev - > header . obj_order ;
ret = snprintf ( name , RBD_MAX_SEG_NAME_LEN , " %s.%012llx " ,
rbd_dev - > header . object_prefix , segment ) ;
if ( ret < 0 | | ret > = RBD_MAX_SEG_NAME_LEN ) {
pr_err ( " error formatting segment name for #%llu (%d) \n " ,
segment , ret ) ;
kfree ( name ) ;
name = NULL ;
}
2010-08-13 03:11:25 +04:00
2012-08-09 21:33:26 +04:00
return name ;
}
2010-08-13 03:11:25 +04:00
2012-08-09 21:33:26 +04:00
static u64 rbd_segment_offset ( struct rbd_device * rbd_dev , u64 offset )
{
u64 segment_size = ( u64 ) 1 < < rbd_dev - > header . obj_order ;
2010-08-13 03:11:25 +04:00
2012-08-09 21:33:26 +04:00
return offset & ( segment_size - 1 ) ;
}
static u64 rbd_segment_length ( struct rbd_device * rbd_dev ,
u64 offset , u64 length )
{
u64 segment_size = ( u64 ) 1 < < rbd_dev - > header . obj_order ;
offset & = segment_size - 1 ;
2012-09-07 01:00:54 +04:00
rbd_assert ( length < = U64_MAX - offset ) ;
2012-08-09 21:33:26 +04:00
if ( offset + length > segment_size )
length = segment_size - offset ;
return length ;
2010-08-13 03:11:25 +04:00
}
2011-05-14 00:52:56 +04:00
static int rbd_get_num_segments ( struct rbd_image_header * header ,
u64 ofs , u64 len )
{
2012-08-09 21:33:26 +04:00
u64 start_seg ;
u64 end_seg ;
if ( ! len )
return 0 ;
if ( len - 1 > U64_MAX - ofs )
return - ERANGE ;
start_seg = ofs > > header - > obj_order ;
end_seg = ( ofs + len - 1 ) > > header - > obj_order ;
2011-05-14 00:52:56 +04:00
return end_seg - start_seg + 1 ;
}
2011-07-22 22:35:23 +04:00
/*
* returns the size of an object in the image
*/
static u64 rbd_obj_bytes ( struct rbd_image_header * header )
{
return 1 < < header - > obj_order ;
}
2010-08-13 03:11:25 +04:00
/*
* bio helpers
*/
static void bio_chain_put ( struct bio * chain )
{
struct bio * tmp ;
while ( chain ) {
tmp = chain ;
chain = chain - > bi_next ;
bio_put ( tmp ) ;
}
}
/*
* zeros a bio chain , starting at specific offset
*/
static void zero_bio_chain ( struct bio * chain , int start_ofs )
{
struct bio_vec * bv ;
unsigned long flags ;
void * buf ;
int i ;
int pos = 0 ;
while ( chain ) {
bio_for_each_segment ( bv , chain , i ) {
if ( pos + bv - > bv_len > start_ofs ) {
int remainder = max ( start_ofs - pos , 0 ) ;
buf = bvec_kmap_irq ( bv , & flags ) ;
memset ( buf + remainder , 0 ,
bv - > bv_len - remainder ) ;
2010-10-11 23:15:11 +04:00
bvec_kunmap_irq ( buf , & flags ) ;
2010-08-13 03:11:25 +04:00
}
pos + = bv - > bv_len ;
}
chain = chain - > bi_next ;
}
}
/*
* bio_chain_clone - clone a chain of bios up to a certain length .
* might return a bio_pair that will need to be released .
*/
static struct bio * bio_chain_clone ( struct bio * * old , struct bio * * next ,
struct bio_pair * * bp ,
int len , gfp_t gfpmask )
{
2012-08-09 21:33:25 +04:00
struct bio * old_chain = * old ;
struct bio * new_chain = NULL ;
struct bio * tail ;
2010-08-13 03:11:25 +04:00
int total = 0 ;
if ( * bp ) {
bio_pair_release ( * bp ) ;
* bp = NULL ;
}
while ( old_chain & & ( total < len ) ) {
2012-08-09 21:33:25 +04:00
struct bio * tmp ;
2010-08-13 03:11:25 +04:00
tmp = bio_kmalloc ( gfpmask , old_chain - > bi_max_vecs ) ;
if ( ! tmp )
goto err_out ;
2012-08-09 21:33:25 +04:00
gfpmask & = ~ __GFP_WAIT ; /* can't wait after the first */
2010-08-13 03:11:25 +04:00
if ( total + old_chain - > bi_size > len ) {
struct bio_pair * bp ;
/*
* this split can only happen with a single paged bio ,
* split_bio will BUG_ON if this is not the case
*/
dout ( " bio_chain_clone split! total=%d remaining=%d "
2012-07-14 05:35:11 +04:00
" bi_size=%u \n " ,
total , len - total , old_chain - > bi_size ) ;
2010-08-13 03:11:25 +04:00
/* split the bio. We'll release it either in the next
call , or it will have to be released outside */
2012-02-07 22:03:37 +04:00
bp = bio_split ( old_chain , ( len - total ) / SECTOR_SIZE ) ;
2010-08-13 03:11:25 +04:00
if ( ! bp )
goto err_out ;
__bio_clone ( tmp , & bp - > bio1 ) ;
* next = & bp - > bio2 ;
} else {
__bio_clone ( tmp , old_chain ) ;
* next = old_chain - > bi_next ;
}
tmp - > bi_bdev = NULL ;
tmp - > bi_next = NULL ;
2012-08-09 21:33:25 +04:00
if ( new_chain )
2010-08-13 03:11:25 +04:00
tail - > bi_next = tmp ;
2012-08-09 21:33:25 +04:00
else
new_chain = tmp ;
tail = tmp ;
2010-08-13 03:11:25 +04:00
old_chain = old_chain - > bi_next ;
total + = tmp - > bi_size ;
}
2012-09-07 01:00:54 +04:00
rbd_assert ( total = = len ) ;
2010-08-13 03:11:25 +04:00
* old = old_chain ;
return new_chain ;
err_out :
dout ( " bio_chain_clone with err \n " ) ;
bio_chain_put ( new_chain ) ;
return NULL ;
}
/*
* helpers for osd request op vectors .
*/
2012-06-26 23:57:03 +04:00
static struct ceph_osd_req_op * rbd_create_rw_ops ( int num_ops ,
int opcode , u32 payload_len )
2010-08-13 03:11:25 +04:00
{
2012-06-26 23:57:03 +04:00
struct ceph_osd_req_op * ops ;
ops = kzalloc ( sizeof ( * ops ) * ( num_ops + 1 ) , GFP_NOIO ) ;
if ( ! ops )
return NULL ;
ops [ 0 ] . op = opcode ;
2010-08-13 03:11:25 +04:00
/*
* op extent offset and length will be set later on
* in calc_raw_layout ( )
*/
2012-06-26 23:57:03 +04:00
ops [ 0 ] . payload_len = payload_len ;
return ops ;
2010-08-13 03:11:25 +04:00
}
static void rbd_destroy_ops ( struct ceph_osd_req_op * ops )
{
kfree ( ops ) ;
}
2011-05-14 00:52:56 +04:00
static void rbd_coll_end_req_index ( struct request * rq ,
struct rbd_req_coll * coll ,
int index ,
int ret , u64 len )
{
struct request_queue * q ;
int min , max , i ;
2012-07-14 05:35:11 +04:00
dout ( " rbd_coll_end_req_index %p index %d ret %d len %llu \n " ,
coll , index , ret , ( unsigned long long ) len ) ;
2011-05-14 00:52:56 +04:00
if ( ! rq )
return ;
if ( ! coll ) {
blk_end_request ( rq , ret , len ) ;
return ;
}
q = rq - > q ;
spin_lock_irq ( q - > queue_lock ) ;
coll - > status [ index ] . done = 1 ;
coll - > status [ index ] . rc = ret ;
coll - > status [ index ] . bytes = len ;
max = min = coll - > num_done ;
while ( max < coll - > total & & coll - > status [ max ] . done )
max + + ;
for ( i = min ; i < max ; i + + ) {
__blk_end_request ( rq , coll - > status [ i ] . rc ,
coll - > status [ i ] . bytes ) ;
coll - > num_done + + ;
kref_put ( & coll - > kref , rbd_coll_release ) ;
}
spin_unlock_irq ( q - > queue_lock ) ;
}
static void rbd_coll_end_req ( struct rbd_request * req ,
int ret , u64 len )
{
rbd_coll_end_req_index ( req - > rq , req - > coll , req - > coll_index , ret , len ) ;
}
2010-08-13 03:11:25 +04:00
/*
* Send ceph osd request
*/
static int rbd_do_request ( struct request * rq ,
2012-07-04 01:01:18 +04:00
struct rbd_device * rbd_dev ,
2010-08-13 03:11:25 +04:00
struct ceph_snap_context * snapc ,
u64 snapid ,
2012-07-04 01:01:18 +04:00
const char * object_name , u64 ofs , u64 len ,
2010-08-13 03:11:25 +04:00
struct bio * bio ,
struct page * * pages ,
int num_pages ,
int flags ,
struct ceph_osd_req_op * ops ,
2011-05-14 00:52:56 +04:00
struct rbd_req_coll * coll ,
int coll_index ,
2010-08-13 03:11:25 +04:00
void ( * rbd_cb ) ( struct ceph_osd_request * req ,
2011-03-22 01:10:11 +03:00
struct ceph_msg * msg ) ,
struct ceph_osd_request * * linger_req ,
u64 * ver )
2010-08-13 03:11:25 +04:00
{
struct ceph_osd_request * req ;
struct ceph_file_layout * layout ;
int ret ;
u64 bno ;
struct timespec mtime = CURRENT_TIME ;
struct rbd_request * req_data ;
struct ceph_osd_request_head * reqhead ;
2012-01-24 20:08:37 +04:00
struct ceph_osd_client * osdc ;
2010-08-13 03:11:25 +04:00
req_data = kzalloc ( sizeof ( * req_data ) , GFP_NOIO ) ;
2011-05-14 00:52:56 +04:00
if ( ! req_data ) {
if ( coll )
rbd_coll_end_req_index ( rq , coll , coll_index ,
- ENOMEM , len ) ;
return - ENOMEM ;
}
if ( coll ) {
req_data - > coll = coll ;
req_data - > coll_index = coll_index ;
}
2010-08-13 03:11:25 +04:00
2012-07-14 05:35:11 +04:00
dout ( " rbd_do_request object_name=%s ofs=%llu len=%llu \n " , object_name ,
( unsigned long long ) ofs , ( unsigned long long ) len ) ;
2010-08-13 03:11:25 +04:00
2012-07-04 01:01:18 +04:00
osdc = & rbd_dev - > rbd_client - > client - > osdc ;
2012-01-24 20:08:37 +04:00
req = ceph_osdc_alloc_request ( osdc , flags , snapc , ops ,
false , GFP_NOIO , pages , bio ) ;
2011-05-03 20:23:36 +04:00
if ( ! req ) {
ret = - ENOMEM ;
2010-08-13 03:11:25 +04:00
goto done_pages ;
}
req - > r_callback = rbd_cb ;
req_data - > rq = rq ;
req_data - > bio = bio ;
req_data - > pages = pages ;
req_data - > len = len ;
req - > r_priv = req_data ;
reqhead = req - > r_request - > front . iov_base ;
reqhead - > snapid = cpu_to_le64 ( CEPH_NOSNAP ) ;
2012-07-04 01:01:18 +04:00
strncpy ( req - > r_oid , object_name , sizeof ( req - > r_oid ) ) ;
2010-08-13 03:11:25 +04:00
req - > r_oid_len = strlen ( req - > r_oid ) ;
layout = & req - > r_file_layout ;
memset ( layout , 0 , sizeof ( * layout ) ) ;
layout - > fl_stripe_unit = cpu_to_le32 ( 1 < < RBD_MAX_OBJ_ORDER ) ;
layout - > fl_stripe_count = cpu_to_le32 ( 1 ) ;
layout - > fl_object_size = cpu_to_le32 ( 1 < < RBD_MAX_OBJ_ORDER ) ;
2012-07-04 01:01:18 +04:00
layout - > fl_pg_pool = cpu_to_le32 ( rbd_dev - > pool_id ) ;
2012-01-24 20:08:37 +04:00
ceph_calc_raw_layout ( osdc , layout , snapid , ofs , & len , & bno ,
req , ops ) ;
2010-08-13 03:11:25 +04:00
ceph_osdc_build_request ( req , ofs , & len ,
ops ,
snapc ,
& mtime ,
req - > r_oid , req - > r_oid_len ) ;
2011-03-22 01:10:11 +03:00
if ( linger_req ) {
2012-01-24 20:08:37 +04:00
ceph_osdc_set_request_linger ( osdc , req ) ;
2011-03-22 01:10:11 +03:00
* linger_req = req ;
}
2012-01-24 20:08:37 +04:00
ret = ceph_osdc_start_request ( osdc , req , false ) ;
2010-08-13 03:11:25 +04:00
if ( ret < 0 )
goto done_err ;
if ( ! rbd_cb ) {
2012-01-24 20:08:37 +04:00
ret = ceph_osdc_wait_request ( osdc , req ) ;
2011-03-22 01:10:11 +03:00
if ( ver )
* ver = le64_to_cpu ( req - > r_reassert_version . version ) ;
2012-07-14 05:35:11 +04:00
dout ( " reassert_ver=%llu \n " ,
( unsigned long long )
le64_to_cpu ( req - > r_reassert_version . version ) ) ;
2010-08-13 03:11:25 +04:00
ceph_osdc_put_request ( req ) ;
}
return ret ;
done_err :
bio_chain_put ( req_data - > bio ) ;
ceph_osdc_put_request ( req ) ;
done_pages :
2011-05-14 00:52:56 +04:00
rbd_coll_end_req ( req_data , ret , len ) ;
2010-08-13 03:11:25 +04:00
kfree ( req_data ) ;
return ret ;
}
/*
* Ceph osd op callback
*/
static void rbd_req_cb ( struct ceph_osd_request * req , struct ceph_msg * msg )
{
struct rbd_request * req_data = req - > r_priv ;
struct ceph_osd_reply_head * replyhead ;
struct ceph_osd_op * op ;
__s32 rc ;
u64 bytes ;
int read_op ;
/* parse reply */
replyhead = msg - > front . iov_base ;
WARN_ON ( le32_to_cpu ( replyhead - > num_ops ) = = 0 ) ;
op = ( void * ) ( replyhead + 1 ) ;
rc = le32_to_cpu ( replyhead - > result ) ;
bytes = le64_to_cpu ( op - > extent . length ) ;
2012-06-06 18:15:33 +04:00
read_op = ( le16_to_cpu ( op - > op ) = = CEPH_OSD_OP_READ ) ;
2010-08-13 03:11:25 +04:00
2012-07-14 05:35:11 +04:00
dout ( " rbd_req_cb bytes=%llu readop=%d rc=%d \n " ,
( unsigned long long ) bytes , read_op , ( int ) rc ) ;
2010-08-13 03:11:25 +04:00
if ( rc = = - ENOENT & & read_op ) {
zero_bio_chain ( req_data - > bio , 0 ) ;
rc = 0 ;
} else if ( rc = = 0 & & read_op & & bytes < req_data - > len ) {
zero_bio_chain ( req_data - > bio , bytes ) ;
bytes = req_data - > len ;
}
2011-05-14 00:52:56 +04:00
rbd_coll_end_req ( req_data , rc , bytes ) ;
2010-08-13 03:11:25 +04:00
if ( req_data - > bio )
bio_chain_put ( req_data - > bio ) ;
ceph_osdc_put_request ( req ) ;
kfree ( req_data ) ;
}
2011-03-22 01:10:11 +03:00
static void rbd_simple_req_cb ( struct ceph_osd_request * req , struct ceph_msg * msg )
{
ceph_osdc_put_request ( req ) ;
}
2010-08-13 03:11:25 +04:00
/*
* Do a synchronous ceph osd operation
*/
2012-07-04 01:01:18 +04:00
static int rbd_req_sync_op ( struct rbd_device * rbd_dev ,
2010-08-13 03:11:25 +04:00
struct ceph_snap_context * snapc ,
u64 snapid ,
int flags ,
2012-06-26 23:57:03 +04:00
struct ceph_osd_req_op * ops ,
2012-07-04 01:01:18 +04:00
const char * object_name ,
2010-08-13 03:11:25 +04:00
u64 ofs , u64 len ,
2011-03-22 01:10:11 +03:00
char * buf ,
struct ceph_osd_request * * linger_req ,
u64 * ver )
2010-08-13 03:11:25 +04:00
{
int ret ;
struct page * * pages ;
int num_pages ;
2012-06-26 23:57:03 +04:00
2012-09-07 01:00:54 +04:00
rbd_assert ( ops ! = NULL ) ;
2010-08-13 03:11:25 +04:00
num_pages = calc_pages_for ( ofs , len ) ;
pages = ceph_alloc_page_vector ( num_pages , GFP_KERNEL ) ;
2010-10-11 23:14:23 +04:00
if ( IS_ERR ( pages ) )
return PTR_ERR ( pages ) ;
2010-08-13 03:11:25 +04:00
2012-07-04 01:01:18 +04:00
ret = rbd_do_request ( NULL , rbd_dev , snapc , snapid ,
2012-07-04 01:01:18 +04:00
object_name , ofs , len , NULL ,
2010-08-13 03:11:25 +04:00
pages , num_pages ,
flags ,
ops ,
2011-05-14 00:52:56 +04:00
NULL , 0 ,
2011-03-22 01:10:11 +03:00
NULL ,
linger_req , ver ) ;
2010-08-13 03:11:25 +04:00
if ( ret < 0 )
2012-06-26 23:57:03 +04:00
goto done ;
2010-08-13 03:11:25 +04:00
if ( ( flags & CEPH_OSD_FLAG_READ ) & & buf )
ret = ceph_copy_from_page_vector ( pages , buf , ofs , ret ) ;
done :
ceph_release_page_vector ( pages , num_pages ) ;
return ret ;
}
/*
* Do an asynchronous ceph osd operation
*/
static int rbd_do_op ( struct request * rq ,
2012-07-04 01:01:18 +04:00
struct rbd_device * rbd_dev ,
2010-08-13 03:11:25 +04:00
struct ceph_snap_context * snapc ,
u64 snapid ,
2012-06-26 23:57:03 +04:00
int opcode , int flags ,
2010-08-13 03:11:25 +04:00
u64 ofs , u64 len ,
2011-05-14 00:52:56 +04:00
struct bio * bio ,
struct rbd_req_coll * coll ,
int coll_index )
2010-08-13 03:11:25 +04:00
{
char * seg_name ;
u64 seg_ofs ;
u64 seg_len ;
int ret ;
struct ceph_osd_req_op * ops ;
u32 payload_len ;
2012-08-09 21:33:26 +04:00
seg_name = rbd_segment_name ( rbd_dev , ofs ) ;
2010-08-13 03:11:25 +04:00
if ( ! seg_name )
return - ENOMEM ;
2012-08-09 21:33:26 +04:00
seg_len = rbd_segment_length ( rbd_dev , ofs , len ) ;
seg_ofs = rbd_segment_offset ( rbd_dev , ofs ) ;
2010-08-13 03:11:25 +04:00
payload_len = ( flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0 ) ;
2012-06-26 23:57:03 +04:00
ret = - ENOMEM ;
ops = rbd_create_rw_ops ( 1 , opcode , payload_len ) ;
if ( ! ops )
2010-08-13 03:11:25 +04:00
goto done ;
/* we've taken care of segment sizes earlier when we
cloned the bios . We should never have a segment
truncated at this point */
2012-09-07 01:00:54 +04:00
rbd_assert ( seg_len = = len ) ;
2010-08-13 03:11:25 +04:00
ret = rbd_do_request ( rq , rbd_dev , snapc , snapid ,
seg_name , seg_ofs , seg_len ,
bio ,
NULL , 0 ,
flags ,
ops ,
2011-05-14 00:52:56 +04:00
coll , coll_index ,
2011-03-22 01:10:11 +03:00
rbd_req_cb , 0 , NULL ) ;
2011-05-13 03:13:54 +04:00
rbd_destroy_ops ( ops ) ;
2010-08-13 03:11:25 +04:00
done :
kfree ( seg_name ) ;
return ret ;
}
/*
* Request async osd write
*/
static int rbd_req_write ( struct request * rq ,
struct rbd_device * rbd_dev ,
struct ceph_snap_context * snapc ,
u64 ofs , u64 len ,
2011-05-14 00:52:56 +04:00
struct bio * bio ,
struct rbd_req_coll * coll ,
int coll_index )
2010-08-13 03:11:25 +04:00
{
return rbd_do_op ( rq , rbd_dev , snapc , CEPH_NOSNAP ,
CEPH_OSD_OP_WRITE ,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK ,
2011-05-14 00:52:56 +04:00
ofs , len , bio , coll , coll_index ) ;
2010-08-13 03:11:25 +04:00
}
/*
* Request async osd read
*/
static int rbd_req_read ( struct request * rq ,
struct rbd_device * rbd_dev ,
u64 snapid ,
u64 ofs , u64 len ,
2011-05-14 00:52:56 +04:00
struct bio * bio ,
struct rbd_req_coll * coll ,
int coll_index )
2010-08-13 03:11:25 +04:00
{
return rbd_do_op ( rq , rbd_dev , NULL ,
2011-11-22 06:16:52 +04:00
snapid ,
2010-08-13 03:11:25 +04:00
CEPH_OSD_OP_READ ,
CEPH_OSD_FLAG_READ ,
2011-05-14 00:52:56 +04:00
ofs , len , bio , coll , coll_index ) ;
2010-08-13 03:11:25 +04:00
}
/*
* Request sync osd read
*/
2012-07-04 01:01:18 +04:00
static int rbd_req_sync_read ( struct rbd_device * rbd_dev ,
2010-08-13 03:11:25 +04:00
u64 snapid ,
2012-07-04 01:01:18 +04:00
const char * object_name ,
2010-08-13 03:11:25 +04:00
u64 ofs , u64 len ,
2011-03-22 01:10:11 +03:00
char * buf ,
u64 * ver )
2010-08-13 03:11:25 +04:00
{
2012-06-26 23:57:03 +04:00
struct ceph_osd_req_op * ops ;
int ret ;
ops = rbd_create_rw_ops ( 1 , CEPH_OSD_OP_READ , 0 ) ;
if ( ! ops )
return - ENOMEM ;
ret = rbd_req_sync_op ( rbd_dev , NULL ,
2011-11-22 06:16:52 +04:00
snapid ,
2010-08-13 03:11:25 +04:00
CEPH_OSD_FLAG_READ ,
2012-06-26 23:57:03 +04:00
ops , object_name , ofs , len , buf , NULL , ver ) ;
rbd_destroy_ops ( ops ) ;
return ret ;
2010-08-13 03:11:25 +04:00
}
/*
2011-03-22 01:10:11 +03:00
* Request sync osd watch
*/
2012-07-04 01:01:18 +04:00
static int rbd_req_sync_notify_ack ( struct rbd_device * rbd_dev ,
2011-03-22 01:10:11 +03:00
u64 ver ,
2012-07-25 18:32:40 +04:00
u64 notify_id )
2011-03-22 01:10:11 +03:00
{
struct ceph_osd_req_op * ops ;
2011-05-13 03:13:54 +04:00
int ret ;
2012-06-26 23:57:03 +04:00
ops = rbd_create_rw_ops ( 1 , CEPH_OSD_OP_NOTIFY_ACK , 0 ) ;
if ( ! ops )
return - ENOMEM ;
2011-03-22 01:10:11 +03:00
2011-12-06 06:10:44 +04:00
ops [ 0 ] . watch . ver = cpu_to_le64 ( ver ) ;
2011-03-22 01:10:11 +03:00
ops [ 0 ] . watch . cookie = notify_id ;
ops [ 0 ] . watch . flag = 0 ;
2012-07-04 01:01:18 +04:00
ret = rbd_do_request ( NULL , rbd_dev , NULL , CEPH_NOSNAP ,
2012-07-25 18:32:40 +04:00
rbd_dev - > header_name , 0 , 0 , NULL ,
2012-07-04 01:01:19 +04:00
NULL , 0 ,
2011-03-22 01:10:11 +03:00
CEPH_OSD_FLAG_READ ,
ops ,
2011-05-14 00:52:56 +04:00
NULL , 0 ,
2011-03-22 01:10:11 +03:00
rbd_simple_req_cb , 0 , NULL ) ;
rbd_destroy_ops ( ops ) ;
return ret ;
}
static void rbd_watch_cb ( u64 ver , u64 notify_id , u8 opcode , void * data )
{
2012-07-04 01:01:18 +04:00
struct rbd_device * rbd_dev = ( struct rbd_device * ) data ;
2011-12-06 06:10:44 +04:00
u64 hver ;
2011-05-13 03:08:30 +04:00
int rc ;
2012-07-04 01:01:18 +04:00
if ( ! rbd_dev )
2011-03-22 01:10:11 +03:00
return ;
2012-07-14 05:35:11 +04:00
dout ( " rbd_watch_cb %s notify_id=%llu opcode=%u \n " ,
rbd_dev - > header_name , ( unsigned long long ) notify_id ,
( unsigned int ) opcode ) ;
2012-07-25 18:32:41 +04:00
rc = rbd_refresh_header ( rbd_dev , & hver ) ;
2011-05-13 03:08:30 +04:00
if ( rc )
2012-01-29 23:57:44 +04:00
pr_warning ( RBD_DRV_NAME " %d got notification but failed to "
2012-07-04 01:01:18 +04:00
" update snaps: %d \n " , rbd_dev - > major , rc ) ;
2011-03-22 01:10:11 +03:00
2012-07-25 18:32:40 +04:00
rbd_req_sync_notify_ack ( rbd_dev , hver , notify_id ) ;
2011-03-22 01:10:11 +03:00
}
/*
* Request sync osd watch
*/
2012-07-25 18:32:40 +04:00
static int rbd_req_sync_watch ( struct rbd_device * rbd_dev )
2011-03-22 01:10:11 +03:00
{
struct ceph_osd_req_op * ops ;
2012-07-04 01:01:18 +04:00
struct ceph_osd_client * osdc = & rbd_dev - > rbd_client - > client - > osdc ;
2012-06-26 23:57:03 +04:00
int ret ;
2011-03-22 01:10:11 +03:00
2012-06-26 23:57:03 +04:00
ops = rbd_create_rw_ops ( 1 , CEPH_OSD_OP_WATCH , 0 ) ;
if ( ! ops )
return - ENOMEM ;
2011-03-22 01:10:11 +03:00
ret = ceph_osdc_create_event ( osdc , rbd_watch_cb , 0 ,
2012-07-04 01:01:18 +04:00
( void * ) rbd_dev , & rbd_dev - > watch_event ) ;
2011-03-22 01:10:11 +03:00
if ( ret < 0 )
goto fail ;
2012-07-25 18:32:40 +04:00
ops [ 0 ] . watch . ver = cpu_to_le64 ( rbd_dev - > header . obj_version ) ;
2012-07-04 01:01:18 +04:00
ops [ 0 ] . watch . cookie = cpu_to_le64 ( rbd_dev - > watch_event - > cookie ) ;
2011-03-22 01:10:11 +03:00
ops [ 0 ] . watch . flag = 1 ;
2012-07-04 01:01:18 +04:00
ret = rbd_req_sync_op ( rbd_dev , NULL ,
2011-03-22 01:10:11 +03:00
CEPH_NOSNAP ,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK ,
ops ,
2012-07-25 18:32:40 +04:00
rbd_dev - > header_name ,
0 , 0 , NULL ,
2012-07-04 01:01:18 +04:00
& rbd_dev - > watch_request , NULL ) ;
2011-03-22 01:10:11 +03:00
if ( ret < 0 )
goto fail_event ;
rbd_destroy_ops ( ops ) ;
return 0 ;
fail_event :
2012-07-04 01:01:18 +04:00
ceph_osdc_cancel_event ( rbd_dev - > watch_event ) ;
rbd_dev - > watch_event = NULL ;
2011-03-22 01:10:11 +03:00
fail :
rbd_destroy_ops ( ops ) ;
return ret ;
}
2011-07-13 03:56:57 +04:00
/*
* Request sync osd unwatch
*/
2012-07-25 18:32:41 +04:00
static int rbd_req_sync_unwatch ( struct rbd_device * rbd_dev )
2011-07-13 03:56:57 +04:00
{
struct ceph_osd_req_op * ops ;
2012-06-26 23:57:03 +04:00
int ret ;
2011-07-13 03:56:57 +04:00
2012-06-26 23:57:03 +04:00
ops = rbd_create_rw_ops ( 1 , CEPH_OSD_OP_WATCH , 0 ) ;
if ( ! ops )
return - ENOMEM ;
2011-07-13 03:56:57 +04:00
ops [ 0 ] . watch . ver = 0 ;
2012-07-04 01:01:18 +04:00
ops [ 0 ] . watch . cookie = cpu_to_le64 ( rbd_dev - > watch_event - > cookie ) ;
2011-07-13 03:56:57 +04:00
ops [ 0 ] . watch . flag = 0 ;
2012-07-04 01:01:18 +04:00
ret = rbd_req_sync_op ( rbd_dev , NULL ,
2011-07-13 03:56:57 +04:00
CEPH_NOSNAP ,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK ,
ops ,
2012-07-25 18:32:41 +04:00
rbd_dev - > header_name ,
0 , 0 , NULL , NULL , NULL ) ;
2011-07-13 03:56:57 +04:00
rbd_destroy_ops ( ops ) ;
2012-07-04 01:01:18 +04:00
ceph_osdc_cancel_event ( rbd_dev - > watch_event ) ;
rbd_dev - > watch_event = NULL ;
2011-07-13 03:56:57 +04:00
return ret ;
}
2011-03-22 01:10:11 +03:00
struct rbd_notify_info {
2012-07-04 01:01:18 +04:00
struct rbd_device * rbd_dev ;
2011-03-22 01:10:11 +03:00
} ;
static void rbd_notify_cb ( u64 ver , u64 notify_id , u8 opcode , void * data )
{
2012-07-04 01:01:18 +04:00
struct rbd_device * rbd_dev = ( struct rbd_device * ) data ;
if ( ! rbd_dev )
2011-03-22 01:10:11 +03:00
return ;
2012-07-14 05:35:11 +04:00
dout ( " rbd_notify_cb %s notify_id=%llu opcode=%u \n " ,
rbd_dev - > header_name , ( unsigned long long ) notify_id ,
( unsigned int ) opcode ) ;
2011-03-22 01:10:11 +03:00
}
/*
* Request sync osd notify
*/
2012-07-25 18:32:40 +04:00
static int rbd_req_sync_notify ( struct rbd_device * rbd_dev )
2011-03-22 01:10:11 +03:00
{
struct ceph_osd_req_op * ops ;
2012-07-04 01:01:18 +04:00
struct ceph_osd_client * osdc = & rbd_dev - > rbd_client - > client - > osdc ;
2011-03-22 01:10:11 +03:00
struct ceph_osd_event * event ;
struct rbd_notify_info info ;
int payload_len = sizeof ( u32 ) + sizeof ( u32 ) ;
int ret ;
2012-06-26 23:57:03 +04:00
ops = rbd_create_rw_ops ( 1 , CEPH_OSD_OP_NOTIFY , payload_len ) ;
if ( ! ops )
return - ENOMEM ;
2011-03-22 01:10:11 +03:00
2012-07-04 01:01:18 +04:00
info . rbd_dev = rbd_dev ;
2011-03-22 01:10:11 +03:00
ret = ceph_osdc_create_event ( osdc , rbd_notify_cb , 1 ,
( void * ) & info , & event ) ;
if ( ret < 0 )
goto fail ;
ops [ 0 ] . watch . ver = 1 ;
ops [ 0 ] . watch . flag = 1 ;
ops [ 0 ] . watch . cookie = event - > cookie ;
ops [ 0 ] . watch . prot_ver = RADOS_NOTIFY_VER ;
ops [ 0 ] . watch . timeout = 12 ;
2012-07-04 01:01:18 +04:00
ret = rbd_req_sync_op ( rbd_dev , NULL ,
2011-03-22 01:10:11 +03:00
CEPH_NOSNAP ,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK ,
ops ,
2012-07-25 18:32:40 +04:00
rbd_dev - > header_name ,
0 , 0 , NULL , NULL , NULL ) ;
2011-03-22 01:10:11 +03:00
if ( ret < 0 )
goto fail_event ;
ret = ceph_osdc_wait_event ( event , CEPH_OSD_TIMEOUT_DEFAULT ) ;
dout ( " ceph_osdc_wait_event returned %d \n " , ret ) ;
rbd_destroy_ops ( ops ) ;
return 0 ;
fail_event :
ceph_osdc_cancel_event ( event ) ;
fail :
rbd_destroy_ops ( ops ) ;
return ret ;
}
2010-08-13 03:11:25 +04:00
/*
* Request sync osd read
*/
2012-07-04 01:01:18 +04:00
static int rbd_req_sync_exec ( struct rbd_device * rbd_dev ,
2012-07-04 01:01:18 +04:00
const char * object_name ,
const char * class_name ,
const char * method_name ,
2010-08-13 03:11:25 +04:00
const char * data ,
2011-03-22 01:10:11 +03:00
int len ,
u64 * ver )
2010-08-13 03:11:25 +04:00
{
struct ceph_osd_req_op * ops ;
2012-07-04 01:01:18 +04:00
int class_name_len = strlen ( class_name ) ;
int method_name_len = strlen ( method_name ) ;
2012-06-26 23:57:03 +04:00
int ret ;
ops = rbd_create_rw_ops ( 1 , CEPH_OSD_OP_CALL ,
2012-07-04 01:01:18 +04:00
class_name_len + method_name_len + len ) ;
2012-06-26 23:57:03 +04:00
if ( ! ops )
return - ENOMEM ;
2010-08-13 03:11:25 +04:00
2012-07-04 01:01:18 +04:00
ops [ 0 ] . cls . class_name = class_name ;
ops [ 0 ] . cls . class_len = ( __u8 ) class_name_len ;
ops [ 0 ] . cls . method_name = method_name ;
ops [ 0 ] . cls . method_len = ( __u8 ) method_name_len ;
2010-08-13 03:11:25 +04:00
ops [ 0 ] . cls . argc = 0 ;
ops [ 0 ] . cls . indata = data ;
ops [ 0 ] . cls . indata_len = len ;
2012-07-04 01:01:18 +04:00
ret = rbd_req_sync_op ( rbd_dev , NULL ,
2010-08-13 03:11:25 +04:00
CEPH_NOSNAP ,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK ,
ops ,
2012-06-26 23:57:03 +04:00
object_name , 0 , 0 , NULL , NULL , ver ) ;
2010-08-13 03:11:25 +04:00
rbd_destroy_ops ( ops ) ;
dout ( " cls_exec returned %d \n " , ret ) ;
return ret ;
}
2011-05-14 00:52:56 +04:00
static struct rbd_req_coll * rbd_alloc_coll ( int num_reqs )
{
struct rbd_req_coll * coll =
kzalloc ( sizeof ( struct rbd_req_coll ) +
sizeof ( struct rbd_req_status ) * num_reqs ,
GFP_ATOMIC ) ;
if ( ! coll )
return NULL ;
coll - > total = num_reqs ;
kref_init ( & coll - > kref ) ;
return coll ;
}
2010-08-13 03:11:25 +04:00
/*
* block device queue callback
*/
static void rbd_rq_fn ( struct request_queue * q )
{
struct rbd_device * rbd_dev = q - > queuedata ;
struct request * rq ;
struct bio_pair * bp = NULL ;
2012-02-07 22:03:36 +04:00
while ( ( rq = blk_fetch_request ( q ) ) ) {
2010-08-13 03:11:25 +04:00
struct bio * bio ;
struct bio * rq_bio , * next_bio = NULL ;
bool do_write ;
2012-07-14 05:35:11 +04:00
unsigned int size ;
u64 op_size = 0 ;
2010-08-13 03:11:25 +04:00
u64 ofs ;
2011-05-14 00:52:56 +04:00
int num_segs , cur_seg = 0 ;
struct rbd_req_coll * coll ;
2011-12-06 02:03:05 +04:00
struct ceph_snap_context * snapc ;
2010-08-13 03:11:25 +04:00
dout ( " fetched request \n " ) ;
/* filter out block requests we don't understand */
if ( ( rq - > cmd_type ! = REQ_TYPE_FS ) ) {
__blk_end_request_all ( rq , 0 ) ;
2012-02-07 22:03:36 +04:00
continue ;
2010-08-13 03:11:25 +04:00
}
/* deduce our operation (read, write) */
do_write = ( rq_data_dir ( rq ) = = WRITE ) ;
size = blk_rq_bytes ( rq ) ;
2012-02-07 22:03:37 +04:00
ofs = blk_rq_pos ( rq ) * SECTOR_SIZE ;
2010-08-13 03:11:25 +04:00
rq_bio = rq - > bio ;
if ( do_write & & rbd_dev - > read_only ) {
__blk_end_request_all ( rq , - EROFS ) ;
2012-02-07 22:03:36 +04:00
continue ;
2010-08-13 03:11:25 +04:00
}
spin_unlock_irq ( q - > queue_lock ) ;
2011-12-06 02:03:05 +04:00
down_read ( & rbd_dev - > header_rwsem ) ;
2011-11-22 06:14:25 +04:00
2011-12-06 02:03:05 +04:00
if ( rbd_dev - > snap_id ! = CEPH_NOSNAP & & ! rbd_dev - > snap_exists ) {
2011-11-22 06:14:25 +04:00
up_read ( & rbd_dev - > header_rwsem ) ;
2011-12-06 02:03:05 +04:00
dout ( " request for non-existent snapshot " ) ;
spin_lock_irq ( q - > queue_lock ) ;
__blk_end_request_all ( rq , - ENXIO ) ;
continue ;
2011-11-22 06:14:25 +04:00
}
2011-12-06 02:03:05 +04:00
snapc = ceph_get_snap_context ( rbd_dev - > header . snapc ) ;
up_read ( & rbd_dev - > header_rwsem ) ;
2010-08-13 03:11:25 +04:00
dout ( " %s 0x%x bytes at 0x%llx \n " ,
do_write ? " write " : " read " ,
2012-07-14 05:35:11 +04:00
size , ( unsigned long long ) blk_rq_pos ( rq ) * SECTOR_SIZE ) ;
2010-08-13 03:11:25 +04:00
2011-05-14 00:52:56 +04:00
num_segs = rbd_get_num_segments ( & rbd_dev - > header , ofs , size ) ;
2012-08-09 21:33:26 +04:00
if ( num_segs < = 0 ) {
spin_lock_irq ( q - > queue_lock ) ;
__blk_end_request_all ( rq , num_segs ) ;
ceph_put_snap_context ( snapc ) ;
continue ;
}
2011-05-14 00:52:56 +04:00
coll = rbd_alloc_coll ( num_segs ) ;
if ( ! coll ) {
spin_lock_irq ( q - > queue_lock ) ;
__blk_end_request_all ( rq , - ENOMEM ) ;
2011-12-06 02:03:05 +04:00
ceph_put_snap_context ( snapc ) ;
2012-02-07 22:03:36 +04:00
continue ;
2011-05-14 00:52:56 +04:00
}
2010-08-13 03:11:25 +04:00
do {
/* a bio clone to be passed down to OSD req */
2012-07-14 05:35:11 +04:00
dout ( " rq->bio->bi_vcnt=%hu \n " , rq - > bio - > bi_vcnt ) ;
2012-08-09 21:33:26 +04:00
op_size = rbd_segment_length ( rbd_dev , ofs , size ) ;
2011-05-14 00:52:56 +04:00
kref_get ( & coll - > kref ) ;
2010-08-13 03:11:25 +04:00
bio = bio_chain_clone ( & rq_bio , & next_bio , & bp ,
op_size , GFP_ATOMIC ) ;
if ( ! bio ) {
2011-05-14 00:52:56 +04:00
rbd_coll_end_req_index ( rq , coll , cur_seg ,
- ENOMEM , op_size ) ;
goto next_seg ;
2010-08-13 03:11:25 +04:00
}
2011-05-14 00:52:56 +04:00
2010-08-13 03:11:25 +04:00
/* init OSD command: write or read */
if ( do_write )
rbd_req_write ( rq , rbd_dev ,
2011-12-06 02:03:05 +04:00
snapc ,
2010-08-13 03:11:25 +04:00
ofs ,
2011-05-14 00:52:56 +04:00
op_size , bio ,
coll , cur_seg ) ;
2010-08-13 03:11:25 +04:00
else
rbd_req_read ( rq , rbd_dev ,
2011-11-22 01:04:42 +04:00
rbd_dev - > snap_id ,
2010-08-13 03:11:25 +04:00
ofs ,
2011-05-14 00:52:56 +04:00
op_size , bio ,
coll , cur_seg ) ;
2010-08-13 03:11:25 +04:00
2011-05-14 00:52:56 +04:00
next_seg :
2010-08-13 03:11:25 +04:00
size - = op_size ;
ofs + = op_size ;
2011-05-14 00:52:56 +04:00
cur_seg + + ;
2010-08-13 03:11:25 +04:00
rq_bio = next_bio ;
} while ( size > 0 ) ;
2011-05-14 00:52:56 +04:00
kref_put ( & coll - > kref , rbd_coll_release ) ;
2010-08-13 03:11:25 +04:00
if ( bp )
bio_pair_release ( bp ) ;
spin_lock_irq ( q - > queue_lock ) ;
2011-12-06 02:03:05 +04:00
ceph_put_snap_context ( snapc ) ;
2010-08-13 03:11:25 +04:00
}
}
/*
* a queue callback . Makes sure that we don ' t create a bio that spans across
* multiple osd objects . One exception would be with a single page bios ,
* which we handle later at bio_chain_clone
*/
static int rbd_merge_bvec ( struct request_queue * q , struct bvec_merge_data * bmd ,
struct bio_vec * bvec )
{
struct rbd_device * rbd_dev = q - > queuedata ;
2012-02-07 22:03:37 +04:00
unsigned int chunk_sectors ;
sector_t sector ;
unsigned int bio_sectors ;
2010-08-13 03:11:25 +04:00
int max ;
2012-02-07 22:03:37 +04:00
chunk_sectors = 1 < < ( rbd_dev - > header . obj_order - SECTOR_SHIFT ) ;
sector = bmd - > bi_sector + get_start_sect ( bmd - > bi_bdev ) ;
bio_sectors = bmd - > bi_size > > SECTOR_SHIFT ;
2010-08-13 03:11:25 +04:00
max = ( chunk_sectors - ( ( sector & ( chunk_sectors - 1 ) )
2012-02-07 22:03:37 +04:00
+ bio_sectors ) ) < < SECTOR_SHIFT ;
2010-08-13 03:11:25 +04:00
if ( max < 0 )
max = 0 ; /* bio_add cannot handle a negative return */
if ( max < = bvec - > bv_len & & bio_sectors = = 0 )
return bvec - > bv_len ;
return max ;
}
static void rbd_free_disk ( struct rbd_device * rbd_dev )
{
struct gendisk * disk = rbd_dev - > disk ;
if ( ! disk )
return ;
rbd_header_free ( & rbd_dev - > header ) ;
if ( disk - > flags & GENHD_FL_UP )
del_gendisk ( disk ) ;
if ( disk - > queue )
blk_cleanup_queue ( disk - > queue ) ;
put_disk ( disk ) ;
}
/*
2012-08-02 20:29:46 +04:00
* Read the complete header for the given rbd device .
*
* Returns a pointer to a dynamically - allocated buffer containing
* the complete and validated header . Caller can pass the address
* of a variable that will be filled in with the version of the
* header object at the time it was read .
*
* Returns a pointer - coded errno if a failure occurs .
2010-08-13 03:11:25 +04:00
*/
2012-08-02 20:29:46 +04:00
static struct rbd_image_header_ondisk *
rbd_dev_v1_header_read ( struct rbd_device * rbd_dev , u64 * version )
2010-08-13 03:11:25 +04:00
{
2012-08-02 20:29:46 +04:00
struct rbd_image_header_ondisk * ondisk = NULL ;
2012-04-21 00:49:44 +04:00
u32 snap_count = 0 ;
2012-08-02 20:29:46 +04:00
u64 names_size = 0 ;
u32 want_count ;
int ret ;
2010-08-13 03:11:25 +04:00
2012-02-07 22:03:36 +04:00
/*
2012-08-02 20:29:46 +04:00
* The complete header will include an array of its 64 - bit
* snapshot ids , followed by the names of those snapshots as
* a contiguous block of NUL - terminated strings . Note that
* the number of snapshots could change by the time we read
* it in , in which case we re - read it .
2012-02-07 22:03:36 +04:00
*/
2012-08-02 20:29:46 +04:00
do {
size_t size ;
kfree ( ondisk ) ;
size = sizeof ( * ondisk ) ;
size + = snap_count * sizeof ( struct rbd_image_snap_ondisk ) ;
size + = names_size ;
ondisk = kmalloc ( size , GFP_KERNEL ) ;
if ( ! ondisk )
return ERR_PTR ( - ENOMEM ) ;
ret = rbd_req_sync_read ( rbd_dev , CEPH_NOSNAP ,
2012-07-04 01:01:18 +04:00
rbd_dev - > header_name ,
2012-08-02 20:29:46 +04:00
0 , size ,
( char * ) ondisk , version ) ;
if ( ret < 0 )
goto out_err ;
if ( WARN_ON ( ( size_t ) ret < size ) ) {
ret = - ENXIO ;
pr_warning ( " short header read for image %s "
" (want %zd got %d) \n " ,
rbd_dev - > image_name , size , ret ) ;
goto out_err ;
}
if ( ! rbd_dev_ondisk_valid ( ondisk ) ) {
ret = - ENXIO ;
pr_warning ( " invalid header for image %s \n " ,
rbd_dev - > image_name ) ;
goto out_err ;
2011-11-16 02:49:53 +04:00
}
2010-08-13 03:11:25 +04:00
2012-08-02 20:29:46 +04:00
names_size = le64_to_cpu ( ondisk - > snap_names_len ) ;
want_count = snap_count ;
snap_count = le32_to_cpu ( ondisk - > snap_count ) ;
} while ( snap_count ! = want_count ) ;
2012-02-07 22:03:36 +04:00
2012-08-02 20:29:46 +04:00
return ondisk ;
2012-02-07 22:03:36 +04:00
2012-08-02 20:29:46 +04:00
out_err :
kfree ( ondisk ) ;
return ERR_PTR ( ret ) ;
}
/*
* reload the ondisk the header
*/
static int rbd_read_header ( struct rbd_device * rbd_dev ,
struct rbd_image_header * header )
{
struct rbd_image_header_ondisk * ondisk ;
u64 ver = 0 ;
int ret ;
2010-08-13 03:11:25 +04:00
2012-08-02 20:29:46 +04:00
ondisk = rbd_dev_v1_header_read ( rbd_dev , & ver ) ;
if ( IS_ERR ( ondisk ) )
return PTR_ERR ( ondisk ) ;
ret = rbd_header_from_disk ( header , ondisk ) ;
if ( ret > = 0 )
header - > obj_version = ver ;
kfree ( ondisk ) ;
return ret ;
2010-08-13 03:11:25 +04:00
}
/*
* create a snapshot
*/
2012-07-04 01:01:18 +04:00
static int rbd_header_add_snap ( struct rbd_device * rbd_dev ,
2010-08-13 03:11:25 +04:00
const char * snap_name ,
gfp_t gfp_flags )
{
int name_len = strlen ( snap_name ) ;
u64 new_snapid ;
int ret ;
2011-05-13 03:10:50 +04:00
void * data , * p , * e ;
2012-01-24 20:08:37 +04:00
struct ceph_mon_client * monc ;
2010-08-13 03:11:25 +04:00
/* we should create a snapshot only if we're pointing at the head */
2012-07-04 01:01:18 +04:00
if ( rbd_dev - > snap_id ! = CEPH_NOSNAP )
2010-08-13 03:11:25 +04:00
return - EINVAL ;
2012-07-04 01:01:18 +04:00
monc = & rbd_dev - > rbd_client - > client - > monc ;
ret = ceph_monc_create_snapid ( monc , rbd_dev - > pool_id , & new_snapid ) ;
2012-07-14 05:35:11 +04:00
dout ( " created snapid=%llu \n " , ( unsigned long long ) new_snapid ) ;
2010-08-13 03:11:25 +04:00
if ( ret < 0 )
return ret ;
data = kmalloc ( name_len + 16 , gfp_flags ) ;
if ( ! data )
return - ENOMEM ;
2011-05-13 03:10:50 +04:00
p = data ;
e = data + name_len + 16 ;
2010-08-13 03:11:25 +04:00
2011-05-13 03:10:50 +04:00
ceph_encode_string_safe ( & p , e , snap_name , name_len , bad ) ;
ceph_encode_64_safe ( & p , e , new_snapid , bad ) ;
2010-08-13 03:11:25 +04:00
2012-07-04 01:01:18 +04:00
ret = rbd_req_sync_exec ( rbd_dev , rbd_dev - > header_name ,
2012-07-04 01:01:18 +04:00
" rbd " , " snap_add " ,
2012-07-14 05:35:11 +04:00
data , p - data , NULL ) ;
2010-08-13 03:11:25 +04:00
2011-05-13 03:10:50 +04:00
kfree ( data ) ;
2010-08-13 03:11:25 +04:00
2012-07-19 17:49:18 +04:00
return ret < 0 ? ret : 0 ;
2010-08-13 03:11:25 +04:00
bad :
return - ERANGE ;
}
2010-11-20 01:51:04 +03:00
static void __rbd_remove_all_snaps ( struct rbd_device * rbd_dev )
{
struct rbd_snap * snap ;
2012-07-19 18:09:27 +04:00
struct rbd_snap * next ;
2010-11-20 01:51:04 +03:00
2012-07-19 18:09:27 +04:00
list_for_each_entry_safe ( snap , next , & rbd_dev - > snaps , node )
2012-07-19 18:09:27 +04:00
__rbd_remove_snap_dev ( snap ) ;
2010-11-20 01:51:04 +03:00
}
2010-08-13 03:11:25 +04:00
/*
* only read the first part of the ondisk header , without the snaps info
*/
2012-07-25 18:32:41 +04:00
static int __rbd_refresh_header ( struct rbd_device * rbd_dev , u64 * hver )
2010-08-13 03:11:25 +04:00
{
int ret ;
struct rbd_image_header h ;
ret = rbd_read_header ( rbd_dev , & h ) ;
if ( ret < 0 )
return ret ;
2011-12-05 22:35:04 +04:00
down_write ( & rbd_dev - > header_rwsem ) ;
2011-04-20 09:49:06 +04:00
/* resized? */
2011-11-22 05:13:54 +04:00
if ( rbd_dev - > snap_id = = CEPH_NOSNAP ) {
sector_t size = ( sector_t ) h . image_size / SECTOR_SIZE ;
dout ( " setting size to %llu sectors " , ( unsigned long long ) size ) ;
set_capacity ( rbd_dev - > disk , size ) ;
}
2011-04-20 09:49:06 +04:00
2012-07-10 06:04:24 +04:00
/* rbd_dev->header.object_prefix shouldn't change */
2010-08-13 03:11:25 +04:00
kfree ( rbd_dev - > header . snap_sizes ) ;
2012-07-10 06:04:24 +04:00
kfree ( rbd_dev - > header . snap_names ) ;
2011-12-06 02:03:05 +04:00
/* osd requests may still refer to snapc */
ceph_put_snap_context ( rbd_dev - > header . snapc ) ;
2010-08-13 03:11:25 +04:00
2012-07-25 18:32:41 +04:00
if ( hver )
* hver = h . obj_version ;
2011-12-06 06:10:44 +04:00
rbd_dev - > header . obj_version = h . obj_version ;
2011-12-05 22:41:28 +04:00
rbd_dev - > header . image_size = h . image_size ;
2010-08-13 03:11:25 +04:00
rbd_dev - > header . snapc = h . snapc ;
rbd_dev - > header . snap_names = h . snap_names ;
rbd_dev - > header . snap_sizes = h . snap_sizes ;
2012-07-10 06:04:24 +04:00
/* Free the extra copy of the object prefix */
WARN_ON ( strcmp ( rbd_dev - > header . object_prefix , h . object_prefix ) ) ;
kfree ( h . object_prefix ) ;
2012-08-24 08:48:49 +04:00
ret = rbd_dev_snap_devs_update ( rbd_dev ) ;
2010-11-20 01:51:04 +03:00
2011-11-22 05:11:12 +04:00
up_write ( & rbd_dev - > header_rwsem ) ;
2010-08-13 03:11:25 +04:00
2010-11-20 01:51:04 +03:00
return ret ;
2010-08-13 03:11:25 +04:00
}
2012-07-25 18:32:41 +04:00
static int rbd_refresh_header ( struct rbd_device * rbd_dev , u64 * hver )
{
int ret ;
mutex_lock_nested ( & ctl_mutex , SINGLE_DEPTH_NESTING ) ;
ret = __rbd_refresh_header ( rbd_dev , hver ) ;
mutex_unlock ( & ctl_mutex ) ;
return ret ;
}
2010-08-13 03:11:25 +04:00
static int rbd_init_disk ( struct rbd_device * rbd_dev )
{
struct gendisk * disk ;
struct request_queue * q ;
int rc ;
2012-02-07 22:03:37 +04:00
u64 segment_size ;
2010-08-13 03:11:25 +04:00
u64 total_size = 0 ;
/* contact OSD, request size info about the object being mapped */
rc = rbd_read_header ( rbd_dev , & rbd_dev - > header ) ;
if ( rc )
return rc ;
2010-11-20 01:51:04 +03:00
/* no need to lock here, as rbd_dev is not registered yet */
2012-08-24 08:48:49 +04:00
rc = rbd_dev_snap_devs_update ( rbd_dev ) ;
2010-11-20 01:51:04 +03:00
if ( rc )
return rc ;
2011-11-22 06:19:13 +04:00
rc = rbd_header_set_snap ( rbd_dev , & total_size ) ;
2010-08-13 03:11:25 +04:00
if ( rc )
return rc ;
/* create gendisk info */
rc = - ENOMEM ;
disk = alloc_disk ( RBD_MINORS_PER_MAJOR ) ;
if ( ! disk )
goto out ;
2012-01-29 23:57:44 +04:00
snprintf ( disk - > disk_name , sizeof ( disk - > disk_name ) , RBD_DRV_NAME " %d " ,
2012-07-04 01:01:19 +04:00
rbd_dev - > dev_id ) ;
2010-08-13 03:11:25 +04:00
disk - > major = rbd_dev - > major ;
disk - > first_minor = 0 ;
disk - > fops = & rbd_bd_ops ;
disk - > private_data = rbd_dev ;
/* init rq */
rc = - ENOMEM ;
q = blk_init_queue ( rbd_rq_fn , & rbd_dev - > lock ) ;
if ( ! q )
goto out_disk ;
2011-07-22 22:35:23 +04:00
2012-02-07 22:03:37 +04:00
/* We use the default size, but let's be explicit about it. */
blk_queue_physical_block_size ( q , SECTOR_SIZE ) ;
2011-07-22 22:35:23 +04:00
/* set io sizes to object size */
2012-02-07 22:03:37 +04:00
segment_size = rbd_obj_bytes ( & rbd_dev - > header ) ;
blk_queue_max_hw_sectors ( q , segment_size / SECTOR_SIZE ) ;
blk_queue_max_segment_size ( q , segment_size ) ;
blk_queue_io_min ( q , segment_size ) ;
blk_queue_io_opt ( q , segment_size ) ;
2011-07-22 22:35:23 +04:00
2010-08-13 03:11:25 +04:00
blk_queue_merge_bvec ( q , rbd_merge_bvec ) ;
disk - > queue = q ;
q - > queuedata = rbd_dev ;
rbd_dev - > disk = disk ;
/* finally, announce the disk to the world */
2012-02-07 22:03:37 +04:00
set_capacity ( disk , total_size / SECTOR_SIZE ) ;
2010-08-13 03:11:25 +04:00
add_disk ( disk ) ;
pr_info ( " %s: added with size 0x%llx \n " ,
disk - > disk_name , ( unsigned long long ) total_size ) ;
return 0 ;
out_disk :
put_disk ( disk ) ;
out :
return rc ;
}
2010-11-20 01:51:04 +03:00
/*
sysfs
*/
2012-02-07 22:03:37 +04:00
static struct rbd_device * dev_to_rbd_dev ( struct device * dev )
{
return container_of ( dev , struct rbd_device , dev ) ;
}
2010-11-20 01:51:04 +03:00
static ssize_t rbd_size_show ( struct device * dev ,
struct device_attribute * attr , char * buf )
{
2012-02-07 22:03:37 +04:00
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
2011-12-05 22:35:04 +04:00
sector_t size ;
down_read ( & rbd_dev - > header_rwsem ) ;
size = get_capacity ( rbd_dev - > disk ) ;
up_read ( & rbd_dev - > header_rwsem ) ;
2010-11-20 01:51:04 +03:00
2011-12-05 22:35:04 +04:00
return sprintf ( buf , " %llu \n " , ( unsigned long long ) size * SECTOR_SIZE ) ;
2010-11-20 01:51:04 +03:00
}
static ssize_t rbd_major_show ( struct device * dev ,
struct device_attribute * attr , char * buf )
{
2012-02-07 22:03:37 +04:00
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
2010-08-13 03:11:25 +04:00
2010-11-20 01:51:04 +03:00
return sprintf ( buf , " %d \n " , rbd_dev - > major ) ;
}
static ssize_t rbd_client_id_show ( struct device * dev ,
struct device_attribute * attr , char * buf )
2010-08-13 03:11:25 +04:00
{
2012-02-07 22:03:37 +04:00
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
2010-11-20 01:51:04 +03:00
2012-01-24 20:08:37 +04:00
return sprintf ( buf , " client%lld \n " ,
ceph_client_id ( rbd_dev - > rbd_client - > client ) ) ;
2010-08-13 03:11:25 +04:00
}
2010-11-20 01:51:04 +03:00
static ssize_t rbd_pool_show ( struct device * dev ,
struct device_attribute * attr , char * buf )
2010-08-13 03:11:25 +04:00
{
2012-02-07 22:03:37 +04:00
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
2010-11-20 01:51:04 +03:00
return sprintf ( buf , " %s \n " , rbd_dev - > pool_name ) ;
}
2012-07-12 19:46:35 +04:00
static ssize_t rbd_pool_id_show ( struct device * dev ,
struct device_attribute * attr , char * buf )
{
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
return sprintf ( buf , " %d \n " , rbd_dev - > pool_id ) ;
}
2010-11-20 01:51:04 +03:00
static ssize_t rbd_name_show ( struct device * dev ,
struct device_attribute * attr , char * buf )
{
2012-02-07 22:03:37 +04:00
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
2010-11-20 01:51:04 +03:00
2012-07-04 01:01:18 +04:00
return sprintf ( buf , " %s \n " , rbd_dev - > image_name ) ;
2010-11-20 01:51:04 +03:00
}
static ssize_t rbd_snap_show ( struct device * dev ,
struct device_attribute * attr ,
char * buf )
{
2012-02-07 22:03:37 +04:00
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
2010-11-20 01:51:04 +03:00
return sprintf ( buf , " %s \n " , rbd_dev - > snap_name ) ;
}
static ssize_t rbd_image_refresh ( struct device * dev ,
struct device_attribute * attr ,
const char * buf ,
size_t size )
{
2012-02-07 22:03:37 +04:00
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
2012-07-25 18:32:41 +04:00
int ret ;
2010-08-13 03:11:25 +04:00
2012-07-25 18:32:41 +04:00
ret = rbd_refresh_header ( rbd_dev , NULL ) ;
2012-07-25 18:32:41 +04:00
return ret < 0 ? ret : size ;
2010-11-20 01:51:04 +03:00
}
2010-08-13 03:11:25 +04:00
2010-11-20 01:51:04 +03:00
static DEVICE_ATTR ( size , S_IRUGO , rbd_size_show , NULL ) ;
static DEVICE_ATTR ( major , S_IRUGO , rbd_major_show , NULL ) ;
static DEVICE_ATTR ( client_id , S_IRUGO , rbd_client_id_show , NULL ) ;
static DEVICE_ATTR ( pool , S_IRUGO , rbd_pool_show , NULL ) ;
2012-07-12 19:46:35 +04:00
static DEVICE_ATTR ( pool_id , S_IRUGO , rbd_pool_id_show , NULL ) ;
2010-11-20 01:51:04 +03:00
static DEVICE_ATTR ( name , S_IRUGO , rbd_name_show , NULL ) ;
static DEVICE_ATTR ( refresh , S_IWUSR , NULL , rbd_image_refresh ) ;
static DEVICE_ATTR ( current_snap , S_IRUGO , rbd_snap_show , NULL ) ;
static DEVICE_ATTR ( create_snap , S_IWUSR , NULL , rbd_snap_add ) ;
static struct attribute * rbd_attrs [ ] = {
& dev_attr_size . attr ,
& dev_attr_major . attr ,
& dev_attr_client_id . attr ,
& dev_attr_pool . attr ,
2012-07-12 19:46:35 +04:00
& dev_attr_pool_id . attr ,
2010-11-20 01:51:04 +03:00
& dev_attr_name . attr ,
& dev_attr_current_snap . attr ,
& dev_attr_refresh . attr ,
& dev_attr_create_snap . attr ,
NULL
} ;
static struct attribute_group rbd_attr_group = {
. attrs = rbd_attrs ,
} ;
static const struct attribute_group * rbd_attr_groups [ ] = {
& rbd_attr_group ,
NULL
} ;
static void rbd_sysfs_dev_release ( struct device * dev )
{
}
static struct device_type rbd_device_type = {
. name = " rbd " ,
. groups = rbd_attr_groups ,
. release = rbd_sysfs_dev_release ,
} ;
/*
sysfs - snapshots
*/
static ssize_t rbd_snap_size_show ( struct device * dev ,
struct device_attribute * attr ,
char * buf )
{
struct rbd_snap * snap = container_of ( dev , struct rbd_snap , dev ) ;
2011-12-06 06:25:13 +04:00
return sprintf ( buf , " %llu \n " , ( unsigned long long ) snap - > size ) ;
2010-11-20 01:51:04 +03:00
}
static ssize_t rbd_snap_id_show ( struct device * dev ,
struct device_attribute * attr ,
char * buf )
{
struct rbd_snap * snap = container_of ( dev , struct rbd_snap , dev ) ;
2011-12-06 06:25:13 +04:00
return sprintf ( buf , " %llu \n " , ( unsigned long long ) snap - > id ) ;
2010-11-20 01:51:04 +03:00
}
static DEVICE_ATTR ( snap_size , S_IRUGO , rbd_snap_size_show , NULL ) ;
static DEVICE_ATTR ( snap_id , S_IRUGO , rbd_snap_id_show , NULL ) ;
static struct attribute * rbd_snap_attrs [ ] = {
& dev_attr_snap_size . attr ,
& dev_attr_snap_id . attr ,
NULL ,
} ;
static struct attribute_group rbd_snap_attr_group = {
. attrs = rbd_snap_attrs ,
} ;
static void rbd_snap_dev_release ( struct device * dev )
{
struct rbd_snap * snap = container_of ( dev , struct rbd_snap , dev ) ;
kfree ( snap - > name ) ;
kfree ( snap ) ;
}
static const struct attribute_group * rbd_snap_attr_groups [ ] = {
& rbd_snap_attr_group ,
NULL
} ;
static struct device_type rbd_snap_device_type = {
. groups = rbd_snap_attr_groups ,
. release = rbd_snap_dev_release ,
} ;
2012-07-19 18:09:27 +04:00
static void __rbd_remove_snap_dev ( struct rbd_snap * snap )
2010-11-20 01:51:04 +03:00
{
list_del ( & snap - > node ) ;
device_unregister ( & snap - > dev ) ;
}
2012-07-19 18:09:27 +04:00
static int rbd_register_snap_dev ( struct rbd_snap * snap ,
2010-11-20 01:51:04 +03:00
struct device * parent )
{
struct device * dev = & snap - > dev ;
int ret ;
dev - > type = & rbd_snap_device_type ;
dev - > parent = parent ;
dev - > release = rbd_snap_dev_release ;
dev_set_name ( dev , " snap_%s " , snap - > name ) ;
ret = device_register ( dev ) ;
return ret ;
}
2012-07-11 05:30:10 +04:00
static struct rbd_snap * __rbd_add_snap_dev ( struct rbd_device * rbd_dev ,
int i , const char * name )
2010-11-20 01:51:04 +03:00
{
2012-07-11 05:30:10 +04:00
struct rbd_snap * snap ;
2010-11-20 01:51:04 +03:00
int ret ;
2012-07-11 05:30:10 +04:00
snap = kzalloc ( sizeof ( * snap ) , GFP_KERNEL ) ;
2010-11-20 01:51:04 +03:00
if ( ! snap )
2012-07-11 05:30:10 +04:00
return ERR_PTR ( - ENOMEM ) ;
ret = - ENOMEM ;
2010-11-20 01:51:04 +03:00
snap - > name = kstrdup ( name , GFP_KERNEL ) ;
2012-07-11 05:30:10 +04:00
if ( ! snap - > name )
goto err ;
2010-11-20 01:51:04 +03:00
snap - > size = rbd_dev - > header . snap_sizes [ i ] ;
snap - > id = rbd_dev - > header . snapc - > snaps [ i ] ;
if ( device_is_registered ( & rbd_dev - > dev ) ) {
2012-07-19 18:09:27 +04:00
ret = rbd_register_snap_dev ( snap , & rbd_dev - > dev ) ;
2010-11-20 01:51:04 +03:00
if ( ret < 0 )
goto err ;
}
2012-07-11 05:30:10 +04:00
return snap ;
2010-11-20 01:51:04 +03:00
err :
kfree ( snap - > name ) ;
kfree ( snap ) ;
2012-07-11 05:30:10 +04:00
return ERR_PTR ( ret ) ;
2010-11-20 01:51:04 +03:00
}
/*
2012-08-02 20:29:46 +04:00
* Scan the rbd device ' s current snapshot list and compare it to the
* newly - received snapshot context . Remove any existing snapshots
* not present in the new snapshot context . Add a new snapshot for
* any snaphots in the snapshot context not in the current list .
* And verify there are no changes to snapshots we already know
* about .
*
* Assumes the snapshots in the snapshot context are sorted by
* snapshot id , highest id first . ( Snapshots in the rbd_dev ' s list
* are also maintained in that order . )
2010-11-20 01:51:04 +03:00
*/
2012-08-24 08:48:49 +04:00
static int rbd_dev_snap_devs_update ( struct rbd_device * rbd_dev )
2010-11-20 01:51:04 +03:00
{
2012-08-02 20:29:46 +04:00
struct ceph_snap_context * snapc = rbd_dev - > header . snapc ;
const u32 snap_count = snapc - > num_snaps ;
char * snap_name = rbd_dev - > header . snap_names ;
struct list_head * head = & rbd_dev - > snaps ;
struct list_head * links = head - > next ;
u32 index = 0 ;
2010-11-20 01:51:04 +03:00
2012-08-24 08:48:49 +04:00
dout ( " %s: snap count is %u \n " , __func__ , ( unsigned int ) snap_count ) ;
2012-08-02 20:29:46 +04:00
while ( index < snap_count | | links ! = head ) {
u64 snap_id ;
struct rbd_snap * snap ;
2010-11-20 01:51:04 +03:00
2012-08-02 20:29:46 +04:00
snap_id = index < snap_count ? snapc - > snaps [ index ]
: CEPH_NOSNAP ;
snap = links ! = head ? list_entry ( links , struct rbd_snap , node )
: NULL ;
2012-09-07 01:00:54 +04:00
rbd_assert ( ! snap | | snap - > id ! = CEPH_NOSNAP ) ;
2010-11-20 01:51:04 +03:00
2012-08-02 20:29:46 +04:00
if ( snap_id = = CEPH_NOSNAP | | ( snap & & snap - > id > snap_id ) ) {
struct list_head * next = links - > next ;
2010-11-20 01:51:04 +03:00
2012-08-02 20:29:46 +04:00
/* Existing snapshot not in the new snap context */
2010-11-20 01:51:04 +03:00
2012-08-02 20:29:46 +04:00
if ( rbd_dev - > snap_id = = snap - > id )
2011-11-22 06:14:25 +04:00
rbd_dev - > snap_exists = false ;
2012-08-02 20:29:46 +04:00
__rbd_remove_snap_dev ( snap ) ;
2012-08-24 08:48:49 +04:00
dout ( " %ssnap id %llu has been removed \n " ,
rbd_dev - > snap_id = = snap - > id ? " mapped " : " " ,
( unsigned long long ) snap - > id ) ;
2012-08-02 20:29:46 +04:00
/* Done with this list entry; advance */
links = next ;
2010-11-20 01:51:04 +03:00
continue ;
}
2012-08-02 20:29:46 +04:00
2012-08-24 08:48:49 +04:00
dout ( " entry %u: snap_id = %llu \n " , ( unsigned int ) snap_count ,
( unsigned long long ) snap_id ) ;
2012-08-02 20:29:46 +04:00
if ( ! snap | | ( snap_id ! = CEPH_NOSNAP & & snap - > id < snap_id ) ) {
struct rbd_snap * new_snap ;
/* We haven't seen this snapshot before */
new_snap = __rbd_add_snap_dev ( rbd_dev , index ,
snap_name ) ;
2012-08-24 08:48:49 +04:00
if ( IS_ERR ( new_snap ) ) {
int err = PTR_ERR ( new_snap ) ;
dout ( " failed to add dev, error %d \n " , err ) ;
return err ;
}
2012-08-02 20:29:46 +04:00
/* New goes before existing, or at end of list */
2012-08-24 08:48:49 +04:00
dout ( " added dev%s \n " , snap ? " " : " at end \n " ) ;
2012-08-02 20:29:46 +04:00
if ( snap )
list_add_tail ( & new_snap - > node , & snap - > node ) ;
else
2012-08-30 09:16:37 +04:00
list_add_tail ( & new_snap - > node , head ) ;
2012-08-02 20:29:46 +04:00
} else {
/* Already have this one */
2012-08-24 08:48:49 +04:00
dout ( " already present \n " ) ;
2012-09-07 01:00:54 +04:00
rbd_assert ( snap - > size = =
rbd_dev - > header . snap_sizes [ index ] ) ;
rbd_assert ( ! strcmp ( snap - > name , snap_name ) ) ;
2012-08-02 20:29:46 +04:00
/* Done with this list entry; advance */
links = links - > next ;
2010-11-20 01:51:04 +03:00
}
2012-08-02 20:29:46 +04:00
/* Advance to the next entry in the snapshot context */
index + + ;
snap_name + = strlen ( snap_name ) + 1 ;
2010-11-20 01:51:04 +03:00
}
2012-08-24 08:48:49 +04:00
dout ( " %s: done \n " , __func__ ) ;
2010-11-20 01:51:04 +03:00
return 0 ;
}
static int rbd_bus_add_dev ( struct rbd_device * rbd_dev )
{
2012-01-29 23:57:44 +04:00
int ret ;
2010-11-20 01:51:04 +03:00
struct device * dev ;
struct rbd_snap * snap ;
mutex_lock_nested ( & ctl_mutex , SINGLE_DEPTH_NESTING ) ;
dev = & rbd_dev - > dev ;
dev - > bus = & rbd_bus_type ;
dev - > type = & rbd_device_type ;
dev - > parent = & rbd_root_dev ;
dev - > release = rbd_dev_release ;
2012-07-04 01:01:19 +04:00
dev_set_name ( dev , " %d " , rbd_dev - > dev_id ) ;
2010-11-20 01:51:04 +03:00
ret = device_register ( dev ) ;
if ( ret < 0 )
2012-01-29 23:57:44 +04:00
goto out ;
2010-11-20 01:51:04 +03:00
list_for_each_entry ( snap , & rbd_dev - > snaps , node ) {
2012-07-19 18:09:27 +04:00
ret = rbd_register_snap_dev ( snap , & rbd_dev - > dev ) ;
2010-11-20 01:51:04 +03:00
if ( ret < 0 )
2010-08-13 03:11:25 +04:00
break ;
}
2012-01-29 23:57:44 +04:00
out :
2010-11-20 01:51:04 +03:00
mutex_unlock ( & ctl_mutex ) ;
return ret ;
2010-08-13 03:11:25 +04:00
}
2010-11-20 01:51:04 +03:00
static void rbd_bus_del_dev ( struct rbd_device * rbd_dev )
{
device_unregister ( & rbd_dev - > dev ) ;
}
2011-03-22 01:10:11 +03:00
static int rbd_init_watch_dev ( struct rbd_device * rbd_dev )
{
int ret , rc ;
do {
2012-07-25 18:32:40 +04:00
ret = rbd_req_sync_watch ( rbd_dev ) ;
2011-03-22 01:10:11 +03:00
if ( ret = = - ERANGE ) {
2012-07-25 18:32:41 +04:00
rc = rbd_refresh_header ( rbd_dev , NULL ) ;
2011-03-22 01:10:11 +03:00
if ( rc < 0 )
return rc ;
}
} while ( ret = = - ERANGE ) ;
return ret ;
}
2012-08-30 02:11:06 +04:00
static atomic64_t rbd_dev_id_max = ATOMIC64_INIT ( 0 ) ;
2012-01-29 23:57:44 +04:00
/*
2012-02-02 18:13:29 +04:00
* Get a unique rbd identifier for the given new rbd_dev , and add
* the rbd_dev to the global list . The minimum rbd id is 1.
2012-01-29 23:57:44 +04:00
*/
2012-08-30 02:11:06 +04:00
static void rbd_dev_id_get ( struct rbd_device * rbd_dev )
2012-01-29 23:57:43 +04:00
{
2012-08-30 02:11:06 +04:00
rbd_dev - > dev_id = atomic64_inc_return ( & rbd_dev_id_max ) ;
2012-02-02 18:13:29 +04:00
spin_lock ( & rbd_dev_list_lock ) ;
list_add_tail ( & rbd_dev - > node , & rbd_dev_list ) ;
spin_unlock ( & rbd_dev_list_lock ) ;
2012-08-30 02:11:06 +04:00
dout ( " rbd_dev %p given dev id %llu \n " , rbd_dev ,
( unsigned long long ) rbd_dev - > dev_id ) ;
2012-01-29 23:57:44 +04:00
}
2012-01-29 23:57:43 +04:00
2012-01-29 23:57:44 +04:00
/*
2012-02-02 18:13:29 +04:00
* Remove an rbd_dev from the global list , and record that its
* identifier is no longer in use .
2012-01-29 23:57:44 +04:00
*/
2012-08-30 02:11:06 +04:00
static void rbd_dev_id_put ( struct rbd_device * rbd_dev )
2012-01-29 23:57:44 +04:00
{
2012-01-29 23:57:44 +04:00
struct list_head * tmp ;
2012-07-04 01:01:19 +04:00
int rbd_id = rbd_dev - > dev_id ;
2012-01-29 23:57:44 +04:00
int max_id ;
2012-09-07 01:00:54 +04:00
rbd_assert ( rbd_id > 0 ) ;
2012-02-02 18:13:29 +04:00
2012-08-30 02:11:06 +04:00
dout ( " rbd_dev %p released dev id %llu \n " , rbd_dev ,
( unsigned long long ) rbd_dev - > dev_id ) ;
2012-02-02 18:13:29 +04:00
spin_lock ( & rbd_dev_list_lock ) ;
list_del_init ( & rbd_dev - > node ) ;
2012-01-29 23:57:44 +04:00
/*
* If the id being " put " is not the current maximum , there
* is nothing special we need to do .
*/
2012-08-30 02:11:06 +04:00
if ( rbd_id ! = atomic64_read ( & rbd_dev_id_max ) ) {
2012-01-29 23:57:44 +04:00
spin_unlock ( & rbd_dev_list_lock ) ;
return ;
}
/*
* We need to update the current maximum id . Search the
* list to find out what it is . We ' re more likely to find
* the maximum at the end , so search the list backward .
*/
max_id = 0 ;
list_for_each_prev ( tmp , & rbd_dev_list ) {
struct rbd_device * rbd_dev ;
rbd_dev = list_entry ( tmp , struct rbd_device , node ) ;
if ( rbd_id > max_id )
max_id = rbd_id ;
}
2012-02-02 18:13:29 +04:00
spin_unlock ( & rbd_dev_list_lock ) ;
2012-01-29 23:57:43 +04:00
2012-01-29 23:57:44 +04:00
/*
2012-08-30 02:11:06 +04:00
* The max id could have been updated by rbd_dev_id_get ( ) , in
2012-01-29 23:57:44 +04:00
* which case it now accurately reflects the new maximum .
* Be careful not to overwrite the maximum value in that
* case .
2012-01-29 23:57:44 +04:00
*/
2012-08-30 02:11:06 +04:00
atomic64_cmpxchg ( & rbd_dev_id_max , rbd_id , max_id ) ;
dout ( " max dev id has been reset \n " ) ;
2012-01-29 23:57:43 +04:00
}
2012-02-02 18:13:30 +04:00
/*
* Skips over white space at * buf , and updates * buf to point to the
* first found non - space character ( if any ) . Returns the length of
2012-02-07 22:03:37 +04:00
* the token ( string of non - white space characters ) found . Note
* that * buf must be terminated with ' \0 ' .
2012-02-02 18:13:30 +04:00
*/
static inline size_t next_token ( const char * * buf )
{
/*
* These are the characters that produce nonzero for
* isspace ( ) in the " C " and " POSIX " locales .
*/
const char * spaces = " \f \n \r \t \v " ;
* buf + = strspn ( * buf , spaces ) ; /* Find start of token */
return strcspn ( * buf , spaces ) ; /* Return token length */
}
/*
* Finds the next token in * buf , and if the provided token buffer is
* big enough , copies the found token into it . The result , if
2012-02-07 22:03:37 +04:00
* copied , is guaranteed to be terminated with ' \0 ' . Note that * buf
* must be terminated with ' \0 ' on entry .
2012-02-02 18:13:30 +04:00
*
* Returns the length of the token found ( not including the ' \0 ' ) .
* Return value will be 0 if no token is found , and it will be > =
* token_size if the token would not fit .
*
2012-02-07 22:03:37 +04:00
* The * buf pointer will be updated to point beyond the end of the
2012-02-02 18:13:30 +04:00
* found token . Note that this occurs even if the token buffer is
* too small to hold it .
*/
static inline size_t copy_token ( const char * * buf ,
char * token ,
size_t token_size )
{
size_t len ;
len = next_token ( buf ) ;
if ( len < token_size ) {
memcpy ( token , * buf , len ) ;
* ( token + len ) = ' \0 ' ;
}
* buf + = len ;
return len ;
}
2012-07-10 06:04:23 +04:00
/*
* Finds the next token in * buf , dynamically allocates a buffer big
* enough to hold a copy of it , and copies the token into the new
* buffer . The copy is guaranteed to be terminated with ' \0 ' . Note
* that a duplicate buffer is created even for a zero - length token .
*
* Returns a pointer to the newly - allocated duplicate , or a null
* pointer if memory for the duplicate was not available . If
* the lenp argument is a non - null pointer , the length of the token
* ( not including the ' \0 ' ) is returned in * lenp .
*
* If successful , the * buf pointer will be updated to point beyond
* the end of the found token .
*
* Note : uses GFP_KERNEL for allocation .
*/
static inline char * dup_token ( const char * * buf , size_t * lenp )
{
char * dup ;
size_t len ;
len = next_token ( buf ) ;
dup = kmalloc ( len + 1 , GFP_KERNEL ) ;
if ( ! dup )
return NULL ;
memcpy ( dup , * buf , len ) ;
* ( dup + len ) = ' \0 ' ;
* buf + = len ;
if ( lenp )
* lenp = len ;
return dup ;
}
2012-02-02 18:13:30 +04:00
/*
2012-07-04 01:01:18 +04:00
* This fills in the pool_name , image_name , image_name_len , snap_name ,
2012-02-02 18:13:30 +04:00
* rbd_dev , rbd_md_name , and name fields of the given rbd_dev , based
* on the list of monitor addresses and other options provided via
* / sys / bus / rbd / add .
2012-07-12 19:46:35 +04:00
*
* Note : rbd_dev is assumed to have been initially zero - filled .
2012-02-02 18:13:30 +04:00
*/
static int rbd_add_parse_args ( struct rbd_device * rbd_dev ,
const char * buf ,
2012-02-02 18:13:30 +04:00
const char * * mon_addrs ,
2012-02-02 18:13:30 +04:00
size_t * mon_addrs_size ,
2012-02-02 18:13:30 +04:00
char * options ,
2012-07-04 01:01:18 +04:00
size_t options_size )
2012-02-02 18:13:30 +04:00
{
2012-07-12 19:46:35 +04:00
size_t len ;
int ret ;
2012-02-02 18:13:30 +04:00
/* The first four tokens are required */
2012-02-02 18:13:30 +04:00
len = next_token ( & buf ) ;
if ( ! len )
2012-02-02 18:13:30 +04:00
return - EINVAL ;
2012-02-02 18:13:30 +04:00
* mon_addrs_size = len + 1 ;
2012-02-02 18:13:30 +04:00
* mon_addrs = buf ;
buf + = len ;
2012-02-02 18:13:30 +04:00
2012-02-02 18:13:30 +04:00
len = copy_token ( & buf , options , options_size ) ;
if ( ! len | | len > = options_size )
return - EINVAL ;
2012-07-10 06:04:23 +04:00
ret = - ENOMEM ;
2012-07-12 19:46:35 +04:00
rbd_dev - > pool_name = dup_token ( & buf , NULL ) ;
if ( ! rbd_dev - > pool_name )
goto out_err ;
2012-02-02 18:13:30 +04:00
2012-07-04 01:01:18 +04:00
rbd_dev - > image_name = dup_token ( & buf , & rbd_dev - > image_name_len ) ;
if ( ! rbd_dev - > image_name )
2012-07-10 06:04:23 +04:00
goto out_err ;
2012-02-02 18:13:30 +04:00
2012-07-10 06:04:23 +04:00
/* Create the name of the header object */
2012-07-04 01:01:18 +04:00
rbd_dev - > header_name = kmalloc ( rbd_dev - > image_name_len
2012-07-10 06:04:23 +04:00
+ sizeof ( RBD_SUFFIX ) ,
GFP_KERNEL ) ;
2012-07-04 01:01:18 +04:00
if ( ! rbd_dev - > header_name )
2012-07-10 06:04:23 +04:00
goto out_err ;
2012-07-04 01:01:18 +04:00
sprintf ( rbd_dev - > header_name , " %s%s " , rbd_dev - > image_name , RBD_SUFFIX ) ;
2012-02-02 18:13:30 +04:00
2012-02-02 18:13:30 +04:00
/*
2012-07-10 06:04:24 +04:00
* The snapshot name is optional . If none is is supplied ,
* we use the default value .
2012-02-02 18:13:30 +04:00
*/
2012-07-10 06:04:24 +04:00
rbd_dev - > snap_name = dup_token ( & buf , & len ) ;
if ( ! rbd_dev - > snap_name )
goto out_err ;
if ( ! len ) {
/* Replace the empty name with the default */
kfree ( rbd_dev - > snap_name ) ;
rbd_dev - > snap_name
= kmalloc ( sizeof ( RBD_SNAP_HEAD_NAME ) , GFP_KERNEL ) ;
if ( ! rbd_dev - > snap_name )
goto out_err ;
2012-02-02 18:13:30 +04:00
memcpy ( rbd_dev - > snap_name , RBD_SNAP_HEAD_NAME ,
sizeof ( RBD_SNAP_HEAD_NAME ) ) ;
2012-07-10 06:04:24 +04:00
}
2012-02-02 18:13:30 +04:00
2012-02-02 18:13:30 +04:00
return 0 ;
2012-07-12 19:46:35 +04:00
out_err :
2012-07-04 01:01:18 +04:00
kfree ( rbd_dev - > header_name ) ;
2012-07-27 08:37:14 +04:00
rbd_dev - > header_name = NULL ;
2012-07-04 01:01:18 +04:00
kfree ( rbd_dev - > image_name ) ;
2012-07-27 08:37:14 +04:00
rbd_dev - > image_name = NULL ;
rbd_dev - > image_name_len = 0 ;
2012-07-12 19:46:35 +04:00
kfree ( rbd_dev - > pool_name ) ;
rbd_dev - > pool_name = NULL ;
return ret ;
2012-02-02 18:13:30 +04:00
}
2011-03-22 01:10:11 +03:00
static ssize_t rbd_add ( struct bus_type * bus ,
const char * buf ,
size_t count )
2010-08-13 03:11:25 +04:00
{
2012-07-10 06:04:23 +04:00
char * options ;
struct rbd_device * rbd_dev = NULL ;
2012-02-02 18:13:30 +04:00
const char * mon_addrs = NULL ;
size_t mon_addrs_size = 0 ;
2012-02-02 18:13:30 +04:00
struct ceph_osd_client * osdc ;
int rc = - ENOMEM ;
2010-08-13 03:11:25 +04:00
if ( ! try_module_get ( THIS_MODULE ) )
return - ENODEV ;
2012-02-02 18:13:30 +04:00
options = kmalloc ( count , GFP_KERNEL ) ;
2010-08-13 03:11:25 +04:00
if ( ! options )
2012-02-02 18:13:30 +04:00
goto err_nomem ;
2012-07-10 06:04:23 +04:00
rbd_dev = kzalloc ( sizeof ( * rbd_dev ) , GFP_KERNEL ) ;
if ( ! rbd_dev )
goto err_nomem ;
2010-08-13 03:11:25 +04:00
/* static rbd_device initialization */
spin_lock_init ( & rbd_dev - > lock ) ;
INIT_LIST_HEAD ( & rbd_dev - > node ) ;
2010-11-20 01:51:04 +03:00
INIT_LIST_HEAD ( & rbd_dev - > snaps ) ;
2011-11-22 05:11:12 +04:00
init_rwsem ( & rbd_dev - > header_rwsem ) ;
2010-08-13 03:11:25 +04:00
2012-01-29 23:57:44 +04:00
/* generate unique id: find highest unique id, add one */
2012-08-30 02:11:06 +04:00
rbd_dev_id_get ( rbd_dev ) ;
2010-08-13 03:11:25 +04:00
2012-02-02 18:13:30 +04:00
/* Fill in the device name, now that we have its id. */
2012-02-02 18:13:30 +04:00
BUILD_BUG_ON ( DEV_NAME_LEN
< sizeof ( RBD_DRV_NAME ) + MAX_INT_FORMAT_WIDTH ) ;
2012-07-04 01:01:19 +04:00
sprintf ( rbd_dev - > name , " %s%d " , RBD_DRV_NAME , rbd_dev - > dev_id ) ;
2012-02-02 18:13:30 +04:00
2010-08-13 03:11:25 +04:00
/* parse add command */
2012-02-02 18:13:30 +04:00
rc = rbd_add_parse_args ( rbd_dev , buf , & mon_addrs , & mon_addrs_size ,
2012-02-02 18:13:30 +04:00
options , count ) ;
2012-02-02 18:13:30 +04:00
if ( rc )
2012-01-29 23:57:44 +04:00
goto err_put_id ;
2012-01-29 23:57:44 +04:00
2012-08-11 00:12:07 +04:00
rc = rbd_get_client ( rbd_dev , mon_addrs , mon_addrs_size - 1 , options ) ;
if ( rc < 0 )
2012-01-29 23:57:44 +04:00
goto err_put_id ;
2010-08-13 03:11:25 +04:00
/* pick the pool */
2012-01-24 20:08:37 +04:00
osdc = & rbd_dev - > rbd_client - > client - > osdc ;
2010-08-13 03:11:25 +04:00
rc = ceph_pg_poolid_by_name ( osdc - > osdmap , rbd_dev - > pool_name ) ;
if ( rc < 0 )
goto err_out_client ;
2012-07-12 19:46:35 +04:00
rbd_dev - > pool_id = rc ;
2010-08-13 03:11:25 +04:00
/* register our block device */
2012-02-02 18:13:30 +04:00
rc = register_blkdev ( 0 , rbd_dev - > name ) ;
if ( rc < 0 )
2010-08-13 03:11:25 +04:00
goto err_out_client ;
2012-02-02 18:13:30 +04:00
rbd_dev - > major = rc ;
2010-08-13 03:11:25 +04:00
2010-11-20 01:51:04 +03:00
rc = rbd_bus_add_dev ( rbd_dev ) ;
if ( rc )
2011-01-08 01:58:42 +03:00
goto err_out_blkdev ;
2012-02-09 02:11:14 +04:00
/*
* At this point cleanup in the event of an error is the job
* of the sysfs code ( initiated by rbd_bus_del_dev ( ) ) .
*
* Set up and announce blkdev mapping .
*/
2010-08-13 03:11:25 +04:00
rc = rbd_init_disk ( rbd_dev ) ;
if ( rc )
2011-01-08 01:58:42 +03:00
goto err_out_bus ;
2010-08-13 03:11:25 +04:00
2011-03-22 01:10:11 +03:00
rc = rbd_init_watch_dev ( rbd_dev ) ;
if ( rc )
goto err_out_bus ;
2010-08-13 03:11:25 +04:00
return count ;
2011-01-08 01:58:42 +03:00
err_out_bus :
/* this will also clean up rest of rbd_dev stuff */
rbd_bus_del_dev ( rbd_dev ) ;
kfree ( options ) ;
return rc ;
2010-08-13 03:11:25 +04:00
err_out_blkdev :
unregister_blkdev ( rbd_dev - > major , rbd_dev - > name ) ;
err_out_client :
rbd_put_client ( rbd_dev ) ;
2012-01-29 23:57:44 +04:00
err_put_id :
2012-07-10 06:04:23 +04:00
if ( rbd_dev - > pool_name ) {
2012-07-10 06:04:24 +04:00
kfree ( rbd_dev - > snap_name ) ;
2012-07-04 01:01:18 +04:00
kfree ( rbd_dev - > header_name ) ;
kfree ( rbd_dev - > image_name ) ;
2012-07-10 06:04:23 +04:00
kfree ( rbd_dev - > pool_name ) ;
}
2012-08-30 02:11:06 +04:00
rbd_dev_id_put ( rbd_dev ) ;
2012-02-02 18:13:30 +04:00
err_nomem :
kfree ( rbd_dev ) ;
2012-07-10 06:04:23 +04:00
kfree ( options ) ;
2012-02-02 18:13:30 +04:00
2010-08-13 03:11:25 +04:00
dout ( " Error adding device %s \n " , buf ) ;
module_put ( THIS_MODULE ) ;
2012-02-02 18:13:30 +04:00
return ( ssize_t ) rc ;
2010-08-13 03:11:25 +04:00
}
2012-07-04 01:01:19 +04:00
static struct rbd_device * __rbd_get_dev ( unsigned long dev_id )
2010-08-13 03:11:25 +04:00
{
struct list_head * tmp ;
struct rbd_device * rbd_dev ;
2012-01-29 23:57:44 +04:00
spin_lock ( & rbd_dev_list_lock ) ;
2010-08-13 03:11:25 +04:00
list_for_each ( tmp , & rbd_dev_list ) {
rbd_dev = list_entry ( tmp , struct rbd_device , node ) ;
2012-07-04 01:01:19 +04:00
if ( rbd_dev - > dev_id = = dev_id ) {
2012-01-29 23:57:44 +04:00
spin_unlock ( & rbd_dev_list_lock ) ;
2010-08-13 03:11:25 +04:00
return rbd_dev ;
2012-01-29 23:57:44 +04:00
}
2010-08-13 03:11:25 +04:00
}
2012-01-29 23:57:44 +04:00
spin_unlock ( & rbd_dev_list_lock ) ;
2010-08-13 03:11:25 +04:00
return NULL ;
}
2010-11-20 01:51:04 +03:00
static void rbd_dev_release ( struct device * dev )
2010-08-13 03:11:25 +04:00
{
2012-02-07 22:03:37 +04:00
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
2010-08-13 03:11:25 +04:00
2012-01-24 20:08:37 +04:00
if ( rbd_dev - > watch_request ) {
struct ceph_client * client = rbd_dev - > rbd_client - > client ;
ceph_osdc_unregister_linger_request ( & client - > osdc ,
2011-03-22 01:10:11 +03:00
rbd_dev - > watch_request ) ;
2012-01-24 20:08:37 +04:00
}
2011-03-22 01:10:11 +03:00
if ( rbd_dev - > watch_event )
2012-07-25 18:32:41 +04:00
rbd_req_sync_unwatch ( rbd_dev ) ;
2011-03-22 01:10:11 +03:00
2010-08-13 03:11:25 +04:00
rbd_put_client ( rbd_dev ) ;
/* clean up and free blkdev */
rbd_free_disk ( rbd_dev ) ;
unregister_blkdev ( rbd_dev - > major , rbd_dev - > name ) ;
2012-02-09 02:11:14 +04:00
/* done with the id, and with the rbd_dev */
2012-07-10 06:04:24 +04:00
kfree ( rbd_dev - > snap_name ) ;
2012-07-04 01:01:18 +04:00
kfree ( rbd_dev - > header_name ) ;
2012-07-12 19:46:35 +04:00
kfree ( rbd_dev - > pool_name ) ;
2012-07-04 01:01:18 +04:00
kfree ( rbd_dev - > image_name ) ;
2012-08-30 02:11:06 +04:00
rbd_dev_id_put ( rbd_dev ) ;
2010-08-13 03:11:25 +04:00
kfree ( rbd_dev ) ;
/* release module ref */
module_put ( THIS_MODULE ) ;
}
2010-11-20 01:51:04 +03:00
static ssize_t rbd_remove ( struct bus_type * bus ,
const char * buf ,
size_t count )
2010-08-13 03:11:25 +04:00
{
struct rbd_device * rbd_dev = NULL ;
int target_id , rc ;
unsigned long ul ;
int ret = count ;
rc = strict_strtoul ( buf , 10 , & ul ) ;
if ( rc )
return rc ;
/* convert to int; abort if we lost anything in the conversion */
target_id = ( int ) ul ;
if ( target_id ! = ul )
return - EINVAL ;
mutex_lock_nested ( & ctl_mutex , SINGLE_DEPTH_NESTING ) ;
rbd_dev = __rbd_get_dev ( target_id ) ;
if ( ! rbd_dev ) {
ret = - ENOENT ;
goto done ;
}
2010-11-20 01:51:04 +03:00
__rbd_remove_all_snaps ( rbd_dev ) ;
rbd_bus_del_dev ( rbd_dev ) ;
2010-08-13 03:11:25 +04:00
done :
mutex_unlock ( & ctl_mutex ) ;
2012-09-07 01:00:54 +04:00
2010-08-13 03:11:25 +04:00
return ret ;
}
2010-11-20 01:51:04 +03:00
static ssize_t rbd_snap_add ( struct device * dev ,
struct device_attribute * attr ,
const char * buf ,
size_t count )
2010-08-13 03:11:25 +04:00
{
2012-02-07 22:03:37 +04:00
struct rbd_device * rbd_dev = dev_to_rbd_dev ( dev ) ;
2010-11-20 01:51:04 +03:00
int ret ;
char * name = kmalloc ( count + 1 , GFP_KERNEL ) ;
2010-08-13 03:11:25 +04:00
if ( ! name )
return - ENOMEM ;
2010-11-20 01:51:04 +03:00
snprintf ( name , count , " %s " , buf ) ;
2010-08-13 03:11:25 +04:00
mutex_lock_nested ( & ctl_mutex , SINGLE_DEPTH_NESTING ) ;
ret = rbd_header_add_snap ( rbd_dev ,
name , GFP_KERNEL ) ;
if ( ret < 0 )
2011-03-22 01:10:11 +03:00
goto err_unlock ;
2010-08-13 03:11:25 +04:00
2012-07-25 18:32:41 +04:00
ret = __rbd_refresh_header ( rbd_dev , NULL ) ;
2010-08-13 03:11:25 +04:00
if ( ret < 0 )
2011-03-22 01:10:11 +03:00
goto err_unlock ;
/* shouldn't hold ctl_mutex when notifying.. notify might
trigger a watch callback that would need to get that mutex */
mutex_unlock ( & ctl_mutex ) ;
/* make a best effort, don't error if failed */
2012-07-25 18:32:40 +04:00
rbd_req_sync_notify ( rbd_dev ) ;
2010-08-13 03:11:25 +04:00
ret = count ;
2011-03-22 01:10:11 +03:00
kfree ( name ) ;
return ret ;
err_unlock :
2010-08-13 03:11:25 +04:00
mutex_unlock ( & ctl_mutex ) ;
kfree ( name ) ;
return ret ;
}
/*
* create control files in sysfs
2010-11-20 01:51:04 +03:00
* / sys / bus / rbd / . . .
2010-08-13 03:11:25 +04:00
*/
static int rbd_sysfs_init ( void )
{
2010-11-20 01:51:04 +03:00
int ret ;
2010-08-13 03:11:25 +04:00
2012-02-07 22:03:36 +04:00
ret = device_register ( & rbd_root_dev ) ;
2012-01-24 20:08:36 +04:00
if ( ret < 0 )
2010-11-20 01:51:04 +03:00
return ret ;
2010-08-13 03:11:25 +04:00
2012-02-07 22:03:36 +04:00
ret = bus_register ( & rbd_bus_type ) ;
if ( ret < 0 )
device_unregister ( & rbd_root_dev ) ;
2010-08-13 03:11:25 +04:00
return ret ;
}
static void rbd_sysfs_cleanup ( void )
{
2010-11-20 01:51:04 +03:00
bus_unregister ( & rbd_bus_type ) ;
2012-02-07 22:03:36 +04:00
device_unregister ( & rbd_root_dev ) ;
2010-08-13 03:11:25 +04:00
}
int __init rbd_init ( void )
{
int rc ;
rc = rbd_sysfs_init ( ) ;
if ( rc )
return rc ;
2012-01-29 23:57:44 +04:00
pr_info ( " loaded " RBD_DRV_NAME_LONG " \n " ) ;
2010-08-13 03:11:25 +04:00
return 0 ;
}
void __exit rbd_exit ( void )
{
rbd_sysfs_cleanup ( ) ;
}
module_init ( rbd_init ) ;
module_exit ( rbd_exit ) ;
MODULE_AUTHOR ( " Sage Weil <sage@newdream.net> " ) ;
MODULE_AUTHOR ( " Yehuda Sadeh <yehuda@hq.newdream.net> " ) ;
MODULE_DESCRIPTION ( " rados block device " ) ;
/* following authorship retained from original osdblk.c */
MODULE_AUTHOR ( " Jeff Garzik <jeff@garzik.org> " ) ;
MODULE_LICENSE ( " GPL " ) ;