2009-09-26 03:07:19 +04:00
/*
drbd_req . c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg .
Copyright ( C ) 2001 - 2008 , LINBIT Information Technologies GmbH .
Copyright ( C ) 1999 - 2008 , Philipp Reisner < philipp . reisner @ linbit . com > .
Copyright ( C ) 2002 - 2008 , Lars Ellenberg < lars . ellenberg @ linbit . com > .
drbd is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation ; either version 2 , or ( at your option )
any later version .
drbd is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with drbd ; see the file COPYING . If not , write to
the Free Software Foundation , 675 Mass Ave , Cambridge , MA 0213 9 , USA .
*/
# include <linux/module.h>
# include <linux/slab.h>
# include <linux/drbd.h>
# include "drbd_int.h"
# include "drbd_req.h"
2011-12-03 14:18:56 +04:00
static bool drbd_may_do_local_read ( struct drbd_conf * mdev , sector_t sector , int size ) ;
2009-09-26 03:07:19 +04:00
/* Update disk stats at start of I/O request */
static void _drbd_start_io_acct ( struct drbd_conf * mdev , struct drbd_request * req , struct bio * bio )
{
const int rw = bio_data_dir ( bio ) ;
int cpu ;
cpu = part_stat_lock ( ) ;
2012-02-23 15:56:26 +04:00
part_round_stats ( cpu , & mdev - > vdisk - > part0 ) ;
2009-09-26 03:07:19 +04:00
part_stat_inc ( cpu , & mdev - > vdisk - > part0 , ios [ rw ] ) ;
part_stat_add ( cpu , & mdev - > vdisk - > part0 , sectors [ rw ] , bio_sectors ( bio ) ) ;
2011-11-07 13:54:28 +04:00
( void ) cpu ; /* The macro invocations above want the cpu argument, I do not like
the compiler warning about cpu only assigned but never used . . . */
2009-11-18 17:52:51 +03:00
part_inc_in_flight ( & mdev - > vdisk - > part0 , rw ) ;
2009-09-26 03:07:19 +04:00
part_stat_unlock ( ) ;
}
/* Update disk stats when completing request upwards */
static void _drbd_end_io_acct ( struct drbd_conf * mdev , struct drbd_request * req )
{
int rw = bio_data_dir ( req - > master_bio ) ;
unsigned long duration = jiffies - req - > start_time ;
int cpu ;
cpu = part_stat_lock ( ) ;
part_stat_add ( cpu , & mdev - > vdisk - > part0 , ticks [ rw ] , duration ) ;
part_round_stats ( cpu , & mdev - > vdisk - > part0 ) ;
2009-11-18 17:52:51 +03:00
part_dec_in_flight ( & mdev - > vdisk - > part0 , rw ) ;
2009-09-26 03:07:19 +04:00
part_stat_unlock ( ) ;
}
2011-01-26 20:45:11 +03:00
static struct drbd_request * drbd_req_new ( struct drbd_conf * mdev ,
struct bio * bio_src )
{
struct drbd_request * req ;
req = mempool_alloc ( drbd_request_mempool , GFP_NOIO ) ;
if ( ! req )
return NULL ;
drbd_req_make_private_bio ( req , bio_src ) ;
req - > rq_state = bio_data_dir ( bio_src ) = = WRITE ? RQ_WRITE : 0 ;
2011-02-08 17:08:49 +03:00
req - > w . mdev = mdev ;
2011-01-26 20:45:11 +03:00
req - > master_bio = bio_src ;
req - > epoch = 0 ;
2011-01-28 12:31:04 +03:00
2011-01-26 20:45:11 +03:00
drbd_clear_interval ( & req - > i ) ;
req - > i . sector = bio_src - > bi_sector ;
req - > i . size = bio_src - > bi_size ;
2011-01-27 16:42:51 +03:00
req - > i . local = true ;
2011-01-28 12:31:04 +03:00
req - > i . waiting = false ;
2011-01-26 20:45:11 +03:00
INIT_LIST_HEAD ( & req - > tl_requests ) ;
INIT_LIST_HEAD ( & req - > w . list ) ;
2012-01-24 20:19:42 +04:00
/* one reference to be put by __drbd_make_request */
2012-01-24 19:58:11 +04:00
atomic_set ( & req - > completion_ref , 1 ) ;
2012-01-24 20:19:42 +04:00
/* one kref as long as completion_ref > 0 */
2012-01-24 19:58:11 +04:00
kref_init ( & req - > kref ) ;
2011-01-26 20:45:11 +03:00
return req ;
}
2012-07-24 12:12:36 +04:00
void drbd_req_destroy ( struct kref * kref )
2009-09-26 03:07:19 +04:00
{
2012-01-24 19:58:11 +04:00
struct drbd_request * req = container_of ( kref , struct drbd_request , kref ) ;
struct drbd_conf * mdev = req - > w . mdev ;
2012-01-24 20:19:42 +04:00
const unsigned s = req - > rq_state ;
if ( ( req - > master_bio & & ! ( s & RQ_POSTPONED ) ) | |
atomic_read ( & req - > completion_ref ) | |
( s & RQ_LOCAL_PENDING ) | |
( ( s & RQ_NET_MASK ) & & ! ( s & RQ_NET_DONE ) ) ) {
dev_err ( DEV , " drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d \n " ,
s , atomic_read ( & req - > completion_ref ) ) ;
return ;
}
2010-05-27 17:07:43 +04:00
/* remove it from the transfer log.
* well , only if it had been there in the first
* place . . . if it had not ( local only or conflicting
* and never sent ) , it should still be " empty " as
* initialized in drbd_req_new ( ) , so we can list_del ( ) it
* here unconditionally */
drbd: fix potential deadlock during "restart" of conflicting writes
w_restart_write(), run from worker context, calls __drbd_make_request()
and further drbd_al_begin_io(, delegate=true), which then
potentially deadlocks. The previous patch moved a BUG_ON to expose
such call paths, which would now be triggered.
Also, if we call __drbd_make_request() from resource worker context,
like w_restart_write() did, and that should block for whatever reason
(!drbd_state_is_stable(), resource suspended, ...),
we potentially deadlock the whole resource, as the worker
is needed for state changes and other things.
Create a dedicated retry workqueue for this instead.
Also make sure that inc_ap_bio()/dec_ap_bio() are properly paired,
even if do_retry() needs to retry itself,
in case __drbd_make_request() returns != 0.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
2011-11-24 13:36:25 +04:00
list_del_init ( & req - > tl_requests ) ;
2010-05-27 17:07:43 +04:00
2009-09-26 03:07:19 +04:00
/* if it was a write, we may have to set the corresponding
* bit ( s ) out - of - sync first . If it had a local part , we need to
* release the reference to the activity log . */
2012-01-24 19:58:11 +04:00
if ( s & RQ_WRITE ) {
2009-09-26 03:07:19 +04:00
/* Set out-of-sync unless both OK flags are set
* ( local only or remote failed ) .
* Other places where we set out - of - sync :
* READ with local io - error */
if ( ! ( s & RQ_NET_OK ) | | ! ( s & RQ_LOCAL_OK ) )
2011-01-03 19:09:58 +03:00
drbd_set_out_of_sync ( mdev , req - > i . sector , req - > i . size ) ;
2009-09-26 03:07:19 +04:00
if ( ( s & RQ_NET_OK ) & & ( s & RQ_LOCAL_OK ) & & ( s & RQ_NET_SIS ) )
2011-01-03 19:09:58 +03:00
drbd_set_in_sync ( mdev , req - > i . sector , req - > i . size ) ;
2009-09-26 03:07:19 +04:00
/* one might be tempted to move the drbd_al_complete_io
2011-02-17 18:46:59 +03:00
* to the local io completion callback drbd_request_endio .
2009-09-26 03:07:19 +04:00
* but , if this was a mirror write , we may only
* drbd_al_complete_io after this is RQ_NET_DONE ,
* otherwise the extent could be dropped from the al
* before it has actually been written on the peer .
* if we crash before our peer knows about the request ,
* but after the extent has been dropped from the al ,
* we would forget to resync the corresponding extent .
*/
if ( s & RQ_LOCAL_MASK ) {
if ( get_ldev_if_state ( mdev , D_FAILED ) ) {
2010-08-31 14:00:50 +04:00
if ( s & RQ_IN_ACT_LOG )
2011-03-31 17:18:56 +04:00
drbd_al_complete_io ( mdev , & req - > i ) ;
2009-09-26 03:07:19 +04:00
put_ldev ( mdev ) ;
} else if ( __ratelimit ( & drbd_ratelimit_state ) ) {
2011-03-31 17:18:56 +04:00
dev_warn ( DEV , " Should have called drbd_al_complete_io(, %llu, %u), "
" but my Disk seems to have failed :( \n " ,
( unsigned long long ) req - > i . sector , req - > i . size ) ;
2009-09-26 03:07:19 +04:00
}
}
}
2012-07-24 12:12:36 +04:00
mempool_free ( req , drbd_request_mempool ) ;
2009-09-26 03:07:19 +04:00
}
2011-11-28 18:04:49 +04:00
static void wake_all_senders ( struct drbd_tconn * tconn ) {
wake_up ( & tconn - > sender_work . q_wait ) ;
2009-09-26 03:07:19 +04:00
}
2011-11-28 18:04:49 +04:00
/* must hold resource->req_lock */
static void start_new_tl_epoch ( struct drbd_tconn * tconn )
2009-09-26 03:07:19 +04:00
{
2011-11-28 18:04:49 +04:00
tconn - > current_tle_writes = 0 ;
atomic_inc ( & tconn - > current_tle_nr ) ;
wake_all_senders ( tconn ) ;
2009-09-26 03:07:19 +04:00
}
void complete_master_bio ( struct drbd_conf * mdev ,
struct bio_and_error * m )
{
bio_endio ( m - > bio , m - > error ) ;
dec_ap_bio ( mdev ) ;
}
2011-01-28 12:31:04 +03:00
static void drbd_remove_request_interval ( struct rb_root * root ,
struct drbd_request * req )
{
2011-02-08 17:08:49 +03:00
struct drbd_conf * mdev = req - > w . mdev ;
2011-01-28 12:31:04 +03:00
struct drbd_interval * i = & req - > i ;
drbd_remove_interval ( root , i ) ;
/* Wake up any processes waiting for this request to complete. */
if ( i - > waiting )
wake_up ( & mdev - > misc_wait ) ;
}
2009-09-26 03:07:19 +04:00
/* Helper for __req_mod().
* Set m - > bio to the master bio , if it is fit to be completed ,
* or leave it alone ( it is initialized to NULL in __req_mod ) ,
* if it has already been completed , or cannot be completed yet .
* If m - > bio is set , the error status to be returned is placed in m - > error .
*/
2012-03-26 19:02:45 +04:00
static
2012-01-24 20:19:42 +04:00
void drbd_req_complete ( struct drbd_request * req , struct bio_and_error * m )
2009-09-26 03:07:19 +04:00
{
2012-01-24 20:19:42 +04:00
const unsigned s = req - > rq_state ;
2011-02-08 17:08:49 +03:00
struct drbd_conf * mdev = req - > w . mdev ;
2012-01-24 20:19:42 +04:00
int rw ;
int error , ok ;
2009-09-26 03:07:19 +04:00
/* we must not complete the master bio, while it is
* still being processed by _drbd_send_zc_bio ( drbd_send_dblock )
* not yet acknowledged by the peer
* not yet completed by the local io subsystem
* these flags may get cleared in any order by
* the worker ,
* the receiver ,
* the bio_endio completion callbacks .
*/
2012-01-24 20:19:42 +04:00
if ( ( s & RQ_LOCAL_PENDING & & ! ( s & RQ_LOCAL_ABORTED ) ) | |
( s & RQ_NET_QUEUED ) | | ( s & RQ_NET_PENDING ) | |
( s & RQ_COMPLETION_SUSP ) ) {
dev_err ( DEV , " drbd_req_complete: Logic BUG rq_state = 0x%x \n " , s ) ;
2009-09-26 03:07:19 +04:00
return ;
2012-01-24 20:19:42 +04:00
}
if ( ! req - > master_bio ) {
dev_err ( DEV , " drbd_req_complete: Logic BUG, master_bio == NULL! \n " ) ;
2009-09-26 03:07:19 +04:00
return ;
2012-01-24 20:19:42 +04:00
}
2009-09-26 03:07:19 +04:00
2012-01-24 20:19:42 +04:00
rw = bio_rw ( req - > master_bio ) ;
2012-01-24 19:58:11 +04:00
2012-01-24 20:19:42 +04:00
/*
* figure out whether to report success or failure .
*
* report success when at least one of the operations succeeded .
* or , to put the other way ,
* only report failure , when both operations failed .
*
* what to do about the failures is handled elsewhere .
* what we need to do here is just : complete the master_bio .
*
* local completion error , if any , has been stored as ERR_PTR
* in private_bio within drbd_request_endio .
*/
ok = ( s & RQ_LOCAL_OK ) | | ( s & RQ_NET_OK ) ;
error = PTR_ERR ( req - > private_bio ) ;
2012-03-26 19:29:30 +04:00
2012-01-24 20:19:42 +04:00
/* remove the request from the conflict detection
* respective block_id verification hash */
if ( ! drbd_interval_empty ( & req - > i ) ) {
struct rb_root * root ;
2009-09-26 03:07:19 +04:00
2012-01-24 20:19:42 +04:00
if ( rw = = WRITE )
root = & mdev - > write_requests ;
else
root = & mdev - > read_requests ;
drbd_remove_request_interval ( root , req ) ;
} else if ( ! ( s & RQ_POSTPONED ) )
D_ASSERT ( ( s & ( RQ_NET_MASK & ~ RQ_NET_DONE ) ) = = 0 ) ;
/* Before we can signal completion to the upper layers,
* we may need to close the current transfer log epoch .
* We are within the request lock , so we can simply compare
* the request epoch number with the current transfer log
* epoch number . If they match , increase the current_tle_nr ,
* and reset the transfer log epoch write_cnt .
*/
if ( rw = = WRITE & &
req - > epoch = = atomic_read ( & mdev - > tconn - > current_tle_nr ) )
start_new_tl_epoch ( mdev - > tconn ) ;
2009-09-26 03:07:19 +04:00
2012-01-24 20:19:42 +04:00
/* Update disk stats */
_drbd_end_io_acct ( mdev , req ) ;
2009-09-26 03:07:19 +04:00
2012-01-24 20:19:42 +04:00
/* If READ failed,
* have it be pushed back to the retry work queue ,
* so it will re - enter __drbd_make_request ( ) ,
* and be re - assigned to a suitable local or remote path ,
* or failed if we do not have access to good data anymore .
*
* Unless it was failed early by __drbd_make_request ( ) ,
* because no path was available , in which case
* it was not even added to the transfer_log .
*
* READA may fail , and will not be retried .
*
* WRITE should have used all available paths already .
*/
if ( ! ok & & rw = = READ & & ! list_empty ( & req - > tl_requests ) )
req - > rq_state | = RQ_POSTPONED ;
if ( ! ( req - > rq_state & RQ_POSTPONED ) ) {
m - > error = ok ? 0 : ( error ? : - EIO ) ;
m - > bio = req - > master_bio ;
req - > master_bio = NULL ;
2009-09-26 03:07:19 +04:00
}
}
2012-01-24 20:19:42 +04:00
static int drbd_req_put_completion_ref ( struct drbd_request * req , struct bio_and_error * m , int put )
2010-06-23 19:18:51 +04:00
{
2011-02-08 17:08:49 +03:00
struct drbd_conf * mdev = req - > w . mdev ;
2012-01-24 20:19:42 +04:00
D_ASSERT ( m | | ( req - > rq_state & RQ_POSTPONED ) ) ;
if ( ! atomic_sub_and_test ( put , & req - > completion_ref ) )
return 0 ;
if ( drbd_suspended ( mdev ) ) {
/* We do not allow completion while suspended. Re-get a
* reference , so whatever happens when this is resumed
* may put and complete . */
2010-06-23 19:18:51 +04:00
2012-01-24 20:19:42 +04:00
D_ASSERT ( ! ( req - > rq_state & RQ_COMPLETION_SUSP ) ) ;
req - > rq_state | = RQ_COMPLETION_SUSP ;
atomic_inc ( & req - > completion_ref ) ;
return 0 ;
}
/* else */
drbd_req_complete ( req , m ) ;
2012-07-24 12:12:36 +04:00
if ( req - > rq_state & RQ_POSTPONED ) {
/* don't destroy the req object just yet,
* but queue it for retry */
drbd_restart_request ( req ) ;
return 0 ;
}
2012-01-24 20:19:42 +04:00
return 1 ;
}
/* I'd like this to be the only place that manipulates
* req - > completion_ref and req - > kref . */
static void mod_rq_state ( struct drbd_request * req , struct bio_and_error * m ,
int clear , int set )
{
struct drbd_conf * mdev = req - > w . mdev ;
unsigned s = req - > rq_state ;
int c_put = 0 ;
int k_put = 0 ;
/* apply */
req - > rq_state & = ~ clear ;
req - > rq_state | = set ;
/* no change? */
if ( req - > rq_state = = s )
return ;
/* intent: get references */
if ( ! ( s & RQ_LOCAL_PENDING ) & & ( set & RQ_LOCAL_PENDING ) )
atomic_inc ( & req - > completion_ref ) ;
if ( ! ( s & RQ_NET_PENDING ) & & ( set & RQ_NET_PENDING ) ) {
inc_ap_pending ( mdev ) ;
atomic_inc ( & req - > completion_ref ) ;
}
if ( ! ( s & RQ_NET_QUEUED ) & & ( set & RQ_NET_QUEUED ) )
atomic_inc ( & req - > completion_ref ) ;
if ( ! ( s & RQ_EXP_BARR_ACK ) & & ( set & RQ_EXP_BARR_ACK ) )
kref_get ( & req - > kref ) ; /* wait for the DONE */
if ( ! ( s & RQ_NET_SENT ) & & ( set & RQ_NET_SENT ) )
atomic_add ( req - > i . size > > 9 , & mdev - > ap_in_flight ) ;
/* progress: put references */
if ( ( s & RQ_COMPLETION_SUSP ) & & ( clear & RQ_COMPLETION_SUSP ) )
+ + c_put ;
if ( ! ( s & RQ_LOCAL_ABORTED ) & & ( set & RQ_LOCAL_ABORTED ) ) {
D_ASSERT ( req - > rq_state & RQ_LOCAL_PENDING ) ;
/* local completion may still come in later,
* we need to keep the req object around . */
kref_get ( & req - > kref ) ;
+ + c_put ;
}
if ( ( s & RQ_LOCAL_PENDING ) & & ( clear & RQ_LOCAL_PENDING ) ) {
if ( req - > rq_state & RQ_LOCAL_ABORTED )
+ + k_put ;
else
+ + c_put ;
}
if ( ( s & RQ_NET_PENDING ) & & ( clear & RQ_NET_PENDING ) ) {
dec_ap_pending ( mdev ) ;
+ + c_put ;
}
if ( ( s & RQ_NET_QUEUED ) & & ( clear & RQ_NET_QUEUED ) )
+ + c_put ;
if ( ( s & RQ_EXP_BARR_ACK ) & & ! ( s & RQ_NET_DONE ) & & ( set & RQ_NET_DONE ) ) {
if ( req - > rq_state & RQ_NET_SENT )
atomic_sub ( req - > i . size > > 9 , & mdev - > ap_in_flight ) ;
+ + k_put ;
}
/* potentially complete and destroy */
if ( k_put | | c_put ) {
/* Completion does it's own kref_put. If we are going to
* kref_sub below , we need req to be still around then . */
int at_least = k_put + ! ! c_put ;
int refcount = atomic_read ( & req - > kref . refcount ) ;
if ( refcount < at_least )
dev_err ( DEV ,
" mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d \n " ,
s , req - > rq_state , refcount , at_least ) ;
}
/* If we made progress, retry conflicting peer requests, if any. */
if ( req - > i . waiting )
wake_up ( & mdev - > misc_wait ) ;
if ( c_put )
k_put + = drbd_req_put_completion_ref ( req , m , c_put ) ;
if ( k_put )
kref_sub ( & req - > kref , k_put , drbd_req_destroy ) ;
2010-06-23 19:18:51 +04:00
}
2009-09-26 03:07:19 +04:00
/* obviously this could be coded as many single functions
* instead of one huge switch ,
* or by putting the code directly in the respective locations
* ( as it has been before ) .
*
* but having it this way
* enforces that it is all in this one place , where it is easier to audit ,
* it makes it obvious that whatever " event " " happens " to a request should
* happen " atomically " within the req_lock ,
* and it enforces that we have to think in a very structured manner
* about the " events " that may happen to a request during its life time . . .
*/
2010-06-09 16:07:43 +04:00
int __req_mod ( struct drbd_request * req , enum drbd_req_event what ,
2009-09-26 03:07:19 +04:00
struct bio_and_error * m )
{
2011-02-08 17:08:49 +03:00
struct drbd_conf * mdev = req - > w . mdev ;
2011-04-19 19:10:19 +04:00
struct net_conf * nc ;
2011-04-14 03:24:47 +04:00
int p , rv = 0 ;
2011-02-22 04:15:32 +03:00
if ( m )
m - > bio = NULL ;
2009-09-26 03:07:19 +04:00
switch ( what ) {
default :
dev_err ( DEV , " LOGIC BUG in %s:%u \n " , __FILE__ , __LINE__ ) ;
break ;
/* does not happen...
* initialization done in drbd_req_new
2011-01-25 17:37:43 +03:00
case CREATED :
2009-09-26 03:07:19 +04:00
break ;
*/
2011-01-25 17:37:43 +03:00
case TO_BE_SENT : /* via network */
2011-02-22 04:15:32 +03:00
/* reached via __drbd_make_request
2009-09-26 03:07:19 +04:00
* and from w_read_retry_remote */
D_ASSERT ( ! ( req - > rq_state & RQ_NET_MASK ) ) ;
2011-04-19 19:10:19 +04:00
rcu_read_lock ( ) ;
nc = rcu_dereference ( mdev - > tconn - > net_conf ) ;
p = nc - > wire_protocol ;
rcu_read_unlock ( ) ;
2011-04-14 03:24:47 +04:00
req - > rq_state | =
p = = DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
p = = DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0 ;
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , 0 , RQ_NET_PENDING ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-01-25 17:37:43 +03:00
case TO_BE_SUBMITTED : /* locally */
2011-02-22 04:15:32 +03:00
/* reached via __drbd_make_request */
2009-09-26 03:07:19 +04:00
D_ASSERT ( ! ( req - > rq_state & RQ_LOCAL_MASK ) ) ;
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , 0 , RQ_LOCAL_PENDING ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-01-25 17:37:43 +03:00
case COMPLETED_OK :
2011-07-05 17:38:59 +04:00
if ( req - > rq_state & RQ_WRITE )
2011-01-03 19:09:58 +03:00
mdev - > writ_cnt + = req - > i . size > > 9 ;
2009-09-26 03:07:19 +04:00
else
2011-01-03 19:09:58 +03:00
mdev - > read_cnt + = req - > i . size > > 9 ;
2009-09-26 03:07:19 +04:00
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , RQ_LOCAL_PENDING ,
RQ_LOCAL_COMPLETED | RQ_LOCAL_OK ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-07-05 17:38:59 +04:00
case ABORT_DISK_IO :
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , 0 , RQ_LOCAL_ABORTED ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-01-25 17:37:43 +03:00
case READ_COMPLETED_WITH_ERROR :
2011-01-03 19:09:58 +03:00
drbd_set_out_of_sync ( mdev , req - > i . sector , req - > i . size ) ;
2012-01-24 20:19:42 +04:00
/* fall through. */
case WRITE_COMPLETED_WITH_ERROR :
2010-12-09 17:03:57 +03:00
__drbd_chk_io_error ( mdev , false ) ;
2012-01-24 20:19:42 +04:00
/* fall through. */
case READ_AHEAD_COMPLETED_WITH_ERROR :
/* it is legal to fail READA, no __drbd_chk_io_error in that case. */
mod_rq_state ( req , m , RQ_LOCAL_PENDING , RQ_LOCAL_COMPLETED ) ;
2012-03-26 19:29:30 +04:00
break ;
2009-09-26 03:07:19 +04:00
2011-01-25 17:37:43 +03:00
case QUEUE_FOR_NET_READ :
2009-09-26 03:07:19 +04:00
/* READ or READA, and
* no local disk ,
* or target area marked as invalid ,
* or just got an io - error . */
2011-02-22 04:15:32 +03:00
/* from __drbd_make_request
2009-09-26 03:07:19 +04:00
* or from bio_endio during read io - error recovery */
2012-03-26 19:02:45 +04:00
/* So we can verify the handle in the answer packet.
* Corresponding drbd_remove_request_interval is in
2012-01-24 20:19:42 +04:00
* drbd_req_complete ( ) */
2011-07-16 01:52:44 +04:00
D_ASSERT ( drbd_interval_empty ( & req - > i ) ) ;
2011-01-21 19:18:39 +03:00
drbd_insert_interval ( & mdev - > read_requests , & req - > i ) ;
2009-09-26 03:07:19 +04:00
2009-11-03 04:22:06 +03:00
set_bit ( UNPLUG_REMOTE , & mdev - > flags ) ;
2009-09-26 03:07:19 +04:00
D_ASSERT ( req - > rq_state & RQ_NET_PENDING ) ;
2012-03-26 19:29:30 +04:00
D_ASSERT ( ( req - > rq_state & RQ_LOCAL_MASK ) = = 0 ) ;
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , 0 , RQ_NET_QUEUED ) ;
2012-03-26 19:29:30 +04:00
req - > w . cb = w_send_read_req ;
2011-11-14 18:42:37 +04:00
drbd_queue_work ( & mdev - > tconn - > sender_work , & req - > w ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-01-25 17:37:43 +03:00
case QUEUE_FOR_NET_WRITE :
2009-09-26 03:07:19 +04:00
/* assert something? */
2011-02-22 04:15:32 +03:00
/* from __drbd_make_request only */
2009-09-26 03:07:19 +04:00
2012-03-26 19:02:45 +04:00
/* Corresponding drbd_remove_request_interval is in
2012-01-24 20:19:42 +04:00
* drbd_req_complete ( ) */
2011-07-16 01:52:44 +04:00
D_ASSERT ( drbd_interval_empty ( & req - > i ) ) ;
2011-01-20 17:00:24 +03:00
drbd_insert_interval ( & mdev - > write_requests , & req - > i ) ;
2009-09-26 03:07:19 +04:00
/* NOTE
* In case the req ended up on the transfer log before being
* queued on the worker , it could lead to this request being
* missed during cleanup after connection loss .
* So we have to do both operations here ,
* within the same lock that protects the transfer log .
*
* _req_add_to_epoch ( req ) ; this has to be after the
* _maybe_start_new_epoch ( req ) ; which happened in
2011-02-22 04:15:32 +03:00
* __drbd_make_request , because we now may set the bit
2009-09-26 03:07:19 +04:00
* again ourselves to close the current epoch .
*
* Add req to the ( now ) current epoch ( barrier ) . */
2009-11-03 04:22:06 +03:00
/* otherwise we may lose an unplug, which may cause some remote
* io - scheduler timeout to expire , increasing maximum latency ,
* hurting performance . */
set_bit ( UNPLUG_REMOTE , & mdev - > flags ) ;
2009-09-26 03:07:19 +04:00
/* queue work item to send data */
D_ASSERT ( req - > rq_state & RQ_NET_PENDING ) ;
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , 0 , RQ_NET_QUEUED | RQ_EXP_BARR_ACK ) ;
2009-09-26 03:07:19 +04:00
req - > w . cb = w_send_dblock ;
2011-11-14 18:42:37 +04:00
drbd_queue_work ( & mdev - > tconn - > sender_work , & req - > w ) ;
2009-09-26 03:07:19 +04:00
/* close the epoch, in case it outgrew the limit */
2011-04-19 19:10:19 +04:00
rcu_read_lock ( ) ;
nc = rcu_dereference ( mdev - > tconn - > net_conf ) ;
p = nc - > max_epoch_size ;
rcu_read_unlock ( ) ;
2011-11-28 18:04:49 +04:00
if ( mdev - > tconn - > current_tle_writes > = p )
start_new_tl_epoch ( mdev - > tconn ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-01-25 17:37:43 +03:00
case QUEUE_FOR_SEND_OOS :
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , 0 , RQ_NET_QUEUED ) ;
2010-12-20 01:53:14 +03:00
req - > w . cb = w_send_out_of_sync ;
2011-11-14 18:42:37 +04:00
drbd_queue_work ( & mdev - > tconn - > sender_work , & req - > w ) ;
2010-10-27 16:33:00 +04:00
break ;
2012-03-26 18:46:39 +04:00
case READ_RETRY_REMOTE_CANCELED :
2011-01-25 17:37:43 +03:00
case SEND_CANCELED :
case SEND_FAILED :
2009-09-26 03:07:19 +04:00
/* real cleanup will be done from tl_clear. just update flags
* so it is no longer marked as on the worker queue */
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , RQ_NET_QUEUED , 0 ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-01-25 17:37:43 +03:00
case HANDED_OVER_TO_NETWORK :
2009-09-26 03:07:19 +04:00
/* assert something? */
if ( bio_data_dir ( req - > master_bio ) = = WRITE & &
2011-04-14 03:24:47 +04:00
! ( req - > rq_state & ( RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK ) ) ) {
2009-09-26 03:07:19 +04:00
/* this is what is dangerous about protocol A:
* pretend it was successfully written on the peer . */
2012-01-24 20:19:42 +04:00
if ( req - > rq_state & RQ_NET_PENDING )
mod_rq_state ( req , m , RQ_NET_PENDING , RQ_NET_OK ) ;
/* else: neg-ack was faster... */
2009-09-26 03:07:19 +04:00
/* it is still not yet RQ_NET_DONE until the
* corresponding epoch barrier got acked as well ,
* so we know what to dirty on connection loss */
}
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , RQ_NET_QUEUED , RQ_NET_SENT ) ;
2012-03-26 18:44:59 +04:00
break ;
case OOS_HANDED_TO_NETWORK :
/* Was not set PENDING, no longer QUEUED, so is now DONE
* as far as this connection is concerned . */
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , RQ_NET_QUEUED , RQ_NET_DONE ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-01-25 17:37:43 +03:00
case CONNECTION_LOST_WHILE_PENDING :
2009-09-26 03:07:19 +04:00
/* transfer log cleanup after connection loss */
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m ,
RQ_NET_OK | RQ_NET_PENDING | RQ_COMPLETION_SUSP ,
RQ_NET_DONE ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-02-22 04:15:32 +03:00
case DISCARD_WRITE :
2009-09-26 03:07:19 +04:00
/* for discarded conflicting writes of multiple primaries,
* there is no need to keep anything in the tl , potential
2012-07-24 11:31:18 +04:00
* node crashes are covered by the activity log .
*
* If this request had been marked as RQ_POSTPONED before ,
* it will actually not be discarded , but " restarted " ,
* resubmitted from the retry worker context . */
D_ASSERT ( req - > rq_state & RQ_NET_PENDING ) ;
D_ASSERT ( req - > rq_state & RQ_EXP_WRITE_ACK ) ;
mod_rq_state ( req , m , RQ_NET_PENDING , RQ_NET_DONE | RQ_NET_OK ) ;
break ;
2012-03-26 18:51:11 +04:00
case WRITE_ACKED_BY_PEER_AND_SIS :
2012-07-24 11:31:18 +04:00
req - > rq_state | = RQ_NET_SIS ;
2011-01-25 17:37:43 +03:00
case WRITE_ACKED_BY_PEER :
2011-04-14 03:24:47 +04:00
D_ASSERT ( req - > rq_state & RQ_EXP_WRITE_ACK ) ;
2009-09-26 03:07:19 +04:00
/* protocol C; successfully written on peer.
2012-03-26 18:51:11 +04:00
* Nothing more to do here .
2009-09-26 03:07:19 +04:00
* We want to keep the tl in place for all protocols , to cater
2012-03-26 18:51:11 +04:00
* for volatile write - back caches on lower level devices . */
2009-09-26 03:07:19 +04:00
2011-04-14 03:24:47 +04:00
goto ack_common ;
2011-01-25 17:37:43 +03:00
case RECV_ACKED_BY_PEER :
2011-04-14 03:24:47 +04:00
D_ASSERT ( req - > rq_state & RQ_EXP_RECEIVE_ACK ) ;
2009-09-26 03:07:19 +04:00
/* protocol B; pretends to be successfully written on peer.
2011-01-25 17:37:43 +03:00
* see also notes above in HANDED_OVER_TO_NETWORK about
2009-09-26 03:07:19 +04:00
* protocol ! = C */
2011-04-14 03:24:47 +04:00
ack_common :
2009-09-26 03:07:19 +04:00
D_ASSERT ( req - > rq_state & RQ_NET_PENDING ) ;
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , RQ_NET_PENDING , RQ_NET_OK ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-02-22 04:15:32 +03:00
case POSTPONE_WRITE :
2011-04-14 03:24:47 +04:00
D_ASSERT ( req - > rq_state & RQ_EXP_WRITE_ACK ) ;
/* If this node has already detected the write conflict, the
2011-02-22 04:15:32 +03:00
* worker will be waiting on misc_wait . Wake it up once this
* request has completed locally .
*/
D_ASSERT ( req - > rq_state & RQ_NET_PENDING ) ;
req - > rq_state | = RQ_POSTPONED ;
2012-01-24 20:19:42 +04:00
if ( req - > i . waiting )
wake_up ( & mdev - > misc_wait ) ;
/* Do not clear RQ_NET_PENDING. This request will make further
* progress via restart_conflicting_writes ( ) or
* fail_postponed_requests ( ) . Hopefully . */
2011-02-22 04:15:32 +03:00
break ;
2011-01-25 17:37:43 +03:00
case NEG_ACKED :
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , RQ_NET_OK | RQ_NET_PENDING , RQ_NET_DONE ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-01-25 17:37:43 +03:00
case FAIL_FROZEN_DISK_IO :
2010-05-31 12:14:17 +04:00
if ( ! ( req - > rq_state & RQ_LOCAL_COMPLETED ) )
break ;
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , RQ_COMPLETION_SUSP , 0 ) ;
2010-05-31 12:14:17 +04:00
break ;
2011-01-25 17:37:43 +03:00
case RESTART_FROZEN_DISK_IO :
2010-05-31 12:14:17 +04:00
if ( ! ( req - > rq_state & RQ_LOCAL_COMPLETED ) )
break ;
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m ,
RQ_COMPLETION_SUSP | RQ_LOCAL_COMPLETED ,
RQ_LOCAL_PENDING ) ;
2010-05-31 12:14:17 +04:00
rv = MR_READ ;
if ( bio_data_dir ( req - > master_bio ) = = WRITE )
rv = MR_WRITE ;
2012-01-24 20:19:42 +04:00
get_ldev ( mdev ) ; /* always succeeds in this call path */
2010-05-31 12:14:17 +04:00
req - > w . cb = w_restart_disk_io ;
2011-11-14 18:42:37 +04:00
drbd_queue_work ( & mdev - > tconn - > sender_work , & req - > w ) ;
2010-05-31 12:14:17 +04:00
break ;
2011-01-25 17:37:43 +03:00
case RESEND :
2010-05-12 19:08:26 +04:00
/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
2012-01-24 20:19:42 +04:00
before the connection loss ( B & C only ) ; only P_BARRIER_ACK
( or the local completion ? ) was missing when we suspended .
2012-03-26 19:02:45 +04:00
Throwing them out of the TL here by pretending we got a BARRIER_ACK .
During connection handshake , we ensure that the peer was not rebooted . */
2010-05-12 19:08:26 +04:00
if ( ! ( req - > rq_state & RQ_NET_OK ) ) {
2012-01-24 20:19:42 +04:00
/* FIXME could this possibly be a req->w.cb == w_send_out_of_sync?
* in that case we must not set RQ_NET_PENDING . */
mod_rq_state ( req , m , RQ_COMPLETION_SUSP , RQ_NET_QUEUED | RQ_NET_PENDING ) ;
2010-05-12 19:08:26 +04:00
if ( req - > w . cb ) {
2011-11-14 18:42:37 +04:00
drbd_queue_work ( & mdev - > tconn - > sender_work , & req - > w ) ;
2010-05-12 19:08:26 +04:00
rv = req - > rq_state & RQ_WRITE ? MR_WRITE : MR_READ ;
2012-01-24 20:19:42 +04:00
} /* else: FIXME can this happen? */
2010-05-12 19:08:26 +04:00
break ;
}
2011-01-25 17:37:43 +03:00
/* else, fall through to BARRIER_ACKED */
2010-05-12 19:08:26 +04:00
2011-01-25 17:37:43 +03:00
case BARRIER_ACKED :
2012-01-24 20:19:42 +04:00
/* barrier ack for READ requests does not make sense */
2010-05-27 17:07:43 +04:00
if ( ! ( req - > rq_state & RQ_WRITE ) )
break ;
2009-09-26 03:07:19 +04:00
if ( req - > rq_state & RQ_NET_PENDING ) {
2011-08-17 14:43:25 +04:00
/* barrier came in before all requests were acked.
2009-09-26 03:07:19 +04:00
* this is bad , because if the connection is lost now ,
* we won ' t be able to clean them up . . . */
2011-01-25 17:37:43 +03:00
dev_err ( DEV , " FIXME (BARRIER_ACKED but pending) \n " ) ;
2009-09-26 03:07:19 +04:00
}
2012-01-24 20:19:42 +04:00
/* Allowed to complete requests, even while suspended.
* As this is called for all requests within a matching epoch ,
* we need to filter , and only set RQ_NET_DONE for those that
* have actually been on the wire . */
mod_rq_state ( req , m , RQ_COMPLETION_SUSP ,
( req - > rq_state & RQ_NET_MASK ) ? RQ_NET_DONE : 0 ) ;
2009-09-26 03:07:19 +04:00
break ;
2011-01-25 17:37:43 +03:00
case DATA_RECEIVED :
2009-09-26 03:07:19 +04:00
D_ASSERT ( req - > rq_state & RQ_NET_PENDING ) ;
2012-01-24 20:19:42 +04:00
mod_rq_state ( req , m , RQ_NET_PENDING , RQ_NET_OK | RQ_NET_DONE ) ;
2009-09-26 03:07:19 +04:00
break ;
} ;
2010-06-09 16:07:43 +04:00
return rv ;
2009-09-26 03:07:19 +04:00
}
/* we may do a local read if:
* - we are consistent ( of course ) ,
* - or we are generally inconsistent ,
* BUT we are still / already IN SYNC for this area .
* since size may be bigger than BM_BLOCK_SIZE ,
* we may need to check several bits .
*/
2010-12-19 22:48:29 +03:00
static bool drbd_may_do_local_read ( struct drbd_conf * mdev , sector_t sector , int size )
2009-09-26 03:07:19 +04:00
{
unsigned long sbnr , ebnr ;
sector_t esector , nr_sectors ;
if ( mdev - > state . disk = = D_UP_TO_DATE )
2010-12-19 22:48:29 +03:00
return true ;
2011-02-18 16:13:07 +03:00
if ( mdev - > state . disk ! = D_INCONSISTENT )
2010-12-19 22:48:29 +03:00
return false ;
2009-09-26 03:07:19 +04:00
esector = sector + ( size > > 9 ) - 1 ;
2011-02-21 14:34:58 +03:00
nr_sectors = drbd_get_capacity ( mdev - > this_bdev ) ;
2009-09-26 03:07:19 +04:00
D_ASSERT ( sector < nr_sectors ) ;
D_ASSERT ( esector < nr_sectors ) ;
sbnr = BM_SECT_TO_BIT ( sector ) ;
ebnr = BM_SECT_TO_BIT ( esector ) ;
2010-12-19 22:48:29 +03:00
return drbd_bm_count_bits ( mdev , sbnr , ebnr ) = = 0 ;
2009-09-26 03:07:19 +04:00
}
2012-03-29 19:04:14 +04:00
static bool remote_due_to_read_balancing ( struct drbd_conf * mdev , sector_t sector ,
enum drbd_read_balancing rbm )
2011-11-11 15:31:20 +04:00
{
struct backing_dev_info * bdi ;
2011-11-17 13:12:31 +04:00
int stripe_shift ;
2011-11-11 15:31:20 +04:00
switch ( rbm ) {
case RB_CONGESTED_REMOTE :
bdi = & mdev - > ldev - > backing_bdev - > bd_disk - > queue - > backing_dev_info ;
return bdi_read_congested ( bdi ) ;
case RB_LEAST_PENDING :
return atomic_read ( & mdev - > local_cnt ) >
atomic_read ( & mdev - > ap_pending_cnt ) + atomic_read ( & mdev - > rs_pending_cnt ) ;
2011-11-17 13:12:31 +04:00
case RB_32K_STRIPING : /* stripe_shift = 15 */
case RB_64K_STRIPING :
case RB_128K_STRIPING :
case RB_256K_STRIPING :
case RB_512K_STRIPING :
case RB_1M_STRIPING : /* stripe_shift = 20 */
stripe_shift = ( rbm - RB_32K_STRIPING + 15 ) ;
return ( sector > > ( stripe_shift - 9 ) ) & 1 ;
2011-11-11 15:31:20 +04:00
case RB_ROUND_ROBIN :
return test_and_change_bit ( READ_BALANCE_RR , & mdev - > flags ) ;
case RB_PREFER_REMOTE :
return true ;
case RB_PREFER_LOCAL :
default :
return false ;
}
}
2011-01-28 17:53:51 +03:00
/*
* complete_conflicting_writes - wait for any conflicting write requests
*
* The write_requests tree contains all active write requests which we
* currently know about . Wait for any requests to complete which conflict with
* the new one .
2012-03-26 22:12:24 +04:00
*
* Only way out : remove the conflicting intervals from the tree .
2011-01-28 17:53:51 +03:00
*/
2012-03-26 22:12:24 +04:00
static void complete_conflicting_writes ( struct drbd_request * req )
2011-01-28 17:53:51 +03:00
{
2012-03-26 22:12:24 +04:00
DEFINE_WAIT ( wait ) ;
struct drbd_conf * mdev = req - > w . mdev ;
struct drbd_interval * i ;
sector_t sector = req - > i . sector ;
int size = req - > i . size ;
i = drbd_find_overlap ( & mdev - > write_requests , sector , size ) ;
if ( ! i )
return ;
2011-01-28 17:53:51 +03:00
2012-03-26 22:12:24 +04:00
for ( ; ; ) {
prepare_to_wait ( & mdev - > misc_wait , & wait , TASK_UNINTERRUPTIBLE ) ;
2011-01-28 17:53:51 +03:00
i = drbd_find_overlap ( & mdev - > write_requests , sector , size ) ;
if ( ! i )
2012-03-26 22:12:24 +04:00
break ;
/* Indicate to wake up device->misc_wait on progress. */
i - > waiting = true ;
spin_unlock_irq ( & mdev - > tconn - > req_lock ) ;
schedule ( ) ;
spin_lock_irq ( & mdev - > tconn - > req_lock ) ;
2011-01-28 17:53:51 +03:00
}
2012-03-26 22:12:24 +04:00
finish_wait ( & mdev - > misc_wait , & wait ) ;
2011-01-28 17:53:51 +03:00
}
2012-03-29 19:04:14 +04:00
/* called within req_lock and rcu_read_lock() */
static bool conn_check_congested ( struct drbd_conf * mdev )
{
struct drbd_tconn * tconn = mdev - > tconn ;
struct net_conf * nc ;
bool congested = false ;
enum drbd_on_congestion on_congestion ;
nc = rcu_dereference ( tconn - > net_conf ) ;
on_congestion = nc ? nc - > on_congestion : OC_BLOCK ;
if ( on_congestion = = OC_BLOCK | |
tconn - > agreed_pro_version < 96 )
return false ;
if ( nc - > cong_fill & &
atomic_read ( & mdev - > ap_in_flight ) > = nc - > cong_fill ) {
dev_info ( DEV , " Congestion-fill threshold reached \n " ) ;
congested = true ;
}
if ( mdev - > act_log - > used > = nc - > cong_extents ) {
dev_info ( DEV , " Congestion-extents threshold reached \n " ) ;
congested = true ;
}
if ( congested ) {
if ( mdev - > tconn - > current_tle_writes )
/* start a new epoch for non-mirrored writes */
start_new_tl_epoch ( mdev - > tconn ) ;
if ( on_congestion = = OC_PULL_AHEAD )
_drbd_set_state ( _NS ( mdev , conn , C_AHEAD ) , 0 , NULL ) ;
else /*nc->on_congestion == OC_DISCONNECT */
_drbd_set_state ( _NS ( mdev , conn , C_DISCONNECTING ) , 0 , NULL ) ;
}
return congested ;
}
/* If this returns false, and req->private_bio is still set,
* this should be submitted locally .
*
* If it returns false , but req - > private_bio is not set ,
* we do not have access to good data : (
*
* Otherwise , this destroys req - > private_bio , if any ,
* and returns true .
*/
static bool do_remote_read ( struct drbd_request * req )
{
struct drbd_conf * mdev = req - > w . mdev ;
enum drbd_read_balancing rbm ;
if ( req - > private_bio ) {
if ( ! drbd_may_do_local_read ( mdev ,
req - > i . sector , req - > i . size ) ) {
bio_put ( req - > private_bio ) ;
req - > private_bio = NULL ;
put_ldev ( mdev ) ;
}
}
if ( mdev - > state . pdsk ! = D_UP_TO_DATE )
return false ;
2012-01-24 20:19:42 +04:00
if ( req - > private_bio = = NULL )
return true ;
2012-03-29 19:04:14 +04:00
/* TODO: improve read balancing decisions, take into account drbd
* protocol , pending requests etc . */
rcu_read_lock ( ) ;
rbm = rcu_dereference ( mdev - > ldev - > disk_conf ) - > read_balancing ;
rcu_read_unlock ( ) ;
if ( rbm = = RB_PREFER_LOCAL & & req - > private_bio )
return false ; /* submit locally */
if ( remote_due_to_read_balancing ( mdev , req - > i . sector , rbm ) ) {
if ( req - > private_bio ) {
bio_put ( req - > private_bio ) ;
req - > private_bio = NULL ;
put_ldev ( mdev ) ;
}
return true ;
}
return false ;
}
/* returns number of connections (== 1, for drbd 8.4)
* expected to actually write this data ,
* which does NOT include those that we are L_AHEAD for . */
static int drbd_process_write_request ( struct drbd_request * req )
{
struct drbd_conf * mdev = req - > w . mdev ;
int remote , send_oos ;
rcu_read_lock ( ) ;
remote = drbd_should_do_remote ( mdev - > state ) ;
if ( remote ) {
conn_check_congested ( mdev ) ;
remote = drbd_should_do_remote ( mdev - > state ) ;
}
send_oos = drbd_should_send_out_of_sync ( mdev - > state ) ;
rcu_read_unlock ( ) ;
if ( ! remote & & ! send_oos )
return 0 ;
D_ASSERT ( ! ( remote & & send_oos ) ) ;
if ( remote ) {
_req_mod ( req , TO_BE_SENT ) ;
_req_mod ( req , QUEUE_FOR_NET_WRITE ) ;
} else if ( drbd_set_out_of_sync ( mdev , req - > i . sector , req - > i . size ) )
_req_mod ( req , QUEUE_FOR_SEND_OOS ) ;
return remote ;
}
static void
drbd_submit_req_private_bio ( struct drbd_request * req )
{
struct drbd_conf * mdev = req - > w . mdev ;
struct bio * bio = req - > private_bio ;
const int rw = bio_rw ( bio ) ;
bio - > bi_bdev = mdev - > ldev - > backing_bdev ;
/* State may have changed since we grabbed our reference on the
* - > ldev member . Double check , and short - circuit to endio .
* In case the last activity log transaction failed to get on
* stable storage , and this is a WRITE , we may not even submit
* this bio . */
if ( get_ldev ( mdev ) ) {
if ( drbd_insert_fault ( mdev ,
rw = = WRITE ? DRBD_FAULT_DT_WR
: rw = = READ ? DRBD_FAULT_DT_RD
: DRBD_FAULT_DT_RA ) )
bio_endio ( bio , - EIO ) ;
else
generic_make_request ( bio ) ;
put_ldev ( mdev ) ;
} else
bio_endio ( bio , - EIO ) ;
}
2012-01-24 19:49:58 +04:00
void __drbd_make_request ( struct drbd_conf * mdev , struct bio * bio , unsigned long start_time )
2009-09-26 03:07:19 +04:00
{
const int rw = bio_rw ( bio ) ;
2012-03-29 19:04:14 +04:00
struct bio_and_error m = { NULL , } ;
2009-09-26 03:07:19 +04:00
struct drbd_request * req ;
2012-03-29 19:04:14 +04:00
bool no_remote = false ;
2009-09-26 03:07:19 +04:00
/* allocate outside of all locks; */
req = drbd_req_new ( mdev , bio ) ;
if ( ! req ) {
dec_ap_bio ( mdev ) ;
/* only pass the error to the upper layers.
* if user cannot handle io errors , that ' s not our business . */
dev_err ( DEV , " could not kmalloc() req \n " ) ;
bio_endio ( bio , - ENOMEM ) ;
2012-01-24 19:49:58 +04:00
return ;
2009-09-26 03:07:19 +04:00
}
2010-11-09 19:45:06 +03:00
req - > start_time = start_time ;
2009-09-26 03:07:19 +04:00
2012-03-29 19:04:14 +04:00
if ( ! get_ldev ( mdev ) ) {
bio_put ( req - > private_bio ) ;
2009-09-26 03:07:19 +04:00
req - > private_bio = NULL ;
}
/* For WRITES going to the local disk, grab a reference on the target
* extent . This waits for any resync activity in the corresponding
* resync extent to finish , and , if necessary , pulls in the target
* extent into the activity log , which involves further disk io because
* of transactional on - disk meta data updates . */
2012-03-29 19:04:14 +04:00
if ( rw = = WRITE & & req - > private_bio
& & ! test_bit ( AL_SUSPENDED , & mdev - > flags ) ) {
2010-08-31 14:00:50 +04:00
req - > rq_state | = RQ_IN_ACT_LOG ;
2011-03-31 17:18:56 +04:00
drbd_al_begin_io ( mdev , & req - > i ) ;
2010-08-31 14:00:50 +04:00
}
2009-09-26 03:07:19 +04:00
2011-01-19 16:16:30 +03:00
spin_lock_irq ( & mdev - > tconn - > req_lock ) ;
2011-01-28 17:53:51 +03:00
if ( rw = = WRITE ) {
2012-03-26 22:12:24 +04:00
/* This may temporarily give up the req_lock,
* but will re - aquire it before it returns here .
* Needs to be before the check on drbd_suspended ( ) */
complete_conflicting_writes ( req ) ;
2011-01-28 17:53:51 +03:00
}
2012-03-29 19:04:14 +04:00
/* no more giving up req_lock from now on! */
2010-10-27 16:33:00 +04:00
2012-03-29 19:04:14 +04:00
if ( drbd_suspended ( mdev ) ) {
/* push back and retry: */
req - > rq_state | = RQ_POSTPONED ;
if ( req - > private_bio ) {
bio_put ( req - > private_bio ) ;
req - > private_bio = NULL ;
2009-09-26 03:07:19 +04:00
}
2012-03-29 19:04:14 +04:00
goto out ;
2009-09-26 03:07:19 +04:00
}
/* Update disk stats */
_drbd_start_io_acct ( mdev , req , bio ) ;
2012-03-29 19:04:14 +04:00
/* We fail READ/READA early, if we can not serve it.
* We must do this before req is registered on any lists .
2012-01-24 20:19:42 +04:00
* Otherwise , drbd_req_complete ( ) will queue failed READ for retry . */
2012-03-29 19:04:14 +04:00
if ( rw ! = WRITE ) {
if ( ! do_remote_read ( req ) & & ! req - > private_bio )
goto nodata ;
}
2009-09-26 03:07:19 +04:00
2011-11-28 18:04:49 +04:00
/* which transfer log epoch does this belong to? */
req - > epoch = atomic_read ( & mdev - > tconn - > current_tle_nr ) ;
if ( rw = = WRITE )
mdev - > tconn - > current_tle_writes + + ;
list_add_tail ( & req - > tl_requests , & mdev - > tconn - > transfer_log ) ;
2010-05-27 17:07:43 +04:00
2012-03-29 19:04:14 +04:00
if ( rw = = WRITE ) {
if ( ! drbd_process_write_request ( req ) )
no_remote = true ;
} else {
/* We either have a private_bio, or we can read from remote.
* Otherwise we had done the goto nodata above . */
if ( req - > private_bio = = NULL ) {
_req_mod ( req , TO_BE_SENT ) ;
_req_mod ( req , QUEUE_FOR_NET_READ ) ;
} else
no_remote = true ;
2009-09-26 03:07:19 +04:00
}
2010-10-27 14:21:30 +04:00
2012-03-29 19:04:14 +04:00
if ( req - > private_bio ) {
/* needs to be marked within the same spinlock */
_req_mod ( req , TO_BE_SUBMITTED ) ;
/* but we need to give up the spinlock to submit */
spin_unlock_irq ( & mdev - > tconn - > req_lock ) ;
drbd_submit_req_private_bio ( req ) ;
2012-01-24 20:19:42 +04:00
spin_lock_irq ( & mdev - > tconn - > req_lock ) ;
2012-03-29 19:04:14 +04:00
} else if ( no_remote ) {
nodata :
if ( __ratelimit ( & drbd_ratelimit_state ) )
dev_err ( DEV , " IO ERROR: neither local nor remote disk \n " ) ;
/* A write may have been queued for send_oos, however.
2012-01-24 20:19:42 +04:00
* So we can not simply free it , we must go through drbd_req_put_completion_ref ( ) */
2010-10-27 14:21:30 +04:00
}
2012-03-29 19:04:14 +04:00
out :
2012-01-24 20:19:42 +04:00
if ( drbd_req_put_completion_ref ( req , & m , 1 ) )
kref_put ( & req - > kref , drbd_req_destroy ) ;
2011-01-19 16:16:30 +03:00
spin_unlock_irq ( & mdev - > tconn - > req_lock ) ;
2009-09-26 03:07:19 +04:00
2012-03-29 19:04:14 +04:00
if ( m . bio )
complete_master_bio ( mdev , & m ) ;
2012-01-24 19:49:58 +04:00
return ;
2009-09-26 03:07:19 +04:00
}
2010-12-13 19:48:19 +03:00
int drbd_make_request ( struct request_queue * q , struct bio * bio )
2009-09-26 03:07:19 +04:00
{
struct drbd_conf * mdev = ( struct drbd_conf * ) q - > queuedata ;
2010-11-09 19:45:06 +03:00
unsigned long start_time ;
2009-09-26 03:07:19 +04:00
2010-11-09 19:45:06 +03:00
start_time = jiffies ;
2009-09-26 03:07:19 +04:00
/*
* what we " blindly " assume :
*/
D_ASSERT ( bio - > bi_size > 0 ) ;
2011-02-21 14:41:39 +03:00
D_ASSERT ( IS_ALIGNED ( bio - > bi_size , 512 ) ) ;
2009-09-26 03:07:19 +04:00
2012-01-24 19:49:58 +04:00
inc_ap_bio ( mdev ) ;
__drbd_make_request ( mdev , bio , start_time ) ;
2011-12-20 14:49:58 +04:00
return 0 ;
2009-09-26 03:07:19 +04:00
}
2011-03-31 18:36:43 +04:00
/* This is called by bio_add_page().
*
* q - > max_hw_sectors and other global limits are already enforced there .
2009-09-26 03:07:19 +04:00
*
2011-03-31 18:36:43 +04:00
* We need to call down to our lower level device ,
* in case it has special restrictions .
*
* We also may need to enforce configured max - bio - bvecs limits .
2009-09-26 03:07:19 +04:00
*
* As long as the BIO is empty we have to allow at least one bvec ,
2011-03-31 18:36:43 +04:00
* regardless of size and offset , so no need to ask lower levels .
2009-09-26 03:07:19 +04:00
*/
int drbd_merge_bvec ( struct request_queue * q , struct bvec_merge_data * bvm , struct bio_vec * bvec )
{
struct drbd_conf * mdev = ( struct drbd_conf * ) q - > queuedata ;
unsigned int bio_size = bvm - > bi_size ;
2011-03-31 18:36:43 +04:00
int limit = DRBD_MAX_BIO_SIZE ;
int backing_limit ;
if ( bio_size & & get_ldev ( mdev ) ) {
2009-09-26 03:07:19 +04:00
struct request_queue * const b =
mdev - > ldev - > backing_bdev - > bd_disk - > queue ;
2010-05-14 21:16:41 +04:00
if ( b - > merge_bvec_fn ) {
2009-09-26 03:07:19 +04:00
backing_limit = b - > merge_bvec_fn ( b , bvm , bvec ) ;
limit = min ( limit , backing_limit ) ;
}
put_ldev ( mdev ) ;
}
return limit ;
}
2011-03-01 13:08:28 +03:00
2011-11-28 18:04:49 +04:00
struct drbd_request * find_oldest_request ( struct drbd_tconn * tconn )
{
/* Walk the transfer log,
* and find the oldest not yet completed request */
struct drbd_request * r ;
list_for_each_entry ( r , & tconn - > transfer_log , tl_requests ) {
2012-01-24 19:58:11 +04:00
if ( atomic_read ( & r - > completion_ref ) )
2011-11-28 18:04:49 +04:00
return r ;
}
return NULL ;
}
2011-03-01 13:08:28 +03:00
void request_timer_fn ( unsigned long data )
{
struct drbd_conf * mdev = ( struct drbd_conf * ) data ;
2011-03-01 13:08:28 +03:00
struct drbd_tconn * tconn = mdev - > tconn ;
2011-03-01 13:08:28 +03:00
struct drbd_request * req ; /* oldest request */
2011-04-19 19:10:19 +04:00
struct net_conf * nc ;
2011-07-15 15:53:06 +04:00
unsigned long ent = 0 , dt = 0 , et , nt ; /* effective timeout = ko_count * timeout */
2012-05-07 13:53:08 +04:00
unsigned long now ;
2011-04-19 19:10:19 +04:00
rcu_read_lock ( ) ;
nc = rcu_dereference ( tconn - > net_conf ) ;
2012-05-07 13:53:08 +04:00
if ( nc & & mdev - > state . conn > = C_WF_REPORT_PARAMS )
ent = nc - > timeout * HZ / 10 * nc - > ko_count ;
2011-07-05 17:38:59 +04:00
2012-05-07 13:53:08 +04:00
if ( get_ldev ( mdev ) ) { /* implicit state.disk >= D_INCONSISTENT */
2011-07-05 17:38:59 +04:00
dt = rcu_dereference ( mdev - > ldev - > disk_conf ) - > disk_timeout * HZ / 10 ;
put_ldev ( mdev ) ;
}
2011-04-19 19:10:19 +04:00
rcu_read_unlock ( ) ;
2011-03-01 13:08:28 +03:00
2011-07-05 17:38:59 +04:00
et = min_not_zero ( dt , ent ) ;
2012-05-07 13:53:08 +04:00
if ( ! et )
2011-03-01 13:08:28 +03:00
return ; /* Recurring timer stopped */
2012-05-07 13:53:08 +04:00
now = jiffies ;
2011-03-01 13:08:28 +03:00
spin_lock_irq ( & tconn - > req_lock ) ;
2011-11-28 18:04:49 +04:00
req = find_oldest_request ( tconn ) ;
if ( ! req ) {
2011-03-01 13:08:28 +03:00
spin_unlock_irq ( & tconn - > req_lock ) ;
2012-05-07 13:53:08 +04:00
mod_timer ( & mdev - > request_timer , now + et ) ;
2011-03-01 13:08:28 +03:00
return ;
}
2012-05-07 13:53:08 +04:00
/* The request is considered timed out, if
* - we have some effective timeout from the configuration ,
* with above state restrictions applied ,
* - the oldest request is waiting for a response from the network
* resp . the local disk ,
* - the oldest request is in fact older than the effective timeout ,
* - the connection was established ( resp . disk was attached )
* for longer than the timeout already .
* Note that for 32 bit jiffies and very stable connections / disks ,
* we may have a wrap around , which is catched by
* ! time_in_range ( now , last_ . . . _jif , last_ . . . _jif + timeout ) .
*
* Side effect : once per 32 bit wrap - around interval , which means every
* ~ 198 days with 250 HZ , we have a window where the timeout would need
* to expire twice ( worst case ) to become effective . Good enough .
*/
if ( ent & & req - > rq_state & RQ_NET_PENDING & &
time_after ( now , req - > start_time + ent ) & &
! time_in_range ( now , tconn - > last_reconnect_jif , tconn - > last_reconnect_jif + ent ) ) {
dev_warn ( DEV , " Remote failed to finish a request within ko-count * timeout \n " ) ;
_drbd_set_state ( _NS ( mdev , conn , C_TIMEOUT ) , CS_VERBOSE | CS_HARD , NULL ) ;
2011-07-05 17:38:59 +04:00
}
2012-05-07 13:53:08 +04:00
if ( dt & & req - > rq_state & RQ_LOCAL_PENDING & & req - > w . mdev = = mdev & &
time_after ( now , req - > start_time + dt ) & &
! time_in_range ( now , mdev - > last_reattach_jif , mdev - > last_reattach_jif + dt ) ) {
dev_warn ( DEV , " Local backing device failed to meet the disk-timeout \n " ) ;
__drbd_chk_io_error ( mdev , 1 ) ;
2011-03-01 13:08:28 +03:00
}
2012-05-07 13:53:08 +04:00
nt = ( time_after ( now , req - > start_time + et ) ? now : req - > start_time ) + et ;
2011-03-01 13:08:28 +03:00
spin_unlock_irq ( & tconn - > req_lock ) ;
2011-07-15 15:53:06 +04:00
mod_timer ( & mdev - > request_timer , nt ) ;
2011-03-01 13:08:28 +03:00
}