drbd: Runtime changeable wire protocol
The wire protocol is no longer a property that is negotiated between the two peers. It is now expressed with two bits (DP_SEND_WRITE_ACK and DP_SEND_RECEIVE_ACK) in each data packet. Therefore the primary node is free to change the wire protocol at any time without disconnect/reconnect. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
This commit is contained in:
parent
d3fcb4908d
commit
303d1448a0
@ -327,6 +327,8 @@ extern unsigned int drbd_header_size(struct drbd_tconn *tconn);
|
|||||||
#define DP_FUA 16 /* equals REQ_FUA */
|
#define DP_FUA 16 /* equals REQ_FUA */
|
||||||
#define DP_FLUSH 32 /* equals REQ_FLUSH */
|
#define DP_FLUSH 32 /* equals REQ_FLUSH */
|
||||||
#define DP_DISCARD 64 /* equals REQ_DISCARD */
|
#define DP_DISCARD 64 /* equals REQ_DISCARD */
|
||||||
|
#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
|
||||||
|
#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
|
||||||
|
|
||||||
struct p_data {
|
struct p_data {
|
||||||
u64 sector; /* 64 bits sector number */
|
u64 sector; /* 64 bits sector number */
|
||||||
@ -656,6 +658,9 @@ enum {
|
|||||||
|
|
||||||
/* Conflicting local requests need to be restarted after this request */
|
/* Conflicting local requests need to be restarted after this request */
|
||||||
__EE_RESTART_REQUESTS,
|
__EE_RESTART_REQUESTS,
|
||||||
|
|
||||||
|
/* The peer wants a write ACK for this (wire proto C) */
|
||||||
|
__EE_SEND_WRITE_ACK,
|
||||||
};
|
};
|
||||||
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
|
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
|
||||||
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
|
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
|
||||||
@ -663,6 +668,7 @@ enum {
|
|||||||
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
|
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
|
||||||
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
|
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
|
||||||
#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
|
#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
|
||||||
|
#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
|
||||||
|
|
||||||
/* flag bits per mdev */
|
/* flag bits per mdev */
|
||||||
enum {
|
enum {
|
||||||
|
@ -1681,6 +1681,12 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
|
|||||||
if (mdev->state.conn >= C_SYNC_SOURCE &&
|
if (mdev->state.conn >= C_SYNC_SOURCE &&
|
||||||
mdev->state.conn <= C_PAUSED_SYNC_T)
|
mdev->state.conn <= C_PAUSED_SYNC_T)
|
||||||
dp_flags |= DP_MAY_SET_IN_SYNC;
|
dp_flags |= DP_MAY_SET_IN_SYNC;
|
||||||
|
if (mdev->tconn->agreed_pro_version >= 100) {
|
||||||
|
if (req->rq_state & RQ_EXP_RECEIVE_ACK)
|
||||||
|
dp_flags |= DP_SEND_RECEIVE_ACK;
|
||||||
|
if (req->rq_state & RQ_EXP_WRITE_ACK)
|
||||||
|
dp_flags |= DP_SEND_WRITE_ACK;
|
||||||
|
}
|
||||||
p->dp_flags = cpu_to_be32(dp_flags);
|
p->dp_flags = cpu_to_be32(dp_flags);
|
||||||
if (dgs)
|
if (dgs)
|
||||||
drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, p + 1);
|
drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, p + 1);
|
||||||
@ -1697,7 +1703,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
|
|||||||
* out ok after sending on this side, but does not fit on the
|
* out ok after sending on this side, but does not fit on the
|
||||||
* receiving side, we sure have detected corruption elsewhere.
|
* receiving side, we sure have detected corruption elsewhere.
|
||||||
*/
|
*/
|
||||||
if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A || dgs)
|
if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs)
|
||||||
err = _drbd_send_bio(mdev, req->master_bio);
|
err = _drbd_send_bio(mdev, req->master_bio);
|
||||||
else
|
else
|
||||||
err = _drbd_send_zc_bio(mdev, req->master_bio);
|
err = _drbd_send_zc_bio(mdev, req->master_bio);
|
||||||
|
@ -1697,7 +1697,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
|
|||||||
sector_t sector = peer_req->i.sector;
|
sector_t sector = peer_req->i.sector;
|
||||||
int err = 0, pcmd;
|
int err = 0, pcmd;
|
||||||
|
|
||||||
if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C) {
|
if (peer_req->flags & EE_SEND_WRITE_ACK) {
|
||||||
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
|
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
|
||||||
pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
|
pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
|
||||||
mdev->state.conn <= C_PAUSED_SYNC_T &&
|
mdev->state.conn <= C_PAUSED_SYNC_T &&
|
||||||
@ -2074,20 +2074,28 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
|
|||||||
list_add(&peer_req->w.list, &mdev->active_ee);
|
list_add(&peer_req->w.list, &mdev->active_ee);
|
||||||
spin_unlock_irq(&mdev->tconn->req_lock);
|
spin_unlock_irq(&mdev->tconn->req_lock);
|
||||||
|
|
||||||
switch (mdev->tconn->net_conf->wire_protocol) {
|
if (mdev->tconn->agreed_pro_version < 100) {
|
||||||
case DRBD_PROT_C:
|
switch (mdev->tconn->net_conf->wire_protocol) {
|
||||||
|
case DRBD_PROT_C:
|
||||||
|
dp_flags |= DP_SEND_WRITE_ACK;
|
||||||
|
break;
|
||||||
|
case DRBD_PROT_B:
|
||||||
|
dp_flags |= DP_SEND_RECEIVE_ACK;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dp_flags & DP_SEND_WRITE_ACK) {
|
||||||
|
peer_req->flags |= EE_SEND_WRITE_ACK;
|
||||||
inc_unacked(mdev);
|
inc_unacked(mdev);
|
||||||
/* corresponding dec_unacked() in e_end_block()
|
/* corresponding dec_unacked() in e_end_block()
|
||||||
* respective _drbd_clear_done_ee */
|
* respective _drbd_clear_done_ee */
|
||||||
break;
|
}
|
||||||
case DRBD_PROT_B:
|
|
||||||
|
if (dp_flags & DP_SEND_RECEIVE_ACK) {
|
||||||
/* I really don't like it that the receiver thread
|
/* I really don't like it that the receiver thread
|
||||||
* sends on the msock, but anyways */
|
* sends on the msock, but anyways */
|
||||||
drbd_send_ack(mdev, P_RECV_ACK, peer_req);
|
drbd_send_ack(mdev, P_RECV_ACK, peer_req);
|
||||||
break;
|
|
||||||
case DRBD_PROT_A:
|
|
||||||
/* nothing to do */
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mdev->state.pdsk < D_INCONSISTENT) {
|
if (mdev->state.pdsk < D_INCONSISTENT) {
|
||||||
@ -2932,7 +2940,7 @@ static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi)
|
|||||||
if (cf & CF_DRY_RUN)
|
if (cf & CF_DRY_RUN)
|
||||||
set_bit(CONN_DRY_RUN, &tconn->flags);
|
set_bit(CONN_DRY_RUN, &tconn->flags);
|
||||||
|
|
||||||
if (p_proto != tconn->net_conf->wire_protocol) {
|
if (p_proto != tconn->net_conf->wire_protocol && tconn->agreed_pro_version < 100) {
|
||||||
conn_err(tconn, "incompatible communication protocols\n");
|
conn_err(tconn, "incompatible communication protocols\n");
|
||||||
goto disconnect;
|
goto disconnect;
|
||||||
}
|
}
|
||||||
@ -4622,23 +4630,18 @@ static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi)
|
|||||||
}
|
}
|
||||||
switch (pi->cmd) {
|
switch (pi->cmd) {
|
||||||
case P_RS_WRITE_ACK:
|
case P_RS_WRITE_ACK:
|
||||||
D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C);
|
|
||||||
what = WRITE_ACKED_BY_PEER_AND_SIS;
|
what = WRITE_ACKED_BY_PEER_AND_SIS;
|
||||||
break;
|
break;
|
||||||
case P_WRITE_ACK:
|
case P_WRITE_ACK:
|
||||||
D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C);
|
|
||||||
what = WRITE_ACKED_BY_PEER;
|
what = WRITE_ACKED_BY_PEER;
|
||||||
break;
|
break;
|
||||||
case P_RECV_ACK:
|
case P_RECV_ACK:
|
||||||
D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_B);
|
|
||||||
what = RECV_ACKED_BY_PEER;
|
what = RECV_ACKED_BY_PEER;
|
||||||
break;
|
break;
|
||||||
case P_DISCARD_WRITE:
|
case P_DISCARD_WRITE:
|
||||||
D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C);
|
|
||||||
what = DISCARD_WRITE;
|
what = DISCARD_WRITE;
|
||||||
break;
|
break;
|
||||||
case P_RETRY_WRITE:
|
case P_RETRY_WRITE:
|
||||||
D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C);
|
|
||||||
what = POSTPONE_WRITE;
|
what = POSTPONE_WRITE;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@ -4656,8 +4659,6 @@ static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi)
|
|||||||
struct p_block_ack *p = pi->data;
|
struct p_block_ack *p = pi->data;
|
||||||
sector_t sector = be64_to_cpu(p->sector);
|
sector_t sector = be64_to_cpu(p->sector);
|
||||||
int size = be32_to_cpu(p->blksize);
|
int size = be32_to_cpu(p->blksize);
|
||||||
bool missing_ok = tconn->net_conf->wire_protocol == DRBD_PROT_A ||
|
|
||||||
tconn->net_conf->wire_protocol == DRBD_PROT_B;
|
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
mdev = vnr_to_mdev(tconn, pi->vnr);
|
mdev = vnr_to_mdev(tconn, pi->vnr);
|
||||||
@ -4674,15 +4675,13 @@ static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi)
|
|||||||
|
|
||||||
err = validate_req_change_req_state(mdev, p->block_id, sector,
|
err = validate_req_change_req_state(mdev, p->block_id, sector,
|
||||||
&mdev->write_requests, __func__,
|
&mdev->write_requests, __func__,
|
||||||
NEG_ACKED, missing_ok);
|
NEG_ACKED, true);
|
||||||
if (err) {
|
if (err) {
|
||||||
/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
|
/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
|
||||||
The master bio might already be completed, therefore the
|
The master bio might already be completed, therefore the
|
||||||
request is no longer in the collision hash. */
|
request is no longer in the collision hash. */
|
||||||
/* In Protocol B we might already have got a P_RECV_ACK
|
/* In Protocol B we might already have got a P_RECV_ACK
|
||||||
but then get a P_NEG_ACK afterwards. */
|
but then get a P_NEG_ACK afterwards. */
|
||||||
if (!missing_ok)
|
|
||||||
return err;
|
|
||||||
drbd_set_out_of_sync(mdev, sector, size);
|
drbd_set_out_of_sync(mdev, sector, size);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -323,7 +323,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
|||||||
struct bio_and_error *m)
|
struct bio_and_error *m)
|
||||||
{
|
{
|
||||||
struct drbd_conf *mdev = req->w.mdev;
|
struct drbd_conf *mdev = req->w.mdev;
|
||||||
int rv = 0;
|
int p, rv = 0;
|
||||||
|
|
||||||
if (m)
|
if (m)
|
||||||
m->bio = NULL;
|
m->bio = NULL;
|
||||||
@ -344,6 +344,10 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
|||||||
* and from w_read_retry_remote */
|
* and from w_read_retry_remote */
|
||||||
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
|
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
|
||||||
req->rq_state |= RQ_NET_PENDING;
|
req->rq_state |= RQ_NET_PENDING;
|
||||||
|
p = mdev->tconn->net_conf->wire_protocol;
|
||||||
|
req->rq_state |=
|
||||||
|
p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
|
||||||
|
p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
|
||||||
inc_ap_pending(mdev);
|
inc_ap_pending(mdev);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -500,7 +504,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
|||||||
atomic_add(req->i.size >> 9, &mdev->ap_in_flight);
|
atomic_add(req->i.size >> 9, &mdev->ap_in_flight);
|
||||||
|
|
||||||
if (bio_data_dir(req->master_bio) == WRITE &&
|
if (bio_data_dir(req->master_bio) == WRITE &&
|
||||||
mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A) {
|
!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
|
||||||
/* this is what is dangerous about protocol A:
|
/* this is what is dangerous about protocol A:
|
||||||
* pretend it was successfully written on the peer. */
|
* pretend it was successfully written on the peer. */
|
||||||
if (req->rq_state & RQ_NET_PENDING) {
|
if (req->rq_state & RQ_NET_PENDING) {
|
||||||
@ -550,6 +554,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
|||||||
req->rq_state |= RQ_NET_DONE;
|
req->rq_state |= RQ_NET_DONE;
|
||||||
/* fall through */
|
/* fall through */
|
||||||
case WRITE_ACKED_BY_PEER:
|
case WRITE_ACKED_BY_PEER:
|
||||||
|
D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
|
||||||
/* protocol C; successfully written on peer.
|
/* protocol C; successfully written on peer.
|
||||||
* Nothing to do here.
|
* Nothing to do here.
|
||||||
* We want to keep the tl in place for all protocols, to cater
|
* We want to keep the tl in place for all protocols, to cater
|
||||||
@ -560,11 +565,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
|||||||
* request could set NET_DONE right here, and not wait for the
|
* request could set NET_DONE right here, and not wait for the
|
||||||
* P_BARRIER_ACK, but that is an unnecessary optimization. */
|
* P_BARRIER_ACK, but that is an unnecessary optimization. */
|
||||||
|
|
||||||
|
goto ack_common;
|
||||||
/* this makes it effectively the same as for: */
|
/* this makes it effectively the same as for: */
|
||||||
case RECV_ACKED_BY_PEER:
|
case RECV_ACKED_BY_PEER:
|
||||||
|
D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK);
|
||||||
/* protocol B; pretends to be successfully written on peer.
|
/* protocol B; pretends to be successfully written on peer.
|
||||||
* see also notes above in HANDED_OVER_TO_NETWORK about
|
* see also notes above in HANDED_OVER_TO_NETWORK about
|
||||||
* protocol != C */
|
* protocol != C */
|
||||||
|
ack_common:
|
||||||
req->rq_state |= RQ_NET_OK;
|
req->rq_state |= RQ_NET_OK;
|
||||||
D_ASSERT(req->rq_state & RQ_NET_PENDING);
|
D_ASSERT(req->rq_state & RQ_NET_PENDING);
|
||||||
dec_ap_pending(mdev);
|
dec_ap_pending(mdev);
|
||||||
@ -574,8 +582,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case POSTPONE_WRITE:
|
case POSTPONE_WRITE:
|
||||||
/*
|
D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
|
||||||
* If this node has already detected the write conflict, the
|
/* If this node has already detected the write conflict, the
|
||||||
* worker will be waiting on misc_wait. Wake it up once this
|
* worker will be waiting on misc_wait. Wake it up once this
|
||||||
* request has completed locally.
|
* request has completed locally.
|
||||||
*/
|
*/
|
||||||
@ -646,7 +654,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
|||||||
}
|
}
|
||||||
if ((req->rq_state & RQ_NET_MASK) != 0) {
|
if ((req->rq_state & RQ_NET_MASK) != 0) {
|
||||||
req->rq_state |= RQ_NET_DONE;
|
req->rq_state |= RQ_NET_DONE;
|
||||||
if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A)
|
if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)))
|
||||||
atomic_sub(req->i.size>>9, &mdev->ap_in_flight);
|
atomic_sub(req->i.size>>9, &mdev->ap_in_flight);
|
||||||
}
|
}
|
||||||
_req_may_be_done(req, m); /* Allowed while state.susp */
|
_req_may_be_done(req, m); /* Allowed while state.susp */
|
||||||
|
@ -198,6 +198,12 @@ enum drbd_req_state_bits {
|
|||||||
|
|
||||||
/* The peer has sent a retry ACK */
|
/* The peer has sent a retry ACK */
|
||||||
__RQ_POSTPONED,
|
__RQ_POSTPONED,
|
||||||
|
|
||||||
|
/* We expect a receive ACK (wire proto B) */
|
||||||
|
__RQ_EXP_RECEIVE_ACK,
|
||||||
|
|
||||||
|
/* We expect a write ACK (wite proto C) */
|
||||||
|
__RQ_EXP_WRITE_ACK,
|
||||||
};
|
};
|
||||||
|
|
||||||
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
|
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
|
||||||
@ -219,6 +225,8 @@ enum drbd_req_state_bits {
|
|||||||
#define RQ_WRITE (1UL << __RQ_WRITE)
|
#define RQ_WRITE (1UL << __RQ_WRITE)
|
||||||
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
|
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
|
||||||
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
|
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
|
||||||
|
#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
|
||||||
|
#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
|
||||||
|
|
||||||
/* For waking up the frozen transfer log mod_req() has to return if the request
|
/* For waking up the frozen transfer log mod_req() has to return if the request
|
||||||
should be counted in the epoch object*/
|
should be counted in the epoch object*/
|
||||||
|
Loading…
Reference in New Issue
Block a user