ceph: unify cap flush and snapcap flush

This patch includes following changes
- Assign flush tid to snapcap flush
- Remove session's s_cap_snaps_flushing list. Add inode to session's
  s_cap_flushing list instead. Inode is removed from the list when
  there is no pending snapcap flush or cap flush.
- make __kick_flushing_caps() re-send both snapcap flushes and cap
  flushes.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
This commit is contained in:
Yan, Zheng 2016-07-04 18:06:41 +08:00 committed by Ilya Dryomov
parent e4500b5e35
commit 0e29438789
5 changed files with 175 additions and 222 deletions

View File

@ -40,6 +40,7 @@
* cluster to release server state. * cluster to release server state.
*/ */
static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
/* /*
* Generate readable cap strings for debugging output. * Generate readable cap strings for debugging output.
@ -1217,6 +1218,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
return delayed; return delayed;
} }
static inline int __send_flush_snap(struct inode *inode,
struct ceph_mds_session *session,
struct ceph_cap_snap *capsnap,
u32 mseq, u64 oldest_flush_tid)
{
return send_cap_msg(session, ceph_vino(inode).ino, 0,
CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
capsnap->dirty, 0, capsnap->cap_flush.tid,
oldest_flush_tid, 0, mseq, capsnap->size, 0,
&capsnap->mtime, &capsnap->atime,
&capsnap->ctime, capsnap->time_warp_seq,
capsnap->uid, capsnap->gid, capsnap->mode,
capsnap->xattr_version, capsnap->xattr_blob,
capsnap->follows, capsnap->inline_data);
}
/* /*
* When a snapshot is taken, clients accumulate dirty metadata on * When a snapshot is taken, clients accumulate dirty metadata on
* inodes with capabilities in ceph_cap_snaps to describe the file * inodes with capabilities in ceph_cap_snaps to describe the file
@ -1224,14 +1241,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* asynchronously back to the MDS once sync writes complete and dirty * asynchronously back to the MDS once sync writes complete and dirty
* data is written out. * data is written out.
* *
* Unless @kick is true, skip cap_snaps that were already sent to
* the MDS (i.e., during this session).
*
* Called under i_ceph_lock. Takes s_mutex as needed. * Called under i_ceph_lock. Takes s_mutex as needed.
*/ */
void __ceph_flush_snaps(struct ceph_inode_info *ci, void __ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession, struct ceph_mds_session **psession)
int kick)
__releases(ci->i_ceph_lock) __releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock) __acquires(ci->i_ceph_lock)
{ {
@ -1242,6 +1255,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL; /* if session != NULL, we hold struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
session->s_mutex */ session->s_mutex */
u64 oldest_flush_tid;
u64 next_follows = 0; /* keep track of how far we've gotten through the u64 next_follows = 0; /* keep track of how far we've gotten through the
i_cap_snaps list, and skip these entries next time i_cap_snaps list, and skip these entries next time
around to avoid an infinite loop */ around to avoid an infinite loop */
@ -1272,7 +1286,7 @@ retry:
} }
/* only flush each capsnap once */ /* only flush each capsnap once */
if (!kick && !list_empty(&capsnap->flushing_item)) { if (capsnap->cap_flush.tid > 0) {
dout("already flushed %p, skipping\n", capsnap); dout("already flushed %p, skipping\n", capsnap);
continue; continue;
} }
@ -1282,8 +1296,6 @@ retry:
if (session && session->s_mds != mds) { if (session && session->s_mds != mds) {
dout("oops, wrong session %p mutex\n", session); dout("oops, wrong session %p mutex\n", session);
if (kick)
goto out;
mutex_unlock(&session->s_mutex); mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session); ceph_put_mds_session(session);
@ -1309,26 +1321,27 @@ retry:
} }
spin_lock(&mdsc->cap_dirty_lock); spin_lock(&mdsc->cap_dirty_lock);
capsnap->flush_tid = ++mdsc->last_cap_flush_tid; capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
list_add_tail(&capsnap->cap_flush.g_list,
&mdsc->cap_flush_list);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
if (list_empty(&ci->i_flushing_item)) {
list_add_tail(&ci->i_flushing_item,
&session->s_cap_flushing);
}
spin_unlock(&mdsc->cap_dirty_lock); spin_unlock(&mdsc->cap_dirty_lock);
list_add_tail(&capsnap->cap_flush.i_list,
&ci->i_cap_flush_list);
atomic_inc(&capsnap->nref); atomic_inc(&capsnap->nref);
if (list_empty(&capsnap->flushing_item))
list_add_tail(&capsnap->flushing_item,
&session->s_cap_snaps_flushing);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
inode, capsnap, capsnap->follows, capsnap->flush_tid); inode, capsnap, capsnap->follows, capsnap->cap_flush.tid);
send_cap_msg(session, ceph_vino(inode).ino, 0, __send_flush_snap(inode, session, capsnap, mseq,
CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, oldest_flush_tid);
capsnap->dirty, 0, capsnap->flush_tid, 0,
0, mseq, capsnap->size, 0,
&capsnap->mtime, &capsnap->atime,
&capsnap->ctime, capsnap->time_warp_seq,
capsnap->uid, capsnap->gid, capsnap->mode,
capsnap->xattr_version, capsnap->xattr_blob,
capsnap->follows, capsnap->inline_data);
next_follows = capsnap->follows + 1; next_follows = capsnap->follows + 1;
ceph_put_cap_snap(capsnap); ceph_put_cap_snap(capsnap);
@ -1354,7 +1367,7 @@ out:
static void ceph_flush_snaps(struct ceph_inode_info *ci) static void ceph_flush_snaps(struct ceph_inode_info *ci)
{ {
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
__ceph_flush_snaps(ci, NULL, 0); __ceph_flush_snaps(ci, NULL);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
} }
@ -1476,11 +1489,6 @@ static int __mark_caps_flushing(struct inode *inode,
if (list_empty(&ci->i_flushing_item)) { if (list_empty(&ci->i_flushing_item)) {
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++; mdsc->num_cap_flushing++;
dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
} else {
list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
dout(" inode %p now flushing (more) tid %llu\n",
inode, cf->tid);
} }
spin_unlock(&mdsc->cap_dirty_lock); spin_unlock(&mdsc->cap_dirty_lock);
@ -1556,7 +1564,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
/* flush snaps first time around only */ /* flush snaps first time around only */
if (!list_empty(&ci->i_cap_snaps)) if (!list_empty(&ci->i_cap_snaps))
__ceph_flush_snaps(ci, &session, 0); __ceph_flush_snaps(ci, &session);
goto retry_locked; goto retry_locked;
retry: retry:
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
@ -1997,80 +2005,74 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
return err; return err;
} }
/* static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
* After a recovering MDS goes active, we need to resend any caps struct ceph_mds_session *session,
* we were flushing. struct ceph_inode_info *ci,
* u64 oldest_flush_tid)
* Caller holds session->s_mutex. __releases(ci->i_ceph_lock)
*/ __acquires(ci->i_ceph_lock)
static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_cap_snap *capsnap;
dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
flushing_item) {
struct ceph_inode_info *ci = capsnap->ci;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (cap && cap->session == session) {
dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
cap, capsnap);
__ceph_flush_snaps(ci, &session, 1);
} else {
pr_err("%p auth cap %p not mds%d ???\n", inode,
cap, session->s_mds);
}
spin_unlock(&ci->i_ceph_lock);
}
}
static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_inode_info *ci)
{ {
struct inode *inode = &ci->vfs_inode; struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap; struct ceph_cap *cap;
struct ceph_cap_flush *cf; struct ceph_cap_flush *cf;
int delayed = 0; int ret;
u64 first_tid = 0; u64 first_tid = 0;
u64 oldest_flush_tid;
spin_lock(&mdsc->cap_dirty_lock);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
if (cf->tid < first_tid) if (cf->tid < first_tid)
continue; continue;
cap = ci->i_auth_cap; cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) { if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n", inode, pr_err("%p auth cap %p not mds%d ???\n",
cap, session->s_mds); inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
break; break;
} }
first_tid = cf->tid + 1; first_tid = cf->tid + 1;
dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, if (cf->caps) {
cap, cf->tid, ceph_cap_string(cf->caps)); dout("kick_flushing_caps %p cap %p tid %llu %s\n",
delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, inode, cap, cf->tid, ceph_cap_string(cf->caps));
__ceph_caps_used(ci), ci->i_ceph_flags |= CEPH_I_NODELAY;
__ceph_caps_wanted(ci), ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
cap->issued | cap->implemented, __ceph_caps_used(ci),
cf->caps, cf->tid, oldest_flush_tid); __ceph_caps_wanted(ci),
cap->issued | cap->implemented,
cf->caps, cf->tid, oldest_flush_tid);
if (ret) {
pr_err("kick_flushing_caps: error sending "
"cap flush, ino (%llx.%llx) "
"tid %llu flushing %s\n",
ceph_vinop(inode), cf->tid,
ceph_cap_string(cf->caps));
}
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
cap_flush);
dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
inode, capsnap, cf->tid,
ceph_cap_string(capsnap->dirty));
atomic_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
oldest_flush_tid);
if (ret < 0) {
pr_err("kick_flushing_caps: error sending "
"cap flushsnap, ino (%llx.%llx) "
"tid %llu follows %llu\n",
ceph_vinop(inode), cf->tid,
capsnap->follows);
}
ceph_put_cap_snap(capsnap);
}
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
} }
spin_unlock(&ci->i_ceph_lock);
return delayed;
} }
void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
@ -2078,8 +2080,14 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
{ {
struct ceph_inode_info *ci; struct ceph_inode_info *ci;
struct ceph_cap *cap; struct ceph_cap *cap;
u64 oldest_flush_tid;
dout("early_kick_flushing_caps mds%d\n", session->s_mds); dout("early_kick_flushing_caps mds%d\n", session->s_mds);
spin_lock(&mdsc->cap_dirty_lock);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap; cap = ci->i_auth_cap;
@ -2099,10 +2107,8 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
*/ */
if ((cap->issued & ci->i_flushing_caps) != if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) { ci->i_flushing_caps) {
spin_unlock(&ci->i_ceph_lock); __kick_flushing_caps(mdsc, session, ci,
if (!__kick_flushing_caps(mdsc, session, ci)) oldest_flush_tid);
continue;
spin_lock(&ci->i_ceph_lock);
} }
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
@ -2113,50 +2119,43 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session) struct ceph_mds_session *session)
{ {
struct ceph_inode_info *ci; struct ceph_inode_info *ci;
u64 oldest_flush_tid;
kick_flushing_capsnaps(mdsc, session);
dout("kick_flushing_caps mds%d\n", session->s_mds); dout("kick_flushing_caps mds%d\n", session->s_mds);
spin_lock(&mdsc->cap_dirty_lock);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
int delayed = __kick_flushing_caps(mdsc, session, ci); spin_lock(&ci->i_ceph_lock);
if (delayed) { __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
spin_lock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
}
} }
} }
static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session, struct ceph_mds_session *session,
struct inode *inode) struct inode *inode)
__releases(ci->i_ceph_lock)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap; struct ceph_cap *cap;
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap; cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s\n", inode, dout("kick_flushing_inode_caps %p flushing %s\n", inode,
ceph_cap_string(ci->i_flushing_caps)); ceph_cap_string(ci->i_flushing_caps));
__ceph_flush_snaps(ci, &session, 1); if (!list_empty(&ci->i_cap_flush_list)) {
u64 oldest_flush_tid;
if (ci->i_flushing_caps) {
int delayed;
spin_lock(&mdsc->cap_dirty_lock); spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item, list_move_tail(&ci->i_flushing_item,
&cap->session->s_cap_flushing); &cap->session->s_cap_flushing);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock); spin_unlock(&mdsc->cap_dirty_lock);
__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
delayed = __kick_flushing_caps(mdsc, session, ci);
if (delayed) {
spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
}
} else { } else {
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
} }
@ -2487,12 +2486,11 @@ static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
{ {
if (!capsnap->need_flush && if (!capsnap->need_flush &&
!capsnap->writing && !capsnap->dirty_pages) { !capsnap->writing && !capsnap->dirty_pages) {
dout("dropping cap_snap %p follows %llu\n", dout("dropping cap_snap %p follows %llu\n",
capsnap, capsnap->follows); capsnap, capsnap->follows);
BUG_ON(capsnap->cap_flush.tid > 0);
ceph_put_snap_context(capsnap->context); ceph_put_snap_context(capsnap->context);
list_del(&capsnap->ci_item); list_del(&capsnap->ci_item);
list_del(&capsnap->flushing_item);
ceph_put_cap_snap(capsnap); ceph_put_cap_snap(capsnap);
return 1; return 1;
} }
@ -2891,13 +2889,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
fill_inline = true; fill_inline = true;
} }
spin_unlock(&ci->i_ceph_lock);
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
kick_flushing_inode_caps(mdsc, session, inode);
up_read(&mdsc->snap_rwsem);
if (newcaps & ~issued) if (newcaps & ~issued)
wake = true; wake = true;
kick_flushing_inode_caps(mdsc, session, inode);
up_read(&mdsc->snap_rwsem);
} else {
spin_unlock(&ci->i_ceph_lock);
} }
if (fill_inline) if (fill_inline)
@ -2951,6 +2949,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
if (cf->tid == flush_tid) if (cf->tid == flush_tid)
cleaned = cf->caps; cleaned = cf->caps;
if (cf->caps == 0) /* capsnap */
continue;
if (cf->tid <= flush_tid) { if (cf->tid <= flush_tid) {
list_del(&cf->i_list); list_del(&cf->i_list);
list_add_tail(&cf->i_list, &to_remove); list_add_tail(&cf->i_list, &to_remove);
@ -2985,13 +2985,16 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
} }
if (ci->i_flushing_caps == 0) { if (ci->i_flushing_caps == 0) {
list_del_init(&ci->i_flushing_item); if (list_empty(&ci->i_cap_flush_list)) {
if (!list_empty(&session->s_cap_flushing)) list_del_init(&ci->i_flushing_item);
dout(" mds%d still flushing cap on %p\n", if (!list_empty(&session->s_cap_flushing)) {
session->s_mds, dout(" mds%d still flushing cap on %p\n",
&list_entry(session->s_cap_flushing.next, session->s_mds,
struct ceph_inode_info, &list_first_entry(&session->s_cap_flushing,
i_flushing_item)->vfs_inode); struct ceph_inode_info,
i_flushing_item)->vfs_inode);
}
}
mdsc->num_cap_flushing--; mdsc->num_cap_flushing--;
dout(" inode %p now !flushing\n", inode); dout(" inode %p now !flushing\n", inode);
@ -3039,7 +3042,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows); u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap; struct ceph_cap_snap *capsnap;
int drop = 0; int flushed = 0;
dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
inode, ci, session->s_mds, follows); inode, ci, session->s_mds, follows);
@ -3047,30 +3050,47 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->follows == follows) { if (capsnap->follows == follows) {
if (capsnap->flush_tid != flush_tid) { if (capsnap->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !=" dout(" cap_snap %p follows %lld tid %lld !="
" %lld\n", capsnap, follows, " %lld\n", capsnap, follows,
flush_tid, capsnap->flush_tid); flush_tid, capsnap->cap_flush.tid);
break; break;
} }
WARN_ON(capsnap->dirty_pages || capsnap->writing); flushed = 1;
dout(" removing %p cap_snap %p follows %lld\n",
inode, capsnap, follows);
ceph_put_snap_context(capsnap->context);
list_del(&capsnap->ci_item);
list_del(&capsnap->flushing_item);
ceph_put_cap_snap(capsnap);
wake_up_all(&mdsc->cap_flushing_wq);
drop = 1;
break; break;
} else { } else {
dout(" skipping cap_snap %p follows %lld\n", dout(" skipping cap_snap %p follows %lld\n",
capsnap, capsnap->follows); capsnap, capsnap->follows);
} }
} }
if (flushed) {
u64 oldest_flush_tid;
WARN_ON(capsnap->dirty_pages || capsnap->writing);
dout(" removing %p cap_snap %p follows %lld\n",
inode, capsnap, follows);
list_del(&capsnap->ci_item);
list_del(&capsnap->cap_flush.i_list);
spin_lock(&mdsc->cap_dirty_lock);
if (list_empty(&ci->i_cap_flush_list))
list_del_init(&ci->i_flushing_item);
list_del(&capsnap->cap_flush.g_list);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
if (oldest_flush_tid == 0 || oldest_flush_tid > flush_tid)
wake_up_all(&mdsc->cap_flushing_wq);
spin_unlock(&mdsc->cap_dirty_lock);
wake_up_all(&ci->i_cap_wq);
}
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (drop) if (flushed) {
ceph_put_snap_context(capsnap->context);
ceph_put_cap_snap(capsnap);
iput(inode); iput(inode);
}
} }
/* /*
@ -3175,7 +3195,8 @@ retry:
tcap->implemented |= issued; tcap->implemented |= issued;
if (cap == ci->i_auth_cap) if (cap == ci->i_auth_cap)
ci->i_auth_cap = tcap; ci->i_auth_cap = tcap;
if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { if (!list_empty(&ci->i_cap_flush_list) &&
ci->i_auth_cap == tcap) {
spin_lock(&mdsc->cap_dirty_lock); spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item, list_move_tail(&ci->i_flushing_item,
&tcap->session->s_cap_flushing); &tcap->session->s_cap_flushing);

View File

@ -472,7 +472,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_iterator = NULL; s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases); INIT_LIST_HEAD(&s->s_cap_releases);
INIT_LIST_HEAD(&s->s_cap_flushing); INIT_LIST_HEAD(&s->s_cap_flushing);
INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
dout("register_session mds%d\n", mds); dout("register_session mds%d\n", mds);
if (mds >= mdsc->max_sessions) { if (mds >= mdsc->max_sessions) {
@ -1479,21 +1478,6 @@ static int trim_caps(struct ceph_mds_client *mdsc,
return 0; return 0;
} }
static int check_capsnap_flush(struct ceph_inode_info *ci,
u64 want_snap_seq)
{
int ret = 1;
spin_lock(&ci->i_ceph_lock);
if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
struct ceph_cap_snap *capsnap =
list_first_entry(&ci->i_cap_snaps,
struct ceph_cap_snap, ci_item);
ret = capsnap->follows >= want_snap_seq;
}
spin_unlock(&ci->i_ceph_lock);
return ret;
}
static int check_caps_flush(struct ceph_mds_client *mdsc, static int check_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid) u64 want_flush_tid)
{ {
@ -1520,54 +1504,9 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
* returns true if we've flushed through want_flush_tid * returns true if we've flushed through want_flush_tid
*/ */
static void wait_caps_flush(struct ceph_mds_client *mdsc, static void wait_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid, u64 want_snap_seq) u64 want_flush_tid)
{ {
int mds; dout("check_caps_flush want %llu\n", want_flush_tid);
dout("check_caps_flush want %llu snap want %llu\n",
want_flush_tid, want_snap_seq);
mutex_lock(&mdsc->mutex);
for (mds = 0; mds < mdsc->max_sessions; ) {
struct ceph_mds_session *session = mdsc->sessions[mds];
struct inode *inode = NULL;
if (!session) {
mds++;
continue;
}
get_session(session);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
if (!list_empty(&session->s_cap_snaps_flushing)) {
struct ceph_cap_snap *capsnap =
list_first_entry(&session->s_cap_snaps_flushing,
struct ceph_cap_snap,
flushing_item);
struct ceph_inode_info *ci = capsnap->ci;
if (!check_capsnap_flush(ci, want_snap_seq)) {
dout("check_cap_flush still flushing snap %p "
"follows %lld <= %lld to mds%d\n",
&ci->vfs_inode, capsnap->follows,
want_snap_seq, mds);
inode = igrab(&ci->vfs_inode);
}
}
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
if (inode) {
wait_event(mdsc->cap_flushing_wq,
check_capsnap_flush(ceph_inode(inode),
want_snap_seq));
iput(inode);
} else {
mds++;
}
mutex_lock(&mdsc->mutex);
}
mutex_unlock(&mdsc->mutex);
wait_event(mdsc->cap_flushing_wq, wait_event(mdsc->cap_flushing_wq,
check_caps_flush(mdsc, want_flush_tid)); check_caps_flush(mdsc, want_flush_tid));
@ -3584,7 +3523,7 @@ restart:
void ceph_mdsc_sync(struct ceph_mds_client *mdsc) void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{ {
u64 want_tid, want_flush, want_snap; u64 want_tid, want_flush;
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return; return;
@ -3599,15 +3538,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
want_flush = mdsc->last_cap_flush_tid; want_flush = mdsc->last_cap_flush_tid;
spin_unlock(&mdsc->cap_dirty_lock); spin_unlock(&mdsc->cap_dirty_lock);
down_read(&mdsc->snap_rwsem); dout("sync want tid %lld flush_seq %lld\n",
want_snap = mdsc->last_snap_seq; want_tid, want_flush);
up_read(&mdsc->snap_rwsem);
dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
want_tid, want_flush, want_snap);
wait_unsafe_requests(mdsc, want_tid); wait_unsafe_requests(mdsc, want_tid);
wait_caps_flush(mdsc, want_flush, want_snap); wait_caps_flush(mdsc, want_flush);
} }
/* /*

View File

@ -152,7 +152,6 @@ struct ceph_mds_session {
/* protected by mutex */ /* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */
struct list_head s_cap_snaps_flushing;
unsigned long s_renew_requested; /* last time we sent a renew req */ unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq; u64 s_renew_seq;

View File

@ -520,9 +520,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
ihold(inode); ihold(inode);
atomic_set(&capsnap->nref, 1); atomic_set(&capsnap->nref, 1);
capsnap->ci = ci;
INIT_LIST_HEAD(&capsnap->ci_item); INIT_LIST_HEAD(&capsnap->ci_item);
INIT_LIST_HEAD(&capsnap->flushing_item);
capsnap->follows = old_snapc->seq; capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->issued = __ceph_caps_issued(ci, NULL);
@ -800,7 +798,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
ihold(inode); ihold(inode);
spin_unlock(&mdsc->snap_flush_lock); spin_unlock(&mdsc->snap_flush_lock);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
__ceph_flush_snaps(ci, &session, 0); __ceph_flush_snaps(ci, &session);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
iput(inode); iput(inode);
spin_lock(&mdsc->snap_flush_lock); spin_lock(&mdsc->snap_flush_lock);

View File

@ -147,6 +147,13 @@ struct ceph_cap {
#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ #define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ #define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
struct ceph_cap_flush {
u64 tid;
int caps; /* 0 means capsnap */
struct list_head g_list; // global
struct list_head i_list; // per inode
};
/* /*
* Snapped cap state that is pending flush to mds. When a snapshot occurs, * Snapped cap state that is pending flush to mds. When a snapshot occurs,
* we first complete any in-process sync writes and writeback any dirty * we first complete any in-process sync writes and writeback any dirty
@ -154,10 +161,11 @@ struct ceph_cap {
*/ */
struct ceph_cap_snap { struct ceph_cap_snap {
atomic_t nref; atomic_t nref;
struct ceph_inode_info *ci; struct list_head ci_item;
struct list_head ci_item, flushing_item;
u64 follows, flush_tid; struct ceph_cap_flush cap_flush;
u64 follows;
int issued, dirty; int issued, dirty;
struct ceph_snap_context *context; struct ceph_snap_context *context;
@ -186,13 +194,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
} }
} }
struct ceph_cap_flush {
u64 tid;
int caps;
struct list_head g_list; // global
struct list_head i_list; // per inode
};
/* /*
* The frag tree describes how a directory is fragmented, potentially across * The frag tree describes how a directory is fragmented, potentially across
* multiple metadata servers. It is also used to indicate points where * multiple metadata servers. It is also used to indicate points where
@ -888,8 +889,7 @@ extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc); struct ceph_snap_context *snapc);
extern void __ceph_flush_snaps(struct ceph_inode_info *ci, extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession, struct ceph_mds_session **psession);
int again);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session); struct ceph_mds_session *session);
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);