From affbc19a68f9966ad65a773db405f78e2bafc07b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 5 May 2015 21:22:13 +0800 Subject: [PATCH] ceph: make sure syncfs flushes all cap snaps Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 18 ++++++---- fs/ceph/mds_client.c | 86 +++++++++++++++++++++++++++++++------------- fs/ceph/mds_client.h | 1 + fs/ceph/snap.c | 2 ++ 4 files changed, 76 insertions(+), 31 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 900c05fd77d8..bbd969e16a01 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1259,14 +1259,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Unless @again is true, skip cap_snaps that were already sent to + * Unless @kick is true, skip cap_snaps that were already sent to * the MDS (i.e., during this session). * * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession, - int again) + int kick) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { @@ -1307,7 +1307,7 @@ retry: } /* only flush each capsnap once */ - if (!again && !list_empty(&capsnap->flushing_item)) { + if (!kick && !list_empty(&capsnap->flushing_item)) { dout("already flushed %p, skipping\n", capsnap); continue; } @@ -1317,6 +1317,9 @@ retry: if (session && session->s_mds != mds) { dout("oops, wrong session %p mutex\n", session); + if (kick) + goto out; + mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); session = NULL; @@ -1342,10 +1345,9 @@ retry: capsnap->flush_tid = ++ci->i_cap_flush_last_tid; atomic_inc(&capsnap->nref); - if (!list_empty(&capsnap->flushing_item)) - list_del_init(&capsnap->flushing_item); - list_add_tail(&capsnap->flushing_item, - &session->s_cap_snaps_flushing); + if (list_empty(&capsnap->flushing_item)) + list_add_tail(&capsnap->flushing_item, + &session->s_cap_snaps_flushing); spin_unlock(&ci->i_ceph_lock); dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", @@ -2876,6 +2878,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap; int drop = 0; @@ -2899,6 +2902,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, list_del(&capsnap->ci_item); list_del(&capsnap->flushing_item); ceph_put_cap_snap(capsnap); + wake_up_all(&mdsc->cap_flushing_wq); drop = 1; break; } else { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 88010f9a254d..2bb9264b9225 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1488,17 +1488,22 @@ out_unlocked: return err; } -static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +static int check_cap_flush(struct ceph_inode_info *ci, + u64 want_flush_seq, u64 want_snap_seq) { - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; + int ret1 = 1, ret2 = 1; spin_lock(&ci->i_ceph_lock); - if (ci->i_flushing_caps) - ret = ci->i_cap_flush_seq >= want_flush_seq; - else - ret = 1; + if (want_flush_seq > 0 && ci->i_flushing_caps) + ret1 = ci->i_cap_flush_seq >= want_flush_seq; + + if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + ret2 = capsnap->follows >= want_snap_seq; + } spin_unlock(&ci->i_ceph_lock); - return ret; + return ret1 && ret2; } /* @@ -1506,44 +1511,71 @@ static int check_cap_flush(struct inode *inode, u64 want_flush_seq) * * returns true if we've flushed through want_flush_seq */ -static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void wait_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_seq, u64 want_snap_seq) { int mds; dout("check_cap_flush want %lld\n", want_flush_seq); mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; mds++) { + for (mds = 0; mds < mdsc->max_sessions; ) { struct ceph_mds_session *session = mdsc->sessions[mds]; - struct inode *inode = NULL; + struct inode *inode1 = NULL, *inode2 = NULL; - if (!session) + if (!session) { + mds++; continue; + } get_session(session); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); if (!list_empty(&session->s_cap_flushing)) { struct ceph_inode_info *ci = - list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item); + list_first_entry(&session->s_cap_flushing, + struct ceph_inode_info, + i_flushing_item); - if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { + if (!check_cap_flush(ci, want_flush_seq, 0)) { dout("check_cap_flush still flushing %p " "seq %lld <= %lld to mds%d\n", &ci->vfs_inode, ci->i_cap_flush_seq, - want_flush_seq, session->s_mds); - inode = igrab(&ci->vfs_inode); + want_flush_seq, mds); + inode1 = igrab(&ci->vfs_inode); + } + } + if (!list_empty(&session->s_cap_snaps_flushing)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&session->s_cap_snaps_flushing, + struct ceph_cap_snap, + flushing_item); + struct ceph_inode_info *ci = capsnap->ci; + if (!check_cap_flush(ci, 0, want_snap_seq)) { + dout("check_cap_flush still flushing snap %p " + "follows %lld <= %lld to mds%d\n", + &ci->vfs_inode, capsnap->follows, + want_snap_seq, mds); + inode2 = igrab(&ci->vfs_inode); } } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); - if (inode) { + if (inode1) { wait_event(mdsc->cap_flushing_wq, - check_cap_flush(inode, want_flush_seq)); - iput(inode); + check_cap_flush(ceph_inode(inode1), + want_flush_seq, 0)); + iput(inode1); } + if (inode2) { + wait_event(mdsc->cap_flushing_wq, + check_cap_flush(ceph_inode(inode2), + 0, want_snap_seq)); + iput(inode2); + } + + if (!inode1 && !inode2) + mds++; mutex_lock(&mdsc->mutex); } @@ -3391,6 +3423,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; + mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); @@ -3517,7 +3550,7 @@ restart: void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { - u64 want_tid, want_flush; + u64 want_tid, want_flush, want_snap; if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return; @@ -3532,10 +3565,15 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) want_flush = mdsc->cap_flush_seq; spin_unlock(&mdsc->cap_dirty_lock); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); + down_read(&mdsc->snap_rwsem); + want_snap = mdsc->last_snap_seq; + up_read(&mdsc->snap_rwsem); + + dout("sync want tid %lld flush_seq %lld snap_seq %lld\n", + want_tid, want_flush, want_snap); wait_unsafe_requests(mdsc, want_tid); - wait_caps_flush(mdsc, want_flush); + wait_caps_flush(mdsc, want_flush, want_snap); } /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d474141c034a..bf24d88cfeb2 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -290,6 +290,7 @@ struct ceph_mds_client { * references (implying they contain no inodes with caps) that * should be destroyed. */ + u64 last_snap_seq; struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index ba708017d60b..233d906aec02 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -730,6 +730,8 @@ more: /* queue realm for cap_snap creation */ list_add(&realm->dirty_item, &dirty_realms); + if (realm->seq > mdsc->last_snap_seq) + mdsc->last_snap_seq = realm->seq; invalidate = 1; } else if (!realm->cached_context) {