diff --git a/source3/include/proto.h b/source3/include/proto.h index 1a1f8bef69e..ceea97bf56c 100644 --- a/source3/include/proto.h +++ b/source3/include/proto.h @@ -6899,7 +6899,15 @@ void release_file_oplock(files_struct *fsp); bool remove_oplock(files_struct *fsp); bool downgrade_oplock(files_struct *fsp); void reply_to_oplock_break_requests(files_struct *fsp); -void release_level_2_oplocks_on_change(files_struct *fsp); +void process_oplock_async_level2_break_message(struct messaging_context *msg_ctx, + void *private_data, + uint32_t msg_type, + struct server_id src, + DATA_BLOB *data); +void contend_level2_oplocks_begin(files_struct *fsp, + enum level2_contention_type type); +void contend_level2_oplocks_end(files_struct *fsp, + enum level2_contention_type type); void share_mode_entry_to_message(char *msg, const struct share_mode_entry *e); void message_to_share_mode_entry(struct share_mode_entry *e, char *msg); bool init_oplocks(struct messaging_context *msg_ctx); diff --git a/source3/include/smb.h b/source3/include/smb.h index 96cd3b7d853..557489f211e 100644 --- a/source3/include/smb.h +++ b/source3/include/smb.h @@ -1695,6 +1695,16 @@ struct kernel_oplocks { void *private_data; }; +enum level2_contention_type { + LEVEL2_CONTEND_ALLOC_SHRINK, + LEVEL2_CONTEND_ALLOC_GROW, + LEVEL2_CONTEND_SET_FILE_LEN, + LEVEL2_CONTEND_FILL_SPARSE, + LEVEL2_CONTEND_WRITE, + LEVEL2_CONTEND_WINDOWS_BRL, + LEVEL2_CONTEND_POSIX_BRL +}; + /* if a kernel does support oplocks then a structure of the following typee is used to describe how to interact with the kernel */ struct kernel_oplocks_ops { @@ -1702,6 +1712,10 @@ struct kernel_oplocks_ops { files_struct *fsp, int oplock_type); void (*release_oplock)(struct kernel_oplocks *ctx, files_struct *fsp, int oplock_type); + void (*contend_level2_oplocks_begin)(files_struct *fsp, + enum level2_contention_type type); + void (*contend_level2_oplocks_end)(files_struct *fsp, + enum level2_contention_type type); }; #include "smb_macros.h" diff --git a/source3/lib/dummysmbd.c b/source3/lib/dummysmbd.c index 5c624bdebfb..a41e6dc033a 100644 --- a/source3/lib/dummysmbd.c +++ b/source3/lib/dummysmbd.c @@ -66,3 +66,22 @@ struct messaging_context *smbd_messaging_context(void) { return NULL; } + +/** + * The following two functions need to be called from inside the low-level BRL + * code for oplocks correctness in smbd. Since other utility binaries also + * link in some of the brl code directly, these dummy functions are necessary + * to avoid needing to link in the oplocks code and its dependencies to all of + * the utility binaries. + */ +void contend_level2_oplocks_begin(files_struct *fsp, + enum level2_contention_type type) +{ + return; +} + +void contend_level2_oplocks_end(files_struct *fsp, + enum level2_contention_type type) +{ + return; +} diff --git a/source3/locking/brlock.c b/source3/locking/brlock.c index 032aaa56b61..76496d11fb3 100644 --- a/source3/locking/brlock.c +++ b/source3/locking/brlock.c @@ -311,6 +311,7 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, unsigned int i; files_struct *fsp = br_lck->fsp; struct lock_struct *locks = br_lck->lock_data; + NTSTATUS status; for (i=0; i < br_lck->num_locks; i++) { /* Do any Windows or POSIX locks conflict ? */ @@ -327,6 +328,10 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, #endif } + if (!IS_PENDING_LOCK(plock->lock_type)) { + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_WINDOWS_BRL); + } + /* We can get the Windows lock, now see if it needs to be mapped into a lower level POSIX one, and if so can we get it ? */ @@ -346,9 +351,11 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, plock->context.smbpid = 0xFFFFFFFF; if (errno_ret == EACCES || errno_ret == EAGAIN) { - return NT_STATUS_FILE_LOCK_CONFLICT; + status = NT_STATUS_FILE_LOCK_CONFLICT; + goto fail; } else { - return map_nt_error_from_unix(errno); + status = map_nt_error_from_unix(errno); + goto fail; } } } @@ -356,7 +363,8 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, /* no conflicts - add it to the list of locks */ locks = (struct lock_struct *)SMB_REALLOC(locks, (br_lck->num_locks + 1) * sizeof(*locks)); if (!locks) { - return NT_STATUS_NO_MEMORY; + status = NT_STATUS_NO_MEMORY; + goto fail; } memcpy(&locks[br_lck->num_locks], plock, sizeof(struct lock_struct)); @@ -365,6 +373,11 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, br_lck->modified = True; return NT_STATUS_OK; + fail: + if (!IS_PENDING_LOCK(plock->lock_type)) { + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_WINDOWS_BRL); + } + return status; } /**************************************************************************** @@ -587,11 +600,13 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, struct byte_range_lock *br_lck, struct lock_struct *plock) { - unsigned int i, count; + unsigned int i, count, posix_count; struct lock_struct *locks = br_lck->lock_data; struct lock_struct *tp; bool lock_was_added = False; bool signal_pending_read = False; + bool break_oplocks = false; + NTSTATUS status; /* No zero-zero locks for POSIX. */ if (plock->start == 0 && plock->size == 0) { @@ -613,7 +628,7 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, return NT_STATUS_NO_MEMORY; } - count = 0; + count = posix_count = 0; for (i=0; i < br_lck->num_locks; i++) { struct lock_struct *curr_lock = &locks[i]; @@ -637,6 +652,8 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, memcpy(&tp[count], curr_lock, sizeof(struct lock_struct)); count++; } else { + unsigned int tmp_count; + /* POSIX conflict semantics are different. */ if (brl_conflict_posix(curr_lock, plock)) { /* Can't block ourselves with POSIX locks. */ @@ -648,10 +665,27 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, } /* Work out overlaps. */ - count += brlock_posix_split_merge(&tp[count], curr_lock, plock, &lock_was_added); + tmp_count += brlock_posix_split_merge(&tp[count], curr_lock, plock, &lock_was_added); + posix_count += tmp_count; + count += tmp_count; } } + /* + * Break oplocks while we hold a brl. Since lock() and unlock() calls + * are not symetric with POSIX semantics, we cannot guarantee our + * contend_level2_oplocks_begin/end calls will be acquired and + * released one-for-one as with Windows semantics. Therefore we only + * call contend_level2_oplocks_begin if this is the first POSIX brl on + * the file. + */ + break_oplocks = (!IS_PENDING_LOCK(plock->lock_type) && + posix_count == 0); + if (break_oplocks) { + contend_level2_oplocks_begin(br_lck->fsp, + LEVEL2_CONTEND_POSIX_BRL); + } + if (!lock_was_added) { memcpy(&tp[count], plock, sizeof(struct lock_struct)); count++; @@ -679,10 +713,12 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, if (errno_ret == EACCES || errno_ret == EAGAIN) { SAFE_FREE(tp); - return NT_STATUS_FILE_LOCK_CONFLICT; + status = NT_STATUS_FILE_LOCK_CONFLICT; + goto fail; } else { SAFE_FREE(tp); - return map_nt_error_from_unix(errno); + status = map_nt_error_from_unix(errno); + goto fail; } } } @@ -690,7 +726,8 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, /* Realloc so we don't leak entries per lock call. */ tp = (struct lock_struct *)SMB_REALLOC(tp, count * sizeof(*locks)); if (!tp) { - return NT_STATUS_NO_MEMORY; + status = NT_STATUS_NO_MEMORY; + goto fail; } br_lck->num_locks = count; SAFE_FREE(br_lck->lock_data); @@ -723,6 +760,12 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, } return NT_STATUS_OK; + fail: + if (break_oplocks) { + contend_level2_oplocks_end(br_lck->fsp, + LEVEL2_CONTEND_POSIX_BRL); + } + return status; } /**************************************************************************** @@ -881,6 +924,7 @@ static bool brl_unlock_windows(struct messaging_context *msg_ctx, } } + contend_level2_oplocks_end(br_lck->fsp, LEVEL2_CONTEND_WINDOWS_BRL); return True; } @@ -892,7 +936,7 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, struct byte_range_lock *br_lck, const struct lock_struct *plock) { - unsigned int i, j, count; + unsigned int i, j, count, posix_count; struct lock_struct *tp; struct lock_struct *locks = br_lck->lock_data; bool overlap_found = False; @@ -919,7 +963,7 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, return False; } - count = 0; + count = posix_count = 0; for (i = 0; i < br_lck->num_locks; i++) { struct lock_struct *lock = &locks[i]; struct lock_struct tmp_lock[3]; @@ -946,6 +990,7 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, /* No change in this lock. */ memcpy(&tp[count], &tmp_lock[0], sizeof(struct lock_struct)); count++; + posix_count++; } else { SMB_ASSERT(tmp_lock[0].lock_type == UNLOCK_LOCK); overlap_found = True; @@ -970,6 +1015,7 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, } } count++; + posix_count++; continue; } else { /* tmp_count == 3 - (we split a lock range in two). */ @@ -979,8 +1025,10 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, memcpy(&tp[count], &tmp_lock[0], sizeof(struct lock_struct)); count++; + posix_count++; memcpy(&tp[count], &tmp_lock[2], sizeof(struct lock_struct)); count++; + posix_count++; overlap_found = True; /* Optimisation... */ /* We know we're finished here as we can't overlap any @@ -1024,6 +1072,11 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, tp = NULL; } + if (posix_count == 0) { + contend_level2_oplocks_end(br_lck->fsp, + LEVEL2_CONTEND_POSIX_BRL); + } + br_lck->num_locks = count; SAFE_FREE(br_lck->lock_data); locks = tp; @@ -1276,6 +1329,7 @@ void brl_close_fnum(struct messaging_context *msg_ctx, struct lock_struct *locks = br_lck->lock_data; struct server_id pid = procid_self(); bool unlock_individually = False; + bool posix_level2_contention_ended = false; if(lp_posix_locking(fsp->conn->params)) { @@ -1346,8 +1400,17 @@ void brl_close_fnum(struct messaging_context *msg_ctx, if ((lock->lock_flav == WINDOWS_LOCK) && (lock->fnum == fnum)) { del_this_lock = True; num_deleted_windows_locks++; + contend_level2_oplocks_end(br_lck->fsp, + LEVEL2_CONTEND_WINDOWS_BRL); } else if (lock->lock_flav == POSIX_LOCK) { del_this_lock = True; + + /* Only end level2 contention once for posix */ + if (!posix_level2_contention_ended) { + posix_level2_contention_ended = true; + contend_level2_oplocks_end(br_lck->fsp, + LEVEL2_CONTEND_POSIX_BRL); + } } } diff --git a/source3/smbd/aio.c b/source3/smbd/aio.c index 75ce07d41a9..6b19e098e5c 100644 --- a/source3/smbd/aio.c +++ b/source3/smbd/aio.c @@ -291,7 +291,9 @@ bool schedule_aio_write_and_X(connection_struct *conn, aio_ex->req = talloc_move(aio_ex, &req); - release_level_2_oplocks_on_change(fsp); + /* This should actually be improved to span the write. */ + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_WRITE); + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_WRITE); if (!write_through && !lp_syncalways(SNUM(fsp->conn)) && fsp->aio_write_behind) { diff --git a/source3/smbd/fileio.c b/source3/smbd/fileio.c index 3e3f0943b3d..a9a97a2d14a 100644 --- a/source3/smbd/fileio.c +++ b/source3/smbd/fileio.c @@ -318,7 +318,9 @@ ssize_t write_file(struct smb_request *req, * the shared memory area whilst doing this. */ - release_level_2_oplocks_on_change(fsp); + /* This should actually be improved to span the write. */ + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_WRITE); + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_WRITE); #ifdef WITH_PROFILE if (profile_p && profile_p->writecache_total_writes % 500 == 0) { diff --git a/source3/smbd/oplock.c b/source3/smbd/oplock.c index 0945ac66776..4c84fd41ede 100644 --- a/source3/smbd/oplock.c +++ b/source3/smbd/oplock.c @@ -329,7 +329,7 @@ static void add_oplock_timeout_handler(files_struct *fsp) the client for LEVEL2. *******************************************************************/ -static void process_oplock_async_level2_break_message(struct messaging_context *msg_ctx, +void process_oplock_async_level2_break_message(struct messaging_context *msg_ctx, void *private_data, uint32_t msg_type, struct server_id src, @@ -699,7 +699,8 @@ static void process_open_retry_message(struct messaging_context *msg_ctx, none. ****************************************************************************/ -void release_level_2_oplocks_on_change(files_struct *fsp) +static void contend_level2_oplocks_begin_default(files_struct *fsp, + enum level2_contention_type type) { int i; struct share_mode_lock *lck; @@ -799,6 +800,26 @@ void release_level_2_oplocks_on_change(files_struct *fsp) TALLOC_FREE(lck); } +void contend_level2_oplocks_begin(files_struct *fsp, + enum level2_contention_type type) +{ + if (koplocks && koplocks->ops->contend_level2_oplocks_begin) { + koplocks->ops->contend_level2_oplocks_begin(fsp, type); + return; + } + + contend_level2_oplocks_begin_default(fsp, type); +} + +void contend_level2_oplocks_end(files_struct *fsp, + enum level2_contention_type type) +{ + /* Only kernel oplocks implement this so far */ + if (koplocks && koplocks->ops->contend_level2_oplocks_end) { + koplocks->ops->contend_level2_oplocks_end(fsp, type); + } +} + /**************************************************************************** Linearize a share mode entry struct to an internal oplock break message. ****************************************************************************/ diff --git a/source3/smbd/oplock_irix.c b/source3/smbd/oplock_irix.c index 23b2fa90813..bbc9132a08f 100644 --- a/source3/smbd/oplock_irix.c +++ b/source3/smbd/oplock_irix.c @@ -270,8 +270,10 @@ static void irix_oplocks_read_fde_handler(struct event_context *ev, ****************************************************************************/ static const struct kernel_oplocks_ops irix_koplocks = { - .set_oplock = irix_set_kernel_oplock, - .release_oplock = irix_release_kernel_oplock, + .set_oplock = irix_set_kernel_oplock, + .release_oplock = irix_release_kernel_oplock, + .contend_level2_oplocks_begin = NULL, + .contend_level2_oplocks_end = NULL, }; struct kernel_oplocks *irix_init_kernel_oplocks(TALLOC_CTX *mem_ctx) diff --git a/source3/smbd/oplock_linux.c b/source3/smbd/oplock_linux.c index 5840ff0851c..273fbfdc01e 100644 --- a/source3/smbd/oplock_linux.c +++ b/source3/smbd/oplock_linux.c @@ -179,8 +179,10 @@ static bool linux_oplocks_available(void) ****************************************************************************/ static const struct kernel_oplocks_ops linux_koplocks = { - .set_oplock = linux_set_kernel_oplock, - .release_oplock = linux_release_kernel_oplock, + .set_oplock = linux_set_kernel_oplock, + .release_oplock = linux_release_kernel_oplock, + .contend_level2_oplocks_begin = NULL, + .contend_level2_oplocks_end = NULL, }; struct kernel_oplocks *linux_init_kernel_oplocks(TALLOC_CTX *mem_ctx) diff --git a/source3/smbd/reply.c b/source3/smbd/reply.c index 60e2e5dc7a5..151f9d0827d 100644 --- a/source3/smbd/reply.c +++ b/source3/smbd/reply.c @@ -3010,8 +3010,6 @@ void reply_lockread(struct smb_request *req) return; } - release_level_2_oplocks_on_change(fsp); - numtoread = SVAL(req->vwv+1, 0); startpos = IVAL_TO_SMB_OFF_T(req->vwv+2, 0); @@ -4497,8 +4495,6 @@ void reply_lock(struct smb_request *req) return; } - release_level_2_oplocks_on_change(fsp); - count = (uint64_t)IVAL(req->vwv+1, 0); offset = (uint64_t)IVAL(req->vwv+3, 0); @@ -6870,13 +6866,6 @@ void reply_lockingX(struct smb_request *req) } } - /* - * We do this check *after* we have checked this is not a oplock break - * response message. JRA. - */ - - release_level_2_oplocks_on_change(fsp); - if (req->buflen < (num_ulocks + num_locks) * (large_file_format ? 20 : 10)) { reply_nterror(req, NT_STATUS_INVALID_PARAMETER); diff --git a/source3/smbd/vfs.c b/source3/smbd/vfs.c index 515e557f67e..8d82ca550c5 100644 --- a/source3/smbd/vfs.c +++ b/source3/smbd/vfs.c @@ -514,8 +514,6 @@ int vfs_allocate_file_space(files_struct *fsp, uint64_t len) uint64_t space_avail; uint64_t bsize,dfree,dsize; - release_level_2_oplocks_on_change(fsp); - /* * Actually try and commit the space on disk.... */ @@ -541,15 +539,23 @@ int vfs_allocate_file_space(files_struct *fsp, uint64_t len) DEBUG(10,("vfs_allocate_file_space: file %s, shrink. Current size %.0f\n", fsp->fsp_name, (double)st.st_size )); + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_ALLOC_SHRINK); + flush_write_cache(fsp, SIZECHANGE_FLUSH); if ((ret = SMB_VFS_FTRUNCATE(fsp, (SMB_OFF_T)len)) != -1) { set_filelen_write_cache(fsp, len); } + + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_ALLOC_SHRINK); + return ret; } /* Grow - we need to test if we have enough space. */ + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_ALLOC_GROW); + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_ALLOC_GROW); + if (!lp_strict_allocate(SNUM(fsp->conn))) return 0; @@ -581,7 +587,8 @@ int vfs_set_filelen(files_struct *fsp, SMB_OFF_T len) { int ret; - release_level_2_oplocks_on_change(fsp); + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_SET_FILE_LEN); + DEBUG(10,("vfs_set_filelen: ftruncate %s to len %.0f\n", fsp->fsp_name, (double)len)); flush_write_cache(fsp, SIZECHANGE_FLUSH); if ((ret = SMB_VFS_FTRUNCATE(fsp, len)) != -1) { @@ -592,6 +599,8 @@ int vfs_set_filelen(files_struct *fsp, SMB_OFF_T len) fsp->fsp_name); } + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_SET_FILE_LEN); + return ret; } @@ -613,7 +622,6 @@ int vfs_fill_sparse(files_struct *fsp, SMB_OFF_T len) size_t num_to_write; ssize_t pwrite_ret; - release_level_2_oplocks_on_change(fsp); ret = SMB_VFS_FSTAT(fsp, &st); if (ret == -1) { return ret; @@ -626,13 +634,16 @@ int vfs_fill_sparse(files_struct *fsp, SMB_OFF_T len) DEBUG(10,("vfs_fill_sparse: write zeros in file %s from len %.0f to len %.0f (%.0f bytes)\n", fsp->fsp_name, (double)st.st_size, (double)len, (double)(len - st.st_size))); + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_FILL_SPARSE); + flush_write_cache(fsp, SIZECHANGE_FLUSH); if (!sparse_buf) { sparse_buf = SMB_CALLOC_ARRAY(char, SPARSE_BUF_WRITE_SIZE); if (!sparse_buf) { errno = ENOMEM; - return -1; + ret = -1; + goto out; } } @@ -647,17 +658,23 @@ int vfs_fill_sparse(files_struct *fsp, SMB_OFF_T len) if (pwrite_ret == -1) { DEBUG(10,("vfs_fill_sparse: SMB_VFS_PWRITE for file %s failed with error %s\n", fsp->fsp_name, strerror(errno) )); - return -1; + ret = -1; + goto out; } if (pwrite_ret == 0) { - return 0; + ret = 0; + goto out; } total += pwrite_ret; } set_filelen_write_cache(fsp, len); - return 0; + + ret = 0; + out: + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_FILL_SPARSE); + return ret; } /****************************************************************************