1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-08 21:18:16 +03:00

lib/tsocket: avoid endless cpu-spinning in tstream_bsd_fde_handler()

There were some reports that strace output an LDAP server socket is in
CLOSE_WAIT state, returning EAGAIN for writev over and over (after a call to
epoll() each time).

In the tstream_bsd code the problem happens when we have a pending
writev_send, while there's no readv_send pending. In that case
we still ask for TEVENT_FD_READ in order to notice connection errors
early, so we try to call writev even if the socket doesn't report TEVENT_FD_WRITE.
And there are situations where we do that over and over again.

It happens like this with a Linux kernel:

    tcp_fin() has this:
        struct tcp_sock *tp = tcp_sk(sk);

        inet_csk_schedule_ack(sk);

        sk->sk_shutdown |= RCV_SHUTDOWN;
        sock_set_flag(sk, SOCK_DONE);

        switch (sk->sk_state) {
        case TCP_SYN_RECV:
        case TCP_ESTABLISHED:
                /* Move to CLOSE_WAIT */
                tcp_set_state(sk, TCP_CLOSE_WAIT);
                inet_csk_enter_pingpong_mode(sk);
                break;

It means RCV_SHUTDOWN gets set as well as TCP_CLOSE_WAIT, but
sk->sk_err is not changed to indicate an error.

    tcp_sendmsg_locked has this:
    ...
        err = -EPIPE;
        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
                goto do_error;

        while (msg_data_left(msg)) {
                int copy = 0;

                skb = tcp_write_queue_tail(sk);
                if (skb)
                        copy = size_goal - skb->len;

                if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
                        bool first_skb;

    new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_space;

    ...

    wait_for_space:
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                if (copied)
                        tcp_push(sk, flags & ~MSG_MORE, mss_now,
                                 TCP_NAGLE_PUSH, size_goal);

                err = sk_stream_wait_memory(sk, &timeo);
                if (err != 0)
                        goto do_error;

It means if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) doesn't
hit as we only have RCV_SHUTDOWN and sk_stream_wait_memory returns
-EAGAIN.

    tcp_poll has this:

        if (sk->sk_shutdown & RCV_SHUTDOWN)
                mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

So we'll get EPOLLIN | EPOLLRDNORM | EPOLLRDHUP triggering
TEVENT_FD_READ and writev/sendmsg keeps getting EAGAIN.

So we need to always clear TEVENT_FD_READ if we don't
have readable handler in order to avoid burning cpu.
But we turn it on again after a timeout of 1 second
in order to monitor the error state of the connection.

And now that our tsocket_bsd_error() helper checks for POLLRDHUP,
we can check if the socket is in an error state before calling the
writable handler when TEVENT_FD_READ was reported.
Only on error we'll call the writable handler, which will pick
the error without calling writev().

BUG: https://bugzilla.samba.org/show_bug.cgi?id=15202

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Reviewed-by: Ralph Boehme <slow@samba.org>
This commit is contained in:
Stefan Metzmacher 2022-10-12 17:26:16 +02:00
parent 4c7e2b9b60
commit e232ba946f
2 changed files with 116 additions and 11 deletions

View File

@ -1754,6 +1754,9 @@ struct tstream_bsd {
void (*readable_handler)(void *private_data);
void *writeable_private;
void (*writeable_handler)(void *private_data);
struct tevent_context *error_ctx;
struct tevent_timer *error_timer;
};
bool tstream_bsd_optimize_readv(struct tstream_context *stream,
@ -1775,6 +1778,28 @@ bool tstream_bsd_optimize_readv(struct tstream_context *stream,
return old;
}
static void tstream_bsd_error_timer(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval current_time,
void *private_data)
{
struct tstream_bsd *bsds =
talloc_get_type(private_data,
struct tstream_bsd);
TALLOC_FREE(bsds->error_timer);
/*
* Turn on TEVENT_FD_READABLE() again
* if we have a writeable_handler that
* wants to monitor the connection
* for errors.
*/
if (bsds->writeable_handler != NULL) {
TEVENT_FD_READABLE(bsds->fde);
}
}
static void tstream_bsd_fde_handler(struct tevent_context *ev,
struct tevent_fd *fde,
uint16_t flags,
@ -1789,11 +1814,74 @@ static void tstream_bsd_fde_handler(struct tevent_context *ev,
}
if (flags & TEVENT_FD_READ) {
if (!bsds->readable_handler) {
if (bsds->writeable_handler) {
struct timeval recheck_time;
/*
* In order to avoid cpu-spinning
* we no longer want to get TEVENT_FD_READ
*/
TEVENT_FD_NOT_READABLE(bsds->fde);
if (!bsds->writeable_handler) {
return;
}
/*
* If we have a writeable handler we
* want that to report connection errors
* early.
*
* So we check if the socket is in an
* error state.
*/
if (bsds->error == 0) {
int ret = tsocket_bsd_error(bsds->fd);
if (ret == -1) {
bsds->error = errno;
}
}
if (bsds->error != 0) {
/*
* Let the writeable handler report the error
*/
bsds->writeable_handler(bsds->writeable_private);
return;
}
/*
* Here we called TEVENT_FD_NOT_READABLE() without
* calling into the writeable handler.
*
* So we may have to wait for the kernels tcp stack
* to report TEVENT_FD_WRITE in order to let
* make progress and turn on TEVENT_FD_READABLE()
* again.
*
* As a fallback we use a timer that turns on
* TEVENT_FD_READABLE() again after a timeout of
* 1 second.
*/
if (bsds->error_timer != NULL) {
return;
}
recheck_time = timeval_current_ofs(1, 0);
bsds->error_timer = tevent_add_timer(bsds->error_ctx,
bsds,
recheck_time,
tstream_bsd_error_timer,
bsds);
if (bsds->error_timer == NULL) {
bsds->error = ENOMEM;
/*
* Let the writeable handler report the error
*/
bsds->writeable_handler(bsds->writeable_private);
return;
}
TEVENT_FD_NOT_READABLE(bsds->fde);
return;
}
bsds->readable_handler(bsds->readable_private);
@ -1848,6 +1936,8 @@ static int tstream_bsd_set_readable_handler(struct tstream_bsd *bsds,
TEVENT_FD_READABLE(bsds->fde);
}
TALLOC_FREE(bsds->error_timer);
bsds->readable_handler = handler;
bsds->readable_private = private_data;
@ -1870,7 +1960,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
bsds->writeable_handler = NULL;
bsds->writeable_private = NULL;
TEVENT_FD_NOT_WRITEABLE(bsds->fde);
TALLOC_FREE(bsds->error_timer);
bsds->error_ctx = NULL;
return 0;
}
@ -1882,6 +1973,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
}
bsds->event_ptr = NULL;
TALLOC_FREE(bsds->fde);
TALLOC_FREE(bsds->error_timer);
bsds->error_ctx = NULL;
}
if (tevent_fd_get_flags(bsds->fde) == 0) {
@ -1907,6 +2000,7 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
bsds->writeable_handler = handler;
bsds->writeable_private = private_data;
bsds->error_ctx = ev;
return 0;
}
@ -2212,7 +2306,14 @@ static void tstream_bsd_writev_handler(void *private_data)
}
err = tsocket_bsd_error_from_errno(ret, errno, &retry);
if (retry) {
/* retry later */
/*
* retry later...
*
* make sure we also wait readable again
* in order to notice errors early
*/
TEVENT_FD_READABLE(bsds->fde);
TALLOC_FREE(bsds->error_timer);
return;
}
if (err != 0) {
@ -2238,7 +2339,13 @@ static void tstream_bsd_writev_handler(void *private_data)
}
if (state->count > 0) {
/* we have more to read */
/*
* we have more to write
*
* make sure we also wait readable again
* in order to notice errors early
*/
TEVENT_FD_READABLE(bsds->fde);
return;
}
@ -2286,6 +2393,8 @@ static struct tevent_req *tstream_bsd_disconnect_send(TALLOC_CTX *mem_ctx,
goto post;
}
TALLOC_FREE(bsds->error_timer);
bsds->error_ctx = NULL;
TALLOC_FREE(bsds->fde);
ret = close(bsds->fd);
bsds->fd = -1;
@ -2328,6 +2437,8 @@ static const struct tstream_context_ops tstream_bsd_ops = {
static int tstream_bsd_destructor(struct tstream_bsd *bsds)
{
TALLOC_FREE(bsds->error_timer);
bsds->error_ctx = NULL;
TALLOC_FREE(bsds->fde);
if (bsds->fd != -1) {
close(bsds->fd);

View File

@ -1,6 +0,0 @@
^samba.unittests.tsocket_tstream.test_tstream_disconnected_tcp_client_spin
^samba.unittests.tsocket_tstream.test_tstream_disconnected_unix_client_spin
^samba.unittests.tsocket_tstream.test_tstream_more_tcp_client_spin
^samba.unittests.tsocket_tstream.test_tstream_more_unix_client_spin
^samba.unittests.tsocket_tstream.test_tstream_more_disconnect_tcp_client_spin
^samba.unittests.tsocket_tstream.test_tstream_more_disconnect_unix_client_spin