5
0
mirror of git://git.proxmox.com/git/qemu-server.git synced 2024-12-22 13:34:06 +03:00

qmeventd: add last-ditch effort SIGKILL cleanup

'alarm' is used to schedule an additionaly cleanup round 5 seconds after
sending SIGTERM via terminate_client. This then sends SIGKILL via a
pidfd (if supported by the kernel) or directly via kill, making sure
that the QEMU process is *really* dead and won't be left behind in an
undetermined state.

This shouldn't be an issue under normal circumstances, but can help
avoid dead processes lying around if QEMU hangs after SIGTERM.

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
This commit is contained in:
Stefan Reiter 2020-10-19 14:18:37 +02:00 committed by Thomas Lamprecht
parent 3ff8500175
commit 4c500f1696
2 changed files with 112 additions and 1 deletions

View File

@ -63,6 +63,8 @@ static int verbose = 0;
static int epoll_fd = 0; static int epoll_fd = 0;
static const char *progname; static const char *progname;
GHashTable *vm_clients; // key=vmid (freed on remove), value=*Client (free manually) GHashTable *vm_clients; // key=vmid (freed on remove), value=*Client (free manually)
GSList *forced_cleanups;
volatile sig_atomic_t alarm_triggered = 0;
/* /*
* Helper functions * Helper functions
@ -468,8 +470,39 @@ terminate_client(struct Client *client)
client->state = STATE_TERMINATING; client->state = STATE_TERMINATING;
// open a pidfd before kill for later cleanup
int pidfd = pidfd_open(client->pid, 0);
if (pidfd < 0) {
switch (errno) {
case ESRCH:
// process already dead for some reason, cleanup done
VERBOSE_PRINT("%s: failed to open pidfd, process already dead (pid %d)\n",
client->qemu.vmid, client->pid);
return;
// otherwise fall back to just using the PID directly, but don't
// print if we only failed because we're running on an older kernel
case ENOSYS:
break;
default:
perror("failed to open QEMU pidfd for cleanup");
break;
}
}
int err = kill(client->pid, SIGTERM); int err = kill(client->pid, SIGTERM);
log_neg(err, "kill"); log_neg(err, "kill");
struct CleanupData *data_ptr = malloc(sizeof(struct CleanupData));
struct CleanupData data = {
.pid = client->pid,
.pidfd = pidfd
};
*data_ptr = data;
forced_cleanups = g_slist_prepend(forced_cleanups, (void *)data_ptr);
// resets any other alarms, but will fire eventually and cleanup all
alarm(5);
} }
void void
@ -545,6 +578,55 @@ handle_client(struct Client *client)
} }
/*
* SIGALRM and cleanup handling
*
* terminate_client will set an alarm for 5 seconds and add its client's PID to
* the forced_cleanups list - when the timer expires, we iterate the list and
* attempt to issue SIGKILL to all processes which haven't yet stopped.
*/
static void
alarm_handler(__attribute__((unused)) int signum)
{
alarm_triggered = 1;
}
static void
sigkill(void *ptr, __attribute__((unused)) void *unused)
{
struct CleanupData data = *((struct CleanupData *)ptr);
int err;
if (data.pidfd > 0) {
err = pidfd_send_signal(data.pidfd, SIGKILL, NULL, 0);
} else {
err = kill(data.pid, SIGKILL);
}
if (err < 0) {
if (errno != ESRCH) {
fprintf(stderr, "SIGKILL cleanup of pid '%d' failed - %s\n",
data.pid, strerror(errno));
}
} else {
fprintf(stderr, "cleanup failed, terminating pid '%d' with SIGKILL\n",
data.pid);
}
}
static void
handle_forced_cleanup()
{
if (alarm_triggered) {
alarm_triggered = 0;
g_slist_foreach(forced_cleanups, sigkill, NULL);
g_slist_free_full(forced_cleanups, free);
forced_cleanups = NULL;
}
}
int int
main(int argc, char *argv[]) main(int argc, char *argv[])
{ {
@ -577,6 +659,7 @@ main(int argc, char *argv[])
} }
signal(SIGCHLD, SIG_IGN); signal(SIGCHLD, SIG_IGN);
signal(SIGALRM, alarm_handler);
socket_path = argv[optind]; socket_path = argv[optind];
@ -612,7 +695,7 @@ main(int argc, char *argv[])
for(;;) { for(;;) {
nevents = epoll_wait(epoll_fd, events, 1, -1); nevents = epoll_wait(epoll_fd, events, 1, -1);
if (nevents < 0 && errno == EINTR) { if (nevents < 0 && errno == EINTR) {
// signal happened, try again handle_forced_cleanup();
continue; continue;
} }
bail_neg(nevents, "epoll_wait"); bail_neg(nevents, "epoll_wait");
@ -630,5 +713,7 @@ main(int argc, char *argv[])
handle_client((struct Client *)events[n].data.ptr); handle_client((struct Client *)events[n].data.ptr);
} }
} }
handle_forced_cleanup();
} }
} }

View File

@ -21,6 +21,15 @@
Author: Dominik Csapak <d.csapak@proxmox.com> Author: Dominik Csapak <d.csapak@proxmox.com>
*/ */
#include <sys/syscall.h>
#ifndef __NR_pidfd_open
#define __NR_pidfd_open 434
#endif
#ifndef __NR_pidfd_send_signal
#define __NR_pidfd_send_signal 424
#endif
#define VERBOSE_PRINT(...) do { if (verbose) { printf(__VA_ARGS__); } } while (0) #define VERBOSE_PRINT(...) do { if (verbose) { printf(__VA_ARGS__); } } while (0)
static inline void log_neg(int errval, const char *msg) static inline void log_neg(int errval, const char *msg)
@ -38,6 +47,18 @@ static inline void bail_neg(int errval, const char *msg)
} }
} }
static inline int
pidfd_open(pid_t pid, unsigned int flags)
{
return syscall(__NR_pidfd_open, pid, flags);
}
static inline int
pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags)
{
return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
}
typedef enum { typedef enum {
CLIENT_NONE, CLIENT_NONE,
CLIENT_QEMU, CLIENT_QEMU,
@ -77,6 +98,11 @@ struct Client {
} vzdump; } vzdump;
}; };
struct CleanupData {
pid_t pid;
int pidfd;
};
void handle_qmp_handshake(struct Client *client); void handle_qmp_handshake(struct Client *client);
void handle_qmp_event(struct Client *client, struct json_object *obj); void handle_qmp_event(struct Client *client, struct json_object *obj);
void handle_qmp_return(struct Client *client, struct json_object *data, bool error); void handle_qmp_return(struct Client *client, struct json_object *data, bool error);