1
0
mirror of https://github.com/samba-team/samba.git synced 2025-03-24 10:50:22 +03:00

r14898: This change is an attempt to improve the quality of the information that

is produced when a process exits abnormally.

First, we coalesce the core dumping code so that we greatly improve our
odds of being able to produce a core file, even in the case of a memory
fault. I've removed duplicates of dump_core() and split it in two to
reduce the amount of work needed to actually do the dump.

Second, we refactor the exit_server code path to always log an explanation
and a stack trace. My goal is to always produce enough log information
for us to be able to explain any server exit, though there is a risk
that this could produce too much log information on a flaky network.

Finally, smbcontrol has gained a smbd fault injection operation to test
the changes above. This is only enabled for developer builds.
(This used to be commit 56bc02d64498eb3faf89f0c5452b9299daea8e95)
This commit is contained in:
James Peach 2006-04-04 00:27:50 +00:00 committed by Gerald (Jerry) Carter
parent f5e7376bca
commit 4fa5559800
14 changed files with 284 additions and 205 deletions

View File

@ -1470,6 +1470,8 @@ AC_LIBTESTFUNC(sec, bigcrypt)
AC_LIBTESTFUNC(security, getprpwnam)
AC_LIBTESTFUNC(sec, getprpwnam)
AC_CHECK_FUNCS(strsignal)
############################################
# Check if we have libattr
case "$host_os" in

View File

@ -208,4 +208,8 @@ extern BOOL *DEBUGLEVEL_CLASS_ISSET;
DEBUGLEVEL_CLASS[ DBGC_ALL ] >= (level)) ) \
&& (dbgtext body) )
/* Print a separator to the debug log. */
#define DEBUGSEP(level)\
DEBUG((level),("===============================================================\n"))
#endif

View File

@ -1562,5 +1562,9 @@ LDAP *ldap_open_with_timeout(const char *server, int port, unsigned int to);
#endif
void smb_panic( const char *why ) NORETURN_ATTRIBUTE ;
void exit_server(const char *reason) NORETURN_ATTRIBUTE ;
void dump_core(void) NORETURN_ATTRIBUTE ;
void exit_server(const char *const reason) NORETURN_ATTRIBUTE ;
void exit_server_cleanly(void) NORETURN_ATTRIBUTE ;
void exit_server_fault(void) NORETURN_ATTRIBUTE ;
#endif /* _INCLUDES_H */

View File

@ -69,6 +69,7 @@
#define MSG_SMB_OPEN_RETRY 3009
#define MSG_SMB_KERNEL_BREAK 3010
#define MSG_SMB_FILE_RENAME 3011
#define MSG_SMB_INJECT_FAULT 3012
/* winbind messages */
#define MSG_WINBIND_FINISHED 4001

View File

@ -20,7 +20,12 @@
#include "includes.h"
#ifdef HAVE_SYS_PRCTL_H
#include <sys/prctl.h>
#endif
static void (*cont_fn)(void *);
static pstring corepath;
/*******************************************************************
report a fault
@ -33,11 +38,11 @@ static void fault_report(int sig)
counter++;
DEBUG(0,("===============================================================\n"));
DEBUGSEP(0);
DEBUG(0,("INTERNAL ERROR: Signal %d in pid %d (%s)",sig,(int)sys_getpid(),SAMBA_VERSION_STRING));
DEBUG(0,("\nPlease read the Trouble-Shooting section of the Samba3-HOWTO\n"));
DEBUG(0,("\nFrom: http://www.samba.org/samba/docs/Samba3-HOWTO.pdf\n"));
DEBUG(0,("===============================================================\n"));
DEBUGSEP(0);
smb_panic("internal error");
@ -82,3 +87,91 @@ void fault_setup(void (*fn)(void *))
CatchSignal(SIGABRT,SIGNAL_CAST sig_fault);
#endif
}
/*******************************************************************
make all the preparations to safely dump a core file
********************************************************************/
void dump_core_setup(const char *progname)
{
pstring logbase;
char * end;
if (lp_logfile() && *lp_logfile()) {
snprintf(logbase, sizeof(logbase), "%s", lp_logfile());
if ((end = strrchr_m(logbase, '/'))) {
*end = '\0';
}
} else {
/* We will end up here is the log file is given on the command
* line by the -l option but the "log file" option is not set
* in smb.conf.
*/
snprintf(logbase, sizeof(logbase), "%s", dyn_LOGFILEBASE);
}
SMB_ASSERT(progname != NULL);
snprintf(corepath, sizeof(corepath), "%s/cores", logbase);
mkdir(corepath,0700);
snprintf(corepath, sizeof(corepath), "%s/cores/%s",
logbase, progname);
mkdir(corepath,0700);
sys_chown(corepath,getuid(),getgid());
chmod(corepath,0700);
#ifdef HAVE_GETRLIMIT
#ifdef RLIMIT_CORE
{
struct rlimit rlp;
getrlimit(RLIMIT_CORE, &rlp);
rlp.rlim_cur = MAX(16*1024*1024,rlp.rlim_cur);
setrlimit(RLIMIT_CORE, &rlp);
getrlimit(RLIMIT_CORE, &rlp);
DEBUG(3,("Maximum core file size limits now %d(soft) %d(hard)\n",
(int)rlp.rlim_cur,(int)rlp.rlim_max));
}
#endif
#endif
#if defined(HAVE_PRCTL) && defined(PR_SET_DUMPABLE)
/* On Linux we lose the ability to dump core when we change our user
* ID. We know how to dump core safely, so let's make sure we have our
* dumpable flag set.
*/
prctl(PR_SET_DUMPABLE, 1);
#endif
/* FIXME: if we have a core-plus-pid facility, configurably set
* this up here.
*/
}
void dump_core(void)
{
if (*corepath != '\0') {
/* The chdir might fail if we dump core before we finish
* processing the config file.
*/
if (chdir(corepath) != 0) {
DEBUG(0, ("unable to change to %s", corepath));
DEBUGADD(0, ("refusing to dump core\n"));
exit(1);
}
DEBUG(0,("dumping core in %s\n", corepath));
}
umask(~(0700));
dbgflush();
/* Ensure we don't have a signal handler for abort. */
#ifdef SIGABRT
CatchSignal(SIGABRT,SIGNAL_CAST SIG_DFL);
#endif
abort();
}

View File

@ -1545,19 +1545,10 @@ gid_t nametogid(const char *name)
Something really nasty happened - panic !
********************************************************************/
#ifdef HAVE_LIBEXC_H
#include <libexc.h>
#endif
static void smb_panic2(const char *why, BOOL decrement_pid_count )
void smb_panic(const char *const why)
{
char *cmd;
int result;
#ifdef HAVE_BACKTRACE_SYMBOLS
void *backtrace_stack[BACKTRACE_STACK_SIZE];
size_t backtrace_size;
char **backtrace_strings;
#endif
#ifdef DEVELOPER
{
@ -1570,9 +1561,12 @@ static void smb_panic2(const char *why, BOOL decrement_pid_count )
}
#endif
DEBUG(0,("PANIC (pid %llu): %s\n",
(unsigned long long)sys_getpid(), why));
log_stack_trace();
/* only smbd needs to decrement the smbd counter in connections.tdb */
if ( decrement_pid_count )
decrement_smbd_process_count();
decrement_smbd_process_count();
cmd = lp_panic_action();
if (cmd && *cmd) {
@ -1586,9 +1580,27 @@ static void smb_panic2(const char *why, BOOL decrement_pid_count )
DEBUG(0, ("smb_panic(): action returned status %d\n",
WEXITSTATUS(result)));
}
DEBUG(0,("PANIC: %s\n", why));
dump_core();
}
/*******************************************************************
Print a backtrace of the stack to the debug log. This function
DELIBERATELY LEAKS MEMORY. The expectation is that you should
exit shortly after calling it.
********************************************************************/
#ifdef HAVE_LIBEXC_H
#include <libexc.h>
#endif
void log_stack_trace(void)
{
#ifdef HAVE_BACKTRACE_SYMBOLS
void *backtrace_stack[BACKTRACE_STACK_SIZE];
size_t backtrace_size;
char **backtrace_strings;
/* get the backtrace (stack frames) */
backtrace_size = backtrace(backtrace_stack,BACKTRACE_STACK_SIZE);
backtrace_strings = backtrace_symbols(backtrace_stack, backtrace_size);
@ -1607,16 +1619,14 @@ static void smb_panic2(const char *why, BOOL decrement_pid_count )
#elif HAVE_LIBEXC
#define NAMESIZE 32 /* Arbitrary */
/* The IRIX libexc library provides an API for unwinding the stack. See
* libexc(3) for details. Apparantly trace_back_stack leaks memory, but
* since we are about to abort anyway, it hardly matters.
*
* Note that if we paniced due to a SIGSEGV or SIGBUS (or similar) this
* will fail with a nasty message upon failing to open the /proc entry.
*/
{
#define NAMESIZE 32 /* Arbitrary */
__uint64_t addrs[BACKTRACE_STACK_SIZE];
char * names[BACKTRACE_STACK_SIZE];
char namebuf[BACKTRACE_STACK_SIZE * NAMESIZE];
@ -1646,24 +1656,9 @@ static void smb_panic2(const char *why, BOOL decrement_pid_count )
}
}
#undef NAMESIZE
#else
DEBUG(0, ("unable to produce a stack trace on this platform\n"));
#endif
dbgflush();
#ifdef SIGABRT
CatchSignal(SIGABRT,SIGNAL_CAST SIG_DFL);
#endif
abort();
}
/*******************************************************************
wrapper for smb_panic2()
********************************************************************/
void smb_panic( const char *why )
{
smb_panic2( why, True );
/* Notreached. */
abort();
}
/*******************************************************************

View File

@ -106,46 +106,6 @@ static void sig_hup(int sig)
sys_select_signal(SIGHUP);
}
#if DUMP_CORE
/**************************************************************************** **
Prepare to dump a core file - carefully!
**************************************************************************** */
static BOOL dump_core(void)
{
char *p;
pstring dname;
pstrcpy( dname, lp_logfile() );
if ((p=strrchr_m(dname,'/')))
*p=0;
pstrcat( dname, "/corefiles" );
mkdir( dname, 0700 );
sys_chown( dname, getuid(), getgid() );
chmod( dname, 0700 );
if ( chdir(dname) )
return( False );
umask( ~(0700) );
#ifdef HAVE_GETRLIMIT
#ifdef RLIMIT_CORE
{
struct rlimit rlp;
getrlimit( RLIMIT_CORE, &rlp );
rlp.rlim_cur = MAX( 4*1024*1024, rlp.rlim_cur );
setrlimit( RLIMIT_CORE, &rlp );
getrlimit( RLIMIT_CORE, &rlp );
DEBUG( 3, ( "Core limits now %d %d\n", (int)rlp.rlim_cur, (int)rlp.rlim_max ) );
}
#endif
#endif
DEBUG(0,("Dumping core in %s\n",dname));
abort();
return( True );
}
#endif
/**************************************************************************** **
Possibly continue after a fault.
**************************************************************************** */
@ -692,6 +652,7 @@ static BOOL open_sockets(BOOL isdaemon, int port)
}
fault_setup((void (*)(void *))fault_continue );
dump_core_setup("nmbd");
/* POSIX demands that signals are inherited. If the invoking process has
* these signals masked, we will have problems, as we won't receive them. */

View File

@ -56,46 +56,6 @@ static BOOL reload_services_file(void)
}
#if DUMP_CORE
/**************************************************************************** **
Prepare to dump a core file - carefully!
**************************************************************************** */
static BOOL dump_core(void)
{
char *p;
pstring dname;
pstrcpy( dname, lp_logfile() );
if ((p=strrchr(dname,'/')))
*p=0;
pstrcat( dname, "/corefiles" );
mkdir( dname, 0700 );
sys_chown( dname, getuid(), getgid() );
chmod( dname, 0700 );
if ( chdir(dname) )
return( False );
umask( ~(0700) );
#ifdef HAVE_GETRLIMIT
#ifdef RLIMIT_CORE
{
struct rlimit rlp;
getrlimit( RLIMIT_CORE, &rlp );
rlp.rlim_cur = MAX( 4*1024*1024, rlp.rlim_cur );
setrlimit( RLIMIT_CORE, &rlp );
getrlimit( RLIMIT_CORE, &rlp );
DEBUG( 3, ( "Core limits now %d %d\n", (int)rlp.rlim_cur, (int)rlp.rlim_max ) );
}
#endif
#endif
DEBUG(0,("Dumping core in %s\n",dname));
abort();
return( True );
} /* dump_core */
#endif
/**************************************************************************** **
Handle a fault..
**************************************************************************** */
@ -933,6 +893,7 @@ int main(int argc, char **argv)
CatchSignal(SIGUSR2, SIG_IGN);
fault_setup((void (*)(void *))fault_quit );
dump_core_setup("winbindd");
load_case_tables();

View File

@ -1400,7 +1400,7 @@ void start_background_queue(void)
/* check for some essential signals first */
if (got_sig_term) {
exit_server("Caught TERM signal");
exit_server_cleanly();
}
if (reload_after_sighup) {

View File

@ -724,7 +724,7 @@ static void defer_open(struct share_mode_lock *lck,
if (procid_is_me(&e->pid) && (e->op_mid == mid)) {
DEBUG(0, ("Trying to defer an already deferred "
"request: mid=%d, exiting\n", mid));
exit_server("exiting");
exit_server("attempt to defer a deferred request");
}
}
@ -738,7 +738,7 @@ static void defer_open(struct share_mode_lock *lck,
if (!push_deferred_smb_message(mid, request_time, timeout,
(char *)state, sizeof(*state))) {
exit_server("push_deferred_smb_message failed\n");
exit_server("push_deferred_smb_message failed");
}
add_deferred_open(lck, mid, request_time, state->dev, state->inode);

View File

@ -303,7 +303,7 @@ static void async_processing(fd_set *pfds)
process_aio_queue();
if (got_sig_term) {
exit_server("Caught TERM signal");
exit_server_cleanly();
}
/* check for async change notify events */

View File

@ -22,10 +22,6 @@
#include "includes.h"
#ifdef HAVE_SYS_PRCTL_H
#include <sys/prctl.h>
#endif
static int am_parent = 1;
/* the last message the was processed */
@ -156,9 +152,41 @@ static BOOL open_sockets_inetd(void)
static void msg_exit_server(int msg_type, struct process_id src,
void *buf, size_t len)
{
exit_server("Got a SHUTDOWN message");
DEBUG(3, ("got a SHUTDOWN message\n"));
exit_server_cleanly();
}
#ifdef DEVELOPER
static void msg_inject_fault(int msg_type, struct process_id src,
void *buf, size_t len)
{
int sig;
if (len != sizeof(int)) {
DEBUG(0, ("Process %llu sent bogus signal injection request\n",
(unsigned long long)src.pid));
return;
}
sig = *(int *)buf;
if (sig == -1) {
exit_server("internal error injected");
return;
}
#if HAVE_STRSIGNAL
DEBUG(0, ("Process %llu requested injection of signal %d (%s)\n",
(unsigned long long)src.pid, sig, strsignal(sig)));
#else
DEBUG(0, ("Process %llu requested injection of signal %d\n",
(unsigned long long)src.pid, sig));
#endif
kill(sys_getpid(), sig);
}
#endif /* DEVELOPER */
/****************************************************************************
Have we reached the process limit ?
@ -345,6 +373,10 @@ static BOOL open_sockets_smbd(BOOL is_daemon, BOOL interactive, const char *smb_
message_register(MSG_SMB_FILE_RENAME, msg_file_was_renamed);
message_register(MSG_SMB_CONF_UPDATED, smb_conf_updated);
#ifdef DEVELOPER
message_register(MSG_SMB_INJECT_FAULT, msg_inject_fault);
#endif
/* now accept incoming connections - forking a new process
for each incoming connection */
DEBUG(2,("waiting for a connection\n"));
@ -365,7 +397,7 @@ static BOOL open_sockets_smbd(BOOL is_daemon, BOOL interactive, const char *smb_
if (num == -1 && errno == EINTR) {
if (got_sig_term) {
exit_server("Caught TERM signal");
exit_server_cleanly();
}
/* check for sighup processing */
@ -568,60 +600,18 @@ BOOL reload_services(BOOL test)
return(ret);
}
#if DUMP_CORE
static void dump_core(void) NORETURN_ATTRIBUTE ;
/*******************************************************************
prepare to dump a core file - carefully!
********************************************************************/
static void dump_core(void)
{
char *p;
pstring dname;
pstrcpy(dname,lp_logfile());
if ((p=strrchr_m(dname,'/'))) *p=0;
pstrcat(dname,"/corefiles");
mkdir(dname,0700);
sys_chown(dname,getuid(),getgid());
chmod(dname,0700);
if (chdir(dname)) {
abort();
}
umask(~(0700));
#ifdef HAVE_GETRLIMIT
#ifdef RLIMIT_CORE
{
struct rlimit rlp;
getrlimit(RLIMIT_CORE, &rlp);
rlp.rlim_cur = MAX(4*1024*1024,rlp.rlim_cur);
setrlimit(RLIMIT_CORE, &rlp);
getrlimit(RLIMIT_CORE, &rlp);
DEBUG(3,("Core limits now %d %d\n",
(int)rlp.rlim_cur,(int)rlp.rlim_max));
}
#endif
#endif
DEBUG(0,("Dumping core in %s\n", dname));
/* Ensure we don't have a signal handler for abort. */
#ifdef SIGABRT
CatchSignal(SIGABRT,SIGNAL_CAST SIG_DFL);
#endif
abort();
}
#endif
/****************************************************************************
Exit the server.
****************************************************************************/
void exit_server(const char *reason)
/* Reasons for shutting down a server process. */
enum server_exit_reason { SERVER_EXIT_NORMAL, SERVER_EXIT_ABNORMAL };
static void exit_server_common(enum server_exit_reason how,
const char *const reason) NORETURN_ATTRIBUTE;
static void exit_server_common(enum server_exit_reason how,
const char *const reason)
{
static int firsttime=1;
@ -630,7 +620,6 @@ static void dump_core(void)
firsttime = 0;
change_to_root_user();
DEBUG(2,("Closing connections\n"));
if (negprot_global_auth_context) {
(negprot_global_auth_context->free)(&negprot_global_auth_context);
@ -654,27 +643,54 @@ static void dump_core(void)
}
#endif
if (!reason) {
int oldlevel = DEBUGLEVEL;
char *last_inbuf = get_InBuffer();
DEBUGLEVEL = 10;
DEBUG(0,("Last message was %s\n",smb_fn_name(last_message)));
if (last_inbuf)
show_msg(last_inbuf);
DEBUGLEVEL = oldlevel;
DEBUG(0,("===============================================================\n"));
#if DUMP_CORE
dump_core();
#endif
}
locking_end();
printing_end();
DEBUG(3,("Server exit (%s)\n", (reason ? reason : "")));
if (how != SERVER_EXIT_NORMAL) {
int oldlevel = DEBUGLEVEL;
char *last_inbuf = get_InBuffer();
DEBUGLEVEL = 10;
DEBUGSEP(0);
DEBUG(0,("Abnormal server exit: %s\n",
reason ? reason : "no explanation provided"));
DEBUGSEP(0);
log_stack_trace();
if (last_inbuf) {
DEBUG(0,("Last message was %s\n", LAST_MESSAGE()));
show_msg(last_inbuf);
}
DEBUGLEVEL = oldlevel;
#if DUMP_CORE
dump_core();
#endif
} else {
DEBUG(3,("Server exit (%s)\n",
(reason ? reason : "normal exit")));
}
exit(0);
}
void exit_server(const char *const explanation)
{
exit_server_common(SERVER_EXIT_ABNORMAL, explanation);
}
void exit_server_cleanly(void)
{
exit_server_common(SERVER_EXIT_NORMAL, NULL);
}
void exit_server_fault(void)
{
exit_server("critical server fault");
}
/****************************************************************************
Initialise connect, service and file structs.
****************************************************************************/
@ -795,7 +811,9 @@ void build_options(BOOL screen);
gain_root_privilege();
gain_root_group_privilege();
fault_setup((void (*)(void *))exit_server);
fault_setup((void (*)(void *))exit_server_fault);
dump_core_setup("smbd");
CatchSignal(SIGTERM , SIGNAL_CAST sig_term);
CatchSignal(SIGHUP,SIGNAL_CAST sig_hup);
@ -948,14 +966,6 @@ void build_options(BOOL screen);
* everything after this point is run after the fork()
*/
#if defined(HAVE_PRCTL) && defined(PR_SET_DUMPABLE)
/* On Linux we lose the ability to dump core when we change our user
* ID. We know how to dump core safely, so let's make sure we have our
* dumpable flag set.
*/
prctl(PR_SET_DUMPABLE, 1);
#endif
/* Initialise the password backed before the global_sam_sid
to ensure that we fetch from ldap before we make a domain sid up */
@ -1002,9 +1012,9 @@ void build_options(BOOL screen);
message_register(MSG_SMB_FORCE_TDIS, msg_force_tdis);
smbd_process();
namecache_shutdown();
exit_server("normal exit");
exit_server_cleanly();
return(0);
}

View File

@ -410,6 +410,11 @@ void exit_server(const char *reason)
exit(0);
}
void exit_server_cleanly(void)
{
exit_server("normal exit");
}
static int server_fd = -1;
int last_message = -1;

View File

@ -131,6 +131,47 @@ static BOOL do_debug(const struct process_id pid,
pid, MSG_DEBUG, argv[1], strlen(argv[1]) + 1, False);
}
/* Inject a fault (fata signal) into a running smbd */
static BOOL do_inject_fault(const struct process_id pid,
const int argc, const char **argv)
{
if (argc != 2) {
fprintf(stderr, "Usage: smbcontrol <dest> inject "
"<bus|hup|term|internal|segv>\n");
return False;
}
#ifndef DEVELOPER
fprintf(stderr, "Fault injection is only available in"
"developer builds\n")
return False;
#else /* DEVELOPER */
{
int sig = 0;
if (strcmp(argv[1], "bus") == 0) {
sig = SIGBUS;
} else if (strcmp(argv[1], "hup") == 0) {
sig = SIGHUP;
} else if (strcmp(argv[1], "term") == 0) {
sig = SIGTERM;
} else if (strcmp(argv[1], "segv") == 0) {
sig = SIGSEGV;
} else if (strcmp(argv[1], "internal") == 0) {
/* Force an internal error, ie. an unclean exit. */
sig = -1;
} else {
fprintf(stderr, "Unknown signal name '%s'\n", argv[1]);
return False;
}
return send_message(pid, MSG_SMB_INJECT_FAULT,
&sig, sizeof(int), False);
}
#endif /* DEVELOPER */
}
/* Force a browser election */
static BOOL do_election(const struct process_id pid,
@ -756,6 +797,8 @@ static const struct {
"Force a browse election" },
{ "ping", do_ping, "Elicit a response" },
{ "profile", do_profile, "" },
{ "inject", do_inject_fault,
"Inject a fatal signal into a running smbd"},
{ "profilelevel", do_profilelevel, "" },
{ "debuglevel", do_debuglevel, "Display current debuglevels" },
{ "printnotify", do_printnotify, "Send a print notify message" },