diff --git a/arch/Kconfig b/arch/Kconfig index 9f2f3060a6ab..93404c802d29 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -510,6 +510,15 @@ config MMU_LAZY_TLB_SHOOTDOWN config ARCH_HAVE_NMI_SAFE_CMPXCHG bool +config ARCH_HAVE_EXTRA_ELF_NOTES + bool + help + An architecture should select this in order to enable adding an + arch-specific ELF note section to core files. It must provide two + functions: elf_coredump_extra_notes_size() and + elf_coredump_extra_notes_write() which are invoked by the ELF core + dumper. + config ARCH_HAS_NMI_SAFE_THIS_CPU_OPS bool diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1c4be3373686..c45fa9d7fb76 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -156,6 +156,7 @@ config PPC select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_HAVE_EXTRA_ELF_NOTES if SPU_BASE select ARCH_KEEP_MEMBLOCK select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index 79f1c480b5eb..bb4b94444d3e 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -127,8 +127,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, /* Notes used in ET_CORE. Note name is "SPU//". */ #define NT_SPU 1 -#define ARCH_HAVE_EXTRA_ELF_NOTES - #endif /* CONFIG_SPU_BASE */ #ifdef CONFIG_PPC64 diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5397b552fbeb..b5a25ee49eea 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1262,6 +1262,9 @@ out_free_interp: if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && elf_ex->e_type == ET_DYN && !interpreter) { mm->brk = mm->start_brk = ELF_ET_DYN_BASE; + } else { + /* Otherwise leave a gap between .bss and brk. */ + mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; } mm->brk = mm->start_brk = arch_randomize_brk(mm); @@ -1564,7 +1567,6 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); } -#define MAX_FILE_NOTE_SIZE (4*1024*1024) /* * Format of NT_FILE note: * @@ -1592,8 +1594,12 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm names_ofs = (2 + 3 * count) * sizeof(data[0]); alloc: - if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ + /* paranoia check */ + if (size >= core_file_note_size_limit) { + pr_warn_once("coredump Note size too large: %u (does kernel.core_file_note_size_limit sysctl need adjustment?\n", + size); return -EINVAL; + } size = round_up(size, PAGE_SIZE); /* * "size" can be 0 here legitimately. diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 3314249e8674..b799701454a9 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -505,8 +505,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, char *k_platform, *k_base_platform; char __user *u_platform, *u_base_platform, *p; int loop; - int nr; /* reset for each csp adjustment */ unsigned long flags = 0; + int ei_index; + elf_addr_t *elf_info; #ifdef CONFIG_MMU /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions @@ -601,44 +602,24 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, csp -= sp & 15UL; sp -= sp & 15UL; - /* put the ELF interpreter info on the stack */ -#define NEW_AUX_ENT(id, val) \ - do { \ - struct { unsigned long _id, _val; } __user *ent, v; \ - \ - ent = (void __user *) csp; \ - v._id = (id); \ - v._val = (val); \ - if (copy_to_user(ent + nr, &v, sizeof(v))) \ - return -EFAULT; \ - nr++; \ + /* Create the ELF interpreter info */ + elf_info = (elf_addr_t *)mm->saved_auxv; + /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ +#define NEW_AUX_ENT(id, val) \ + do { \ + *elf_info++ = id; \ + *elf_info++ = val; \ } while (0) - nr = 0; - csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_NULL, 0); - if (k_platform) { - nr = 0; - csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_PLATFORM, - (elf_addr_t) (unsigned long) u_platform); - } - - if (k_base_platform) { - nr = 0; - csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_BASE_PLATFORM, - (elf_addr_t) (unsigned long) u_base_platform); - } - - if (bprm->have_execfd) { - nr = 0; - csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_EXECFD, bprm->execfd); - } - - nr = 0; - csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); +#ifdef ARCH_DLINFO + /* + * ARCH_DLINFO must come first so PPC can do its special alignment of + * AUXV. + * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in + * ARCH_DLINFO changes + */ + ARCH_DLINFO; +#endif NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); @@ -659,17 +640,29 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, bprm->secureexec); NEW_AUX_ENT(AT_EXECFN, bprm->exec); - -#ifdef ARCH_DLINFO - nr = 0; - csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long); - - /* ARCH_DLINFO must come last so platform specific code can enforce - * special alignment requirements on the AUXV if necessary (eg. PPC). - */ - ARCH_DLINFO; -#endif + if (k_platform) + NEW_AUX_ENT(AT_PLATFORM, + (elf_addr_t)(unsigned long)u_platform); + if (k_base_platform) + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t)(unsigned long)u_base_platform); + if (bprm->have_execfd) + NEW_AUX_ENT(AT_EXECFD, bprm->execfd); #undef NEW_AUX_ENT + /* AT_NULL is zero; clear the rest too */ + memset(elf_info, 0, (char *)mm->saved_auxv + + sizeof(mm->saved_auxv) - (char *)elf_info); + + /* And advance past the AT_NULL entry. */ + elf_info += 2; + + ei_index = elf_info - (elf_addr_t *)mm->saved_auxv; + csp -= ei_index * sizeof(elf_addr_t); + + /* Put the elf_info on the stack in the right place. */ + if (copy_to_user((void __user *)csp, mm->saved_auxv, + ei_index * sizeof(elf_addr_t))) + return -EFAULT; /* allocate room for argv[] and envv[] */ csp -= (bprm->envc + 1) * sizeof(elf_caddr_t); diff --git a/fs/coredump.c b/fs/coredump.c index be6403b4b14b..317065e3eb9b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -56,10 +56,15 @@ static bool dump_vma_snapshot(struct coredump_params *cprm); static void free_vma_snapshot(struct coredump_params *cprm); +#define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024) +/* Define a reasonable max cap */ +#define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024) + static int core_uses_pid; static unsigned int core_pipe_limit; static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; +unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; struct core_name { char *corename; @@ -998,6 +1003,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, return error; } +static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT; +static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX; + static struct ctl_table coredump_sysctls[] = { { .procname = "core_uses_pid", @@ -1020,6 +1028,15 @@ static struct ctl_table coredump_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "core_file_note_size_limit", + .data = &core_file_note_size_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (unsigned int *)&core_file_note_size_min, + .extra2 = (unsigned int *)&core_file_note_size_max, + }, }; static int __init init_fs_coredump_sysctls(void) diff --git a/fs/exec.c b/fs/exec.c index cf1df7f16e55..b3c40fbb325f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1267,6 +1267,14 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) return retval; + /* + * This tracepoint marks the point before flushing the old exec where + * the current task is still unchanged, but errors are fatal (point of + * no return). The later "sched_process_exec" tracepoint is called after + * the current task has successfully switched to the new exec. + */ + trace_sched_prepare_exec(current, bprm); + /* * Ensure all future errors are fatal. */ diff --git a/include/linux/coredump.h b/include/linux/coredump.h index d3eba4360150..0904ba010341 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -30,6 +30,8 @@ struct coredump_params { struct core_vma_metadata *vma_meta; }; +extern unsigned int core_file_note_size_limit; + /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. diff --git a/include/linux/elf.h b/include/linux/elf.h index c9a46c4e183b..5c402788da19 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -65,7 +65,7 @@ extern Elf64_Dyn _DYNAMIC []; struct file; struct coredump_params; -#ifndef ARCH_HAVE_EXTRA_ELF_NOTES +#ifndef CONFIG_ARCH_HAVE_EXTRA_ELF_NOTES static inline int elf_coredump_extra_notes_size(void) { return 0; } static inline int elf_coredump_extra_notes_write(struct coredump_params *cprm) { return 0; } #else diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index dbb01b4b7451..226f47c6939c 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -420,6 +420,41 @@ TRACE_EVENT(sched_process_exec, __entry->pid, __entry->old_pid) ); +/** + * sched_prepare_exec - called before setting up new exec + * @task: pointer to the current task + * @bprm: pointer to linux_binprm used for new exec + * + * Called before flushing the old exec, where @task is still unchanged, but at + * the point of no return during switching to the new exec. At the point it is + * called the exec will either succeed, or on failure terminate the task. Also + * see the "sched_process_exec" tracepoint, which is called right after @task + * has successfully switched to the new exec. + */ +TRACE_EVENT(sched_prepare_exec, + + TP_PROTO(struct task_struct *task, struct linux_binprm *bprm), + + TP_ARGS(task, bprm), + + TP_STRUCT__entry( + __string( interp, bprm->interp ) + __string( filename, bprm->filename ) + __field( pid_t, pid ) + __string( comm, task->comm ) + ), + + TP_fast_assign( + __assign_str(interp, bprm->interp); + __assign_str(filename, bprm->filename); + __entry->pid = task->pid; + __assign_str(comm, task->comm); + ), + + TP_printk("interp=%s filename=%s pid=%d comm=%s", + __get_str(interp), __get_str(filename), + __entry->pid, __get_str(comm)) +); #ifdef CONFIG_SCHEDSTATS #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT