diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index 8a2d0f72b324..a10a498eede1 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -134,9 +134,9 @@ static inline long regs_return_value(struct pt_regs *regs) extern void ia64_decrement_ip (struct pt_regs *pt); extern void ia64_ptrace_stop(void); - #define arch_ptrace_stop(code, info) \ + #define arch_ptrace_stop() \ ia64_ptrace_stop() - #define arch_ptrace_stop_needed(code, info) \ + #define arch_ptrace_stop_needed() \ (!test_thread_flag(TIF_RESTORE_RSE)) extern void ptrace_attach_sync_user_rbs (struct task_struct *); diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h index 71dd82b43cc5..d1419e669027 100644 --- a/arch/sparc/include/asm/ptrace.h +++ b/arch/sparc/include/asm/ptrace.h @@ -26,12 +26,12 @@ static inline bool pt_regs_clear_syscall(struct pt_regs *regs) return (regs->tstate &= ~TSTATE_SYSCALL); } -#define arch_ptrace_stop_needed(exit_code, info) \ +#define arch_ptrace_stop_needed() \ ({ flush_user_windows(); \ get_thread_wsaved() != 0; \ }) -#define arch_ptrace_stop(exit_code, info) \ +#define arch_ptrace_stop() \ synchronize_user_stack() #define current_pt_regs() \ @@ -129,12 +129,12 @@ static inline bool pt_regs_clear_syscall(struct pt_regs *regs) return (regs->psr &= ~PSR_SYSCALL); } -#define arch_ptrace_stop_needed(exit_code, info) \ +#define arch_ptrace_stop_needed() \ ({ flush_user_windows(); \ current_thread_info()->w_saved != 0; \ }) -#define arch_ptrace_stop(exit_code, info) \ +#define arch_ptrace_stop() \ synchronize_user_stack() #define current_pt_regs() \ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index a813b70f594e..fa582748be41 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1834,7 +1834,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* * Allocate a structure for each thread. */ - for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) { + for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) { t = kzalloc(offsetof(struct elf_thread_core_info, notes[info->thread_notes]), GFP_KERNEL); @@ -2024,7 +2024,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, if (!elf_note_info_init(info)) return 0; - for (ct = current->mm->core_state->dumper.next; + for (ct = current->signal->core_state->dumper.next; ct; ct = ct->next) { ets = kzalloc(sizeof(*ets), GFP_KERNEL); if (!ets) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 6d8fd6030cbb..c6f588dc4a9d 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1494,7 +1494,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) goto end_coredump; - for (ct = current->mm->core_state->dumper.next; + for (ct = current->signal->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, ct->task, &thread_status_size); diff --git a/fs/coredump.c b/fs/coredump.c index 3224dee44d30..a6b3c196cdef 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -359,7 +359,7 @@ static int zap_process(struct task_struct *start, int exit_code, int flags) for_each_thread(start, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - if (t != current && t->mm) { + if (t != current && !(t->flags & PF_POSTCOREDUMP)) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); nr++; @@ -369,99 +369,34 @@ static int zap_process(struct task_struct *start, int exit_code, int flags) return nr; } -static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, +static int zap_threads(struct task_struct *tsk, struct core_state *core_state, int exit_code) { - struct task_struct *g, *p; - unsigned long flags; int nr = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { - mm->core_state = core_state; + tsk->signal->core_state = core_state; tsk->signal->group_exit_task = tsk; nr = zap_process(tsk, exit_code, 0); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); + tsk->flags |= PF_DUMPCORE; + atomic_set(&core_state->nr_threads, nr); } spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(nr < 0)) - return nr; - - tsk->flags |= PF_DUMPCORE; - if (atomic_read(&mm->mm_users) == nr + 1) - goto done; - /* - * We should find and kill all tasks which use this mm, and we should - * count them correctly into ->nr_threads. We don't take tasklist - * lock, but this is safe wrt: - * - * fork: - * None of sub-threads can fork after zap_process(leader). All - * processes which were created before this point should be - * visible to zap_threads() because copy_process() adds the new - * process to the tail of init_task.tasks list, and lock/unlock - * of ->siglock provides a memory barrier. - * - * do_exit: - * The caller holds mm->mmap_lock. This means that the task which - * uses this mm can't pass exit_mm(), so it can't exit or clear - * its ->mm. - * - * de_thread: - * It does list_replace_rcu(&leader->tasks, ¤t->tasks), - * we must see either old or new leader, this does not matter. - * However, it can change p->sighand, so lock_task_sighand(p) - * must be used. Since p->mm != NULL and we hold ->mmap_lock - * it can't fail. - * - * Note also that "g" can be the old leader with ->mm == NULL - * and already unhashed and thus removed from ->thread_group. - * This is OK, __unhash_process()->list_del_rcu() does not - * clear the ->next pointer, we will find the new leader via - * next_thread(). - */ - rcu_read_lock(); - for_each_process(g) { - if (g == tsk->group_leader) - continue; - if (g->flags & PF_KTHREAD) - continue; - - for_each_thread(g, p) { - if (unlikely(!p->mm)) - continue; - if (unlikely(p->mm == mm)) { - lock_task_sighand(p, &flags); - nr += zap_process(p, exit_code, - SIGNAL_GROUP_EXIT); - unlock_task_sighand(p, &flags); - } - break; - } - } - rcu_read_unlock(); -done: - atomic_set(&core_state->nr_threads, nr); return nr; } static int coredump_wait(int exit_code, struct core_state *core_state) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->mm; int core_waiters = -EBUSY; init_completion(&core_state->startup); core_state->dumper.task = tsk; core_state->dumper.next = NULL; - if (mmap_write_lock_killable(mm)) - return -EINTR; - - if (!mm->core_state) - core_waiters = zap_threads(tsk, mm, core_state, exit_code); - mmap_write_unlock(mm); - + core_waiters = zap_threads(tsk, core_state, exit_code); if (core_waiters > 0) { struct core_thread *ptr; @@ -483,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state) return core_waiters; } -static void coredump_finish(struct mm_struct *mm, bool core_dumped) +static void coredump_finish(bool core_dumped) { struct core_thread *curr, *next; struct task_struct *task; @@ -493,22 +428,21 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped) current->signal->group_exit_code |= 0x80; current->signal->group_exit_task = NULL; current->signal->flags = SIGNAL_GROUP_EXIT; + next = current->signal->core_state->dumper.next; + current->signal->core_state = NULL; spin_unlock_irq(¤t->sighand->siglock); - next = mm->core_state->dumper.next; while ((curr = next) != NULL) { next = curr->next; task = curr->task; /* - * see exit_mm(), curr->task must not see + * see coredump_task_exit(), curr->task must not see * ->task == NULL before we read ->next. */ smp_mb(); curr->task = NULL; wake_up_process(task); } - - mm->core_state = NULL; } static bool dump_interrupted(void) @@ -839,7 +773,7 @@ fail_dropcount: fail_unlock: kfree(argv); kfree(cn.corename); - coredump_finish(mm, core_dumped); + coredump_finish(core_dumped); revert_creds(old_cred); fail_creds: put_cred(cred); diff --git a/fs/exec.c b/fs/exec.c index a098c133d8d7..b6079f1a098e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -987,16 +987,14 @@ static int exec_mmap(struct mm_struct *mm) if (old_mm) { /* - * Make sure that if there is a core dump in progress - * for the old mm, we get out and die instead of going - * through with the exec. We must hold mmap_lock around - * checking core_state and changing tsk->mm. + * If there is a pending fatal signal perhaps a signal + * whose default action is to create a coredump get + * out and die instead of going through with the exec. */ - mmap_read_lock(old_mm); - if (unlikely(old_mm->core_state)) { - mmap_read_unlock(old_mm); + ret = mmap_read_lock_killable(old_mm); + if (ret) { up_write(&tsk->signal->exec_update_lock); - return -EINTR; + return ret; } } diff --git a/fs/proc/array.c b/fs/proc/array.c index 77cf4187adec..ff869a66b34e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -408,9 +408,9 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) cpumask_pr_args(&task->cpus_mask)); } -static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) +static inline void task_core_dumping(struct seq_file *m, struct task_struct *task) { - seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); + seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state); seq_putc(m, '\n'); } @@ -436,7 +436,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, if (mm) { task_mem(m, mm); - task_core_dumping(m, mm); + task_core_dumping(m, task); task_thp_status(m, mm); mmput(mm); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8f3131477ec6..f7326c8704bb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -454,17 +454,6 @@ struct vm_area_struct { struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; -struct core_thread { - struct task_struct *task; - struct core_thread *next; -}; - -struct core_state { - atomic_t nr_threads; - struct core_thread dumper; - struct completion startup; -}; - struct kioctx_table; struct mm_struct { struct { @@ -585,8 +574,6 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access */ - struct core_state *core_state; /* coredumping support */ - #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index b5ebf6c01292..8aee2945ff08 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -362,29 +362,25 @@ static inline void user_single_step_report(struct pt_regs *regs) #ifndef arch_ptrace_stop_needed /** * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called - * @code: current->exit_code value ptrace will stop with - * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with the siglock held, to decide whether or not it's - * necessary to release the siglock and call arch_ptrace_stop() with the - * same @code and @info arguments. It can be defined to a constant if - * arch_ptrace_stop() is never required, or always is. On machines where - * this makes sense, it should be defined to a quick test to optimize out - * calling arch_ptrace_stop() when it would be superfluous. For example, - * if the thread has not been back to user mode since the last stop, the - * thread state might indicate that nothing needs to be done. + * necessary to release the siglock and call arch_ptrace_stop(). It can be + * defined to a constant if arch_ptrace_stop() is never required, or always + * is. On machines where this makes sense, it should be defined to a quick + * test to optimize out calling arch_ptrace_stop() when it would be + * superfluous. For example, if the thread has not been back to user mode + * since the last stop, the thread state might indicate that nothing needs + * to be done. * * This is guaranteed to be invoked once before a task stops for ptrace and * may include arch-specific operations necessary prior to a ptrace stop. */ -#define arch_ptrace_stop_needed(code, info) (0) +#define arch_ptrace_stop_needed() (0) #endif #ifndef arch_ptrace_stop /** * arch_ptrace_stop - Do machine-specific work before stopping for ptrace - * @code: current->exit_code value ptrace will stop with - * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with no locks held when arch_ptrace_stop_needed() has * just returned nonzero. It is allowed to block, e.g. for user memory @@ -394,7 +390,7 @@ static inline void user_single_step_report(struct pt_regs *regs) * we only do it when the arch requires it for this particular stop, as * indicated by arch_ptrace_stop_needed(). */ -#define arch_ptrace_stop(code, info) do { } while (0) +#define arch_ptrace_stop() do { } while (0) #endif #ifndef current_pt_regs diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f6f8f340a0f..78c351e35fec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1661,6 +1661,7 @@ extern struct pid *cad_pid; #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ +#define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e5f4ce622ee6..a8fe2a593a3a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -72,6 +72,17 @@ struct multiprocess_signals { struct hlist_node node; }; +struct core_thread { + struct task_struct *task; + struct core_thread *next; +}; + +struct core_state { + atomic_t nr_threads; + struct core_thread dumper; + struct completion startup; +}; + /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always @@ -110,6 +121,8 @@ struct signal_struct { int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ + struct core_state *core_state; /* coredumping support */ + /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes diff --git a/kernel/exit.c b/kernel/exit.c index 50f1692c732d..f702a6a63686 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -340,6 +340,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } +static void coredump_task_exit(struct task_struct *tsk) +{ + struct core_state *core_state; + + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + spin_lock_irq(&tsk->sighand->siglock); + tsk->flags |= PF_POSTCOREDUMP; + core_state = tsk->signal->core_state; + spin_unlock_irq(&tsk->sighand->siglock); + if (core_state) { + struct core_thread self; + + self.task = current; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + freezable_schedule(); + } + __set_current_state(TASK_RUNNING); + } +} + #ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. @@ -435,47 +475,12 @@ assign_new_owner: static void exit_mm(void) { struct mm_struct *mm = current->mm; - struct core_state *core_state; exit_mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); - /* - * Serialize with any possible pending coredump. - * We must hold mmap_lock around checking core_state - * and clearing tsk->mm. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group with ->mm != NULL. - */ mmap_read_lock(mm); - core_state = mm->core_state; - if (core_state) { - struct core_thread self; - - mmap_read_unlock(mm); - - self.task = current; - if (self.task->flags & PF_SIGNALED) - self.next = xchg(&core_state->dumper.next, &self); - else - self.task = NULL; - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!self.task) /* see coredump_finish() */ - break; - freezable_schedule(); - } - __set_current_state(TASK_RUNNING); - mmap_read_lock(mm); - } mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ @@ -763,6 +768,7 @@ void __noreturn do_exit(long code) profile_task_exit(tsk); kcov_task_exit(tsk); + coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 8e9feeef555e..3f112b11a9ad 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1043,7 +1043,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); - mm->core_state = NULL; mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; @@ -1391,8 +1390,7 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm) * purposes. */ if (tsk->clear_child_tid) { - if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) && - atomic_read(&mm->mm_users) > 1) { + if (atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck. diff --git a/kernel/signal.c b/kernel/signal.c index e99aff33ff14..6f3476dc7873 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2145,40 +2145,6 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, spin_unlock_irqrestore(&sighand->siglock, flags); } -static inline bool may_ptrace_stop(void) -{ - if (!likely(current->ptrace)) - return false; - /* - * Are we in the middle of do_coredump? - * If so and our tracer is also part of the coredump stopping - * is a deadlock situation, and pointless because our tracer - * is dead so don't allow us to stop. - * If SIGKILL was already sent before the caller unlocked - * ->siglock we must see ->core_state != NULL. Otherwise it - * is safe to enter schedule(). - * - * This is almost outdated, a task with the pending SIGKILL can't - * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported - * after SIGKILL was already dequeued. - */ - if (unlikely(current->mm->core_state) && - unlikely(current->mm == current->parent->mm)) - return false; - - return true; -} - -/* - * Return non-zero if there is a SIGKILL that should be waking us up. - * Called with the siglock held. - */ -static bool sigkill_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL); -} - /* * This must be called with current->sighand->siglock held. * @@ -2196,7 +2162,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t { bool gstop_done = false; - if (arch_ptrace_stop_needed(exit_code, info)) { + if (arch_ptrace_stop_needed()) { /* * The arch code has something special to do before a * ptrace stop. This is allowed to block, e.g. for faults @@ -2204,17 +2170,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t * calling arch_ptrace_stop, so we must release it now. * To preserve proper semantics, we must do this before * any signal bookkeeping like checking group_stop_count. - * Meanwhile, a SIGKILL could come in before we retake the - * siglock. That must prevent us from sleeping in TASK_TRACED. - * So after regaining the lock, we must check for SIGKILL. */ spin_unlock_irq(¤t->sighand->siglock); - arch_ptrace_stop(exit_code, info); + arch_ptrace_stop(); spin_lock_irq(¤t->sighand->siglock); - if (sigkill_pending(current)) - return; } + /* + * schedule() will not sleep if there is a pending signal that + * can awaken the task. + */ set_special_state(TASK_TRACED); /* @@ -2260,7 +2225,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); - if (may_ptrace_stop()) { + if (likely(current->ptrace)) { /* * Notify parents of the stop. * diff --git a/mm/debug.c b/mm/debug.c index 714be101dec9..d0020fc58202 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -216,7 +216,7 @@ void dump_mm(const struct mm_struct *mm) "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %px flags %lx core_state %px\n" + "binfmt %px flags %lx\n" #ifdef CONFIG_AIO "ioctx_table %px\n" #endif @@ -248,7 +248,7 @@ void dump_mm(const struct mm_struct *mm) mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, - mm->binfmt, mm->flags, mm->core_state, + mm->binfmt, mm->flags, #ifdef CONFIG_AIO mm->ioctx_table, #endif diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 989f35a2bbb1..50b984d048ce 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -787,9 +787,9 @@ static inline bool __task_will_free_mem(struct task_struct *task) struct signal_struct *sig = task->signal; /* - * A coredumping process may sleep for an extended period in exit_mm(), - * so the oom killer cannot assume that the process will promptly exit - * and release memory. + * A coredumping process may sleep for an extended period in + * coredump_task_exit(), so the oom killer cannot assume that + * the process will promptly exit and release memory. */ if (sig->flags & SIGNAL_GROUP_COREDUMP) return false;