time, signal: Protect resource use statistics with seqlock
Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Alex Thorlton <athorlton@sgi.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Daeseok Youn <daeseok.youn@gmail.com> Cc: David Rientjes <rientjes@google.com> Cc: Dongsheng Yang <yangds.fnst@cn.fujitsu.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Guillaume Morin <guillaume@morinfr.org> Cc: Ionut Alexa <ionut.m.alexa@gmail.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Michal Schmidt <mschmidt@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Vladimir Davydov <vdavydov@parallels.com> Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
		
				
					committed by
					
						 Ingo Molnar
						Ingo Molnar
					
				
			
			
				
	
			
			
			
						parent
						
							90ed9cbe76
						
					
				
				
					commit
					e78c349679
				
			| @@ -645,6 +645,7 @@ struct signal_struct { | ||||
| 	 * Live threads maintain their own counters and add to these | ||||
| 	 * in __exit_signal, except for the group leader. | ||||
| 	 */ | ||||
| 	seqlock_t stats_lock; | ||||
| 	cputime_t utime, stime, cutime, cstime; | ||||
| 	cputime_t gtime; | ||||
| 	cputime_t cgtime; | ||||
|   | ||||
| @@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk) | ||||
| 	 * the signal_struct. | ||||
| 	 */ | ||||
| 	task_cputime(tsk, &utime, &stime); | ||||
| 	write_seqlock(&sig->stats_lock); | ||||
| 	sig->utime += utime; | ||||
| 	sig->stime += stime; | ||||
| 	sig->gtime += task_gtime(tsk); | ||||
| @@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk) | ||||
| 	sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||||
| 	sig->nr_threads--; | ||||
| 	__unhash_process(tsk, group_dead); | ||||
| 	write_sequnlock(&sig->stats_lock); | ||||
|  | ||||
| 	/* | ||||
| 	 * Do this under ->siglock, we can race with another thread | ||||
| @@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | ||||
| 		spin_lock_irq(&p->real_parent->sighand->siglock); | ||||
| 		psig = p->real_parent->signal; | ||||
| 		sig = p->signal; | ||||
| 		write_seqlock(&psig->stats_lock); | ||||
| 		psig->cutime += tgutime + sig->cutime; | ||||
| 		psig->cstime += tgstime + sig->cstime; | ||||
| 		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; | ||||
| @@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | ||||
| 			psig->cmaxrss = maxrss; | ||||
| 		task_io_accounting_add(&psig->ioac, &p->ioac); | ||||
| 		task_io_accounting_add(&psig->ioac, &sig->ioac); | ||||
| 		write_sequnlock(&psig->stats_lock); | ||||
| 		spin_unlock_irq(&p->real_parent->sighand->siglock); | ||||
| 	} | ||||
|  | ||||
|   | ||||
| @@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | ||||
| 	sig->curr_target = tsk; | ||||
| 	init_sigpending(&sig->shared_pending); | ||||
| 	INIT_LIST_HEAD(&sig->posix_timers); | ||||
| 	seqlock_init(&sig->stats_lock); | ||||
|  | ||||
| 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||||
| 	sig->real_timer.function = it_real_fn; | ||||
|   | ||||
| @@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | ||||
| 	struct signal_struct *sig = tsk->signal; | ||||
| 	cputime_t utime, stime; | ||||
| 	struct task_struct *t; | ||||
|  | ||||
| 	times->utime = sig->utime; | ||||
| 	times->stime = sig->stime; | ||||
| 	times->sum_exec_runtime = sig->sum_sched_runtime; | ||||
| 	unsigned int seq, nextseq; | ||||
|  | ||||
| 	rcu_read_lock(); | ||||
| 	for_each_thread(tsk, t) { | ||||
| 		task_cputime(t, &utime, &stime); | ||||
| 		times->utime += utime; | ||||
| 		times->stime += stime; | ||||
| 		times->sum_exec_runtime += task_sched_runtime(t); | ||||
| 	} | ||||
| 	/* Attempt a lockless read on the first round. */ | ||||
| 	nextseq = 0; | ||||
| 	do { | ||||
| 		seq = nextseq; | ||||
| 		read_seqbegin_or_lock(&sig->stats_lock, &seq); | ||||
| 		times->utime = sig->utime; | ||||
| 		times->stime = sig->stime; | ||||
| 		times->sum_exec_runtime = sig->sum_sched_runtime; | ||||
|  | ||||
| 		for_each_thread(tsk, t) { | ||||
| 			task_cputime(t, &utime, &stime); | ||||
| 			times->utime += utime; | ||||
| 			times->stime += stime; | ||||
| 			times->sum_exec_runtime += task_sched_runtime(t); | ||||
| 		} | ||||
| 		/* If lockless access failed, take the lock. */ | ||||
| 		nextseq = 1; | ||||
| 	} while (need_seqretry(&sig->stats_lock, seq)); | ||||
| 	done_seqretry(&sig->stats_lock, seq); | ||||
| 	rcu_read_unlock(); | ||||
| } | ||||
|  | ||||
| @@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||||
| 	cputime_adjust(&cputime, &p->prev_cputime, ut, st); | ||||
| } | ||||
|  | ||||
| /* | ||||
|  * Must be called with siglock held. | ||||
|  */ | ||||
| void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||||
| { | ||||
| 	struct task_cputime cputime; | ||||
|   | ||||
| @@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms) | ||||
| { | ||||
| 	cputime_t tgutime, tgstime, cutime, cstime; | ||||
|  | ||||
| 	spin_lock_irq(¤t->sighand->siglock); | ||||
| 	thread_group_cputime_adjusted(current, &tgutime, &tgstime); | ||||
| 	cutime = current->signal->cutime; | ||||
| 	cstime = current->signal->cstime; | ||||
| 	spin_unlock_irq(¤t->sighand->siglock); | ||||
| 	tms->tms_utime = cputime_to_clock_t(tgutime); | ||||
| 	tms->tms_stime = cputime_to_clock_t(tgstime); | ||||
| 	tms->tms_cutime = cputime_to_clock_t(cutime); | ||||
|   | ||||
| @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, | ||||
| 		if (same_thread_group(tsk, current)) | ||||
| 			err = cpu_clock_sample(which_clock, tsk, &rtn); | ||||
| 	} else { | ||||
| 		unsigned long flags; | ||||
| 		struct sighand_struct *sighand; | ||||
|  | ||||
| 		/* | ||||
| 		 * while_each_thread() is not yet entirely RCU safe, | ||||
| 		 * keep locking the group while sampling process | ||||
| 		 * clock for now. | ||||
| 		 */ | ||||
| 		sighand = lock_task_sighand(tsk, &flags); | ||||
| 		if (!sighand) | ||||
| 			return err; | ||||
|  | ||||
| 		if (tsk == current || thread_group_leader(tsk)) | ||||
| 			err = cpu_clock_sample_group(which_clock, tsk, &rtn); | ||||
|  | ||||
| 		unlock_task_sighand(tsk, &flags); | ||||
| 	} | ||||
|  | ||||
| 	if (!err) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user