linux-stable/include/linux/completion.h
Boqun Feng ec81048cc3 sched/completion: Avoid unnecessary stack allocation for COMPLETION_INITIALIZER_ONSTACK()
In theory, COMPLETION_INITIALIZER_ONSTACK() should never affect the
stack allocation of the caller. However, on some compilers, a temporary
structure was allocated for the return value of
COMPLETION_INITIALIZER_ONSTACK().

For example in write_journal() with LOCKDEP_COMPLETIONS=y (GCC is 7.1.1):

	io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
	    2462:       e8 00 00 00 00          callq  2467 <write_journal+0x47>
	    2467:       48 8d 85 80 fd ff ff    lea    -0x280(%rbp),%rax
	    246e:       48 c7 c6 00 00 00 00    mov    $0x0,%rsi
	    2475:       48 c7 c2 00 00 00 00    mov    $0x0,%rdx
		x->done = 0;
	    247c:       c7 85 90 fd ff ff 00    movl   $0x0,-0x270(%rbp)
	    2483:       00 00 00
		init_waitqueue_head(&x->wait);
	    2486:       48 8d 78 18             lea    0x18(%rax),%rdi
	    248a:       e8 00 00 00 00          callq  248f <write_journal+0x6f>
		if (commit_start + commit_sections <= ic->journal_sections) {
	    248f:       41 8b 87 a8 00 00 00    mov    0xa8(%r15),%eax
		io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
	    2496:       48 8d bd e8 f9 ff ff    lea    -0x618(%rbp),%rdi
	    249d:       48 8d b5 90 fd ff ff    lea    -0x270(%rbp),%rsi
	    24a4:       b9 17 00 00 00          mov    $0x17,%ecx
	    24a9:       f3 48 a5                rep movsq %ds:(%rsi),%es:(%rdi)
		if (commit_start + commit_sections <= ic->journal_sections) {
	    24ac:       41 39 c6                cmp    %eax,%r14d
		io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
	    24af:       48 8d bd 90 fd ff ff    lea    -0x270(%rbp),%rdi
	    24b6:       48 8d b5 e8 f9 ff ff    lea    -0x618(%rbp),%rsi
	    24bd:       b9 17 00 00 00          mov    $0x17,%ecx
	    24c2:       f3 48 a5                rep movsq %ds:(%rsi),%es:(%rdi)

We can obviously see the temporary structure allocated, and the compiler
also does two meaningless memcpy with "rep movsq".

And according to:

	https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html#Statement-Exprs

The return value of a statement expression is returned by value, so the
temporary variable is created in COMPLETION_INITIALIZER_ONSTACK(), and
that's why the temporary structures are allocted.

To fix this, make the brace block in COMPLETION_INITIALIZER_ONSTACK()
return a pointer and dereference it outside the block rather than return
the whole structure, in this way, we are able to teach the compiler not
to do the unnecessary stack allocation.

This could also reduce the stack size even if !LOCKDEP, for example in
write_journal(), compiled with gcc 7.1.1, the result of command:

	 objdump -d drivers/md/dm-integrity.o | ./scripts/checkstack.pl x86

before:

	0x0000246a write_journal [dm-integrity.o]:              696

after:

	0x00002b7a write_journal [dm-integrity.o]:              296

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: walken@google.com
Cc: willy@infradead.org
Link: http://lkml.kernel.org/r/20170823152542.5150-3-boqun.feng@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-08-29 15:14:38 +02:00

152 lines
4.7 KiB
C

#ifndef __LINUX_COMPLETION_H
#define __LINUX_COMPLETION_H
/*
* (C) Copyright 2001 Linus Torvalds
*
* Atomic wait-for-completion handler data structures.
* See kernel/sched/completion.c for details.
*/
#include <linux/wait.h>
#ifdef CONFIG_LOCKDEP_COMPLETIONS
#include <linux/lockdep.h>
#endif
/*
* struct completion - structure used to maintain state for a "completion"
*
* This is the opaque structure used to maintain the state for a "completion".
* Completions currently use a FIFO to queue threads that have to wait for
* the "completion" event.
*
* See also: complete(), wait_for_completion() (and friends _timeout,
* _interruptible, _interruptible_timeout, and _killable), init_completion(),
* reinit_completion(), and macros DECLARE_COMPLETION(),
* DECLARE_COMPLETION_ONSTACK().
*/
struct completion {
unsigned int done;
wait_queue_head_t wait;
#ifdef CONFIG_LOCKDEP_COMPLETIONS
struct lockdep_map_cross map;
#endif
};
#ifdef CONFIG_LOCKDEP_COMPLETIONS
static inline void complete_acquire(struct completion *x)
{
lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
}
static inline void complete_release(struct completion *x)
{
lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_);
}
static inline void complete_release_commit(struct completion *x)
{
lock_commit_crosslock((struct lockdep_map *)&x->map);
}
#define init_completion(x) \
do { \
static struct lock_class_key __key; \
lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \
"(complete)" #x, \
&__key, 0); \
__init_completion(x); \
} while (0)
#else
#define init_completion(x) __init_completion(x)
static inline void complete_acquire(struct completion *x) {}
static inline void complete_release(struct completion *x) {}
static inline void complete_release_commit(struct completion *x) {}
#endif
#ifdef CONFIG_LOCKDEP_COMPLETIONS
#define COMPLETION_INITIALIZER(work) \
{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
#else
#define COMPLETION_INITIALIZER(work) \
{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
#endif
#define COMPLETION_INITIALIZER_ONSTACK(work) \
(*({ init_completion(&work); &work; }))
/**
* DECLARE_COMPLETION - declare and initialize a completion structure
* @work: identifier for the completion structure
*
* This macro declares and initializes a completion structure. Generally used
* for static declarations. You should use the _ONSTACK variant for automatic
* variables.
*/
#define DECLARE_COMPLETION(work) \
struct completion work = COMPLETION_INITIALIZER(work)
/*
* Lockdep needs to run a non-constant initializer for on-stack
* completions - so we use the _ONSTACK() variant for those that
* are on the kernel stack:
*/
/**
* DECLARE_COMPLETION_ONSTACK - declare and initialize a completion structure
* @work: identifier for the completion structure
*
* This macro declares and initializes a completion structure on the kernel
* stack.
*/
#ifdef CONFIG_LOCKDEP
# define DECLARE_COMPLETION_ONSTACK(work) \
struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
#else
# define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
#endif
/**
* init_completion - Initialize a dynamically allocated completion
* @x: pointer to completion structure that is to be initialized
*
* This inline function will initialize a dynamically created completion
* structure.
*/
static inline void __init_completion(struct completion *x)
{
x->done = 0;
init_waitqueue_head(&x->wait);
}
/**
* reinit_completion - reinitialize a completion structure
* @x: pointer to completion structure that is to be reinitialized
*
* This inline function should be used to reinitialize a completion structure so it can
* be reused. This is especially important after complete_all() is used.
*/
static inline void reinit_completion(struct completion *x)
{
x->done = 0;
}
extern void wait_for_completion(struct completion *);
extern void wait_for_completion_io(struct completion *);
extern int wait_for_completion_interruptible(struct completion *x);
extern int wait_for_completion_killable(struct completion *x);
extern unsigned long wait_for_completion_timeout(struct completion *x,
unsigned long timeout);
extern unsigned long wait_for_completion_io_timeout(struct completion *x,
unsigned long timeout);
extern long wait_for_completion_interruptible_timeout(
struct completion *x, unsigned long timeout);
extern long wait_for_completion_killable_timeout(
struct completion *x, unsigned long timeout);
extern bool try_wait_for_completion(struct completion *x);
extern bool completion_done(struct completion *x);
extern void complete(struct completion *);
extern void complete_all(struct completion *);
#endif