From 8f553c498e1772cccb39a114da4a498d22992758 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:12 +0200 Subject: [PATCH 01/37] cpu/hotplug: Provide cpus_read|write_[un]lock() The counting 'rwsem' hackery of get|put_online_cpus() is going to be replaced by percpu rwsem. Rename the functions to make it clear that it's locking and not some refcount style interface. These new functions will be used for the preparatory patches which make the code ready for the percpu rwsem conversion. Rename all instances in the cpu hotplug code while at it. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.080397752@linutronix.de --- include/linux/cpu.h | 32 ++++++++++++++++++-------------- kernel/cpu.c | 36 ++++++++++++++++++------------------ 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f92081234afd..055876003914 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -99,26 +99,30 @@ static inline void cpu_maps_update_done(void) extern struct bus_type cpu_subsys; #ifdef CONFIG_HOTPLUG_CPU -/* Stop CPUs going up and down. */ - -extern void cpu_hotplug_begin(void); -extern void cpu_hotplug_done(void); -extern void get_online_cpus(void); -extern void put_online_cpus(void); +extern void cpus_write_lock(void); +extern void cpus_write_unlock(void); +extern void cpus_read_lock(void); +extern void cpus_read_unlock(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int cpu_down(unsigned int cpu); -#else /* CONFIG_HOTPLUG_CPU */ +#else /* CONFIG_HOTPLUG_CPU */ -static inline void cpu_hotplug_begin(void) {} -static inline void cpu_hotplug_done(void) {} -#define get_online_cpus() do { } while (0) -#define put_online_cpus() do { } while (0) -#define cpu_hotplug_disable() do { } while (0) -#define cpu_hotplug_enable() do { } while (0) -#endif /* CONFIG_HOTPLUG_CPU */ +static inline void cpus_write_lock(void) { } +static inline void cpus_write_unlock(void) { } +static inline void cpus_read_lock(void) { } +static inline void cpus_read_unlock(void) { } +static inline void cpu_hotplug_disable(void) { } +static inline void cpu_hotplug_enable(void) { } +#endif /* !CONFIG_HOTPLUG_CPU */ + +/* Wrappers which go away once all code is converted */ +static inline void cpu_hotplug_begin(void) { cpus_write_lock(); } +static inline void cpu_hotplug_done(void) { cpus_write_unlock(); } +static inline void get_online_cpus(void) { cpus_read_lock(); } +static inline void put_online_cpus(void) { cpus_read_unlock(); } #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ae6fbe5b5cf..d3221ae5b474 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -235,7 +235,7 @@ static struct { #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) -void get_online_cpus(void) +void cpus_read_lock(void) { might_sleep(); if (cpu_hotplug.active_writer == current) @@ -245,9 +245,9 @@ void get_online_cpus(void) atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); } -EXPORT_SYMBOL_GPL(get_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_lock); -void put_online_cpus(void) +void cpus_read_unlock(void) { int refcount; @@ -264,7 +264,7 @@ void put_online_cpus(void) cpuhp_lock_release(); } -EXPORT_SYMBOL_GPL(put_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_unlock); /* * This ensures that the hotplug operation can begin only when the @@ -288,7 +288,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); * get_online_cpus() not an api which is called all that often. * */ -void cpu_hotplug_begin(void) +void cpus_write_lock(void) { DEFINE_WAIT(wait); @@ -306,7 +306,7 @@ void cpu_hotplug_begin(void) finish_wait(&cpu_hotplug.wq, &wait); } -void cpu_hotplug_done(void) +void cpus_write_unlock(void) { cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); @@ -773,7 +773,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (!cpu_present(cpu)) return -EINVAL; - cpu_hotplug_begin(); + cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; @@ -811,7 +811,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, } out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -893,7 +893,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) struct task_struct *idle; int ret = 0; - cpu_hotplug_begin(); + cpus_write_lock(); if (!cpu_present(cpu)) { ret = -EINVAL; @@ -941,7 +941,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) target = min((int)target, CPUHP_BRINGUP_CPU); ret = cpuhp_up_callbacks(cpu, st, target); out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -1424,7 +1424,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, if (sp->multi_instance == false) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) @@ -1453,7 +1453,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); @@ -1486,7 +1486,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, if (cpuhp_cb_check(state) || !name) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, @@ -1522,7 +1522,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, } out: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the * dynamically allocated state in case of success. @@ -1544,7 +1544,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, if (!sp->multi_instance) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !cpuhp_get_teardown_cb(state)) @@ -1565,7 +1565,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, remove: hlist_del(node); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -1587,7 +1587,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) BUG_ON(cpuhp_cb_check(state)); - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { @@ -1615,7 +1615,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); From ade3f680a76b474d9f5375a9b1d100ee787bf469 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:13 +0200 Subject: [PATCH 02/37] cpu/hotplug: Provide lockdep_assert_cpus_held() Provide a stub function which can be used in places where existing get_online_cpus() calls are moved to call sites. This stub is going to be filled by the final conversion of the hotplug locking mechanism to a percpu rwsem. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.161282442@linutronix.de --- include/linux/cpu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 055876003914..af4d660798e5 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -103,6 +103,7 @@ extern void cpus_write_lock(void); extern void cpus_write_unlock(void); extern void cpus_read_lock(void); extern void cpus_read_unlock(void); +static inline void lockdep_assert_cpus_held(void) { } extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); @@ -114,6 +115,7 @@ static inline void cpus_write_lock(void) { } static inline void cpus_write_unlock(void) { } static inline void cpus_read_lock(void) { } static inline void cpus_read_unlock(void) { } +static inline void lockdep_assert_cpus_held(void) { } static inline void cpu_hotplug_disable(void) { } static inline void cpu_hotplug_enable(void) { } #endif /* !CONFIG_HOTPLUG_CPU */ From 71def423fe3da0d40ad3427a4cd5f9edc53bff67 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:14 +0200 Subject: [PATCH 03/37] cpu/hotplug: Provide cpuhp_setup/remove_state[_nocalls]_cpuslocked() Some call sites of cpuhp_setup/remove_state[_nocalls]() are within a cpus_read locked region. cpuhp_setup/remove_state[_nocalls]() call cpus_read_lock() as well, which is possible in the current implementation but prevents converting the hotplug locking to a percpu rwsem. Provide locked versions of the interfaces to avoid nested calls to cpus_read_lock(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.239600868@linutronix.de --- include/linux/cpuhotplug.h | 29 +++++++++++++++++++++++ kernel/cpu.c | 47 +++++++++++++++++++++++++++++--------- 2 files changed, 65 insertions(+), 11 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0f2a80377520..4fac564dde70 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -153,6 +153,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance); +int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, + bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance); /** * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks * @state: The state for which the calls are installed @@ -171,6 +176,15 @@ static inline int cpuhp_setup_state(enum cpuhp_state state, return __cpuhp_setup_state(state, name, true, startup, teardown, false); } +static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state_cpuslocked(state, name, true, startup, + teardown, false); +} + /** * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the * callbacks @@ -191,6 +205,15 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, false); } +static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state_cpuslocked(state, name, false, startup, + teardown, false); +} + /** * cpuhp_setup_state_multi - Add callbacks for multi state * @state: The state for which the calls are installed @@ -250,6 +273,7 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, } void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); +void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); /** * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown @@ -273,6 +297,11 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state) +{ + __cpuhp_remove_state_cpuslocked(state, false); +} + /** * cpuhp_remove_multi_state - Remove hotplug multi state callback * @state: The state for which the calls are removed diff --git a/kernel/cpu.c b/kernel/cpu.c index d3221ae5b474..dc27c5a28153 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1459,7 +1459,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** - * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state * @state: The state to setup * @invoke: If true, the startup function is invoked for cpus where * cpu state >= @state @@ -1468,25 +1468,27 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * @multi_instance: State is set up for multiple instances which get * added afterwards. * + * The caller needs to hold cpus read locked while calling this function. * Returns: * On success: * Positive state number if @state is CPUHP_AP_ONLINE_DYN * 0 for all other states * On failure: proper (negative) error code */ -int __cpuhp_setup_state(enum cpuhp_state state, - const char *name, bool invoke, - int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu), - bool multi_instance) +int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) { int cpu, ret = 0; bool dynstate; + lockdep_assert_cpus_held(); + if (cpuhp_cb_check(state) || !name) return -EINVAL; - cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, @@ -1522,7 +1524,6 @@ int __cpuhp_setup_state(enum cpuhp_state state, } out: mutex_unlock(&cpuhp_state_mutex); - cpus_read_unlock(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the * dynamically allocated state in case of success. @@ -1531,6 +1532,22 @@ int __cpuhp_setup_state(enum cpuhp_state state, return state; return ret; } +EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked); + +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup, + teardown, multi_instance); + cpus_read_unlock(); + return ret; +} EXPORT_SYMBOL(__cpuhp_setup_state); int __cpuhp_state_remove_instance(enum cpuhp_state state, @@ -1572,22 +1589,23 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** - * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state * @state: The state to remove * @invoke: If true, the teardown function is invoked for cpus where * cpu state >= @state * + * The caller needs to hold cpus read locked while calling this function. * The teardown callback is currently not allowed to fail. Think * about module removal! */ -void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke) { struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); - cpus_read_lock(); + lockdep_assert_cpus_held(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { @@ -1615,6 +1633,13 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); +} +EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked); + +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + cpus_read_lock(); + __cpuhp_remove_state_cpuslocked(state, invoke); cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); From 9805c6733349ea3ccd22cf75b8ebaabb5290e310 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:15 +0200 Subject: [PATCH 04/37] cpu/hotplug: Add __cpuhp_state_add_instance_cpuslocked() Add cpuslocked() variants for the multi instance registration so this can be called from a cpus_read_lock() protected region. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.321782217@linutronix.de --- include/linux/cpuhotplug.h | 9 +++++++++ kernel/cpu.c | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4fac564dde70..df3d2719a796 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -240,6 +240,8 @@ static inline int cpuhp_setup_state_multi(enum cpuhp_state state, int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke); +int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, + struct hlist_node *node, bool invoke); /** * cpuhp_state_add_instance - Add an instance for a state and invoke startup @@ -272,6 +274,13 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, return __cpuhp_state_add_instance(state, node, false); } +static inline int +cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state, + struct hlist_node *node) +{ + return __cpuhp_state_add_instance_cpuslocked(state, node, false); +} + void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); diff --git a/kernel/cpu.c b/kernel/cpu.c index dc27c5a28153..e4389ac55b65 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1413,18 +1413,20 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, } } -int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, - bool invoke) +int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, + struct hlist_node *node, + bool invoke) { struct cpuhp_step *sp; int cpu; int ret; + lockdep_assert_cpus_held(); + sp = cpuhp_get_step(state); if (sp->multi_instance == false) return -EINVAL; - cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) @@ -1453,6 +1455,16 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); + return ret; +} + +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke); cpus_read_unlock(); return ret; } From fe5595c074005bd94f0c7d1644175941149f6768 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:16 +0200 Subject: [PATCH 05/37] stop_machine: Provide stop_machine_cpuslocked() Some call sites of stop_machine() are within a get_online_cpus() protected region. stop_machine() calls get_online_cpus() as well, which is possible in the current implementation but prevents converting the hotplug locking to a percpu rwsem. Provide stop_machine_cpuslocked() to avoid nested calls to get_online_cpus(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.400700852@linutronix.de --- include/linux/stop_machine.h | 26 +++++++++++++++++++++++--- kernel/stop_machine.c | 11 +++++++---- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 3cc9632dcc2a..3d60275e3ba9 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -116,15 +116,29 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * @fn() runs. * * This can be thought of as a very heavy write lock, equivalent to - * grabbing every spinlock in the kernel. */ + * grabbing every spinlock in the kernel. + * + * Protects against CPU hotplug. + */ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); +/** + * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function + * @fn: the function to run + * @data: the data ptr for the @fn() + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) + * + * Same as above. Must be called from with in a cpus_read_lock() protected + * region. Avoids nested calls to cpus_read_lock(). + */ +int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); + int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); #else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -static inline int stop_machine(cpu_stop_fn_t fn, void *data, - const struct cpumask *cpus) +static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { unsigned long flags; int ret; @@ -134,6 +148,12 @@ static inline int stop_machine(cpu_stop_fn_t fn, void *data, return ret; } +static inline int stop_machine(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) +{ + return stop_machine_cpuslocked(fn, data, cpus); +} + static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1eb82661ecdb..b7591261652d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -552,7 +552,8 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) +int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, @@ -561,6 +562,8 @@ static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cp .active_cpus = cpus, }; + lockdep_assert_cpus_held(); + if (!stop_machine_initialized) { /* * Handle the case where stop_machine() is called @@ -590,9 +593,9 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) int ret; /* No CPUs can come up or down during this. */ - get_online_cpus(); - ret = __stop_machine(fn, data, cpus); - put_online_cpus(); + cpus_read_lock(); + ret = stop_machine_cpuslocked(fn, data, cpus); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); From 9596695ee1e7eedd743c43811fe68299eb005b5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:17 +0200 Subject: [PATCH 06/37] padata: Make padata_alloc() static No users outside of padata.c Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Steffen Klassert Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/20170524081547.491457256@linutronix.de --- include/linux/padata.h | 3 --- kernel/padata.c | 32 ++++++++++++++++---------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/include/linux/padata.h b/include/linux/padata.h index 0f9e567d5e15..2f9c1f93b1ce 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -166,9 +166,6 @@ struct padata_instance { extern struct padata_instance *padata_alloc_possible( struct workqueue_struct *wq); -extern struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask); extern void padata_free(struct padata_instance *pinst); extern int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu); diff --git a/kernel/padata.c b/kernel/padata.c index ac8f1e524836..0c708f648853 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -933,19 +933,6 @@ static struct kobj_type padata_attr_type = { .release = padata_sysfs_release, }; -/** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @wq: workqueue to use for the allocated padata instance - */ -struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) -{ - return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); - /** * padata_alloc - allocate and initialize a padata instance and specify * cpumasks for serial and parallel workers. @@ -954,9 +941,9 @@ EXPORT_SYMBOL(padata_alloc_possible); * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization */ -struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { struct padata_instance *pinst; struct parallel_data *pd = NULL; @@ -1010,6 +997,19 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, return NULL; } +/** + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + */ +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + /** * padata_free - free a padata instance * From c5a81c8ff816d89941fe86961b286765d6ca2f5f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:18 +0200 Subject: [PATCH 07/37] padata: Avoid nested calls to cpus_read_lock() in pcrypt_init_padata() pcrypt_init_padata() cpus_read_lock() padata_alloc_possible() padata_alloc() cpus_read_lock() The nested call to cpus_read_lock() works with the current implementation, but prevents the conversion to a percpu rwsem. The other caller of padata_alloc_possible() is pcrypt_init_padata() which calls from a cpus_read_lock() protected region as well. Remove the cpus_read_lock() call in padata_alloc() and document the calling convention. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Steffen Klassert Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/20170524081547.571278910@linutronix.de --- kernel/padata.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/padata.c b/kernel/padata.c index 0c708f648853..868f947166d7 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -940,6 +940,8 @@ static struct kobj_type padata_attr_type = { * @wq: workqueue to use for the allocated padata instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization + * + * Must be called from a cpus_read_lock() protected region */ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, const struct cpumask *pcpumask, @@ -952,7 +954,6 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, if (!pinst) goto err; - get_online_cpus(); if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_inst; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { @@ -976,14 +977,12 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, pinst->flags = 0; - put_online_cpus(); - BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); #endif return pinst; @@ -992,7 +991,6 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); - put_online_cpus(); err: return NULL; } @@ -1003,9 +1001,12 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, * parallel workers. * * @wq: workqueue to use for the allocated padata instance + * + * Must be called from a cpus_read_lock() protected region */ struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) { + lockdep_assert_cpus_held(); return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); } EXPORT_SYMBOL(padata_alloc_possible); From 547efeadd42a3c75e41e33c0637cba100fc18289 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:19 +0200 Subject: [PATCH 08/37] x86/mtrr: Remove get_online_cpus() from mtrr_save_state() mtrr_save_state() is invoked from native_cpu_up() which is in the context of a CPU hotplug operation and therefor calling get_online_cpus() is pointless. While this works in the current get_online_cpus() implementation it prevents from converting the hotplug locking to percpu rwsems. Remove it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.651378834@linutronix.de --- arch/x86/kernel/cpu/mtrr/main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 2bce84d91c2b..c5bb63be4ba1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -807,10 +807,8 @@ void mtrr_save_state(void) if (!mtrr_enabled()) return; - get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); - put_online_cpus(); } void set_mtrr_aps_delayed_init(void) From a92551e41d5a7b563ae440496bc5ca19d205231d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:20 +0200 Subject: [PATCH 09/37] cpufreq: Use cpuhp_setup_state_nocalls_cpuslocked() cpufreq holds get_online_cpus() while invoking cpuhp_setup_state_nocalls() to make subsys_interface_register() and the registration of hotplug calls atomic versus cpu hotplug. cpuhp_setup_state_nocalls() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use cpuhp_setup/remove_state_nocalls_cpuslocked() to avoid the nested call. Convert *_online_cpus() to the new interfaces while at it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Acked-by: "Rafael J. Wysocki" Acked-by: Viresh Kumar Cc: linux-pm@vger.kernel.org Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.731628408@linutronix.de --- drivers/cpufreq/cpufreq.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0e3f6496524d..6001369f9aeb 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -887,7 +887,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, struct freq_attr *fattr = to_attr(attr); ssize_t ret = -EINVAL; - get_online_cpus(); + cpus_read_lock(); if (cpu_online(policy->cpu)) { down_write(&policy->rwsem); @@ -895,7 +895,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, up_write(&policy->rwsem); } - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -2441,7 +2441,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) pr_debug("trying to register driver %s\n", driver_data->name); /* Protect against concurrent CPU online/offline. */ - get_online_cpus(); + cpus_read_lock(); write_lock_irqsave(&cpufreq_driver_lock, flags); if (cpufreq_driver) { @@ -2473,9 +2473,10 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) goto err_if_unreg; } - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "cpufreq:online", - cpuhp_cpufreq_online, - cpuhp_cpufreq_offline); + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "cpufreq:online", + cpuhp_cpufreq_online, + cpuhp_cpufreq_offline); if (ret < 0) goto err_if_unreg; hp_online = ret; @@ -2493,7 +2494,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) cpufreq_driver = NULL; write_unlock_irqrestore(&cpufreq_driver_lock, flags); out: - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(cpufreq_register_driver); @@ -2516,17 +2517,17 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) pr_debug("unregistering driver %s\n", driver->name); /* Protect against concurrent cpu hotplug */ - get_online_cpus(); + cpus_read_lock(); subsys_interface_unregister(&cpufreq_interface); remove_boost_sysfs_file(); - cpuhp_remove_state_nocalls(hp_online); + cpuhp_remove_state_nocalls_cpuslocked(hp_online); write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver = NULL; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - put_online_cpus(); + cpus_read_unlock(); return 0; } From 419af25fa4d0974fd758a668c08c369c19392a47 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:21 +0200 Subject: [PATCH 10/37] KVM/PPC/Book3S HV: Use cpuhp_setup_state_nocalls_cpuslocked() kvmppc_alloc_host_rm_ops() holds get_online_cpus() while invoking cpuhp_setup_state_nocalls(). cpuhp_setup_state_nocalls() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use cpuhp_setup_state_nocalls_cpuslocked() to avoid the nested call. Convert *_online_cpus() to the new interfaces while at it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Paul E. McKenney Cc: kvm@vger.kernel.org Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Steven Rostedt Cc: kvm-ppc@vger.kernel.org Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Cc: Alexander Graf Link: http://lkml.kernel.org/r/20170524081547.809616236@linutronix.de --- arch/powerpc/kvm/book3s_hv.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 42b7a4fd57d9..48a6bd160011 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3317,7 +3317,7 @@ void kvmppc_alloc_host_rm_ops(void) return; } - get_online_cpus(); + cpus_read_lock(); for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { if (!cpu_online(cpu)) @@ -3339,17 +3339,17 @@ void kvmppc_alloc_host_rm_ops(void) l_ops = (unsigned long) ops; if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { - put_online_cpus(); + cpus_read_unlock(); kfree(ops->rm_core); kfree(ops); return; } - cpuhp_setup_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE, - "ppc/kvm_book3s:prepare", - kvmppc_set_host_core, - kvmppc_clear_host_core); - put_online_cpus(); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE, + "ppc/kvm_book3s:prepare", + kvmppc_set_host_core, + kvmppc_clear_host_core); + cpus_read_unlock(); } void kvmppc_free_host_rm_ops(void) From e560c89c8ac0baadf0da351f602c599016568fc7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:22 +0200 Subject: [PATCH 11/37] hwtracing/coresight-etm3x: Use cpuhp_setup_state_nocalls_cpuslocked() etm_probe() holds get_online_cpus() while invoking cpuhp_setup_state_nocalls(). cpuhp_setup_state_nocalls() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use cpuhp_setup_state_nocalls_cpuslocked() to avoid the nested call. Convert *_online_cpus() to the new interfaces while at it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Mathieu Poirier Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20170524081547.889092478@linutronix.de --- drivers/hwtracing/coresight/coresight-etm3x.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c index a51b6b64ecdf..93ee8fc539be 100644 --- a/drivers/hwtracing/coresight/coresight-etm3x.c +++ b/drivers/hwtracing/coresight/coresight-etm3x.c @@ -587,7 +587,7 @@ static void etm_disable_sysfs(struct coresight_device *csdev) * after cpu online mask indicates the cpu is offline but before the * DYING hotplug callback is serviced by the ETM driver. */ - get_online_cpus(); + cpus_read_lock(); spin_lock(&drvdata->spinlock); /* @@ -597,7 +597,7 @@ static void etm_disable_sysfs(struct coresight_device *csdev) smp_call_function_single(drvdata->cpu, etm_disable_hw, drvdata, 1); spin_unlock(&drvdata->spinlock); - put_online_cpus(); + cpus_read_unlock(); dev_info(drvdata->dev, "ETM tracing disabled\n"); } @@ -795,7 +795,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id) drvdata->cpu = pdata ? pdata->cpu : 0; - get_online_cpus(); + cpus_read_lock(); etmdrvdata[drvdata->cpu] = drvdata; if (smp_call_function_single(drvdata->cpu, @@ -803,17 +803,17 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id) dev_err(dev, "ETM arch init failed\n"); if (!etm_count++) { - cpuhp_setup_state_nocalls(CPUHP_AP_ARM_CORESIGHT_STARTING, - "arm/coresight:starting", - etm_starting_cpu, etm_dying_cpu); - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "arm/coresight:online", - etm_online_cpu, NULL); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ARM_CORESIGHT_STARTING, + "arm/coresight:starting", + etm_starting_cpu, etm_dying_cpu); + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "arm/coresight:online", + etm_online_cpu, NULL); if (ret < 0) goto err_arch_supported; hp_online = ret; } - put_online_cpus(); + cpus_read_unlock(); if (etm_arch_supported(drvdata->arch) == false) { ret = -EINVAL; From e9f5d63f84febb7e9dfe4e0dc696adf88053fbf2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:23 +0200 Subject: [PATCH 12/37] hwtracing/coresight-etm4x: Use cpuhp_setup_state_nocalls_cpuslocked() etm_probe4() holds get_online_cpus() while invoking cpuhp_setup_state_nocalls(). cpuhp_setup_state_nocalls() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use cpuhp_setup_state_nocalls_cpuslocked() to avoid the nested call. Convert *_online_cpus() to the new interfaces while at it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Mathieu Poirier Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20170524081547.983493849@linutronix.de --- drivers/hwtracing/coresight/coresight-etm4x.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c index d1340fb4e457..532adc9dd32a 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x.c +++ b/drivers/hwtracing/coresight/coresight-etm4x.c @@ -371,7 +371,7 @@ static void etm4_disable_sysfs(struct coresight_device *csdev) * after cpu online mask indicates the cpu is offline but before the * DYING hotplug callback is serviced by the ETM driver. */ - get_online_cpus(); + cpus_read_lock(); spin_lock(&drvdata->spinlock); /* @@ -381,7 +381,7 @@ static void etm4_disable_sysfs(struct coresight_device *csdev) smp_call_function_single(drvdata->cpu, etm4_disable_hw, drvdata, 1); spin_unlock(&drvdata->spinlock); - put_online_cpus(); + cpus_read_unlock(); dev_info(drvdata->dev, "ETM tracing disabled\n"); } @@ -982,7 +982,7 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id) drvdata->cpu = pdata ? pdata->cpu : 0; - get_online_cpus(); + cpus_read_lock(); etmdrvdata[drvdata->cpu] = drvdata; if (smp_call_function_single(drvdata->cpu, @@ -990,18 +990,18 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id) dev_err(dev, "ETM arch init failed\n"); if (!etm4_count++) { - cpuhp_setup_state_nocalls(CPUHP_AP_ARM_CORESIGHT_STARTING, - "arm/coresight4:starting", - etm4_starting_cpu, etm4_dying_cpu); - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "arm/coresight4:online", - etm4_online_cpu, NULL); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ARM_CORESIGHT_STARTING, + "arm/coresight4:starting", + etm4_starting_cpu, etm4_dying_cpu); + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "arm/coresight4:online", + etm4_online_cpu, NULL); if (ret < 0) goto err_arch_supported; hp_online = ret; } - put_online_cpus(); + cpus_read_unlock(); if (etm4_arch_supported(drvdata->arch) == false) { ret = -EINVAL; From 04b247c2ebdd6ba1c46c7c22546229a89760b43a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:24 +0200 Subject: [PATCH 13/37] perf/x86/intel/cqm: Use cpuhp_setup_state_cpuslocked() intel_cqm_init() holds get_online_cpus() while registerring the hotplug callbacks. cpuhp_setup_state() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use cpuhp_setup_state_cpuslocked() to avoid the nested call. Convert *_online_cpus() to the new interfaces while at it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Paul E. McKenney Cc: Fenghua Yu Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081548.075604046@linutronix.de --- arch/x86/events/intel/cqm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 8c00dc09a5d2..2521f771f2f5 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -1682,7 +1682,7 @@ static int __init intel_cqm_init(void) * * Also, check that the scales match on all cpus. */ - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -1746,14 +1746,14 @@ static int __init intel_cqm_init(void) * Setup the hot cpu notifier once we are sure cqm * is enabled to avoid notifier leak. */ - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING, - "perf/x86/cqm:starting", - intel_cqm_cpu_starting, NULL); - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online", - NULL, intel_cqm_cpu_exit); - + cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING, + "perf/x86/cqm:starting", + intel_cqm_cpu_starting, NULL); + cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE, + "perf/x86/cqm:online", + NULL, intel_cqm_cpu_exit); out: - put_online_cpus(); + cpus_read_unlock(); if (ret) { kfree(str); From fe2a5cd8aa038e2b02fda983afc2083e94c04b4f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:25 +0200 Subject: [PATCH 14/37] ARM/hw_breakpoint: Use cpuhp_setup_state_cpuslocked() arch_hw_breakpoint_init() holds get_online_cpus() while registerring the hotplug callbacks. cpuhp_setup_state() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use cpuhp_setup_state_cpuslocked() to avoid the nested call. Convert *_online_cpus() to the new interfaces while at it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Mark Rutland Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Will Deacon Cc: Steven Rostedt Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20170524081548.170940729@linutronix.de --- arch/arm/kernel/hw_breakpoint.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index be3b3fbd382f..63cb4c7c6593 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -1090,7 +1090,7 @@ static int __init arch_hw_breakpoint_init(void) * driven low on this core and there isn't an architected way to * determine that. */ - get_online_cpus(); + cpus_read_lock(); register_undef_hook(&debug_reg_hook); /* @@ -1098,15 +1098,16 @@ static int __init arch_hw_breakpoint_init(void) * assume that a halting debugger will leave the world in a nice state * for us. */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm/hw_breakpoint:online", - dbg_reset_online, NULL); + ret = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "arm/hw_breakpoint:online", + dbg_reset_online, NULL); unregister_undef_hook(&debug_reg_hook); if (WARN_ON(ret < 0) || !cpumask_empty(&debug_err_mask)) { core_num_brps = 0; core_num_wrps = 0; if (ret > 0) cpuhp_remove_state_nocalls(ret); - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -1124,7 +1125,7 @@ static int __init arch_hw_breakpoint_init(void) TRAP_HWBKPT, "watchpoint debug exception"); hook_ifault_code(FAULT_CODE_DEBUG, hw_breakpoint_pending, SIGTRAP, TRAP_HWBKPT, "breakpoint debug exception"); - put_online_cpus(); + cpus_read_unlock(); /* Register PM notifiers. */ pm_init(); From 2337e879e8805a630b418f3e73a98084d4724b83 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:26 +0200 Subject: [PATCH 15/37] s390/kernel: Use stop_machine_cpuslocked() stp_work_fn() holds get_online_cpus() while invoking stop_machine(). stop_machine() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use stop_machine_cpuslocked() to avoid the nested call. Convert *_online_cpus() to the new interfaces while at it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Heiko Carstens Cc: Paul E. McKenney Cc: linux-s390@vger.kernel.org Cc: Peter Zijlstra Cc: Steven Rostedt Cc: David Hildenbrand Cc: Martin Schwidefsky Link: http://lkml.kernel.org/r/20170524081548.250203087@linutronix.de --- arch/s390/kernel/time.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index c3a52f9a69a0..192efdfac918 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -636,10 +636,10 @@ static void stp_work_fn(struct work_struct *work) goto out_unlock; memset(&stp_sync, 0, sizeof(stp_sync)); - get_online_cpus(); + cpus_read_lock(); atomic_set(&stp_sync.cpus, num_online_cpus() - 1); - stop_machine(stp_sync_clock, &stp_sync, cpu_online_mask); - put_online_cpus(); + stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask); + cpus_read_unlock(); if (!check_sync_clock()) /* From f9a69931c3959940538884d5962b770c3db75df5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:27 +0200 Subject: [PATCH 16/37] powerpc/powernv: Use stop_machine_cpuslocked() set_subcores_per_core() holds get_online_cpus() while invoking stop_machine(). stop_machine() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use stop_machine_cpuslocked() to avoid the nested call. Convert *_online_cpus() to the new interfaces while at it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Steven Rostedt Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170524081548.331016542@linutronix.de --- arch/powerpc/platforms/powernv/subcore.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 0babef11136f..e6230f104dd9 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -348,7 +348,7 @@ static int set_subcores_per_core(int new_mode) state->master = 0; } - get_online_cpus(); + cpus_read_lock(); /* This cpu will update the globals before exiting stop machine */ this_cpu_ptr(&split_state)->master = 1; @@ -356,9 +356,10 @@ static int set_subcores_per_core(int new_mode) /* Ensure state is consistent before we call the other cpus */ mb(); - stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask); + stop_machine_cpuslocked(cpu_update_split_mode, &new_mode, + cpu_online_mask); - put_online_cpus(); + cpus_read_unlock(); return 0; } From 210e21331fc3a396af640cec652be769d146e49f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:28 +0200 Subject: [PATCH 17/37] cpu/hotplug: Use stop_machine_cpuslocked() in takedown_cpu() takedown_cpu() is a cpu hotplug function invoking stop_machine(). The cpu hotplug machinery holds the hotplug lock for write. stop_machine() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use stop_machine_cpuslocked() to avoid the nested call. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081548.423292433@linutronix.de --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index e4389ac55b65..142d889d9f69 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -701,7 +701,7 @@ static int takedown_cpu(unsigned int cpu) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); + err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU refused to die */ irq_unlock_sparse(); From 27d3b157fee0bad264eb745d5c547e2e0676f1a2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:29 +0200 Subject: [PATCH 18/37] x86/perf: Drop EXPORT of perf_check_microcode The only caller is the microcode update, which cannot be modular. Drop the export. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Borislav Petkov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20170524081548.515204988@linutronix.de --- arch/x86/events/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 580b60f5ac83..ac650d57ebf7 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2224,7 +2224,6 @@ void perf_check_microcode(void) if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } -EXPORT_SYMBOL_GPL(perf_check_microcode); static struct pmu pmu = { .pmu_enable = x86_pmu_enable, From 1ba143a5216fb148211160a0ecc1f8d3f92f06bb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:30 +0200 Subject: [PATCH 19/37] perf/x86/intel: Drop get_online_cpus() in intel_snb_check_microcode() If intel_snb_check_microcode() is invoked via microcode_init -> perf_check_microcode -> intel_snb_check_microcode then get_online_cpus() is invoked nested. This works with the current implementation of get_online_cpus() but prevents converting it to a percpu rwsem. intel_snb_check_microcode() is also invoked from intel_sandybridge_quirk() unprotected. Drop get_online_cpus() from intel_snb_check_microcode() and add it to intel_sandybridge_quirk() so both call sites are protected. Convert *_online_cpus() to the new interfaces while at it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Borislav Petkov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20170524081548.594862191@linutronix.de --- arch/x86/events/intel/core.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a6d91d4e37a1..b9174aacf42f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3410,12 +3410,10 @@ static void intel_snb_check_microcode(void) int pebs_broken = 0; int cpu; - get_online_cpus(); for_each_online_cpu(cpu) { if ((pebs_broken = intel_snb_pebs_broken(cpu))) break; } - put_online_cpus(); if (pebs_broken == x86_pmu.pebs_broken) return; @@ -3488,7 +3486,9 @@ static bool check_msr(unsigned long msr, u64 mask) static __init void intel_sandybridge_quirk(void) { x86_pmu.check_microcode = intel_snb_check_microcode; + cpus_read_lock(); intel_snb_check_microcode(); + cpus_read_unlock(); } static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { @@ -4112,13 +4112,12 @@ static __init int fixup_ht_bug(void) lockup_detector_resume(); - get_online_cpus(); + cpus_read_lock(); - for_each_online_cpu(c) { + for_each_online_cpu(c) free_excl_cntrs(c); - } - put_online_cpus(); + cpus_read_unlock(); pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n"); return 0; } From 1ddd45f8d76f0c15ec4e44073eeaaee6a806ee81 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:31 +0200 Subject: [PATCH 20/37] PCI: Use cpu_hotplug_disable() instead of get_online_cpus() Converting the hotplug locking, i.e. get_online_cpus(), to a percpu rwsem unearthed a circular lock dependency which was hidden from lockdep due to the lockdep annotation of get_online_cpus() which prevents lockdep from creating full dependency chains. There are several variants of this. And example is: Chain exists of: cpu_hotplug_lock.rw_sem --> drm_global_mutex --> &item->mutex CPU0 CPU1 ---- ---- lock(&item->mutex); lock(drm_global_mutex); lock(&item->mutex); lock(cpu_hotplug_lock.rw_sem); because there are dependencies through workqueues. The call chain is: get_online_cpus apply_workqueue_attrs __alloc_workqueue_key ttm_mem_global_init ast_ttm_mem_global_init drm_global_item_ref ast_mm_init ast_driver_load drm_dev_register drm_get_pci_dev ast_pci_probe local_pci_probe work_for_cpu_fn process_one_work worker_thread This is not a problem of get_online_cpus() recursion, it's a possible deadlock undetected by lockdep so far. The cure is to use cpu_hotplug_disable() instead of get_online_cpus() to protect the PCI probing. There is a side effect to this: cpu_hotplug_disable() makes a concurrent cpu hotplug attempt via the sysfs interfaces fail with -EBUSY, but PCI probing usually happens during the boot process where no interaction is possible. Any later invocations are infrequent enough and concurrent hotplug attempts are so unlikely that the danger of user space visible regressions is very close to zero. Anyway, thats preferrable over a real deadlock. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Bjorn Helgaas Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: linux-pci@vger.kernel.org Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081548.691198590@linutronix.de --- drivers/pci/pci-driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 192e7b681b96..5bf92fd983e5 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -349,13 +349,13 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, if (node >= 0 && node != numa_node_id()) { int cpu; - get_online_cpus(); + cpu_hotplug_disable(); cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); - put_online_cpus(); + cpu_hotplug_enable(); } else error = local_pci_probe(&ddi); From 0b2c2a71e6f07fb67e6f72817d39910f64d2e258 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:32 +0200 Subject: [PATCH 21/37] PCI: Replace the racy recursion prevention pci_call_probe() can called recursively when a physcial function is probed and the probing creates virtual functions, which are populated via pci_bus_add_device() which in turn can end up calling pci_call_probe() again. The code has an interesting way to prevent recursing into the workqueue code. That's accomplished by a check whether the current task runs already on the numa node which is associated with the device. While that works to prevent the recursion into the workqueue code, it's racy versus normal execution as there is no guarantee that the node does not vanish after the check. There is another issue with this code. It dereferences cpumask_of_node() unconditionally without checking whether the node is available. Make the detection reliable by: - Mark a probed device as 'is_probed' in pci_call_probe() - Check in pci_call_probe for a virtual function. If it's a virtual function and the associated physical function device is marked 'is_probed' then this is a recursive call, so the call can be invoked in the calling context. - Add a check whether the node is online before dereferencing it. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Bjorn Helgaas Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: linux-pci@vger.kernel.org Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081548.771457199@linutronix.de --- drivers/pci/pci-driver.c | 47 +++++++++++++++++++++------------------- include/linux/pci.h | 1 + 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 5bf92fd983e5..fe6be6382505 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -320,10 +320,19 @@ static long local_pci_probe(void *_ddi) return 0; } +static bool pci_physfn_is_probed(struct pci_dev *dev) +{ +#ifdef CONFIG_PCI_IOV + return dev->is_virtfn && dev->physfn->is_probed; +#else + return false; +#endif +} + static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, const struct pci_device_id *id) { - int error, node; + int error, node, cpu; struct drv_dev_and_id ddi = { drv, dev, id }; /* @@ -332,33 +341,27 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, * on the right node. */ node = dev_to_node(&dev->dev); + dev->is_probed = 1; + + cpu_hotplug_disable(); /* - * On NUMA systems, we are likely to call a PF probe function using - * work_on_cpu(). If that probe calls pci_enable_sriov() (which - * adds the VF devices via pci_bus_add_device()), we may re-enter - * this function to call the VF probe function. Calling - * work_on_cpu() again will cause a lockdep warning. Since VFs are - * always on the same node as the PF, we can work around this by - * avoiding work_on_cpu() when we're already on the correct node. - * - * Preemption is enabled, so it's theoretically unsafe to use - * numa_node_id(), but even if we run the probe function on the - * wrong node, it should be functionally correct. + * Prevent nesting work_on_cpu() for the case where a Virtual Function + * device is probed from work_on_cpu() of the Physical device. */ - if (node >= 0 && node != numa_node_id()) { - int cpu; - - cpu_hotplug_disable(); + if (node < 0 || node >= MAX_NUMNODES || !node_online(node) || + pci_physfn_is_probed(dev)) + cpu = nr_cpu_ids; + else cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); - if (cpu < nr_cpu_ids) - error = work_on_cpu(cpu, local_pci_probe, &ddi); - else - error = local_pci_probe(&ddi); - cpu_hotplug_enable(); - } else + + if (cpu < nr_cpu_ids) + error = work_on_cpu(cpu, local_pci_probe, &ddi); + else error = local_pci_probe(&ddi); + dev->is_probed = 0; + cpu_hotplug_enable(); return error; } diff --git a/include/linux/pci.h b/include/linux/pci.h index 33c2b0b77429..5026f2ae86db 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -371,6 +371,7 @@ struct pci_dev { unsigned int irq_managed:1; unsigned int has_secondary_link:1; unsigned int non_compliant_bars:1; /* broken BARs; ignore them */ + unsigned int is_probed:1; /* device probing in progress */ pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ From fdaf0a51bad496289356d11d796095a293794b5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:33 +0200 Subject: [PATCH 22/37] ACPI/processor: Use cpu_hotplug_disable() instead of get_online_cpus() Converting the hotplug locking, i.e. get_online_cpus(), to a percpu rwsem unearthed a circular lock dependency which was hidden from lockdep due to the lockdep annotation of get_online_cpus() which prevents lockdep from creating full dependency chains. CPU0 CPU1 ---- ---- lock((&wfc.work)); lock(cpu_hotplug_lock.rw_sem); lock((&wfc.work)); lock(cpu_hotplug_lock.rw_sem); This dependency is established via acpi_processor_start() which calls into the work queue code. And the work queue code establishes the reverse dependency. This is not a problem of get_online_cpus() recursion, it's a possible deadlock undetected by lockdep so far. The cure is to use cpu_hotplug_disable() instead of get_online_cpus() to protect the probing from acpi_processor_start(). There is a side effect to this: cpu_hotplug_disable() makes a concurrent cpu hotplug attempt via the sysfs interfaces fail with -EBUSY, but that probing usually happens during the boot process where no interaction is possible. Any later invocations are infrequent enough and concurrent hotplug attempts are so unlikely that the danger of user space visible regressions is very close to zero. Anyway, thats preferrable over a real deadlock. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Rafael J. Wysocki Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: linux-acpi@vger.kernel.org Cc: Len Brown Link: http://lkml.kernel.org/r/20170524081548.851588594@linutronix.de --- drivers/acpi/processor_driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 8697a82bd465..591d1dd3f04e 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -268,9 +268,9 @@ static int acpi_processor_start(struct device *dev) return -ENODEV; /* Protect against concurrent CPU hotplug operations */ - get_online_cpus(); + cpu_hotplug_disable(); ret = __acpi_processor_start(device); - put_online_cpus(); + cpu_hotplug_enable(); return ret; } From a63fbed776c7124ce9f606234267c3c095b2680e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:34 +0200 Subject: [PATCH 23/37] perf/tracing/cpuhotplug: Fix locking order perf, tracing, kprobes and jump_labels have a gazillion of ways to create dependency lock chains. Some of those involve nested invocations of get_online_cpus(). The conversion of the hotplug locking to a percpu rwsem requires to avoid such nested calls. sys_perf_event_open() protects most of the syscall logic against cpu hotplug. This causes nested calls and lock inversions versus ftrace and kprobes in various interesting ways. It's impossible to move the hotplug locking to the outer end of all call chains in the involved facilities, so the hotplug protection in sys_perf_event_open() needs to be solved differently. Introduce 'pmus_mutex' which protects a perf private online cpumask. This mutex is taken when the mask is updated in the cpu hotplug callbacks and can be taken in sys_perf_event_open() to protect the swhash setup/teardown code and when the final judgement about a valid event has to be made. [ tglx: Produced changelog and fixed the swhash interaction ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Paul E. McKenney Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Masami Hiramatsu Link: http://lkml.kernel.org/r/20170524081548.930941109@linutronix.de --- include/linux/perf_event.h | 2 + kernel/events/core.c | 106 ++++++++++++++++++++++++++----------- 2 files changed, 78 insertions(+), 30 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 24a635887f28..7d6aa29094b2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -801,6 +801,8 @@ struct perf_cpu_context { struct list_head sched_cb_entry; int sched_cb_usage; + + int online; }; struct perf_output_handle { diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..b97cda4d1777 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; +static cpumask_var_t perf_online_mask; /* * perf event paranoia level: @@ -3812,14 +3813,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); @@ -7703,7 +7696,8 @@ static int swevent_hlist_get_cpu(int cpu) int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { + if (!swevent_hlist_deref(swhash) && + cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -7724,7 +7718,7 @@ static int swevent_hlist_get(void) { int err, cpu, failed_cpu; - get_online_cpus(); + mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { @@ -7732,8 +7726,7 @@ static int swevent_hlist_get(void) goto fail; } } - put_online_cpus(); - + mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { @@ -7741,8 +7734,7 @@ static int swevent_hlist_get(void) break; swevent_hlist_put_cpu(cpu); } - - put_online_cpus(); + mutex_unlock(&pmus_lock); return err; } @@ -8920,7 +8912,7 @@ perf_event_mux_interval_ms_store(struct device *dev, pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); @@ -8929,7 +8921,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpu_function_call(cpu, (remote_function_f)perf_mux_hrtimer_restart, cpuctx); } - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; @@ -9059,6 +9051,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); __perf_mux_hrtimer_init(cpuctx, cpu); } @@ -9882,12 +9875,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - get_online_cpus(); - if (task) { err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (err) - goto err_cpus; + goto err_cred; /* * Reuse ptrace permission checks for now. @@ -10073,6 +10064,23 @@ SYSCALL_DEFINE5(perf_event_open, goto err_locked; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } + } + + /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. @@ -10162,8 +10170,6 @@ SYSCALL_DEFINE5(perf_event_open, put_task_struct(task); } - put_online_cpus(); - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -10197,8 +10203,6 @@ SYSCALL_DEFINE5(perf_event_open, err_cred: if (task) mutex_unlock(&task->signal->cred_guard_mutex); -err_cpus: - put_online_cpus(); err_task: if (task) put_task_struct(task); @@ -10253,6 +10257,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + if (!cpuctx->online) { + err = -ENODEV; + goto err_unlock; + } + } + if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_unlock; @@ -10920,6 +10939,8 @@ static void __init perf_event_init_all_cpus(void) struct swevent_htable *swhash; int cpu; + zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); @@ -10935,7 +10956,7 @@ static void __init perf_event_init_all_cpus(void) } } -int perf_event_init_cpu(unsigned int cpu) +void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10948,7 +10969,6 @@ int perf_event_init_cpu(unsigned int cpu) rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); - return 0; } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -10966,19 +10986,22 @@ static void __perf_event_exit_context(void *__info) static void perf_event_exit_cpu_context(int cpu) { + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; struct pmu *pmu; - int idx; - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + mutex_lock(&pmus_lock); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; mutex_unlock(&ctx->mutex); } - srcu_read_unlock(&pmus_srcu, idx); + cpumask_clear_cpu(cpu, perf_online_mask); + mutex_unlock(&pmus_lock); } #else @@ -10986,6 +11009,29 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +int perf_event_init_cpu(unsigned int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + + perf_swevent_init_cpu(cpu); + + mutex_lock(&pmus_lock); + cpumask_set_cpu(cpu, perf_online_mask); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); + } + mutex_unlock(&pmus_lock); + + return 0; +} + int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); From f2545b2d4ce13e068897ef60ae64dffe215f4152 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:35 +0200 Subject: [PATCH 24/37] jump_label: Reorder hotplug lock and jump_label_lock The conversion of the hotplug locking to a percpu rwsem unearthed lock ordering issues all over the place. The jump_label code has two issues: 1) Nested get_online_cpus() invocations 2) Ordering problems vs. the cpus rwsem and the jump_label_mutex To cure these, the following lock order has been established; cpus_rwsem -> jump_label_lock -> text_mutex Even if not all architectures need protection against CPU hotplug, taking cpus_rwsem before jump_label_lock is now mandatory in code pathes which actually modify code and therefor need text_mutex protection. Move the get_online_cpus() invocations into the core jump label code and establish the proper lock order where required. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: "David S. Miller" Cc: Paul E. McKenney Cc: Chris Metcalf Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Jason Baron Cc: Ralf Baechle Link: http://lkml.kernel.org/r/20170524081549.025830817@linutronix.de --- arch/mips/kernel/jump_label.c | 2 -- arch/sparc/kernel/jump_label.c | 2 -- arch/tile/kernel/jump_label.c | 2 -- arch/x86/kernel/jump_label.c | 2 -- kernel/jump_label.c | 20 ++++++++++++++------ 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c index 3e586daa3a32..32e3168316cd 100644 --- a/arch/mips/kernel/jump_label.c +++ b/arch/mips/kernel/jump_label.c @@ -58,7 +58,6 @@ void arch_jump_label_transform(struct jump_entry *e, insn.word = 0; /* nop */ } - get_online_cpus(); mutex_lock(&text_mutex); if (IS_ENABLED(CONFIG_CPU_MICROMIPS)) { insn_p->halfword[0] = insn.word >> 16; @@ -70,7 +69,6 @@ void arch_jump_label_transform(struct jump_entry *e, (unsigned long)insn_p + sizeof(*insn_p)); mutex_unlock(&text_mutex); - put_online_cpus(); } #endif /* HAVE_JUMP_LABEL */ diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c index 07933b9e9ce0..93adde1ac166 100644 --- a/arch/sparc/kernel/jump_label.c +++ b/arch/sparc/kernel/jump_label.c @@ -41,12 +41,10 @@ void arch_jump_label_transform(struct jump_entry *entry, val = 0x01000000; } - get_online_cpus(); mutex_lock(&text_mutex); *insn = val; flushi(insn); mutex_unlock(&text_mutex); - put_online_cpus(); } #endif diff --git a/arch/tile/kernel/jump_label.c b/arch/tile/kernel/jump_label.c index 07802d586988..93931a46625b 100644 --- a/arch/tile/kernel/jump_label.c +++ b/arch/tile/kernel/jump_label.c @@ -45,14 +45,12 @@ static void __jump_label_transform(struct jump_entry *e, void arch_jump_label_transform(struct jump_entry *e, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(e, type); flush_icache_range(e->code, e->code + sizeof(tilegx_bundle_bits)); mutex_unlock(&text_mutex); - put_online_cpus(); } __init_or_module void arch_jump_label_transform_static(struct jump_entry *e, diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index c37bd0f39c70..ab4f491da2a9 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -105,11 +105,9 @@ static void __jump_label_transform(struct jump_entry *entry, void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(entry, type, NULL, 0); mutex_unlock(&text_mutex); - put_online_cpus(); } static enum { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 6c9cb208ac48..d11c506a6ac3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL @@ -124,6 +125,7 @@ void static_key_slow_inc(struct static_key *key) return; } + cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); @@ -133,12 +135,14 @@ void static_key_slow_inc(struct static_key *key) atomic_inc(&key->enabled); } jump_label_unlock(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + cpus_read_lock(); /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -149,6 +153,7 @@ static void __static_key_slow_dec(struct static_key *key, if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); + cpus_read_unlock(); return; } @@ -159,6 +164,7 @@ static void __static_key_slow_dec(struct static_key *key, jump_label_update(key); } jump_label_unlock(); + cpus_read_unlock(); } static void jump_label_update_timeout(struct work_struct *work) @@ -334,6 +340,7 @@ void __init jump_label_init(void) if (static_key_initialized) return; + cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); @@ -353,6 +360,7 @@ void __init jump_label_init(void) } static_key_initialized = true; jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_MODULES @@ -590,28 +598,28 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, struct module *mod = data; int ret = 0; + cpus_read_lock(); + jump_label_lock(); + switch (val) { case MODULE_STATE_COMING: - jump_label_lock(); ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } - jump_label_unlock(); break; case MODULE_STATE_GOING: - jump_label_lock(); jump_label_del_module(mod); - jump_label_unlock(); break; case MODULE_STATE_LIVE: - jump_label_lock(); jump_label_invalidate_module_init(mod); - jump_label_unlock(); break; } + jump_label_unlock(); + cpus_read_unlock(); + return notifier_from_errno(ret); } From 2d1e38f56622b9bb5af85be63c1052c056f5c677 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:36 +0200 Subject: [PATCH 25/37] kprobes: Cure hotplug lock ordering issues Converting the cpu hotplug locking to a percpu rwsem unearthed hidden lock ordering problems. There is a wide range of locks involved in this: kprobe_mutex, jump_label_mutex, ftrace_lock, text_mutex, event_mutex, module_mutex, func_hash->regex_lock and a gazillion of lock order permutations with nested get_online_cpus() calls. Some of those permutations are potential deadlocks even with the current nesting hotplug locking scheme, but they can't be discovered by lockdep. The conversion of the hotplug locking to a percpu rwsem requires to prevent nested locking, so it's required to take the hotplug rwsem early in the call chain and establish a proper lock order. After quite some analysis and going down the wrong road severa times the following lock order has been chosen: kprobe_mutex -> cpus_rwsem -> jump_label_mutex -> text_mutex For kprobes which hook on an ftrace function trace point, it's required to drop cpus_rwsem before calling into the ftrace code to avoid a deadlock on the func_hash->regex_lock. [ Steven: Ftrace interaction fixes ] Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Masami Hiramatsu Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sebastian Siewior Link: http://lkml.kernel.org/r/20170524081549.104864779@linutronix.de --- kernel/kprobes.c | 59 ++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d2d3a568e4e..9f6056749a28 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -483,11 +483,6 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); */ static void do_optimize_kprobes(void) { - /* Optimization never be done when disarmed */ - if (kprobes_all_disarmed || !kprobes_allow_optimization || - list_empty(&optimizing_list)) - return; - /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. @@ -495,14 +490,19 @@ static void do_optimize_kprobes(void) * This combination can cause a deadlock (cpu-hotplug try to lock * text_mutex but stop_machine can not be done because online_cpus * has been changed) - * To avoid this deadlock, we need to call get_online_cpus() + * To avoid this deadlock, caller must have locked cpu hotplug * for preventing cpu-hotplug outside of text_mutex locking. */ - get_online_cpus(); + lockdep_assert_cpus_held(); + + /* Optimization never be done when disarmed */ + if (kprobes_all_disarmed || !kprobes_allow_optimization || + list_empty(&optimizing_list)) + return; + mutex_lock(&text_mutex); arch_optimize_kprobes(&optimizing_list); mutex_unlock(&text_mutex); - put_online_cpus(); } /* @@ -513,12 +513,13 @@ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; + /* See comment in do_optimize_kprobes() */ + lockdep_assert_cpus_held(); + /* Unoptimization must be done anytime */ if (list_empty(&unoptimizing_list)) return; - /* Ditto to do_optimize_kprobes */ - get_online_cpus(); mutex_lock(&text_mutex); arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ @@ -537,7 +538,6 @@ static void do_unoptimize_kprobes(void) list_del_init(&op->list); } mutex_unlock(&text_mutex); - put_online_cpus(); } /* Reclaim all kprobes on the free_list */ @@ -562,6 +562,7 @@ static void kick_kprobe_optimizer(void) static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); + cpus_read_lock(); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -587,6 +588,7 @@ static void kprobe_optimizer(struct work_struct *work) do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ @@ -650,9 +652,8 @@ static void optimize_kprobe(struct kprobe *p) /* Short cut to direct unoptimizing */ static void force_unoptimize_kprobe(struct optimized_kprobe *op) { - get_online_cpus(); + lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); - put_online_cpus(); if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); } @@ -791,6 +792,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) return; /* For preparing optimization, jump_label_text_reserved() is called */ + cpus_read_lock(); jump_label_lock(); mutex_lock(&text_mutex); @@ -812,6 +814,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) out: mutex_unlock(&text_mutex); jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_SYSCTL @@ -826,6 +829,7 @@ static void optimize_all_kprobes(void) if (kprobes_allow_optimization) goto out; + cpus_read_lock(); kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -833,6 +837,7 @@ static void optimize_all_kprobes(void) if (!kprobe_disabled(p)) optimize_kprobe(p); } + cpus_read_unlock(); printk(KERN_INFO "Kprobes globally optimized\n"); out: mutex_unlock(&kprobe_mutex); @@ -851,6 +856,7 @@ static void unoptimize_all_kprobes(void) return; } + cpus_read_lock(); kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -859,6 +865,7 @@ static void unoptimize_all_kprobes(void) unoptimize_kprobe(p, false); } } + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Wait for unoptimizing completion */ @@ -1010,14 +1017,11 @@ static void arm_kprobe(struct kprobe *kp) arm_kprobe_ftrace(kp); return; } - /* - * Here, since __arm_kprobe() doesn't use stop_machine(), - * this doesn't cause deadlock on text_mutex. So, we don't - * need get_online_cpus(). - */ + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* Disarm a kprobe with text_mutex */ @@ -1027,10 +1031,12 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt) disarm_kprobe_ftrace(kp); return; } - /* Ditto */ + + cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* @@ -1298,13 +1304,10 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) int ret = 0; struct kprobe *ap = orig_p; + cpus_read_lock(); + /* For preparing optimization, jump_label_text_reserved() is called */ jump_label_lock(); - /* - * Get online CPUs to avoid text_mutex deadlock.with stop machine, - * which is invoked by unoptimize_kprobe() in add_new_kprobe() - */ - get_online_cpus(); mutex_lock(&text_mutex); if (!kprobe_aggrprobe(orig_p)) { @@ -1352,8 +1355,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) out: mutex_unlock(&text_mutex); - put_online_cpus(); jump_label_unlock(); + cpus_read_unlock(); if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; @@ -1555,9 +1558,12 @@ int register_kprobe(struct kprobe *p) goto out; } - mutex_lock(&text_mutex); /* Avoiding text modification */ + cpus_read_lock(); + /* Prevent text modification */ + mutex_lock(&text_mutex); ret = prepare_kprobe(p); mutex_unlock(&text_mutex); + cpus_read_unlock(); if (ret) goto out; @@ -1570,7 +1576,6 @@ int register_kprobe(struct kprobe *p) /* Try to optimize kprobe */ try_to_optimize_kprobe(p); - out: mutex_unlock(&kprobe_mutex); From c23a465625e287c4deba0fdf5e8adc59cfd2a0b7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:37 +0200 Subject: [PATCH 26/37] arm64: Prevent cpu hotplug rwsem recursion The text patching functions which are invoked from jump_label and kprobes code are protected against cpu hotplug at the call sites. Use stop_machine_cpuslocked() to avoid recursion on the cpu hotplug rwsem. stop_machine_cpuslocked() contains a lockdep assertion to catch any unprotected callers. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Paul E. McKenney Cc: Mark Rutland Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Sebastian Siewior Cc: Will Deacon Cc: Steven Rostedt Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20170524081549.197070135@linutronix.de --- arch/arm64/include/asm/insn.h | 1 - arch/arm64/kernel/insn.c | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 29cb2ca756f6..4214c38d016b 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -433,7 +433,6 @@ u32 aarch64_set_branch_offset(u32 insn, s32 offset); bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn); int aarch64_insn_patch_text_nosync(void *addr, u32 insn); -int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt); int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); s32 aarch64_insn_adrp_get_offset(u32 insn); diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index b884a926a632..cd872133e88e 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -255,6 +255,7 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg) return ret; } +static int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt) { struct aarch64_insn_patch patch = { @@ -267,8 +268,8 @@ int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt) if (cnt <= 0) return -EINVAL; - return stop_machine(aarch64_insn_patch_text_cb, &patch, - cpu_online_mask); + return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, + cpu_online_mask); } int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) From 9489cc8f370be811f7e741a772bcce88b712272d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:38 +0200 Subject: [PATCH 27/37] arm: Prevent hotplug rwsem recursion The text patching functions which are invoked from jump_label and kprobes code are protected against cpu hotplug at the call sites. Use stop_machine_cpuslocked() to avoid recursion on the cpu hotplug rwsem. stop_machine_cpuslocked() contains a lockdep assertion to catch any unprotected callers. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20170524081549.275871311@linutronix.de --- arch/arm/kernel/patch.c | 2 +- arch/arm/probes/kprobes/core.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c index 020560b2dcb7..a1a34722c655 100644 --- a/arch/arm/kernel/patch.c +++ b/arch/arm/kernel/patch.c @@ -124,5 +124,5 @@ void __kprobes patch_text(void *addr, unsigned int insn) .insn = insn, }; - stop_machine(patch_text_stop_machine, &patch, NULL); + stop_machine_cpuslocked(patch_text_stop_machine, &patch, NULL); } diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index ad1f4e6a9e33..52d1cd14fda4 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -182,7 +182,8 @@ void __kprobes kprobes_remove_breakpoint(void *addr, unsigned int insn) .addr = addr, .insn = insn, }; - stop_machine(__kprobes_remove_breakpoint, &p, cpu_online_mask); + stop_machine_cpuslocked(__kprobes_remove_breakpoint, &p, + cpu_online_mask); } void __kprobes arch_disarm_kprobe(struct kprobe *p) From 5d5dbc4ef27e72104dea6102e4d1a1bf5a8ed971 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:39 +0200 Subject: [PATCH 28/37] s390: Prevent hotplug rwsem recursion The text patching functions which are invoked from jump_label and kprobes code are protected against cpu hotplug at the call sites. Use stop_machine_cpuslocked() to avoid recursion on the cpu hotplug rwsem. stop_machine_cpuslocked() contains a lockdep assertion to catch any unprotected callers. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Heiko Carstens Cc: Paul E. McKenney Cc: linux-s390@vger.kernel.org Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Martin Schwidefsky Link: http://lkml.kernel.org/r/20170524081549.354513406@linutronix.de --- arch/s390/kernel/jump_label.c | 2 +- arch/s390/kernel/kprobes.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 6aa630a8d24f..262506cee4c3 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -93,7 +93,7 @@ void arch_jump_label_transform(struct jump_entry *entry, args.entry = entry; args.type = type; - stop_machine(__sm_arch_jump_label_transform, &args, NULL); + stop_machine_cpuslocked(__sm_arch_jump_label_transform, &args, NULL); } void arch_jump_label_transform_static(struct jump_entry *entry, diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 3d6a99746454..6842e4501e2e 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -196,7 +196,7 @@ void arch_arm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; - stop_machine(swap_instruction, &args, NULL); + stop_machine_cpuslocked(swap_instruction, &args, NULL); } NOKPROBE_SYMBOL(arch_arm_kprobe); @@ -204,7 +204,7 @@ void arch_disarm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; - stop_machine(swap_instruction, &args, NULL); + stop_machine_cpuslocked(swap_instruction, &args, NULL); } NOKPROBE_SYMBOL(arch_disarm_kprobe); From fc8dffd379ca5620664336eb895a426b42847558 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:40 +0200 Subject: [PATCH 29/37] cpu/hotplug: Convert hotplug locking to percpu rwsem There are no more (known) nested calls to get_online_cpus() and all observed lock ordering problems have been addressed. Replace the magic nested 'rwsem' hackery with a percpu-rwsem. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081549.447014063@linutronix.de --- include/linux/cpu.h | 2 +- kernel/cpu.c | 107 ++++++-------------------------------------- 2 files changed, 14 insertions(+), 95 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index af4d660798e5..ca73bc1563f4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -103,7 +103,7 @@ extern void cpus_write_lock(void); extern void cpus_write_unlock(void); extern void cpus_read_lock(void); extern void cpus_read_unlock(void); -static inline void lockdep_assert_cpus_held(void) { } +extern void lockdep_assert_cpus_held(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); diff --git a/kernel/cpu.c b/kernel/cpu.c index 142d889d9f69..66836216ebae 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -196,121 +197,41 @@ void cpu_maps_update_done(void) mutex_unlock(&cpu_add_remove_lock); } -/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. +/* + * If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock */ static int cpu_hotplug_disabled; #ifdef CONFIG_HOTPLUG_CPU -static struct { - struct task_struct *active_writer; - /* wait queue to wake up the active_writer */ - wait_queue_head_t wq; - /* verifies that no writer will get active while readers are active */ - struct mutex lock; - /* - * Also blocks the new readers during - * an ongoing cpu hotplug operation. - */ - atomic_t refcount; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} cpu_hotplug = { - .active_writer = NULL, - .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), - .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), -#ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), -#endif -}; - -/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ -#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire_tryread() \ - lock_map_acquire_tryread(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) -#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) - +DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); void cpus_read_lock(void) { - might_sleep(); - if (cpu_hotplug.active_writer == current) - return; - cpuhp_lock_acquire_read(); - mutex_lock(&cpu_hotplug.lock); - atomic_inc(&cpu_hotplug.refcount); - mutex_unlock(&cpu_hotplug.lock); + percpu_down_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_lock); void cpus_read_unlock(void) { - int refcount; - - if (cpu_hotplug.active_writer == current) - return; - - refcount = atomic_dec_return(&cpu_hotplug.refcount); - if (WARN_ON(refcount < 0)) /* try to fix things up */ - atomic_inc(&cpu_hotplug.refcount); - - if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) - wake_up(&cpu_hotplug.wq); - - cpuhp_lock_release(); - + percpu_up_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_unlock); -/* - * This ensures that the hotplug operation can begin only when the - * refcount goes to zero. - * - * Note that during a cpu-hotplug operation, the new readers, if any, - * will be blocked by the cpu_hotplug.lock - * - * Since cpu_hotplug_begin() is always called after invoking - * cpu_maps_update_begin(), we can be sure that only one writer is active. - * - * Note that theoretically, there is a possibility of a livelock: - * - Refcount goes to zero, last reader wakes up the sleeping - * writer. - * - Last reader unlocks the cpu_hotplug.lock. - * - A new reader arrives at this moment, bumps up the refcount. - * - The writer acquires the cpu_hotplug.lock finds the refcount - * non zero and goes to sleep again. - * - * However, this is very difficult to achieve in practice since - * get_online_cpus() not an api which is called all that often. - * - */ void cpus_write_lock(void) { - DEFINE_WAIT(wait); - - cpu_hotplug.active_writer = current; - cpuhp_lock_acquire(); - - for (;;) { - mutex_lock(&cpu_hotplug.lock); - prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); - if (likely(!atomic_read(&cpu_hotplug.refcount))) - break; - mutex_unlock(&cpu_hotplug.lock); - schedule(); - } - finish_wait(&cpu_hotplug.wq, &wait); + percpu_down_write(&cpu_hotplug_lock); } void cpus_write_unlock(void) { - cpu_hotplug.active_writer = NULL; - mutex_unlock(&cpu_hotplug.lock); - cpuhp_lock_release(); + percpu_up_write(&cpu_hotplug_lock); +} + +void lockdep_assert_cpus_held(void) +{ + percpu_rwsem_assert_held(&cpu_hotplug_lock); } /* @@ -344,8 +265,6 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -/* Notifier wrappers for transitioning to state machine */ - static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); From 62ec05dd71b19f5be890a1992227cc7b2ac0adc4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:41 +0200 Subject: [PATCH 30/37] sched: Provide is_percpu_thread() helper Provide a helper function for checking whether current task is a per cpu thread. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081549.541649540@linutronix.de --- include/linux/sched.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b69fc650201..3dfa5f99d6ee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1265,6 +1265,16 @@ extern struct pid *cad_pid; #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) +static inline bool is_percpu_thread(void) +{ +#ifdef CONFIG_SMP + return (current->flags & PF_NO_SETAFFINITY) && + (current->nr_cpus_allowed == 1); +#else + return true; +#endif +} + /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ From 0266d81e9bf5cc1fe6405c0523dfa015fe55aae1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:42 +0200 Subject: [PATCH 31/37] acpi/processor: Prevent cpu hotplug deadlock With the enhanced CPU hotplug lockdep coverage the following lockdep splat happens: ====================================================== WARNING: possible circular locking dependency detected 4.12.0-rc2+ #84 Tainted: G W ------------------------------------------------------ cpuhp/1/15 is trying to acquire lock: flush_work+0x39/0x2f0 but task is already holding lock: cpuhp_thread_fun+0x30/0x160 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (cpuhp_state){+.+.+.}: lock_acquire+0xb4/0x200 cpuhp_kick_ap_work+0x72/0x330 _cpu_down+0x8b/0x100 do_cpu_down+0x3e/0x60 cpu_down+0x10/0x20 cpu_subsys_offline+0x14/0x20 device_offline+0x88/0xb0 online_store+0x4c/0xa0 dev_attr_store+0x18/0x30 sysfs_kf_write+0x45/0x60 kernfs_fop_write+0x156/0x1e0 __vfs_write+0x37/0x160 vfs_write+0xca/0x1c0 SyS_write+0x58/0xc0 entry_SYSCALL_64_fastpath+0x23/0xc2 -> #1 (cpu_hotplug_lock.rw_sem){++++++}: lock_acquire+0xb4/0x200 cpus_read_lock+0x3d/0xb0 apply_workqueue_attrs+0x17/0x50 __alloc_workqueue_key+0x1e1/0x530 scsi_host_alloc+0x373/0x480 [scsi_mod] ata_scsi_add_hosts+0xcb/0x130 [libata] ata_host_register+0x11a/0x2c0 [libata] ata_host_activate+0xf0/0x150 [libata] ahci_host_activate+0x13e/0x170 [libahci] ahci_init_one+0xa3a/0xd3f [ahci] local_pci_probe+0x45/0xa0 work_for_cpu_fn+0x14/0x20 process_one_work+0x1f9/0x690 worker_thread+0x200/0x3d0 kthread+0x138/0x170 ret_from_fork+0x31/0x40 -> #0 ((&wfc.work)){+.+.+.}: __lock_acquire+0x11e1/0x13e0 lock_acquire+0xb4/0x200 flush_work+0x5c/0x2f0 work_on_cpu+0xa1/0xd0 acpi_processor_get_throttling+0x3d/0x50 acpi_processor_reevaluate_tstate+0x2c/0x50 acpi_soft_cpu_online+0x69/0xd0 cpuhp_invoke_callback+0xb4/0x8b0 cpuhp_up_callbacks+0x36/0xc0 cpuhp_thread_fun+0x14e/0x160 smpboot_thread_fn+0x1e8/0x300 kthread+0x138/0x170 ret_from_fork+0x31/0x40 other info that might help us debug this: Chain exists of: (&wfc.work) --> cpu_hotplug_lock.rw_sem --> cpuhp_state Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpuhp_state); lock(cpu_hotplug_lock.rw_sem); lock(cpuhp_state); lock((&wfc.work)); *** DEADLOCK *** 1 lock held by cpuhp/1/15: cpuhp_thread_fun+0x30/0x160 stack backtrace: CPU: 1 PID: 15 Comm: cpuhp/1 Tainted: G W 4.12.0-rc2+ #84 Hardware name: Supermicro SYS-4048B-TR4FT/X10QBi, BIOS 1.1a 07/29/2015 Call Trace: dump_stack+0x85/0xc4 print_circular_bug+0x209/0x217 __lock_acquire+0x11e1/0x13e0 lock_acquire+0xb4/0x200 ? lock_acquire+0xb4/0x200 ? flush_work+0x39/0x2f0 ? acpi_processor_start+0x50/0x50 flush_work+0x5c/0x2f0 ? flush_work+0x39/0x2f0 ? acpi_processor_start+0x50/0x50 ? mark_held_locks+0x6d/0x90 ? queue_work_on+0x56/0x90 ? trace_hardirqs_on_caller+0x154/0x1c0 ? trace_hardirqs_on+0xd/0x10 ? acpi_processor_start+0x50/0x50 work_on_cpu+0xa1/0xd0 ? find_worker_executing_work+0x50/0x50 ? acpi_processor_power_exit+0x70/0x70 acpi_processor_get_throttling+0x3d/0x50 acpi_processor_reevaluate_tstate+0x2c/0x50 acpi_soft_cpu_online+0x69/0xd0 cpuhp_invoke_callback+0xb4/0x8b0 ? lock_acquire+0xb4/0x200 ? padata_replace+0x120/0x120 cpuhp_up_callbacks+0x36/0xc0 cpuhp_thread_fun+0x14e/0x160 smpboot_thread_fn+0x1e8/0x300 kthread+0x138/0x170 ? sort_range+0x30/0x30 ? kthread_create_on_node+0x70/0x70 ret_from_fork+0x31/0x40 The problem is that the work is scheduled on the current CPU from the hotplug thread associated with that CPU. It's not required to invoke these functions via the workqueue because the hotplug thread runs on the target CPU already. Check whether current is a per cpu thread pinned on the target CPU and invoke the function directly to avoid the workqueue. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: linux-acpi@vger.kernel.org Cc: Len Brown Link: http://lkml.kernel.org/r/20170524081549.620489733@linutronix.de --- drivers/acpi/processor_throttling.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index 3de34633f7f9..7f9aff4b8d62 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -909,6 +909,13 @@ static long __acpi_processor_get_throttling(void *data) return pr->throttling.acpi_processor_get_throttling(pr); } +static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct) +{ + if (direct || (is_percpu_thread() && cpu == smp_processor_id())) + return fn(arg); + return work_on_cpu(cpu, fn, arg); +} + static int acpi_processor_get_throttling(struct acpi_processor *pr) { if (!pr) @@ -926,7 +933,7 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr) if (!cpu_online(pr->id)) return -ENODEV; - return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr); + return call_on_cpu(pr->id, __acpi_processor_get_throttling, pr, false); } static int acpi_processor_get_fadt_info(struct acpi_processor *pr) @@ -1076,13 +1083,6 @@ static long acpi_processor_throttling_fn(void *data) arg->target_state, arg->force); } -static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct) -{ - if (direct) - return fn(arg); - return work_on_cpu(cpu, fn, arg); -} - static int __acpi_processor_set_throttling(struct acpi_processor *pr, int state, bool force, bool direct) { From 49dfe2a6779717d9c18395684ee31bdc98b22e53 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:43 +0200 Subject: [PATCH 32/37] cpuhotplug: Link lock stacks for hotplug callbacks The CPU hotplug callbacks are not covered by lockdep versus the cpu hotplug rwsem. CPU0 CPU1 cpuhp_setup_state(STATE, startup, teardown); cpus_read_lock(); invoke_callback_on_ap(); kick_hotplug_thread(ap); wait_for_completion(); hotplug_thread_fn() lock(m); do_stuff(); unlock(m); Lockdep does not know about this dependency and will not trigger on the following code sequence: lock(m); cpus_read_lock(); Add a lockdep map and connect the initiators lock chain with the hotplug thread lock chain, so potential deadlocks can be detected. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081549.709375845@linutronix.de --- kernel/cpu.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 66836216ebae..7435ffc6163b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -66,6 +66,12 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) +static struct lock_class_key cpuhp_state_key; +static struct lockdep_map cpuhp_state_lock_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +#endif + /** * cpuhp_step - Hotplug state machine step * @name: Name of the step @@ -403,6 +409,7 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; + lock_map_acquire(&cpuhp_state_lock_map); /* Single callback invocation for [un]install ? */ if (st->single) { if (st->cb_state < CPUHP_AP_ONLINE) { @@ -429,6 +436,7 @@ static void cpuhp_thread_fun(unsigned int cpu) else if (st->state > st->target) ret = cpuhp_ap_offline(cpu, st); } + lock_map_release(&cpuhp_state_lock_map); st->result = ret; complete(&st->done); } @@ -443,6 +451,9 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); + /* * If we are up and running, use the hotplug thread. For early calls * we invoke the thread function directly. @@ -486,6 +497,8 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state state = st->state; trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); __cpuhp_kick_ap_work(st); wait_for_completion(&st->done); trace_cpuhp_exit(cpu, st->state, state, st->result); From e5aeee51f6b4fb22e851105ee6d8ad211c40a214 Mon Sep 17 00:00:00 2001 From: Alexander Levin Date: Sat, 3 Jun 2017 03:39:13 +0000 Subject: [PATCH 33/37] perf/core: Don't release cred_guard_mutex if not taken If we failed to acquire task's cred_guard_mutex we shouldn't proceed to release it in the error path. Fixes: a63fbed776c ("perf/tracing/cpuhotplug: Fix locking order") Signed-off-by: Alexander Levin Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: mhiramat@kernel.org Cc: paulmck@linux.vnet.ibm.com Cc: bigeasy@linutronix.de Link: http://lkml.kernel.org/r/20170603033903.12056-1-alexander.levin@verizon.com Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index b97cda4d1777..1f1b8cdaca2d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9878,7 +9878,7 @@ SYSCALL_DEFINE5(perf_event_open, if (task) { err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (err) - goto err_cred; + goto err_task; /* * Reuse ptrace permission checks for now. From 57de72125d34f83bfd39615fcc3cc25ca3b9c0ec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 8 Jun 2017 10:55:33 +0200 Subject: [PATCH 34/37] cpu/hotplug: Remove unused check_for_tasks() function clang -Wunused-function found one remaining function that was apparently meant to be removed in a recent code cleanup: kernel/cpu.c:565:20: warning: unused function 'check_for_tasks' [-Wunused-function] Sebastian explained: The function became unused unintentionally, but there is already a failure check, when a task cannot be removed from the outgoing cpu in the scheduler code, so bringing it back is not really giving any extra value. Fixes: 530e9b76ae8f ("cpu/hotplug: Remove obsolete cpu hotplug register/unregister functions") Signed-off-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Boris Ostrovsky Cc: Anna-Maria Gleixner Link: http://lkml.kernel.org/r/20170608085544.2257132-1-arnd@arndb.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 7435ffc6163b..d0f5f54aa087 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -562,30 +562,6 @@ void clear_tasks_mm_cpumask(int cpu) rcu_read_unlock(); } -static inline void check_for_tasks(int dead_cpu) -{ - struct task_struct *g, *p; - - read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - if (!p->on_rq) - continue; - /* - * We do the check with unlocked task_rq(p)->lock. - * Order the reading to do not warn about a task, - * which was running on this cpu in the past, and - * it's just been woken on another cpu. - */ - rmb(); - if (task_cpu(p) != dead_cpu) - continue; - - pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", - p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } - read_unlock(&tasklist_lock); -} - /* Take this CPU down. */ static int take_cpu_down(void *_param) { From 1b3b22507e0d45dedc6a54b26d56e0b8c4d36875 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 16 Jun 2017 01:22:38 -0700 Subject: [PATCH 35/37] ARM/hw_breakpoint: Fix possible recursive locking for arch_hw_breakpoint_init Recent change to use cpuhp_setup_state_cpuslocked() with commit fe2a5cd8aa03 ("ARM/hw_breakpoint: Use cpuhp_setup_state_cpuslocked()") missed to change the related paired cpuhp_remove_state_nocalls_cpuslocked(). Now if arch_hw_breakpoint_init() fails, we get "WARNING: possible recursive locking detected" on the exit path. Fixes: fe2a5cd8aa03 ("ARM/hw_breakpoint: Use cpuhp_setup_state_cpuslocked()") Signed-off-by: Tony Lindgren Acked-by: Sebastian Andrzej Siewior Cc: Mark Rutland Cc: linux-omap@vger.kernel.org Cc: Peter Zijlstra Cc: Will Deacon Cc: Steven Rostedt Cc: Russell King Cc: "Paul E . McKenney" Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20170616082238.15553-1-tony@atomide.com Signed-off-by: Thomas Gleixner --- arch/arm/kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 63cb4c7c6593..af2a7f1e3103 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -1106,7 +1106,7 @@ static int __init arch_hw_breakpoint_init(void) core_num_brps = 0; core_num_wrps = 0; if (ret > 0) - cpuhp_remove_state_nocalls(ret); + cpuhp_remove_state_nocalls_cpuslocked(ret); cpus_read_unlock(); return 0; } From 3e401f7a2e5199151f735aee6a5c6b4776e6a35e Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 20 Jun 2017 19:08:30 -0300 Subject: [PATCH 36/37] powerpc: Only obtain cpu_hotplug_lock if called by rtasd Calling arch_update_cpu_topology from a CPU hotplug state machine callback hits a deadlock because the function tries to get a read lock on cpu_hotplug_lock while the state machine still holds a write lock on it. Since all callers of arch_update_cpu_topology except rtasd already hold cpu_hotplug_lock, this patch changes the function to use stop_machine_cpuslocked and creates a separate function for rtasd which still tries to obtain the lock. Michael Bringmann investigated the bug and provided a detailed analysis of the deadlock on this previous RFC for an alternate solution: Signed-off-by: Thiago Jung Bauermann Signed-off-by: Thomas Gleixner Acked-by: Michael Ellerman Cc: John Allen Cc: Michael Bringmann Cc: Nathan Fontenot Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1497996510-4032-1-git-send-email-bauerman@linux.vnet.ibm.com Link: https://patchwork.ozlabs.org/patch/771293/ --- arch/powerpc/include/asm/topology.h | 6 ++++++ arch/powerpc/kernel/rtasd.c | 2 +- arch/powerpc/mm/numa.c | 22 +++++++++++++++++++--- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 8b3b46b7b0f2..a2d36b7703ae 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -43,6 +43,7 @@ extern void __init dump_numa_cpu_topology(void); extern int sysfs_add_device_to_node(struct device *dev, int nid); extern void sysfs_remove_device_from_node(struct device *dev, int nid); +extern int numa_update_cpu_topology(bool cpus_locked); #else @@ -57,6 +58,11 @@ static inline void sysfs_remove_device_from_node(struct device *dev, int nid) { } + +static inline int numa_update_cpu_topology(bool cpus_locked) +{ + return 0; +} #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 3650732639ed..0f0b1b2f3b60 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -283,7 +283,7 @@ static void prrn_work_fn(struct work_struct *work) * the RTAS event. */ pseries_devicetree_update(-prrn_update_scope); - arch_update_cpu_topology(); + numa_update_cpu_topology(false); } static DECLARE_WORK(prrn_work, prrn_work_fn); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 371792e4418f..b95c584ce19d 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1311,8 +1311,10 @@ static int update_lookup_table(void *data) /* * Update the node maps and sysfs entries for each cpu whose home node * has changed. Returns 1 when the topology has changed, and 0 otherwise. + * + * cpus_locked says whether we already hold cpu_hotplug_lock. */ -int arch_update_cpu_topology(void) +int numa_update_cpu_topology(bool cpus_locked) { unsigned int cpu, sibling, changed = 0; struct topology_update_data *updates, *ud; @@ -1400,15 +1402,23 @@ int arch_update_cpu_topology(void) if (!cpumask_weight(&updated_cpus)) goto out; - stop_machine(update_cpu_topology, &updates[0], &updated_cpus); + if (cpus_locked) + stop_machine_cpuslocked(update_cpu_topology, &updates[0], + &updated_cpus); + else + stop_machine(update_cpu_topology, &updates[0], &updated_cpus); /* * Update the numa-cpu lookup table with the new mappings, even for * offline CPUs. It is best to perform this update from the stop- * machine context. */ - stop_machine(update_lookup_table, &updates[0], + if (cpus_locked) + stop_machine_cpuslocked(update_lookup_table, &updates[0], cpumask_of(raw_smp_processor_id())); + else + stop_machine(update_lookup_table, &updates[0], + cpumask_of(raw_smp_processor_id())); for (ud = &updates[0]; ud; ud = ud->next) { unregister_cpu_under_node(ud->cpu, ud->old_nid); @@ -1426,6 +1436,12 @@ int arch_update_cpu_topology(void) return changed; } +int arch_update_cpu_topology(void) +{ + lockdep_assert_cpus_held(); + return numa_update_cpu_topology(true); +} + static void topology_work_fn(struct work_struct *work) { rebuild_sched_domains(); From 993647a293814dd47ae41d38657fda6e4ab04e33 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 29 Jun 2017 17:40:47 +0530 Subject: [PATCH 37/37] cpu/hotplug: Constify attribute_group structures attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const: File size before: text data bss dec hex filename 12582 15361 20 27963 6d3b kernel/cpu.o File size After adding 'const': text data bss dec hex filename 12710 15265 20 27995 6d5b kernel/cpu.o Signed-off-by: Arvind Yadav Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: anna-maria@linutronix.de Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: rcochran@linutronix.de Link: http://lkml.kernel.org/r/f9079e94e12b36d245e7adbf67d312bc5d0250c6.1498737970.git.arvind.yadav.cs@gmail.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index d0f5f54aa087..b69c0588f8c9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1629,7 +1629,7 @@ static struct attribute *cpuhp_cpu_attrs[] = { NULL }; -static struct attribute_group cpuhp_cpu_attr_group = { +static const struct attribute_group cpuhp_cpu_attr_group = { .attrs = cpuhp_cpu_attrs, .name = "hotplug", NULL @@ -1661,7 +1661,7 @@ static struct attribute *cpuhp_cpu_root_attrs[] = { NULL }; -static struct attribute_group cpuhp_cpu_root_attr_group = { +static const struct attribute_group cpuhp_cpu_root_attr_group = { .attrs = cpuhp_cpu_root_attrs, .name = "hotplug", NULL