linux-stable/lib/percpu_counter.c
Jens Axboe 60b0ea120c percpu_counter: fix bad counter state during suspend
I got a bug report yesterday from Laszlo Ersek <lersek@xxxxxxxxxx>, in
which he states that his kvm instance fails to suspend. He Laszlo
bisected it down to this commit:

commit 1cf7e9c68f
Author: Jens Axboe <axboe@xxxxxxxxx>
Date: Fri Nov 1 10:52:52 2013 -0600

virtio_blk: blk-mq support

where virtio-blk is converted to use the blk-mq infrastructure. After
digging a bit, it became clear that the issue was with the queue drain.
blk-mq tracks queue usage in a percpu counter, which is incremented on
request alloc and decremented when the request is freed. The initial
hunt was for an inconsistency in blk-mq, but everything seemed fine. In
fact, the counter only returned crazy values when suspend was in
progress. When a CPU is unplugged, the percpu counters merges that CPU
state with the general state. blk-mq takes care to register a hotcpu
notifier with the appropriate priority, so we know it runs after the
percpu counter notifier. However, the percpu counter notifier only
merges the state when the CPU is fully gone. This leaves a state
transition where the CPU going away is no longer in the online mask, yet
it still holds private values. This means that in this state,
percpu_counter_sum() returns invalid results, and the suspend then hangs
waiting for abs(dead-cpu-value) requests to complete which of course
will never happen.

Fix this by clearing the state earlier, so we never have a case where
the CPU isn't in online mask but still holds private state. This bug has
been there since forever, I guess we don't have a lot of users where
percpu counters needs to be reliable during the suspend cycle.

Reported-by: <lersek@redhat.com>
Tested-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2014-04-07 08:17:10 -06:00

225 lines
5.3 KiB
C

/*
* Fast batching percpu counters.
*/
#include <linux/percpu_counter.h>
#include <linux/notifier.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/debugobjects.h>
#ifdef CONFIG_HOTPLUG_CPU
static LIST_HEAD(percpu_counters);
static DEFINE_SPINLOCK(percpu_counters_lock);
#endif
#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER
static struct debug_obj_descr percpu_counter_debug_descr;
static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
struct percpu_counter *fbc = addr;
switch (state) {
case ODEBUG_STATE_ACTIVE:
percpu_counter_destroy(fbc);
debug_object_free(fbc, &percpu_counter_debug_descr);
return 1;
default:
return 0;
}
}
static struct debug_obj_descr percpu_counter_debug_descr = {
.name = "percpu_counter",
.fixup_free = percpu_counter_fixup_free,
};
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{
debug_object_init(fbc, &percpu_counter_debug_descr);
debug_object_activate(fbc, &percpu_counter_debug_descr);
}
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{
debug_object_deactivate(fbc, &percpu_counter_debug_descr);
debug_object_free(fbc, &percpu_counter_debug_descr);
}
#else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{ }
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{ }
#endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
int cpu;
unsigned long flags;
raw_spin_lock_irqsave(&fbc->lock, flags);
for_each_possible_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
*pcount = 0;
}
fbc->count = amount;
raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_set);
void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
{
s64 count;
preempt_disable();
count = __this_cpu_read(*fbc->counters) + amount;
if (count >= batch || count <= -batch) {
unsigned long flags;
raw_spin_lock_irqsave(&fbc->lock, flags);
fbc->count += count;
__this_cpu_sub(*fbc->counters, count - amount);
raw_spin_unlock_irqrestore(&fbc->lock, flags);
} else {
this_cpu_add(*fbc->counters, amount);
}
preempt_enable();
}
EXPORT_SYMBOL(__percpu_counter_add);
/*
* Add up all the per-cpu counts, return the result. This is a more accurate
* but much slower version of percpu_counter_read_positive()
*/
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
s64 ret;
int cpu;
unsigned long flags;
raw_spin_lock_irqsave(&fbc->lock, flags);
ret = fbc->count;
for_each_online_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
ret += *pcount;
}
raw_spin_unlock_irqrestore(&fbc->lock, flags);
return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);
int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
struct lock_class_key *key)
{
raw_spin_lock_init(&fbc->lock);
lockdep_set_class(&fbc->lock, key);
fbc->count = amount;
fbc->counters = alloc_percpu(s32);
if (!fbc->counters)
return -ENOMEM;
debug_percpu_counter_activate(fbc);
#ifdef CONFIG_HOTPLUG_CPU
INIT_LIST_HEAD(&fbc->list);
spin_lock(&percpu_counters_lock);
list_add(&fbc->list, &percpu_counters);
spin_unlock(&percpu_counters_lock);
#endif
return 0;
}
EXPORT_SYMBOL(__percpu_counter_init);
void percpu_counter_destroy(struct percpu_counter *fbc)
{
if (!fbc->counters)
return;
debug_percpu_counter_deactivate(fbc);
#ifdef CONFIG_HOTPLUG_CPU
spin_lock(&percpu_counters_lock);
list_del(&fbc->list);
spin_unlock(&percpu_counters_lock);
#endif
free_percpu(fbc->counters);
fbc->counters = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy);
int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);
static void compute_batch_value(void)
{
int nr = num_online_cpus();
percpu_counter_batch = max(32, nr*2);
}
static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
unsigned long action, void *hcpu)
{
#ifdef CONFIG_HOTPLUG_CPU
unsigned int cpu;
struct percpu_counter *fbc;
compute_batch_value();
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
cpu = (unsigned long)hcpu;
spin_lock(&percpu_counters_lock);
list_for_each_entry(fbc, &percpu_counters, list) {
s32 *pcount;
unsigned long flags;
raw_spin_lock_irqsave(&fbc->lock, flags);
pcount = per_cpu_ptr(fbc->counters, cpu);
fbc->count += *pcount;
*pcount = 0;
raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
spin_unlock(&percpu_counters_lock);
#endif
return NOTIFY_OK;
}
/*
* Compare counter against given value.
* Return 1 if greater, 0 if equal and -1 if less
*/
int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
s64 count;
count = percpu_counter_read(fbc);
/* Check to see if rough count will be sufficient for comparison */
if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
if (count > rhs)
return 1;
else
return -1;
}
/* Need to use precise count */
count = percpu_counter_sum(fbc);
if (count > rhs)
return 1;
else if (count < rhs)
return -1;
else
return 0;
}
EXPORT_SYMBOL(percpu_counter_compare);
static int __init percpu_counter_startup(void)
{
compute_batch_value();
hotcpu_notifier(percpu_counter_hotcpu_callback, 0);
return 0;
}
module_init(percpu_counter_startup);