linux-stable/arch/powerpc/lib/qspinlock.c

// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/smp.h>
#include <asm/qspinlock.h>
#include <asm/paravirt.h>

#define MAX_NODES	4

struct qnode {
	struct qnode	*next;
	struct qspinlock *lock;
	u8		locked; /* 1 if lock acquired */
};

struct qnodes {
	int		count;
	struct qnode nodes[MAX_NODES];
};

/* Tuning parameters */
static int steal_spins __read_mostly = (1 << 5);
static bool maybe_stealers __read_mostly = true;
static int head_spins __read_mostly = (1 << 8);

static bool pv_yield_owner __read_mostly = true;
static bool pv_yield_allow_steal __read_mostly = false;
static bool pv_yield_prev __read_mostly = true;

static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);

static __always_inline int get_steal_spins(bool paravirt)
{
	return steal_spins;
}

static __always_inline int get_head_spins(bool paravirt)
{
	return head_spins;
}

static inline u32 encode_tail_cpu(int cpu)
{
	return (cpu + 1) << _Q_TAIL_CPU_OFFSET;
}

static inline int decode_tail_cpu(u32 val)
{
	return (val >> _Q_TAIL_CPU_OFFSET) - 1;
}

static inline int get_owner_cpu(u32 val)
{
	return (val & _Q_OWNER_CPU_MASK) >> _Q_OWNER_CPU_OFFSET;
}

/*
 * Try to acquire the lock if it was not already locked. If the tail matches
 * mytail then clear it, otherwise leave it unchnaged. Return previous value.
 *
 * This is used by the head of the queue to acquire the lock and clean up
 * its tail if it was the last one queued.
 */
static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail)
{
	u32 newval = queued_spin_encode_locked_val();
	u32 prev, tmp;

	asm volatile(
"1:	lwarx	%0,0,%2,%7	# trylock_clean_tail			\n"
	/* This test is necessary if there could be stealers */
"	andi.	%1,%0,%5						\n"
"	bne	3f							\n"
	/* Test whether the lock tail == mytail */
"	and	%1,%0,%6						\n"
"	cmpw	0,%1,%3							\n"
	/* Merge the new locked value */
"	or	%1,%1,%4						\n"
"	bne	2f							\n"
	/* If the lock tail matched, then clear it, otherwise leave it. */
"	andc	%1,%1,%6						\n"
"2:	stwcx.	%1,0,%2							\n"
"	bne-	1b							\n"
"\t"	PPC_ACQUIRE_BARRIER "						\n"
"3:									\n"
	: "=&r" (prev), "=&r" (tmp)
	: "r" (&lock->val), "r"(tail), "r" (newval),
	  "i" (_Q_LOCKED_VAL),
	  "r" (_Q_TAIL_CPU_MASK),
	  "i" (IS_ENABLED(CONFIG_PPC64))
	: "cr0", "memory");

	return prev;
}

/*
 * Publish our tail, replacing previous tail. Return previous value.
 *
 * This provides a release barrier for publishing node, this pairs with the
 * acquire barrier in get_tail_qnode() when the next CPU finds this tail
 * value.
 */
static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail)
{
	u32 prev, tmp;

	asm volatile(
"\t"	PPC_RELEASE_BARRIER "						\n"
"1:	lwarx	%0,0,%2		# publish_tail_cpu			\n"
"	andc	%1,%0,%4						\n"
"	or	%1,%1,%3						\n"
"	stwcx.	%1,0,%2							\n"
"	bne-	1b							\n"
	: "=&r" (prev), "=&r"(tmp)
	: "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK)
	: "cr0", "memory");

	return prev;
}

static __always_inline u32 set_mustq(struct qspinlock *lock)
{
	u32 prev;

	asm volatile(
"1:	lwarx	%0,0,%1		# set_mustq				\n"
"	or	%0,%0,%2						\n"
"	stwcx.	%0,0,%1							\n"
"	bne-	1b							\n"
	: "=&r" (prev)
	: "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
	: "cr0", "memory");

	return prev;
}

static __always_inline u32 clear_mustq(struct qspinlock *lock)
{
	u32 prev;

	asm volatile(
"1:	lwarx	%0,0,%1		# clear_mustq				\n"
"	andc	%0,%0,%2						\n"
"	stwcx.	%0,0,%1							\n"
"	bne-	1b							\n"
	: "=&r" (prev)
	: "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
	: "cr0", "memory");

	return prev;
}

static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
{
	int cpu = decode_tail_cpu(val);
	struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu);
	int idx;

	/*
	 * After publishing the new tail and finding a previous tail in the
	 * previous val (which is the control dependency), this barrier
	 * orders the release barrier in publish_tail_cpu performed by the
	 * last CPU, with subsequently looking at its qnode structures
	 * after the barrier.
	 */
	smp_acquire__after_ctrl_dep();

	for (idx = 0; idx < MAX_NODES; idx++) {
		struct qnode *qnode = &qnodesp->nodes[idx];
		if (qnode->lock == lock)
			return qnode;
	}

	BUG();
}

static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq)
{
	int owner;
	u32 yield_count;

	BUG_ON(!(val & _Q_LOCKED_VAL));

	if (!paravirt)
		goto relax;

	if (!pv_yield_owner)
		goto relax;

	owner = get_owner_cpu(val);
	yield_count = yield_count_of(owner);

	if ((yield_count & 1) == 0)
		goto relax; /* owner vcpu is running */

	/*
	 * Read the lock word after sampling the yield count. On the other side
	 * there may a wmb because the yield count update is done by the
	 * hypervisor preemption and the value update by the OS, however this
	 * ordering might reduce the chance of out of order accesses and
	 * improve the heuristic.
	 */
	smp_rmb();

	if (READ_ONCE(lock->val) == val) {
		if (mustq)
			clear_mustq(lock);
		yield_to_preempted(owner, yield_count);
		if (mustq)
			set_mustq(lock);
		/* Don't relax if we yielded. Maybe we should? */
		return;
	}
relax:
	cpu_relax();
}

static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt)
{
	__yield_to_locked_owner(lock, val, paravirt, false);
}

static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt)
{
	bool mustq = false;

	if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal)
		mustq = true;

	__yield_to_locked_owner(lock, val, paravirt, mustq);
}

static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt)
{
	int prev_cpu = decode_tail_cpu(val);
	u32 yield_count;

	if (!paravirt)
		goto relax;

	if (!pv_yield_prev)
		goto relax;

	yield_count = yield_count_of(prev_cpu);
	if ((yield_count & 1) == 0)
		goto relax; /* owner vcpu is running */

	smp_rmb(); /* See __yield_to_locked_owner comment */

	if (!node->locked) {
		yield_to_preempted(prev_cpu, yield_count);
		return;
	}

relax:
	cpu_relax();
}


static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt)
{
	int iters = 0;

	if (!steal_spins)
		return false;

	/* Attempt to steal the lock */
	do {
		u32 val = READ_ONCE(lock->val);

		if (val & _Q_MUST_Q_VAL)
			break;

		if (unlikely(!(val & _Q_LOCKED_VAL))) {
			if (__queued_spin_trylock_steal(lock))
				return true;
		} else {
			yield_to_locked_owner(lock, val, paravirt);
		}

		iters++;
	} while (iters < get_steal_spins(paravirt));

	return false;
}

static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt)
{
	struct qnodes *qnodesp;
	struct qnode *next, *node;
	u32 val, old, tail;
	bool mustq = false;
	int idx;
	int iters = 0;

	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));

	qnodesp = this_cpu_ptr(&qnodes);
	if (unlikely(qnodesp->count >= MAX_NODES)) {
		while (!queued_spin_trylock(lock))
			cpu_relax();
		return;
	}

	idx = qnodesp->count++;
	/*
	 * Ensure that we increment the head node->count before initialising
	 * the actual node. If the compiler is kind enough to reorder these
	 * stores, then an IRQ could overwrite our assignments.
	 */
	barrier();
	node = &qnodesp->nodes[idx];
	node->next = NULL;
	node->lock = lock;
	node->locked = 0;

	tail = encode_tail_cpu(smp_processor_id());

	old = publish_tail_cpu(lock, tail);

	/*
	 * If there was a previous node; link it and wait until reaching the
	 * head of the waitqueue.
	 */
	if (old & _Q_TAIL_CPU_MASK) {
		struct qnode *prev = get_tail_qnode(lock, old);

		/* Link @node into the waitqueue. */
		WRITE_ONCE(prev->next, node);

		/* Wait for mcs node lock to be released */
		while (!node->locked)
			yield_to_prev(lock, node, old, paravirt);

		smp_rmb(); /* acquire barrier for the mcs lock */
	}

again:
	/* We're at the head of the waitqueue, wait for the lock. */
	for (;;) {
		val = READ_ONCE(lock->val);
		if (!(val & _Q_LOCKED_VAL))
			break;

		yield_head_to_locked_owner(lock, val, paravirt);
		if (!maybe_stealers)
			continue;
		iters++;

		if (!mustq && iters >= get_head_spins(paravirt)) {
			mustq = true;
			set_mustq(lock);
			val |= _Q_MUST_Q_VAL;
		}
	}

	/* If we're the last queued, must clean up the tail. */
	old = trylock_clean_tail(lock, tail);
	if (unlikely(old & _Q_LOCKED_VAL)) {
		BUG_ON(!maybe_stealers);
		goto again; /* Can only be true if maybe_stealers. */
	}

	if ((old & _Q_TAIL_CPU_MASK) == tail)
		goto release; /* We were the tail, no next. */

	/* There is a next, must wait for node->next != NULL (MCS protocol) */
	while (!(next = READ_ONCE(node->next)))
		cpu_relax();

	/*
	 * Unlock the next mcs waiter node. Release barrier is not required
	 * here because the acquirer is only accessing the lock word, and
	 * the acquire barrier we took the lock with orders that update vs
	 * this store to locked. The corresponding barrier is the smp_rmb()
	 * acquire barrier for mcs lock, above.
	 */
	WRITE_ONCE(next->locked, 1);

release:
	qnodesp->count--; /* release the node */
}

void queued_spin_lock_slowpath(struct qspinlock *lock)
{
	/*
	 * This looks funny, but it induces the compiler to inline both
	 * sides of the branch rather than share code as when the condition
	 * is passed as the paravirt argument to the functions.
	 */
	if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) {
		if (try_to_steal_lock(lock, true))
			return;
		queued_spin_lock_mcs_queue(lock, true);
	} else {
		if (try_to_steal_lock(lock, false))
			return;
		queued_spin_lock_mcs_queue(lock, false);
	}
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);

#ifdef CONFIG_PARAVIRT_SPINLOCKS
void pv_spinlocks_init(void)
{
}
#endif

#include <linux/debugfs.h>
static int steal_spins_set(void *data, u64 val)
{
	static DEFINE_MUTEX(lock);

	/*
	 * The lock slow path has a !maybe_stealers case that can assume
	 * the head of queue will not see concurrent waiters. That waiter
	 * is unsafe in the presence of stealers, so must keep them away
	 * from one another.
	 */

	mutex_lock(&lock);
	if (val && !steal_spins) {
		maybe_stealers = true;
		/* wait for queue head waiter to go away */
		synchronize_rcu();
		steal_spins = val;
	} else if (!val && steal_spins) {
		steal_spins = val;
		/* wait for all possible stealers to go away */
		synchronize_rcu();
		maybe_stealers = false;
	} else {
		steal_spins = val;
	}
	mutex_unlock(&lock);

	return 0;
}

static int steal_spins_get(void *data, u64 *val)
{
	*val = steal_spins;

	return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, "%llu\n");

static int head_spins_set(void *data, u64 val)
{
	head_spins = val;

	return 0;
}

static int head_spins_get(void *data, u64 *val)
{
	*val = head_spins;

	return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(fops_head_spins, head_spins_get, head_spins_set, "%llu\n");

static int pv_yield_owner_set(void *data, u64 val)
{
	pv_yield_owner = !!val;

	return 0;
}

static int pv_yield_owner_get(void *data, u64 *val)
{
	*val = pv_yield_owner;

	return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, pv_yield_owner_set, "%llu\n");

static int pv_yield_allow_steal_set(void *data, u64 val)
{
	pv_yield_allow_steal = !!val;

	return 0;
}

static int pv_yield_allow_steal_get(void *data, u64 *val)
{
	*val = pv_yield_allow_steal;

	return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_allow_steal, pv_yield_allow_steal_get, pv_yield_allow_steal_set, "%llu\n");

static int pv_yield_prev_set(void *data, u64 val)
{
	pv_yield_prev = !!val;

	return 0;
}

static int pv_yield_prev_get(void *data, u64 *val)
{
	*val = pv_yield_prev;

	return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");

static __init int spinlock_debugfs_init(void)
{
	debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins);
	debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins);
	if (is_shared_processor()) {
		debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
		debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
		debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
	}

	return 0;
}
device_initcall(spinlock_debugfs_init);