linux-stable/include/linux/types.h
Thomas Gleixner ee1ee6db07 atomics: Provide rcuref - scalable reference counting
atomic_t based reference counting, including refcount_t, uses
atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is
implemented with a atomic_try_cmpxchg() loop. High contention of the
reference count leads to retry loops and scales badly. There is nothing to
improve on this implementation as the semantics have to be preserved.

Provide rcuref as a scalable alternative solution which is suitable for RCU
managed objects. Similar to refcount_t it comes with overflow and underflow
detection and mitigation.

rcuref treats the underlying atomic_t as an unsigned integer and partitions
this space into zones:

  0x00000000 - 0x7FFFFFFF	valid zone (1 .. (INT_MAX + 1) references)
  0x80000000 - 0xBFFFFFFF	saturation zone
  0xC0000000 - 0xFFFFFFFE	dead zone
  0xFFFFFFFF   			no reference

rcuref_get() unconditionally increments the reference count with
atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the
reference count with atomic_add_negative_release().

This unconditional increment avoids the inc_not_zero() problem, but
requires a more complex implementation on the put() side when the count
drops from 0 to -1.

When this transition is detected then it is attempted to mark the reference
count dead, by setting it to the midpoint of the dead zone with a single
atomic_cmpxchg_release() operation. This operation can fail due to a
concurrent rcuref_get() elevating the reference count from -1 to 0 again.

If the unconditional increment in rcuref_get() hits a reference count which
is marked dead (or saturated) it will detect it after the fact and bring
back the reference count to the midpoint of the respective zone. The zones
provide enough tolerance which makes it practically impossible to escape
from a zone.

The racy implementation of rcuref_put() requires to protect rcuref_put()
against a grace period ending in order to prevent a subtle use after
free. As RCU is the only mechanism which allows to protect against that, it
is not possible to fully replace the atomic_inc_not_zero() based
implementation of refcount_t with this scheme.

The final drop is slightly more expensive than the atomic_dec_return()
counterpart, but that's not the case which this is optimized for. The
optimization is on the high frequeunt get()/put() pairs and their
scalability.

The performance of an uncontended rcuref_get()/put() pair where the put()
is not dropping the last reference is still on par with the plain atomic
operations, while at the same time providing overflow and underflow
detection and mitigation.

The performance of rcuref compared to plain atomic_inc_not_zero() and
atomic_dec_return() based reference counting under contention:

 -  Micro benchmark: All CPUs running a increment/decrement loop on an
    elevated reference count, which means the 0 to -1 transition never
    happens.

    The performance gain depends on microarchitecture and the number of
    CPUs and has been observed in the range of 1.3X to 4.7X

 - Conversion of dst_entry::__refcnt to rcuref and testing with the
    localhost memtier/memcached benchmark. That benchmark shows the
    reference count contention prominently.

    The performance gain depends on microarchitecture and the number of
    CPUs and has been observed in the range of 1.1X to 2.6X over the
    previous fix for the false sharing issue vs. struct
    dst_entry::__refcnt.

    When memtier is run over a real 1Gb network connection, there is a
    small gain on top of the false sharing fix. The two changes combined
    result in a 2%-5% total gain for that networked test.

Reported-by: Wangyang Guo <wangyang.guo@intel.com>
Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de
2023-03-28 10:39:29 +02:00

242 lines
5.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TYPES_H
#define _LINUX_TYPES_H
#define __EXPORTED_HEADERS__
#include <uapi/linux/types.h>
#ifndef __ASSEMBLY__
#define DECLARE_BITMAP(name,bits) \
unsigned long name[BITS_TO_LONGS(bits)]
typedef u32 __kernel_dev_t;
typedef __kernel_fd_set fd_set;
typedef __kernel_dev_t dev_t;
typedef __kernel_ulong_t ino_t;
typedef __kernel_mode_t mode_t;
typedef unsigned short umode_t;
typedef u32 nlink_t;
typedef __kernel_off_t off_t;
typedef __kernel_pid_t pid_t;
typedef __kernel_daddr_t daddr_t;
typedef __kernel_key_t key_t;
typedef __kernel_suseconds_t suseconds_t;
typedef __kernel_timer_t timer_t;
typedef __kernel_clockid_t clockid_t;
typedef __kernel_mqd_t mqd_t;
typedef _Bool bool;
typedef __kernel_uid32_t uid_t;
typedef __kernel_gid32_t gid_t;
typedef __kernel_uid16_t uid16_t;
typedef __kernel_gid16_t gid16_t;
typedef unsigned long uintptr_t;
#ifdef CONFIG_HAVE_UID16
/* This is defined by include/asm-{arch}/posix_types.h */
typedef __kernel_old_uid_t old_uid_t;
typedef __kernel_old_gid_t old_gid_t;
#endif /* CONFIG_UID16 */
#if defined(__GNUC__)
typedef __kernel_loff_t loff_t;
#endif
/*
* The following typedefs are also protected by individual ifdefs for
* historical reasons:
*/
#ifndef _SIZE_T
#define _SIZE_T
typedef __kernel_size_t size_t;
#endif
#ifndef _SSIZE_T
#define _SSIZE_T
typedef __kernel_ssize_t ssize_t;
#endif
#ifndef _PTRDIFF_T
#define _PTRDIFF_T
typedef __kernel_ptrdiff_t ptrdiff_t;
#endif
#ifndef _CLOCK_T
#define _CLOCK_T
typedef __kernel_clock_t clock_t;
#endif
#ifndef _CADDR_T
#define _CADDR_T
typedef __kernel_caddr_t caddr_t;
#endif
/* bsd */
typedef unsigned char u_char;
typedef unsigned short u_short;
typedef unsigned int u_int;
typedef unsigned long u_long;
/* sysv */
typedef unsigned char unchar;
typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long ulong;
#ifndef __BIT_TYPES_DEFINED__
#define __BIT_TYPES_DEFINED__
typedef u8 u_int8_t;
typedef s8 int8_t;
typedef u16 u_int16_t;
typedef s16 int16_t;
typedef u32 u_int32_t;
typedef s32 int32_t;
#endif /* !(__BIT_TYPES_DEFINED__) */
typedef u8 uint8_t;
typedef u16 uint16_t;
typedef u32 uint32_t;
#if defined(__GNUC__)
typedef u64 uint64_t;
typedef u64 u_int64_t;
typedef s64 int64_t;
#endif
/* this is a special 64bit data type that is 8-byte aligned */
#define aligned_u64 __aligned_u64
#define aligned_be64 __aligned_be64
#define aligned_le64 __aligned_le64
/**
* The type used for indexing onto a disc or disc partition.
*
* Linux always considers sectors to be 512 bytes long independently
* of the devices real block size.
*
* blkcnt_t is the type of the inode's block count.
*/
typedef u64 sector_t;
typedef u64 blkcnt_t;
/*
* The type of an index into the pagecache.
*/
#define pgoff_t unsigned long
/*
* A dma_addr_t can hold any valid DMA address, i.e., any address returned
* by the DMA API.
*
* If the DMA API only uses 32-bit addresses, dma_addr_t need only be 32
* bits wide. Bus addresses, e.g., PCI BARs, may be wider than 32 bits,
* but drivers do memory-mapped I/O to ioremapped kernel virtual addresses,
* so they don't care about the size of the actual bus addresses.
*/
#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
typedef u64 dma_addr_t;
#else
typedef u32 dma_addr_t;
#endif
typedef unsigned int __bitwise gfp_t;
typedef unsigned int __bitwise slab_flags_t;
typedef unsigned int __bitwise fmode_t;
#ifdef CONFIG_PHYS_ADDR_T_64BIT
typedef u64 phys_addr_t;
#else
typedef u32 phys_addr_t;
#endif
typedef phys_addr_t resource_size_t;
/*
* This type is the placeholder for a hardware interrupt number. It has to be
* big enough to enclose whatever representation is used by a given platform.
*/
typedef unsigned long irq_hw_number_t;
typedef struct {
int counter;
} atomic_t;
#define ATOMIC_INIT(i) { (i) }
#ifdef CONFIG_64BIT
typedef struct {
s64 counter;
} atomic64_t;
#endif
typedef struct {
atomic_t refcnt;
} rcuref_t;
#define RCUREF_INIT(i) { .refcnt = ATOMIC_INIT(i - 1) }
struct list_head {
struct list_head *next, *prev;
};
struct hlist_head {
struct hlist_node *first;
};
struct hlist_node {
struct hlist_node *next, **pprev;
};
struct ustat {
__kernel_daddr_t f_tfree;
#ifdef CONFIG_ARCH_32BIT_USTAT_F_TINODE
unsigned int f_tinode;
#else
unsigned long f_tinode;
#endif
char f_fname[6];
char f_fpack[6];
};
/**
* struct callback_head - callback structure for use with RCU and task_work
* @next: next update requests in a list
* @func: actual update function to call after the grace period.
*
* The struct is aligned to size of pointer. On most architectures it happens
* naturally due ABI requirements, but some architectures (like CRIS) have
* weird ABI and we need to ask it explicitly.
*
* The alignment is required to guarantee that bit 0 of @next will be
* clear under normal conditions -- as long as we use call_rcu() or
* call_srcu() to queue the callback.
*
* This guarantee is important for few reasons:
* - future call_rcu_lazy() will make use of lower bits in the pointer;
* - the structure shares storage space in struct page with @compound_head,
* which encode PageTail() in bit 0. The guarantee is needed to avoid
* false-positive PageTail().
*/
struct callback_head {
struct callback_head *next;
void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head
typedef void (*rcu_callback_t)(struct rcu_head *head);
typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func);
typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv);
typedef void (*swap_func_t)(void *a, void *b, int size);
typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv);
typedef int (*cmp_func_t)(const void *a, const void *b);
#endif /* __ASSEMBLY__ */
#endif /* _LINUX_TYPES_H */