mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-09-26 12:26:11 +00:00
ee1ee6db07
atomic_t based reference counting, including refcount_t, uses atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is implemented with a atomic_try_cmpxchg() loop. High contention of the reference count leads to retry loops and scales badly. There is nothing to improve on this implementation as the semantics have to be preserved. Provide rcuref as a scalable alternative solution which is suitable for RCU managed objects. Similar to refcount_t it comes with overflow and underflow detection and mitigation. rcuref treats the underlying atomic_t as an unsigned integer and partitions this space into zones: 0x00000000 - 0x7FFFFFFF valid zone (1 .. (INT_MAX + 1) references) 0x80000000 - 0xBFFFFFFF saturation zone 0xC0000000 - 0xFFFFFFFE dead zone 0xFFFFFFFF no reference rcuref_get() unconditionally increments the reference count with atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the reference count with atomic_add_negative_release(). This unconditional increment avoids the inc_not_zero() problem, but requires a more complex implementation on the put() side when the count drops from 0 to -1. When this transition is detected then it is attempted to mark the reference count dead, by setting it to the midpoint of the dead zone with a single atomic_cmpxchg_release() operation. This operation can fail due to a concurrent rcuref_get() elevating the reference count from -1 to 0 again. If the unconditional increment in rcuref_get() hits a reference count which is marked dead (or saturated) it will detect it after the fact and bring back the reference count to the midpoint of the respective zone. The zones provide enough tolerance which makes it practically impossible to escape from a zone. The racy implementation of rcuref_put() requires to protect rcuref_put() against a grace period ending in order to prevent a subtle use after free. As RCU is the only mechanism which allows to protect against that, it is not possible to fully replace the atomic_inc_not_zero() based implementation of refcount_t with this scheme. The final drop is slightly more expensive than the atomic_dec_return() counterpart, but that's not the case which this is optimized for. The optimization is on the high frequeunt get()/put() pairs and their scalability. The performance of an uncontended rcuref_get()/put() pair where the put() is not dropping the last reference is still on par with the plain atomic operations, while at the same time providing overflow and underflow detection and mitigation. The performance of rcuref compared to plain atomic_inc_not_zero() and atomic_dec_return() based reference counting under contention: - Micro benchmark: All CPUs running a increment/decrement loop on an elevated reference count, which means the 0 to -1 transition never happens. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.3X to 4.7X - Conversion of dst_entry::__refcnt to rcuref and testing with the localhost memtier/memcached benchmark. That benchmark shows the reference count contention prominently. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.1X to 2.6X over the previous fix for the false sharing issue vs. struct dst_entry::__refcnt. When memtier is run over a real 1Gb network connection, there is a small gain on top of the false sharing fix. The two changes combined result in a 2%-5% total gain for that networked test. Reported-by: Wangyang Guo <wangyang.guo@intel.com> Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de
242 lines
5.8 KiB
C
242 lines
5.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_TYPES_H
|
|
#define _LINUX_TYPES_H
|
|
|
|
#define __EXPORTED_HEADERS__
|
|
#include <uapi/linux/types.h>
|
|
|
|
#ifndef __ASSEMBLY__
|
|
|
|
#define DECLARE_BITMAP(name,bits) \
|
|
unsigned long name[BITS_TO_LONGS(bits)]
|
|
|
|
typedef u32 __kernel_dev_t;
|
|
|
|
typedef __kernel_fd_set fd_set;
|
|
typedef __kernel_dev_t dev_t;
|
|
typedef __kernel_ulong_t ino_t;
|
|
typedef __kernel_mode_t mode_t;
|
|
typedef unsigned short umode_t;
|
|
typedef u32 nlink_t;
|
|
typedef __kernel_off_t off_t;
|
|
typedef __kernel_pid_t pid_t;
|
|
typedef __kernel_daddr_t daddr_t;
|
|
typedef __kernel_key_t key_t;
|
|
typedef __kernel_suseconds_t suseconds_t;
|
|
typedef __kernel_timer_t timer_t;
|
|
typedef __kernel_clockid_t clockid_t;
|
|
typedef __kernel_mqd_t mqd_t;
|
|
|
|
typedef _Bool bool;
|
|
|
|
typedef __kernel_uid32_t uid_t;
|
|
typedef __kernel_gid32_t gid_t;
|
|
typedef __kernel_uid16_t uid16_t;
|
|
typedef __kernel_gid16_t gid16_t;
|
|
|
|
typedef unsigned long uintptr_t;
|
|
|
|
#ifdef CONFIG_HAVE_UID16
|
|
/* This is defined by include/asm-{arch}/posix_types.h */
|
|
typedef __kernel_old_uid_t old_uid_t;
|
|
typedef __kernel_old_gid_t old_gid_t;
|
|
#endif /* CONFIG_UID16 */
|
|
|
|
#if defined(__GNUC__)
|
|
typedef __kernel_loff_t loff_t;
|
|
#endif
|
|
|
|
/*
|
|
* The following typedefs are also protected by individual ifdefs for
|
|
* historical reasons:
|
|
*/
|
|
#ifndef _SIZE_T
|
|
#define _SIZE_T
|
|
typedef __kernel_size_t size_t;
|
|
#endif
|
|
|
|
#ifndef _SSIZE_T
|
|
#define _SSIZE_T
|
|
typedef __kernel_ssize_t ssize_t;
|
|
#endif
|
|
|
|
#ifndef _PTRDIFF_T
|
|
#define _PTRDIFF_T
|
|
typedef __kernel_ptrdiff_t ptrdiff_t;
|
|
#endif
|
|
|
|
#ifndef _CLOCK_T
|
|
#define _CLOCK_T
|
|
typedef __kernel_clock_t clock_t;
|
|
#endif
|
|
|
|
#ifndef _CADDR_T
|
|
#define _CADDR_T
|
|
typedef __kernel_caddr_t caddr_t;
|
|
#endif
|
|
|
|
/* bsd */
|
|
typedef unsigned char u_char;
|
|
typedef unsigned short u_short;
|
|
typedef unsigned int u_int;
|
|
typedef unsigned long u_long;
|
|
|
|
/* sysv */
|
|
typedef unsigned char unchar;
|
|
typedef unsigned short ushort;
|
|
typedef unsigned int uint;
|
|
typedef unsigned long ulong;
|
|
|
|
#ifndef __BIT_TYPES_DEFINED__
|
|
#define __BIT_TYPES_DEFINED__
|
|
|
|
typedef u8 u_int8_t;
|
|
typedef s8 int8_t;
|
|
typedef u16 u_int16_t;
|
|
typedef s16 int16_t;
|
|
typedef u32 u_int32_t;
|
|
typedef s32 int32_t;
|
|
|
|
#endif /* !(__BIT_TYPES_DEFINED__) */
|
|
|
|
typedef u8 uint8_t;
|
|
typedef u16 uint16_t;
|
|
typedef u32 uint32_t;
|
|
|
|
#if defined(__GNUC__)
|
|
typedef u64 uint64_t;
|
|
typedef u64 u_int64_t;
|
|
typedef s64 int64_t;
|
|
#endif
|
|
|
|
/* this is a special 64bit data type that is 8-byte aligned */
|
|
#define aligned_u64 __aligned_u64
|
|
#define aligned_be64 __aligned_be64
|
|
#define aligned_le64 __aligned_le64
|
|
|
|
/**
|
|
* The type used for indexing onto a disc or disc partition.
|
|
*
|
|
* Linux always considers sectors to be 512 bytes long independently
|
|
* of the devices real block size.
|
|
*
|
|
* blkcnt_t is the type of the inode's block count.
|
|
*/
|
|
typedef u64 sector_t;
|
|
typedef u64 blkcnt_t;
|
|
|
|
/*
|
|
* The type of an index into the pagecache.
|
|
*/
|
|
#define pgoff_t unsigned long
|
|
|
|
/*
|
|
* A dma_addr_t can hold any valid DMA address, i.e., any address returned
|
|
* by the DMA API.
|
|
*
|
|
* If the DMA API only uses 32-bit addresses, dma_addr_t need only be 32
|
|
* bits wide. Bus addresses, e.g., PCI BARs, may be wider than 32 bits,
|
|
* but drivers do memory-mapped I/O to ioremapped kernel virtual addresses,
|
|
* so they don't care about the size of the actual bus addresses.
|
|
*/
|
|
#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
|
|
typedef u64 dma_addr_t;
|
|
#else
|
|
typedef u32 dma_addr_t;
|
|
#endif
|
|
|
|
typedef unsigned int __bitwise gfp_t;
|
|
typedef unsigned int __bitwise slab_flags_t;
|
|
typedef unsigned int __bitwise fmode_t;
|
|
|
|
#ifdef CONFIG_PHYS_ADDR_T_64BIT
|
|
typedef u64 phys_addr_t;
|
|
#else
|
|
typedef u32 phys_addr_t;
|
|
#endif
|
|
|
|
typedef phys_addr_t resource_size_t;
|
|
|
|
/*
|
|
* This type is the placeholder for a hardware interrupt number. It has to be
|
|
* big enough to enclose whatever representation is used by a given platform.
|
|
*/
|
|
typedef unsigned long irq_hw_number_t;
|
|
|
|
typedef struct {
|
|
int counter;
|
|
} atomic_t;
|
|
|
|
#define ATOMIC_INIT(i) { (i) }
|
|
|
|
#ifdef CONFIG_64BIT
|
|
typedef struct {
|
|
s64 counter;
|
|
} atomic64_t;
|
|
#endif
|
|
|
|
typedef struct {
|
|
atomic_t refcnt;
|
|
} rcuref_t;
|
|
|
|
#define RCUREF_INIT(i) { .refcnt = ATOMIC_INIT(i - 1) }
|
|
|
|
struct list_head {
|
|
struct list_head *next, *prev;
|
|
};
|
|
|
|
struct hlist_head {
|
|
struct hlist_node *first;
|
|
};
|
|
|
|
struct hlist_node {
|
|
struct hlist_node *next, **pprev;
|
|
};
|
|
|
|
struct ustat {
|
|
__kernel_daddr_t f_tfree;
|
|
#ifdef CONFIG_ARCH_32BIT_USTAT_F_TINODE
|
|
unsigned int f_tinode;
|
|
#else
|
|
unsigned long f_tinode;
|
|
#endif
|
|
char f_fname[6];
|
|
char f_fpack[6];
|
|
};
|
|
|
|
/**
|
|
* struct callback_head - callback structure for use with RCU and task_work
|
|
* @next: next update requests in a list
|
|
* @func: actual update function to call after the grace period.
|
|
*
|
|
* The struct is aligned to size of pointer. On most architectures it happens
|
|
* naturally due ABI requirements, but some architectures (like CRIS) have
|
|
* weird ABI and we need to ask it explicitly.
|
|
*
|
|
* The alignment is required to guarantee that bit 0 of @next will be
|
|
* clear under normal conditions -- as long as we use call_rcu() or
|
|
* call_srcu() to queue the callback.
|
|
*
|
|
* This guarantee is important for few reasons:
|
|
* - future call_rcu_lazy() will make use of lower bits in the pointer;
|
|
* - the structure shares storage space in struct page with @compound_head,
|
|
* which encode PageTail() in bit 0. The guarantee is needed to avoid
|
|
* false-positive PageTail().
|
|
*/
|
|
struct callback_head {
|
|
struct callback_head *next;
|
|
void (*func)(struct callback_head *head);
|
|
} __attribute__((aligned(sizeof(void *))));
|
|
#define rcu_head callback_head
|
|
|
|
typedef void (*rcu_callback_t)(struct rcu_head *head);
|
|
typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func);
|
|
|
|
typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv);
|
|
typedef void (*swap_func_t)(void *a, void *b, int size);
|
|
|
|
typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv);
|
|
typedef int (*cmp_func_t)(const void *a, const void *b);
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
#endif /* _LINUX_TYPES_H */
|