linux-stable/mm/percpu-stats.c
Roman Gushchin f183324133 percpu: implement partial chunk depopulation
From Roman ("percpu: partial chunk depopulation"):
In our [Facebook] production experience the percpu memory allocator is
sometimes struggling with returning the memory to the system. A typical
example is a creation of several thousands memory cgroups (each has
several chunks of the percpu data used for vmstats, vmevents,
ref counters etc). Deletion and complete releasing of these cgroups
doesn't always lead to a shrinkage of the percpu memory, so that
sometimes there are several GB's of memory wasted.

The underlying problem is the fragmentation: to release an underlying
chunk all percpu allocations should be released first. The percpu
allocator tends to top up chunks to improve the utilization. It means
new small-ish allocations (e.g. percpu ref counters) are placed onto
almost filled old-ish chunks, effectively pinning them in memory.

This patchset solves this problem by implementing a partial depopulation
of percpu chunks: chunks with many empty pages are being asynchronously
depopulated and the pages are returned to the system.

To illustrate the problem the following script can be used:
--

cd /sys/fs/cgroup

mkdir percpu_test
echo "+memory" > percpu_test/cgroup.subtree_control

cat /proc/meminfo | grep Percpu

for i in `seq 1 1000`; do
    mkdir percpu_test/cg_"${i}"
    for j in `seq 1 10`; do
	mkdir percpu_test/cg_"${i}"_"${j}"
    done
done

cat /proc/meminfo | grep Percpu

for i in `seq 1 1000`; do
    for j in `seq 1 10`; do
	rmdir percpu_test/cg_"${i}"_"${j}"
    done
done

sleep 10

cat /proc/meminfo | grep Percpu

for i in `seq 1 1000`; do
    rmdir percpu_test/cg_"${i}"
done

rmdir percpu_test
--

It creates 11000 memory cgroups and removes every 10 out of 11.
It prints the initial size of the percpu memory, the size after
creating all cgroups and the size after deleting most of them.

Results:
  vanilla:
    ./percpu_test.sh
    Percpu:             7488 kB
    Percpu:           481152 kB
    Percpu:           481152 kB

  with this patchset applied:
    ./percpu_test.sh
    Percpu:             7488 kB
    Percpu:           481408 kB
    Percpu:           135552 kB

The total size of the percpu memory was reduced by more than 3.5 times.

This patch:

This patch implements partial depopulation of percpu chunks.

As of now, a chunk can be depopulated only as a part of the final
destruction, if there are no more outstanding allocations. However
to minimize a memory waste it might be useful to depopulate a
partially filed chunk, if a small number of outstanding allocations
prevents the chunk from being fully reclaimed.

This patch implements the following depopulation process: it scans
over the chunk pages, looks for a range of empty and populated pages
and performs the depopulation. To avoid races with new allocations,
the chunk is previously isolated. After the depopulation the chunk is
sidelined to a special list or freed. New allocations prefer using
active chunks to sidelined chunks. If a sidelined chunk is used, it is
reintegrated to the active lists.

The depopulation is scheduled on the free path if the chunk is all of
the following:
  1) has more than 1/4 of total pages free and populated
  2) the system has enough free percpu pages aside of this chunk
  3) isn't the reserved chunk
  4) isn't the first chunk
If it's already depopulated but got free populated pages, it's a good
target too. The chunk is moved to a special slot,
pcpu_to_depopulate_slot, chunk->isolated is set, and the balance work
item is scheduled. On isolation, these pages are removed from the
pcpu_nr_empty_pop_pages. It is constantly replaced to the
to_depopulate_slot when it meets these qualifications.

pcpu_reclaim_populated() iterates over the to_depopulate_slot until it
becomes empty. The depopulation is performed in the reverse direction to
keep populated pages close to the beginning. Depopulated chunks are
sidelined to preferentially avoid them for new allocations. When no
active chunk can suffice a new allocation, sidelined chunks are first
checked before creating a new chunk.

Signed-off-by: Roman Gushchin <guro@fb.com>
Co-developed-by: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Tested-by: Pratik Sampat <psampat@linux.ibm.com>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
2021-04-21 18:17:40 +00:00

251 lines
6.3 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu-debug.c
*
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
*
* Prints statistics about the percpu allocator and backing chunks.
*/
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/sort.h>
#include <linux/vmalloc.h>
#include "percpu-internal.h"
#define P(X, Y) \
seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y)
struct percpu_stats pcpu_stats;
struct pcpu_alloc_info pcpu_stats_ai;
static int cmpint(const void *a, const void *b)
{
return *(int *)a - *(int *)b;
}
/*
* Iterates over all chunks to find the max nr_alloc entries.
*/
static int find_max_nr_alloc(void)
{
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
enum pcpu_chunk_type type;
max_nr_alloc = 0;
for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
for (slot = 0; slot < pcpu_nr_slots; slot++)
list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
list)
max_nr_alloc = max(max_nr_alloc,
chunk->nr_alloc);
return max_nr_alloc;
}
/*
* Prints out chunk state. Fragmentation is considered between
* the beginning of the chunk to the last allocation.
*
* All statistics are in bytes unless stated otherwise.
*/
static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
int *buffer)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int i, last_alloc, as_len, start, end;
int *alloc_sizes, *p;
/* statistics */
int sum_frag = 0, max_frag = 0;
int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
alloc_sizes = buffer;
/*
* find_last_bit returns the start value if nothing found.
* Therefore, we must determine if it is a failure of find_last_bit
* and set the appropriate value.
*/
last_alloc = find_last_bit(chunk->alloc_map,
pcpu_chunk_map_bits(chunk) -
chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1);
last_alloc = test_bit(last_alloc, chunk->alloc_map) ?
last_alloc + 1 : 0;
as_len = 0;
start = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
/*
* If a bit is set in the allocation map, the bound_map identifies
* where the allocation ends. If the allocation is not set, the
* bound_map does not identify free areas as it is only kept accurate
* on allocation, not free.
*
* Positive values are allocations and negative values are free
* fragments.
*/
while (start < last_alloc) {
if (test_bit(start, chunk->alloc_map)) {
end = find_next_bit(chunk->bound_map, last_alloc,
start + 1);
alloc_sizes[as_len] = 1;
} else {
end = find_next_bit(chunk->alloc_map, last_alloc,
start + 1);
alloc_sizes[as_len] = -1;
}
alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE;
start = end;
}
/*
* The negative values are free fragments and thus sorting gives the
* free fragments at the beginning in largest first order.
*/
if (as_len > 0) {
sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL);
/* iterate through the unallocated fragments */
for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
sum_frag -= *p;
max_frag = max(max_frag, -1 * (*p));
}
cur_min_alloc = alloc_sizes[i];
cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2];
cur_max_alloc = alloc_sizes[as_len - 1];
}
P("nr_alloc", chunk->nr_alloc);
P("max_alloc_size", chunk->max_alloc_size);
P("empty_pop_pages", chunk->nr_empty_pop_pages);
P("first_bit", chunk_md->first_free);
P("free_bytes", chunk->free_bytes);
P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
P("sum_frag", sum_frag);
P("max_frag", max_frag);
P("cur_min_alloc", cur_min_alloc);
P("cur_med_alloc", cur_med_alloc);
P("cur_max_alloc", cur_max_alloc);
#ifdef CONFIG_MEMCG_KMEM
P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)));
#endif
seq_putc(m, '\n');
}
static int percpu_stats_show(struct seq_file *m, void *v)
{
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
int *buffer;
enum pcpu_chunk_type type;
int nr_empty_pop_pages;
alloc_buffer:
spin_lock_irq(&pcpu_lock);
max_nr_alloc = find_max_nr_alloc();
spin_unlock_irq(&pcpu_lock);
/* there can be at most this many free and allocated fragments */
buffer = vmalloc(array_size(sizeof(int), (2 * max_nr_alloc + 1)));
if (!buffer)
return -ENOMEM;
spin_lock_irq(&pcpu_lock);
/* if the buffer allocated earlier is too small */
if (max_nr_alloc < find_max_nr_alloc()) {
spin_unlock_irq(&pcpu_lock);
vfree(buffer);
goto alloc_buffer;
}
nr_empty_pop_pages = 0;
for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
nr_empty_pop_pages += pcpu_nr_empty_pop_pages[type];
#define PL(X) \
seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
seq_printf(m,
"Percpu Memory Statistics\n"
"Allocation Info:\n"
"----------------------------------------\n");
PL(unit_size);
PL(static_size);
PL(reserved_size);
PL(dyn_size);
PL(atom_size);
PL(alloc_size);
seq_putc(m, '\n');
#undef PL
#define PU(X) \
seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X)
seq_printf(m,
"Global Stats:\n"
"----------------------------------------\n");
PU(nr_alloc);
PU(nr_dealloc);
PU(nr_cur_alloc);
PU(nr_max_alloc);
PU(nr_chunks);
PU(nr_max_chunks);
PU(min_alloc_size);
PU(max_alloc_size);
P("empty_pop_pages", nr_empty_pop_pages);
seq_putc(m, '\n');
#undef PU
seq_printf(m,
"Per Chunk Stats:\n"
"----------------------------------------\n");
if (pcpu_reserved_chunk) {
seq_puts(m, "Chunk: <- Reserved Chunk\n");
chunk_map_stats(m, pcpu_reserved_chunk, buffer);
}
for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
for (slot = 0; slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
list) {
if (chunk == pcpu_first_chunk)
seq_puts(m, "Chunk: <- First Chunk\n");
else if (slot == pcpu_to_depopulate_slot)
seq_puts(m, "Chunk (to_depopulate)\n");
else if (slot == pcpu_sidelined_slot)
seq_puts(m, "Chunk (sidelined):\n");
else
seq_puts(m, "Chunk:\n");
chunk_map_stats(m, chunk, buffer);
}
}
}
spin_unlock_irq(&pcpu_lock);
vfree(buffer);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(percpu_stats);
static int __init init_percpu_stats_debugfs(void)
{
debugfs_create_file("percpu_stats", 0444, NULL, NULL,
&percpu_stats_fops);
return 0;
}
late_initcall(init_percpu_stats_debugfs);