3c7be18ac9
Percpu memory is becoming more and more widely used by various subsystems, and the total amount of memory controlled by the percpu allocator can make a good part of the total memory. As an example, bpf maps can consume a lot of percpu memory, and they are created by a user. Also, some cgroup internals (e.g. memory controller statistics) can be quite large. On a machine with many CPUs and big number of cgroups they can consume hundreds of megabytes. So the lack of memcg accounting is creating a breach in the memory isolation. Similar to the slab memory, percpu memory should be accounted by default. To implement the perpcu accounting it's possible to take the slab memory accounting as a model to follow. Let's introduce two types of percpu chunks: root and memcg. What makes memcg chunks different is an additional space allocated to store memcg membership information. If __GFP_ACCOUNT is passed on allocation, a memcg chunk should be be used. If it's possible to charge the corresponding size to the target memory cgroup, allocation is performed, and the memcg ownership data is recorded. System-wide allocations are performed using root chunks, so there is no additional memory overhead. To implement a fast reparenting of percpu memory on memcg removal, we don't store mem_cgroup pointers directly: instead we use obj_cgroup API, introduced for slab accounting. [akpm@linux-foundation.org: fix CONFIG_MEMCG_KMEM=n build errors and warning] [akpm@linux-foundation.org: move unreachable code, per Roman] [cuibixuan@huawei.com: mm/percpu: fix 'defined but not used' warning] Link: http://lkml.kernel.org/r/6d41b939-a741-b521-a7a2-e7296ec16219@huawei.com Signed-off-by: Roman Gushchin <guro@fb.com> Signed-off-by: Bixuan Cui <cuibixuan@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Shakeel Butt <shakeelb@google.com> Acked-by: Dennis Zhou <dennis@kernel.org> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Tejun Heo <tj@kernel.org> Cc: Tobin C. Harding <tobin@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Waiman Long <longman@redhat.com> Cc: Bixuan Cui <cuibixuan@huawei.com> Cc: Michal Koutný <mkoutny@suse.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Link: http://lkml.kernel.org/r/20200623184515.4132564-3-guro@fb.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
245 lines
6.0 KiB
C
245 lines
6.0 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* mm/percpu-debug.c
|
|
*
|
|
* Copyright (C) 2017 Facebook Inc.
|
|
* Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
|
|
*
|
|
* Prints statistics about the percpu allocator and backing chunks.
|
|
*/
|
|
#include <linux/debugfs.h>
|
|
#include <linux/list.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/sort.h>
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include "percpu-internal.h"
|
|
|
|
#define P(X, Y) \
|
|
seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y)
|
|
|
|
struct percpu_stats pcpu_stats;
|
|
struct pcpu_alloc_info pcpu_stats_ai;
|
|
|
|
static int cmpint(const void *a, const void *b)
|
|
{
|
|
return *(int *)a - *(int *)b;
|
|
}
|
|
|
|
/*
|
|
* Iterates over all chunks to find the max nr_alloc entries.
|
|
*/
|
|
static int find_max_nr_alloc(void)
|
|
{
|
|
struct pcpu_chunk *chunk;
|
|
int slot, max_nr_alloc;
|
|
enum pcpu_chunk_type type;
|
|
|
|
max_nr_alloc = 0;
|
|
for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
|
|
for (slot = 0; slot < pcpu_nr_slots; slot++)
|
|
list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
|
|
list)
|
|
max_nr_alloc = max(max_nr_alloc,
|
|
chunk->nr_alloc);
|
|
|
|
return max_nr_alloc;
|
|
}
|
|
|
|
/*
|
|
* Prints out chunk state. Fragmentation is considered between
|
|
* the beginning of the chunk to the last allocation.
|
|
*
|
|
* All statistics are in bytes unless stated otherwise.
|
|
*/
|
|
static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
|
|
int *buffer)
|
|
{
|
|
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
|
|
int i, last_alloc, as_len, start, end;
|
|
int *alloc_sizes, *p;
|
|
/* statistics */
|
|
int sum_frag = 0, max_frag = 0;
|
|
int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
|
|
|
|
alloc_sizes = buffer;
|
|
|
|
/*
|
|
* find_last_bit returns the start value if nothing found.
|
|
* Therefore, we must determine if it is a failure of find_last_bit
|
|
* and set the appropriate value.
|
|
*/
|
|
last_alloc = find_last_bit(chunk->alloc_map,
|
|
pcpu_chunk_map_bits(chunk) -
|
|
chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1);
|
|
last_alloc = test_bit(last_alloc, chunk->alloc_map) ?
|
|
last_alloc + 1 : 0;
|
|
|
|
as_len = 0;
|
|
start = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
|
|
|
|
/*
|
|
* If a bit is set in the allocation map, the bound_map identifies
|
|
* where the allocation ends. If the allocation is not set, the
|
|
* bound_map does not identify free areas as it is only kept accurate
|
|
* on allocation, not free.
|
|
*
|
|
* Positive values are allocations and negative values are free
|
|
* fragments.
|
|
*/
|
|
while (start < last_alloc) {
|
|
if (test_bit(start, chunk->alloc_map)) {
|
|
end = find_next_bit(chunk->bound_map, last_alloc,
|
|
start + 1);
|
|
alloc_sizes[as_len] = 1;
|
|
} else {
|
|
end = find_next_bit(chunk->alloc_map, last_alloc,
|
|
start + 1);
|
|
alloc_sizes[as_len] = -1;
|
|
}
|
|
|
|
alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE;
|
|
|
|
start = end;
|
|
}
|
|
|
|
/*
|
|
* The negative values are free fragments and thus sorting gives the
|
|
* free fragments at the beginning in largest first order.
|
|
*/
|
|
if (as_len > 0) {
|
|
sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL);
|
|
|
|
/* iterate through the unallocated fragments */
|
|
for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
|
|
sum_frag -= *p;
|
|
max_frag = max(max_frag, -1 * (*p));
|
|
}
|
|
|
|
cur_min_alloc = alloc_sizes[i];
|
|
cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2];
|
|
cur_max_alloc = alloc_sizes[as_len - 1];
|
|
}
|
|
|
|
P("nr_alloc", chunk->nr_alloc);
|
|
P("max_alloc_size", chunk->max_alloc_size);
|
|
P("empty_pop_pages", chunk->nr_empty_pop_pages);
|
|
P("first_bit", chunk_md->first_free);
|
|
P("free_bytes", chunk->free_bytes);
|
|
P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
|
|
P("sum_frag", sum_frag);
|
|
P("max_frag", max_frag);
|
|
P("cur_min_alloc", cur_min_alloc);
|
|
P("cur_med_alloc", cur_med_alloc);
|
|
P("cur_max_alloc", cur_max_alloc);
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)));
|
|
#endif
|
|
seq_putc(m, '\n');
|
|
}
|
|
|
|
static int percpu_stats_show(struct seq_file *m, void *v)
|
|
{
|
|
struct pcpu_chunk *chunk;
|
|
int slot, max_nr_alloc;
|
|
int *buffer;
|
|
enum pcpu_chunk_type type;
|
|
|
|
alloc_buffer:
|
|
spin_lock_irq(&pcpu_lock);
|
|
max_nr_alloc = find_max_nr_alloc();
|
|
spin_unlock_irq(&pcpu_lock);
|
|
|
|
/* there can be at most this many free and allocated fragments */
|
|
buffer = vmalloc(array_size(sizeof(int), (2 * max_nr_alloc + 1)));
|
|
if (!buffer)
|
|
return -ENOMEM;
|
|
|
|
spin_lock_irq(&pcpu_lock);
|
|
|
|
/* if the buffer allocated earlier is too small */
|
|
if (max_nr_alloc < find_max_nr_alloc()) {
|
|
spin_unlock_irq(&pcpu_lock);
|
|
vfree(buffer);
|
|
goto alloc_buffer;
|
|
}
|
|
|
|
#define PL(X) \
|
|
seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
|
|
|
|
seq_printf(m,
|
|
"Percpu Memory Statistics\n"
|
|
"Allocation Info:\n"
|
|
"----------------------------------------\n");
|
|
PL(unit_size);
|
|
PL(static_size);
|
|
PL(reserved_size);
|
|
PL(dyn_size);
|
|
PL(atom_size);
|
|
PL(alloc_size);
|
|
seq_putc(m, '\n');
|
|
|
|
#undef PL
|
|
|
|
#define PU(X) \
|
|
seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X)
|
|
|
|
seq_printf(m,
|
|
"Global Stats:\n"
|
|
"----------------------------------------\n");
|
|
PU(nr_alloc);
|
|
PU(nr_dealloc);
|
|
PU(nr_cur_alloc);
|
|
PU(nr_max_alloc);
|
|
PU(nr_chunks);
|
|
PU(nr_max_chunks);
|
|
PU(min_alloc_size);
|
|
PU(max_alloc_size);
|
|
P("empty_pop_pages", pcpu_nr_empty_pop_pages);
|
|
seq_putc(m, '\n');
|
|
|
|
#undef PU
|
|
|
|
seq_printf(m,
|
|
"Per Chunk Stats:\n"
|
|
"----------------------------------------\n");
|
|
|
|
if (pcpu_reserved_chunk) {
|
|
seq_puts(m, "Chunk: <- Reserved Chunk\n");
|
|
chunk_map_stats(m, pcpu_reserved_chunk, buffer);
|
|
}
|
|
|
|
for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
|
|
for (slot = 0; slot < pcpu_nr_slots; slot++) {
|
|
list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
|
|
list) {
|
|
if (chunk == pcpu_first_chunk) {
|
|
seq_puts(m, "Chunk: <- First Chunk\n");
|
|
chunk_map_stats(m, chunk, buffer);
|
|
} else {
|
|
seq_puts(m, "Chunk:\n");
|
|
chunk_map_stats(m, chunk, buffer);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
spin_unlock_irq(&pcpu_lock);
|
|
|
|
vfree(buffer);
|
|
|
|
return 0;
|
|
}
|
|
DEFINE_SHOW_ATTRIBUTE(percpu_stats);
|
|
|
|
static int __init init_percpu_stats_debugfs(void)
|
|
{
|
|
debugfs_create_file("percpu_stats", 0444, NULL, NULL,
|
|
&percpu_stats_fops);
|
|
|
|
return 0;
|
|
}
|
|
|
|
late_initcall(init_percpu_stats_debugfs);
|