linux-stable/include/linux/psi_types.h
Brian Chen cb0e52b774 psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
We've noticed cases where tasks in a cgroup are stalled on memory but
there is little memory FULL pressure since tasks stay on the runqueue
in reclaim.

A simple example involves a single threaded program that keeps leaking
and touching large amounts of memory. It runs in a cgroup with swap
enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though
there is significant CPU pressure and memory SOME, there is barely any
memory FULL since the task enters reclaim and stays on the runqueue.
However, this memory-bound task is effectively stalled on memory and
we expect memory FULL to match memory SOME in this scenario.

The code is confused about memstall && running, thinking there is a
stalled task and a productive task when there's only one task: a
reclaimer that's counted as both. To fix this, we redefine the
condition for PSI_MEM_FULL to check that all running tasks are in an
active memstall instead of checking that there are no running tasks.

        case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);

This will capture reclaimers. It will also capture tasks that called
psi_memstall_enter() and are about to sleep, but this should be
negligible noise.

Signed-off-by: Brian Chen <brianchen118@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com
2021-11-17 14:49:00 +01:00

195 lines
4.4 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PSI_TYPES_H
#define _LINUX_PSI_TYPES_H
#include <linux/kthread.h>
#include <linux/seqlock.h>
#include <linux/types.h>
#include <linux/kref.h>
#include <linux/wait.h>
#ifdef CONFIG_PSI
/* Tracked task states */
enum psi_task_count {
NR_IOWAIT,
NR_MEMSTALL,
NR_RUNNING,
/*
* This can't have values other than 0 or 1 and could be
* implemented as a bit flag. But for now we still have room
* in the first cacheline of psi_group_cpu, and this way we
* don't have to special case any state tracking for it.
*/
NR_ONCPU,
/*
* For IO and CPU stalls the presence of running/oncpu tasks
* in the domain means a partial rather than a full stall.
* For memory it's not so simple because of page reclaimers:
* they are running/oncpu while representing a stall. To tell
* whether a domain has productivity left or not, we need to
* distinguish between regular running (i.e. productive)
* threads and memstall ones.
*/
NR_MEMSTALL_RUNNING,
NR_PSI_TASK_COUNTS = 5,
};
/* Task state bitmasks */
#define TSK_IOWAIT (1 << NR_IOWAIT)
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
/* Resources that workloads could be stalled on */
enum psi_res {
PSI_IO,
PSI_MEM,
PSI_CPU,
NR_PSI_RESOURCES = 3,
};
/*
* Pressure states for each resource:
*
* SOME: Stalled tasks & working tasks
* FULL: Stalled tasks & no working tasks
*/
enum psi_states {
PSI_IO_SOME,
PSI_IO_FULL,
PSI_MEM_SOME,
PSI_MEM_FULL,
PSI_CPU_SOME,
PSI_CPU_FULL,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
NR_PSI_STATES = 7,
};
enum psi_aggregators {
PSI_AVGS = 0,
PSI_POLL,
NR_PSI_AGGREGATORS,
};
struct psi_group_cpu {
/* 1st cacheline updated by the scheduler */
/* Aggregator needs to know of concurrent changes */
seqcount_t seq ____cacheline_aligned_in_smp;
/* States of the tasks belonging to this group */
unsigned int tasks[NR_PSI_TASK_COUNTS];
/* Aggregate pressure state derived from the tasks */
u32 state_mask;
/* Period time sampling buckets for each state of interest (ns) */
u32 times[NR_PSI_STATES];
/* Time of last task change in this group (rq_clock) */
u64 state_start;
/* 2nd cacheline updated by the aggregator */
/* Delta detection against the sampling buckets */
u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
____cacheline_aligned_in_smp;
};
/* PSI growth tracking window */
struct psi_window {
/* Window size in ns */
u64 size;
/* Start time of the current window in ns */
u64 start_time;
/* Value at the start of the window */
u64 start_value;
/* Value growth in the previous window */
u64 prev_growth;
};
struct psi_trigger {
/* PSI state being monitored by the trigger */
enum psi_states state;
/* User-spacified threshold in ns */
u64 threshold;
/* List node inside triggers list */
struct list_head node;
/* Backpointer needed during trigger destruction */
struct psi_group *group;
/* Wait queue for polling */
wait_queue_head_t event_wait;
/* Pending event flag */
int event;
/* Tracking window */
struct psi_window win;
/*
* Time last event was generated. Used for rate-limiting
* events to one per window
*/
u64 last_event_time;
/* Refcounting to prevent premature destruction */
struct kref refcount;
};
struct psi_group {
/* Protects data used by the aggregator */
struct mutex avgs_lock;
/* Per-cpu task state & time tracking */
struct psi_group_cpu __percpu *pcpu;
/* Running pressure averages */
u64 avg_total[NR_PSI_STATES - 1];
u64 avg_last_update;
u64 avg_next_update;
/* Aggregator work control */
struct delayed_work avgs_work;
/* Total stall times and sampled pressure averages */
u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
unsigned long avg[NR_PSI_STATES - 1][3];
/* Monitor work control */
struct task_struct __rcu *poll_task;
struct timer_list poll_timer;
wait_queue_head_t poll_wait;
atomic_t poll_wakeup;
/* Protects data used by the monitor */
struct mutex trigger_lock;
/* Configured polling triggers */
struct list_head triggers;
u32 nr_triggers[NR_PSI_STATES - 1];
u32 poll_states;
u64 poll_min_period;
/* Total stall times at the start of monitor activation */
u64 polling_total[NR_PSI_STATES - 1];
u64 polling_next_update;
u64 polling_until;
};
#else /* CONFIG_PSI */
struct psi_group { };
#endif /* CONFIG_PSI */
#endif /* _LINUX_PSI_TYPES_H */