psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
authorBrian Chen <brianchen118@gmail.com>
Wed, 10 Nov 2021 21:33:12 +0000 (21:33 +0000)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 27 Jan 2022 10:04:27 +0000 (11:04 +0100)
[ Upstream commit cb0e52b7748737b2cf6481fdd9b920ce7e1ebbdf ]

We've noticed cases where tasks in a cgroup are stalled on memory but
there is little memory FULL pressure since tasks stay on the runqueue
in reclaim.

A simple example involves a single threaded program that keeps leaking
and touching large amounts of memory. It runs in a cgroup with swap
enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though
there is significant CPU pressure and memory SOME, there is barely any
memory FULL since the task enters reclaim and stays on the runqueue.
However, this memory-bound task is effectively stalled on memory and
we expect memory FULL to match memory SOME in this scenario.

The code is confused about memstall && running, thinking there is a
stalled task and a productive task when there's only one task: a
reclaimer that's counted as both. To fix this, we redefine the
condition for PSI_MEM_FULL to check that all running tasks are in an
active memstall instead of checking that there are no running tasks.

        case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);

This will capture reclaimers. It will also capture tasks that called
psi_memstall_enter() and are about to sleep, but this should be
negligible noise.

Signed-off-by: Brian Chen <brianchen118@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
include/linux/psi_types.h
kernel/sched/psi.c
kernel/sched/stats.h

index 0a23300d49af74be5887d2c57352e362fdcf18e1..0819c82dba920a50cb2b1408c7d528760f207469 100644 (file)
@@ -21,7 +21,17 @@ enum psi_task_count {
         * don't have to special case any state tracking for it.
         */
        NR_ONCPU,
-       NR_PSI_TASK_COUNTS = 4,
+       /*
+        * For IO and CPU stalls the presence of running/oncpu tasks
+        * in the domain means a partial rather than a full stall.
+        * For memory it's not so simple because of page reclaimers:
+        * they are running/oncpu while representing a stall. To tell
+        * whether a domain has productivity left or not, we need to
+        * distinguish between regular running (i.e. productive)
+        * threads and memstall ones.
+        */
+       NR_MEMSTALL_RUNNING,
+       NR_PSI_TASK_COUNTS = 5,
 };
 
 /* Task state bitmasks */
@@ -29,6 +39,7 @@ enum psi_task_count {
 #define TSK_MEMSTALL   (1 << NR_MEMSTALL)
 #define TSK_RUNNING    (1 << NR_RUNNING)
 #define TSK_ONCPU      (1 << NR_ONCPU)
+#define TSK_MEMSTALL_RUNNING   (1 << NR_MEMSTALL_RUNNING)
 
 /* Resources that workloads could be stalled on */
 enum psi_res {
index 1652f2bb54b791720a7751fbd30a6074bd907b86..69b19d3af690ff4d4bce4b4dd23ca9c24c41d4d5 100644 (file)
  * delayed on that resource such that nobody is advancing and the CPU
  * goes idle. This leaves both workload and CPU unproductive.
  *
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
  *     SOME = nr_delayed_tasks != 0
- *     FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ *     FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
  *
  * The percentage of wallclock time spent in those compound stall
  * states gives pressure numbers between 0 and 100 for each resource,
  *
  *     threads = min(nr_nonidle_tasks, nr_cpus)
  *        SOME = min(nr_delayed_tasks / threads, 1)
- *        FULL = (threads - min(nr_running_tasks, threads)) / threads
+ *        FULL = (threads - min(nr_productive_tasks, threads)) / threads
  *
  * For the 257 number crunchers on 256 CPUs, this yields:
  *
  *     threads = min(257, 256)
  *        SOME = min(1 / 256, 1)             = 0.4%
- *        FULL = (256 - min(257, 256)) / 256 = 0%
+ *        FULL = (256 - min(256, 256)) / 256 = 0%
  *
  * For the 1 out of 4 memory-delayed tasks, this yields:
  *
  * For each runqueue, we track:
  *
  *        tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- *        tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ *        tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
  *     tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
  *
  * and then periodically aggregate:
@@ -233,7 +239,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
        case PSI_MEM_SOME:
                return unlikely(tasks[NR_MEMSTALL]);
        case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
        case PSI_CPU_SOME:
                return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
        case PSI_CPU_FULL:
@@ -710,10 +717,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
                if (groupc->tasks[t]) {
                        groupc->tasks[t]--;
                } else if (!psi_bug) {
-                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
                                        cpu, t, groupc->tasks[0],
                                        groupc->tasks[1], groupc->tasks[2],
-                                       groupc->tasks[3], clear, set);
+                                       groupc->tasks[3], groupc->tasks[4],
+                                       clear, set);
                        psi_bug = 1;
                }
        }
@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                int clear = TSK_ONCPU, set = 0;
 
                /*
-                * When we're going to sleep, psi_dequeue() lets us handle
-                * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
-                * with TSK_ONCPU and save walking common ancestors twice.
+                * When we're going to sleep, psi_dequeue() lets us
+                * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+                * TSK_IOWAIT here, where we can combine it with
+                * TSK_ONCPU and save walking common ancestors twice.
                 */
                if (sleep) {
                        clear |= TSK_RUNNING;
+                       if (prev->in_memstall)
+                               clear |= TSK_MEMSTALL_RUNNING;
                        if (prev->in_iowait)
                                set |= TSK_IOWAIT;
                }
@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
        rq = this_rq_lock_irq(&rf);
 
        current->in_memstall = 1;
-       psi_task_change(current, 0, TSK_MEMSTALL);
+       psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
 
        rq_unlock_irq(rq, &rf);
 }
@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
        rq = this_rq_lock_irq(&rf);
 
        current->in_memstall = 0;
-       psi_task_change(current, TSK_MEMSTALL, 0);
+       psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
 
        rq_unlock_irq(rq, &rf);
 }
index d8f8eb0c655ba9a962201ebad7d99f71fcb3ceab..606a3982d13a513f42b4fdc3d06915eb4b1b0d1e 100644 (file)
@@ -69,6 +69,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
        if (static_branch_likely(&psi_disabled))
                return;
 
+       if (p->in_memstall)
+               set |= TSK_MEMSTALL_RUNNING;
+
        if (!wakeup || p->sched_psi_wake_requeue) {
                if (p->in_memstall)
                        set |= TSK_MEMSTALL;
@@ -99,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
                return;
 
        if (p->in_memstall)
-               clear |= TSK_MEMSTALL;
+               clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
 
        psi_task_change(p, clear, 0);
 }