psi: Use ONCPU state tracking machinery to detect reclaim
authorChengming Zhou <zhouchengming@bytedance.com>
Wed, 3 Mar 2021 03:46:57 +0000 (11:46 +0800)
committerIngo Molnar <mingo@kernel.org>
Sat, 6 Mar 2021 11:40:22 +0000 (12:40 +0100)
Move the reclaim detection from the timer tick to the task state
tracking machinery using the recently added ONCPU state. And we
also add task psi_flags changes checking in the psi_task_switch()
optimization to update the parents properly.

In terms of performance and cost, this ONCPU task state tracking
is not cheaper than previous timer tick in aggregate. But the code is
simpler and shorter this way, so it's a maintainability win. And
Johannes did some testing with perf bench, the performace and cost
changes would be acceptable for real workloads.

Thanks to Johannes Weiner for pointing out the psi_task_switch()
optimization things and the clearer changelog.

Co-developed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20210303034659.91735-3-zhouchengming@bytedance.com
include/linux/psi.h
kernel/sched/core.c
kernel/sched/psi.c
kernel/sched/stats.h

index 7361023f3fdd50b52083797aab2e5b33461154bc..65eb1476ac705614c9cb1b1f338fce9e05624d9b 100644 (file)
@@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                     bool sleep);
 
-void psi_memstall_tick(struct task_struct *task, int cpu);
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
index 361974efc2430fe9ed2b065d318cdbcb56ddb5dc..d2629fd2af0732ae5e328c45983d84bc546879f2 100644 (file)
@@ -4551,7 +4551,6 @@ void scheduler_tick(void)
        update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
        curr->sched_class->task_tick(rq, curr, 0);
        calc_global_load_tick(rq);
-       psi_task_tick(rq);
 
        rq_unlock(rq, &rf);
 
index 2293c45d289dca898bdcdc759ef71c21d4fa9427..0fe6ff6a6a150521a538c63947193c9081252841 100644 (file)
@@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
        wake_up_interruptible(&group->poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu,
-                        bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, int cpu)
 {
        u32 delta;
        u64 now;
@@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
                groupc->times[PSI_MEM_SOME] += delta;
                if (groupc->state_mask & (1 << PSI_MEM_FULL))
                        groupc->times[PSI_MEM_FULL] += delta;
-               else if (memstall_tick) {
-                       u32 sample;
-                       /*
-                        * Since we care about lost potential, a
-                        * memstall is FULL when there are no other
-                        * working tasks, but also when the CPU is
-                        * actively reclaiming and nothing productive
-                        * could run even if it were runnable.
-                        *
-                        * When the timer tick sees a reclaiming CPU,
-                        * regardless of runnable tasks, sample a FULL
-                        * tick (or less if it hasn't been a full tick
-                        * since the last state change).
-                        */
-                       sample = min(delta, (u32)jiffies_to_nsecs(1));
-                       groupc->times[PSI_MEM_FULL] += sample;
-               }
        }
 
        if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
@@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
         */
        write_seqcount_begin(&groupc->seq);
 
-       record_times(groupc, cpu, false);
+       record_times(groupc, cpu);
 
        for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
                if (!(m & (1 << t)))
@@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int cpu,
                if (test_state(groupc->tasks, s))
                        state_mask |= (1 << s);
        }
+
+       /*
+        * Since we care about lost potential, a memstall is FULL
+        * when there are no other working tasks, but also when
+        * the CPU is actively reclaiming and nothing productive
+        * could run even if it were runnable. So when the current
+        * task in a cgroup is in_memstall, the corresponding groupc
+        * on that cpu is in PSI_MEM_FULL state.
+        */
+       if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)
+               state_mask |= (1 << PSI_MEM_FULL);
+
        groupc->state_mask = state_mask;
 
        write_seqcount_end(&groupc->seq);
@@ -823,17 +817,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
        void *iter;
 
        if (next->pid) {
+               bool identical_state;
+
                psi_flags_change(next, 0, TSK_ONCPU);
                /*
-                * When moving state between tasks, the group that
-                * contains them both does not change: we can stop
-                * updating the tree once we reach the first common
-                * ancestor. Iterate @next's ancestors until we
-                * encounter @prev's state.
+                * When switching between tasks that have an identical
+                * runtime state, the cgroup that contains both tasks
+                * runtime state, the cgroup that contains both tasks
+                * we reach the first common ancestor. Iterate @next's
+                * ancestors only until we encounter @prev's ONCPU.
                 */
+               identical_state = prev->psi_flags == next->psi_flags;
                iter = NULL;
                while ((group = iterate_groups(next, &iter))) {
-                       if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+                       if (identical_state &&
+                           per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
                                common = group;
                                break;
                        }
@@ -859,21 +857,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
        }
 }
 
-void psi_memstall_tick(struct task_struct *task, int cpu)
-{
-       struct psi_group *group;
-       void *iter = NULL;
-
-       while ((group = iterate_groups(task, &iter))) {
-               struct psi_group_cpu *groupc;
-
-               groupc = per_cpu_ptr(group->pcpu, cpu);
-               write_seqcount_begin(&groupc->seq);
-               record_times(groupc, cpu, true);
-               write_seqcount_end(&groupc->seq);
-       }
-}
-
 /**
  * psi_memstall_enter - mark the beginning of a memory stall section
  * @flags: flags to handle nested sections
index 33d0daf83842a1a8ae8a9548c4a0e81f5dc1d9fb..9e4e67a94731b37a8a2aca39a5b079f469c9eebb 100644 (file)
@@ -144,14 +144,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
        psi_task_switch(prev, next, sleep);
 }
 
-static inline void psi_task_tick(struct rq *rq)
-{
-       if (static_branch_likely(&psi_disabled))
-               return;
-
-       if (unlikely(rq->curr->in_memstall))
-               psi_memstall_tick(rq->curr, cpu_of(rq));
-}
 #else /* CONFIG_PSI */
 static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
 static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
@@ -159,7 +151,6 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
                                    struct task_struct *next,
                                    bool sleep) {}
-static inline void psi_task_tick(struct rq *rq) {}
 #endif /* CONFIG_PSI */
 
 #ifdef CONFIG_SCHED_INFO