sched/core: Forced idle accounting
authorJosh Don <joshdon@google.com>
Mon, 18 Oct 2021 20:34:28 +0000 (13:34 -0700)
committerPeter Zijlstra <peterz@infradead.org>
Wed, 17 Nov 2021 13:49:00 +0000 (14:49 +0100)
Adds accounting for "forced idle" time, which is time where a cookie'd
task forces its SMT sibling to idle, despite the presence of runnable
tasks.

Forced idle time is one means to measure the cost of enabling core
scheduling (ie. the capacity lost due to the need to force idle).

Forced idle time is attributed to the thread responsible for causing
the forced idle.

A few details:
 - Forced idle time is displayed via /proc/PID/sched. It also requires
   that schedstats is enabled.
 - Forced idle is only accounted when a sibling hyperthread is held
   idle despite the presence of runnable tasks. No time is charged if
   a sibling is idle but has no runnable tasks.
 - Tasks with 0 cookie are never charged forced idle.
 - For SMT > 2, we scale the amount of forced idle charged based on the
   number of forced idle siblings. Additionally, we split the time up and
   evenly charge it to all running tasks, as each is equally responsible
   for the forced idle.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211018203428.2025792-1-joshdon@google.com
include/linux/sched.h
kernel/sched/core.c
kernel/sched/core_sched.c
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/sched.h

index 78c351e35fec6361973c86c1a1337080af7a688a..d2e261adb8ea2a19e11a4ac39803c8ea3b2e0f50 100644 (file)
@@ -523,7 +523,11 @@ struct sched_statistics {
        u64                             nr_wakeups_affine_attempts;
        u64                             nr_wakeups_passive;
        u64                             nr_wakeups_idle;
+
+#ifdef CONFIG_SCHED_CORE
+       u64                             core_forceidle_sum;
 #endif
+#endif /* CONFIG_SCHEDSTATS */
 } ____cacheline_aligned;
 
 struct sched_entity {
index 3c9b0fda64ac08b00723227bad6203279d8a9b1b..beaa8be6241e1711a26536d1d516697767660a89 100644 (file)
@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
                return false;
 
        /* flip prio, so high prio is leftmost */
-       if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+       if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
                return true;
 
        return false;
@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
        rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
 }
 
-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
 {
        rq->core->core_task_seq++;
 
-       if (!sched_core_enqueued(p))
-               return;
+       if (sched_core_enqueued(p)) {
+               rb_erase(&p->core_node, &rq->core_tree);
+               RB_CLEAR_NODE(&p->core_node);
+       }
 
-       rb_erase(&p->core_node, &rq->core_tree);
-       RB_CLEAR_NODE(&p->core_node);
+       /*
+        * Migrating the last task off the cpu, with the cpu in forced idle
+        * state. Reschedule to create an accounting edge for forced idle,
+        * and re-examine whether the core is still in forced idle state.
+        */
+       if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+           rq->core->core_forceidle_count && rq->curr == rq->idle)
+               resched_curr(rq);
 }
 
 /*
@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
                for_each_cpu(t, smt_mask)
                        cpu_rq(t)->core_enabled = enabled;
 
+               cpu_rq(cpu)->core->core_forceidle_start = 0;
+
                sched_core_unlock(cpu, &flags);
 
                cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
@@ -364,7 +374,8 @@ void sched_core_put(void)
 #else /* !CONFIG_SCHED_CORE */
 
 static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
 
 #endif /* CONFIG_SCHED_CORE */
 
@@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (sched_core_enabled(rq))
-               sched_core_dequeue(rq, p);
+               sched_core_dequeue(rq, p, flags);
 
        if (!(flags & DEQUEUE_NOCLOCK))
                update_rq_clock(rq);
@@ -5244,6 +5255,7 @@ void scheduler_tick(void)
        if (sched_feat(LATENCY_WARN))
                resched_latency = cpu_resched_latency(rq);
        calc_global_load_tick(rq);
+       sched_core_tick(rq);
 
        rq_unlock(rq, &rf);
 
@@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        struct task_struct *next, *p, *max = NULL;
        const struct cpumask *smt_mask;
        bool fi_before = false;
+       bool core_clock_updated = (rq == rq->core);
        unsigned long cookie;
        int i, cpu, occ = 0;
        struct rq *rq_i;
@@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
        /* reset state */
        rq->core->core_cookie = 0UL;
-       if (rq->core->core_forceidle) {
+       if (rq->core->core_forceidle_count) {
+               if (!core_clock_updated) {
+                       update_rq_clock(rq->core);
+                       core_clock_updated = true;
+               }
+               sched_core_account_forceidle(rq);
+               /* reset after accounting force idle */
+               rq->core->core_forceidle_start = 0;
+               rq->core->core_forceidle_count = 0;
+               rq->core->core_forceidle_occupation = 0;
                need_sync = true;
                fi_before = true;
-               rq->core->core_forceidle = false;
        }
 
        /*
@@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        for_each_cpu_wrap(i, smt_mask, cpu) {
                rq_i = cpu_rq(i);
 
-               if (i != cpu)
+               /*
+                * Current cpu always has its clock updated on entrance to
+                * pick_next_task(). If the current cpu is not the core,
+                * the core may also have been updated above.
+                */
+               if (i != cpu && (rq_i != rq->core || !core_clock_updated))
                        update_rq_clock(rq_i);
 
                p = rq_i->core_pick = pick_task(rq_i);
@@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
                if (p == rq_i->idle) {
                        if (rq_i->nr_running) {
-                               rq->core->core_forceidle = true;
+                               rq->core->core_forceidle_count++;
                                if (!fi_before)
                                        rq->core->core_forceidle_seq++;
                        }
@@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                }
        }
 
+       if (schedstat_enabled() && rq->core->core_forceidle_count) {
+               if (cookie)
+                       rq->core->core_forceidle_start = rq_clock(rq->core);
+               rq->core->core_forceidle_occupation = occ;
+       }
+
        rq->core->core_pick_seq = rq->core->core_task_seq;
        next = rq->core_pick;
        rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                 *  1            0       1
                 *  1            1       0
                 */
-               if (!(fi_before && rq->core->core_forceidle))
-                       task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+               if (!(fi_before && rq->core->core_forceidle_count))
+                       task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
 
                rq_i->core_pick->core_occupation = occ;
 
@@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
                goto unlock;
 
        /* copy the shared state to the new leader */
-       core_rq->core_task_seq      = rq->core_task_seq;
-       core_rq->core_pick_seq      = rq->core_pick_seq;
-       core_rq->core_cookie        = rq->core_cookie;
-       core_rq->core_forceidle     = rq->core_forceidle;
-       core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+       core_rq->core_task_seq             = rq->core_task_seq;
+       core_rq->core_pick_seq             = rq->core_pick_seq;
+       core_rq->core_cookie               = rq->core_cookie;
+       core_rq->core_forceidle_count      = rq->core_forceidle_count;
+       core_rq->core_forceidle_seq        = rq->core_forceidle_seq;
+       core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+       /*
+        * Accounting edge for forced idle is handled in pick_next_task().
+        * Don't need another one here, since the hotplug thread shouldn't
+        * have a cookie.
+        */
+       core_rq->core_forceidle_start = 0;
 
        /* install new leader */
        for_each_cpu(t, smt_mask) {
@@ -9413,7 +9453,9 @@ void __init sched_init(void)
                rq->core_pick = NULL;
                rq->core_enabled = 0;
                rq->core_tree = RB_ROOT;
-               rq->core_forceidle = false;
+               rq->core_forceidle_count = 0;
+               rq->core_forceidle_occupation = 0;
+               rq->core_forceidle_start = 0;
 
                rq->core_cookie = 0UL;
 #endif
index 517f72b008f508be66b8f63cfedbc47d50c61f51..1fb45672ec85060d42f56d742526930e0e3c2202 100644 (file)
@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
 
        enqueued = sched_core_enqueued(p);
        if (enqueued)
-               sched_core_dequeue(rq, p);
+               sched_core_dequeue(rq, p, DEQUEUE_SAVE);
 
        old_cookie = p->core_cookie;
        p->core_cookie = cookie;
@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
         * If task is currently running, it may not be compatible anymore after
         * the cookie change, so enter the scheduler on its CPU to schedule it
         * away.
+        *
+        * Note that it is possible that as a result of this cookie change, the
+        * core has now entered/left forced idle state. Defer accounting to the
+        * next scheduling edge, rather than always forcing a reschedule here.
         */
        if (task_running(rq, p))
                resched_curr(rq);
@@ -232,3 +236,63 @@ out:
        return err;
 }
 
+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void __sched_core_account_forceidle(struct rq *rq)
+{
+       const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+       u64 delta, now = rq_clock(rq->core);
+       struct rq *rq_i;
+       struct task_struct *p;
+       int i;
+
+       lockdep_assert_rq_held(rq);
+
+       WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+       if (rq->core->core_forceidle_start == 0)
+               return;
+
+       delta = now - rq->core->core_forceidle_start;
+       if (unlikely((s64)delta <= 0))
+               return;
+
+       rq->core->core_forceidle_start = now;
+
+       if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+               /* can't be forced idle without a running task */
+       } else if (rq->core->core_forceidle_count > 1 ||
+                  rq->core->core_forceidle_occupation > 1) {
+               /*
+                * For larger SMT configurations, we need to scale the charged
+                * forced idle amount since there can be more than one forced
+                * idle sibling and more than one running cookied task.
+                */
+               delta *= rq->core->core_forceidle_count;
+               delta = div_u64(delta, rq->core->core_forceidle_occupation);
+       }
+
+       for_each_cpu(i, smt_mask) {
+               rq_i = cpu_rq(i);
+               p = rq_i->core_pick ?: rq_i->curr;
+
+               if (!p->core_cookie)
+                       continue;
+
+               __schedstat_add(p->stats.core_forceidle_sum, delta);
+       }
+}
+
+void __sched_core_tick(struct rq *rq)
+{
+       if (!rq->core->core_forceidle_count)
+               return;
+
+       if (rq != rq->core)
+               update_rq_clock(rq->core);
+
+       __sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
index 7dcbaa31c5d917adc8d53fe941016cb1e88c1c16..aa29211de1bf816ebacc9653c9377c8dd7060ea2 100644 (file)
@@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 
                __PN(avg_atom);
                __PN(avg_per_cpu);
+
+#ifdef CONFIG_SCHED_CORE
+               PN_SCHEDSTAT(core_forceidle_sum);
+#endif
        }
 
        __P(nr_switches);
index 6e476f6d94351bd346202c8bb19c1342677dda7b..884f29d0796370a966dec46e13509aed80ffd4b4 100644 (file)
@@ -11068,7 +11068,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
         * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
         * if we need to give up the CPU.
         */
-       if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+       if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
            __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
                resched_curr(rq);
 }
index 0e66749486e7559e02e7d784d8ef3455ca203ccb..eb971151e7e45d5ca844135e94c41947bb30cabc 100644 (file)
@@ -1111,8 +1111,10 @@ struct rq {
        unsigned int            core_task_seq;
        unsigned int            core_pick_seq;
        unsigned long           core_cookie;
-       unsigned char           core_forceidle;
+       unsigned int            core_forceidle_count;
        unsigned int            core_forceidle_seq;
+       unsigned int            core_forceidle_occupation;
+       u64                     core_forceidle_start;
 #endif
 };
 
@@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
 }
 
 extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
-extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
 
 extern void sched_core_get(void);
 extern void sched_core_put(void);
@@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { }
 #include "stats.h"
 #include "autogroup.h"
 
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+       if (schedstat_enabled())
+               __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+       if (sched_core_enabled(rq) && schedstat_enabled())
+               __sched_core_tick(rq);
+}
+
+#else
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
+
+#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
+
 #ifdef CONFIG_CGROUP_SCHED
 
 /*