sched/core: Forced idle accounting

author Josh Don <joshdon@google.com>

Mon, 18 Oct 2021 20:34:28 +0000 (13:34 -0700)

committer Peter Zijlstra <peterz@infradead.org>

Wed, 17 Nov 2021 13:49:00 +0000 (14:49 +0100)
author Josh Don <joshdon@google.com>
Mon, 18 Oct 2021 20:34:28 +0000 (13:34 -0700)
committer Peter Zijlstra <peterz@infradead.org>
Wed, 17 Nov 2021 13:49:00 +0000 (14:49 +0100)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 78c351e35fec6361973c86c1a1337080af7a688a..d2e261adb8ea2a19e11a4ac39803c8ea3b2e0f50 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,7 +523,11 @@ struct sched_statistics {
         u64                             nr_wakeups_affine_attempts;
         u64                             nr_wakeups_passive;
         u64                             nr_wakeups_idle;
+
+#ifdef CONFIG_SCHED_CORE
+       u64                             core_forceidle_sum;
  #endif
+#endif /* CONFIG_SCHEDSTATS */
  } ____cacheline_aligned;
  
  struct sched_entity {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 3c9b0fda64ac08b00723227bad6203279d8a9b1b..beaa8be6241e1711a26536d1d516697767660a89 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
                 return false;
  
         /* flip prio, so high prio is leftmost */
-       if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+       if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
                 return true;
  
         return false;
@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
         rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
  }
  
-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
  {
         rq->core->core_task_seq++;
  
-       if (!sched_core_enqueued(p))
-               return;
+       if (sched_core_enqueued(p)) {
+               rb_erase(&p->core_node, &rq->core_tree);
+               RB_CLEAR_NODE(&p->core_node);
+       }
  
-       rb_erase(&p->core_node, &rq->core_tree);
-       RB_CLEAR_NODE(&p->core_node);
+       /*
+        * Migrating the last task off the cpu, with the cpu in forced idle
+        * state. Reschedule to create an accounting edge for forced idle,
+        * and re-examine whether the core is still in forced idle state.
+        */
+       if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+           rq->core->core_forceidle_count && rq->curr == rq->idle)
+               resched_curr(rq);
  }
  
  /*
@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
                 for_each_cpu(t, smt_mask)
                         cpu_rq(t)->core_enabled = enabled;
  
+               cpu_rq(cpu)->core->core_forceidle_start = 0;
+
                 sched_core_unlock(cpu, &flags);
  
                 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
@@ -364,7 +374,8 @@ void sched_core_put(void)
  #else /* !CONFIG_SCHED_CORE */
  
  static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
  
  #endif /* CONFIG_SCHED_CORE */
  
@@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
         if (sched_core_enabled(rq))
-               sched_core_dequeue(rq, p);
+               sched_core_dequeue(rq, p, flags);
  
         if (!(flags & DEQUEUE_NOCLOCK))
                 update_rq_clock(rq);
@@ -5244,6 +5255,7 @@ void scheduler_tick(void)
         if (sched_feat(LATENCY_WARN))
                 resched_latency = cpu_resched_latency(rq);
         calc_global_load_tick(rq);
+       sched_core_tick(rq);
  
         rq_unlock(rq, &rf);
  
@@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
         struct task_struct *next, *p, *max = NULL;
         const struct cpumask *smt_mask;
         bool fi_before = false;
+       bool core_clock_updated = (rq == rq->core);
         unsigned long cookie;
         int i, cpu, occ = 0;
         struct rq *rq_i;
@@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  
         /* reset state */
         rq->core->core_cookie = 0UL;
-       if (rq->core->core_forceidle) {
+       if (rq->core->core_forceidle_count) {
+               if (!core_clock_updated) {
+                       update_rq_clock(rq->core);
+                       core_clock_updated = true;
+               }
+               sched_core_account_forceidle(rq);
+               /* reset after accounting force idle */
+               rq->core->core_forceidle_start = 0;
+               rq->core->core_forceidle_count = 0;
+               rq->core->core_forceidle_occupation = 0;
                 need_sync = true;
                 fi_before = true;
-               rq->core->core_forceidle = false;
         }
  
         /*
@@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
         for_each_cpu_wrap(i, smt_mask, cpu) {
                 rq_i = cpu_rq(i);
  
-               if (i != cpu)
+               /*
+                * Current cpu always has its clock updated on entrance to
+                * pick_next_task(). If the current cpu is not the core,
+                * the core may also have been updated above.
+                */
+               if (i != cpu && (rq_i != rq->core || !core_clock_updated))
                         update_rq_clock(rq_i);
  
                 p = rq_i->core_pick = pick_task(rq_i);
@@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  
                 if (p == rq_i->idle) {
                         if (rq_i->nr_running) {
-                               rq->core->core_forceidle = true;
+                               rq->core->core_forceidle_count++;
                                 if (!fi_before)
                                         rq->core->core_forceidle_seq++;
                         }
@@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                 }
         }
  
+       if (schedstat_enabled() && rq->core->core_forceidle_count) {
+               if (cookie)
+                       rq->core->core_forceidle_start = rq_clock(rq->core);
+               rq->core->core_forceidle_occupation = occ;
+       }
+
         rq->core->core_pick_seq = rq->core->core_task_seq;
         next = rq->core_pick;
         rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                  *  1            0       1
                  *  1            1       0
                  */
-               if (!(fi_before && rq->core->core_forceidle))
-                       task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+               if (!(fi_before && rq->core->core_forceidle_count))
+                       task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
  
                 rq_i->core_pick->core_occupation = occ;
  
@@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
                 goto unlock;
  
         /* copy the shared state to the new leader */
-       core_rq->core_task_seq      = rq->core_task_seq;
-       core_rq->core_pick_seq      = rq->core_pick_seq;
-       core_rq->core_cookie        = rq->core_cookie;
-       core_rq->core_forceidle     = rq->core_forceidle;
-       core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+       core_rq->core_task_seq             = rq->core_task_seq;
+       core_rq->core_pick_seq             = rq->core_pick_seq;
+       core_rq->core_cookie               = rq->core_cookie;
+       core_rq->core_forceidle_count      = rq->core_forceidle_count;
+       core_rq->core_forceidle_seq        = rq->core_forceidle_seq;
+       core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+       /*
+        * Accounting edge for forced idle is handled in pick_next_task().
+        * Don't need another one here, since the hotplug thread shouldn't
+        * have a cookie.
+        */
+       core_rq->core_forceidle_start = 0;
  
         /* install new leader */
         for_each_cpu(t, smt_mask) {
@@ -9413,7 +9453,9 @@ void __init sched_init(void)
                 rq->core_pick = NULL;
                 rq->core_enabled = 0;
                 rq->core_tree = RB_ROOT;
-               rq->core_forceidle = false;
+               rq->core_forceidle_count = 0;
+               rq->core_forceidle_occupation = 0;
+               rq->core_forceidle_start = 0;
  
                 rq->core_cookie = 0UL;
  #endif
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c

index 517f72b008f508be66b8f63cfedbc47d50c61f51..1fb45672ec85060d42f56d742526930e0e3c2202 100644 (file)
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
  
         enqueued = sched_core_enqueued(p);
         if (enqueued)
-               sched_core_dequeue(rq, p);
+               sched_core_dequeue(rq, p, DEQUEUE_SAVE);
  
         old_cookie = p->core_cookie;
         p->core_cookie = cookie;
@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
          * If task is currently running, it may not be compatible anymore after
          * the cookie change, so enter the scheduler on its CPU to schedule it
          * away.
+        *
+        * Note that it is possible that as a result of this cookie change, the
+        * core has now entered/left forced idle state. Defer accounting to the
+        * next scheduling edge, rather than always forcing a reschedule here.
          */
         if (task_running(rq, p))
                 resched_curr(rq);
@@ -232,3 +236,63 @@ out:
         return err;
  }
  
+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void __sched_core_account_forceidle(struct rq *rq)
+{
+       const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+       u64 delta, now = rq_clock(rq->core);
+       struct rq *rq_i;
+       struct task_struct *p;
+       int i;
+
+       lockdep_assert_rq_held(rq);
+
+       WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+       if (rq->core->core_forceidle_start == 0)
+               return;
+
+       delta = now - rq->core->core_forceidle_start;
+       if (unlikely((s64)delta <= 0))
+               return;
+
+       rq->core->core_forceidle_start = now;
+
+       if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+               /* can't be forced idle without a running task */
+       } else if (rq->core->core_forceidle_count > 1 ||
+                  rq->core->core_forceidle_occupation > 1) {
+               /*
+                * For larger SMT configurations, we need to scale the charged
+                * forced idle amount since there can be more than one forced
+                * idle sibling and more than one running cookied task.
+                */
+               delta *= rq->core->core_forceidle_count;
+               delta = div_u64(delta, rq->core->core_forceidle_occupation);
+       }
+
+       for_each_cpu(i, smt_mask) {
+               rq_i = cpu_rq(i);
+               p = rq_i->core_pick ?: rq_i->curr;
+
+               if (!p->core_cookie)
+                       continue;
+
+               __schedstat_add(p->stats.core_forceidle_sum, delta);
+       }
+}
+
+void __sched_core_tick(struct rq *rq)
+{
+       if (!rq->core->core_forceidle_count)
+               return;
+
+       if (rq != rq->core)
+               update_rq_clock(rq->core);
+
+       __sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 7dcbaa31c5d917adc8d53fe941016cb1e88c1c16..aa29211de1bf816ebacc9653c9377c8dd7060ea2 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
  
                 __PN(avg_atom);
                 __PN(avg_per_cpu);
+
+#ifdef CONFIG_SCHED_CORE
+               PN_SCHEDSTAT(core_forceidle_sum);
+#endif
         }
  
         __P(nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 6e476f6d94351bd346202c8bb19c1342677dda7b..884f29d0796370a966dec46e13509aed80ffd4b4 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11068,7 +11068,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
          * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
          * if we need to give up the CPU.
          */
-       if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+       if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
             __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
                 resched_curr(rq);
  }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 0e66749486e7559e02e7d784d8ef3455ca203ccb..eb971151e7e45d5ca844135e94c41947bb30cabc 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1111,8 +1111,10 @@ struct rq {
         unsigned int            core_task_seq;
         unsigned int            core_pick_seq;
         unsigned long           core_cookie;
-       unsigned char           core_forceidle;
+       unsigned int            core_forceidle_count;
         unsigned int            core_forceidle_seq;
+       unsigned int            core_forceidle_occupation;
+       u64                     core_forceidle_start;
  #endif
  };
  
@@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
  }
  
  extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
-extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
  
  extern void sched_core_get(void);
  extern void sched_core_put(void);
@@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { }
  #include "stats.h"
  #include "autogroup.h"
  
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+       if (schedstat_enabled())
+               __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+       if (sched_core_enabled(rq) && schedstat_enabled())
+               __sched_core_tick(rq);
+}
+
+#else
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
+
+#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
+
  #ifdef CONFIG_CGROUP_SCHED
  
  /*
author	Josh Don <joshdon@google.com>
	Mon, 18 Oct 2021 20:34:28 +0000 (13:34 -0700)
committer	Peter Zijlstra <peterz@infradead.org>
	Wed, 17 Nov 2021 13:49:00 +0000 (14:49 +0100)
include/linux/sched.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/core_sched.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history