sched/fair: Simplify util_est

author Vincent Guittot <vincent.guittot@linaro.org>

Fri, 1 Dec 2023 16:16:52 +0000 (17:16 +0100)

committer Ingo Molnar <mingo@kernel.org>

Sat, 23 Dec 2023 14:59:58 +0000 (15:59 +0100)
author Vincent Guittot <vincent.guittot@linaro.org>
Fri, 1 Dec 2023 16:16:52 +0000 (17:16 +0100)
committer Ingo Molnar <mingo@kernel.org>
Sat, 23 Dec 2023 14:59:58 +0000 (15:59 +0100)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 8d258162deb0a0694c2f8379134c44258f272310..03bfe9ab29511398f6062ec58ed35be59b8092f3 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -415,42 +415,6 @@ struct load_weight {
         u32                             inv_weight;
  };
  
-/**
- * struct util_est - Estimation utilization of FAIR tasks
- * @enqueued: instantaneous estimated utilization of a task/cpu
- * @ewma:     the Exponential Weighted Moving Average (EWMA)
- *            utilization of a task
- *
- * Support data structure to track an Exponential Weighted Moving Average
- * (EWMA) of a FAIR task's utilization. New samples are added to the moving
- * average each time a task completes an activation. Sample's weight is chosen
- * so that the EWMA will be relatively insensitive to transient changes to the
- * task's workload.
- *
- * The enqueued attribute has a slightly different meaning for tasks and cpus:
- * - task:   the task's util_avg at last task dequeue time
- * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
- * Thus, the util_est.enqueued of a task represents the contribution on the
- * estimated utilization of the CPU where that task is currently enqueued.
- *
- * Only for tasks we track a moving average of the past instantaneous
- * estimated utilization. This allows to absorb sporadic drops in utilization
- * of an otherwise almost periodic task.
- *
- * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- * updates. When a task is dequeued, its util_est should not be updated if its
- * util_avg has not been updated in the meantime.
- * This information is mapped into the MSB bit of util_est.enqueued at dequeue
- * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
- * for a task) it is safe to use MSB.
- */
-struct util_est {
-       unsigned int                    enqueued;
-       unsigned int                    ewma;
-#define UTIL_EST_WEIGHT_SHIFT          2
-#define UTIL_AVG_UNCHANGED             0x80000000
-} __attribute__((__aligned__(sizeof(u64))));
-
  /*
   * The load/runnable/util_avg accumulates an infinite geometric series
   * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -505,9 +469,20 @@ struct sched_avg {
         unsigned long                   load_avg;
         unsigned long                   runnable_avg;
         unsigned long                   util_avg;
-       struct util_est                 util_est;
+       unsigned int                    util_est;
  } ____cacheline_aligned;
  
+/*
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est at dequeue time.
+ * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ * it is safe to use MSB.
+ */
+#define UTIL_EST_WEIGHT_SHIFT          2
+#define UTIL_AVG_UNCHANGED             0x80000000
+
  struct sched_statistics {
  #ifdef CONFIG_SCHEDSTATS
         u64                             wait_start;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 168eecc209b49438059ddad6ee163548ff165e24..8d5d98a5834dfdb95f39b6b9294daa732364d2bc 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -684,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                         cfs_rq->avg.runnable_avg);
         SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
                         cfs_rq->avg.util_avg);
-       SEQ_printf(m, "  .%-30s: %u\n", "util_est_enqueued",
-                       cfs_rq->avg.util_est.enqueued);
+       SEQ_printf(m, "  .%-30s: %u\n", "util_est",
+                       cfs_rq->avg.util_est);
         SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
                         cfs_rq->removed.load_avg);
         SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
@@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
         P(se.avg.runnable_avg);
         P(se.avg.util_avg);
         P(se.avg.last_update_time);
-       P(se.avg.util_est.ewma);
-       PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
+       PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
  #endif
  #ifdef CONFIG_UCLAMP_TASK
         __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index e94d65da8d665128f1c7b9900fd7f2cf3200ecaa..823dd76d054691f95c4590a3799b7a5cc31b3421 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4781,9 +4781,7 @@ static inline unsigned long task_runnable(struct task_struct *p)
  
  static inline unsigned long _task_util_est(struct task_struct *p)
  {
-       struct util_est ue = READ_ONCE(p->se.avg.util_est);
-
-       return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+       return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
  }
  
  static inline unsigned long task_util_est(struct task_struct *p)
@@ -4800,9 +4798,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
                 return;
  
         /* Update root cfs_rq's estimated utilization */
-       enqueued  = cfs_rq->avg.util_est.enqueued;
+       enqueued  = cfs_rq->avg.util_est;
         enqueued += _task_util_est(p);
-       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+       WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
  
         trace_sched_util_est_cfs_tp(cfs_rq);
  }
@@ -4816,34 +4814,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
                 return;
  
         /* Update root cfs_rq's estimated utilization */
-       enqueued  = cfs_rq->avg.util_est.enqueued;
+       enqueued  = cfs_rq->avg.util_est;
         enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
-       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+       WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
  
         trace_sched_util_est_cfs_tp(cfs_rq);
  }
  
  #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
  
-/*
- * Check if a (signed) value is within a specified (unsigned) margin,
- * based on the observation that:
- *
- *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
- *
- * NOTE: this only works when value + margin < INT_MAX.
- */
-static inline bool within_margin(int value, int margin)
-{
-       return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
-}
-
  static inline void util_est_update(struct cfs_rq *cfs_rq,
                                    struct task_struct *p,
                                    bool task_sleep)
  {
-       long last_ewma_diff, last_enqueued_diff;
-       struct util_est ue;
+       unsigned int ewma, dequeued, last_ewma_diff;
  
         if (!sched_feat(UTIL_EST))
                 return;
@@ -4855,23 +4839,25 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
         if (!task_sleep)
                 return;
  
+       /* Get current estimate of utilization */
+       ewma = READ_ONCE(p->se.avg.util_est);
+
         /*
          * If the PELT values haven't changed since enqueue time,
          * skip the util_est update.
          */
-       ue = p->se.avg.util_est;
-       if (ue.enqueued & UTIL_AVG_UNCHANGED)
+       if (ewma & UTIL_AVG_UNCHANGED)
                 return;
  
-       last_enqueued_diff = ue.enqueued;
+       /* Get utilization at dequeue */
+       dequeued = task_util(p);
  
         /*
          * Reset EWMA on utilization increases, the moving average is used only
          * to smooth utilization decreases.
          */
-       ue.enqueued = task_util(p);
-       if (ue.ewma < ue.enqueued) {
-               ue.ewma = ue.enqueued;
+       if (ewma <= dequeued) {
+               ewma = dequeued;
                 goto done;
         }
  
@@ -4879,27 +4865,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
          * Skip update of task's estimated utilization when its members are
          * already ~1% close to its last activation value.
          */
-       last_ewma_diff = ue.enqueued - ue.ewma;
-       last_enqueued_diff -= ue.enqueued;
-       if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
-               if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
-                       goto done;
-
-               return;
-       }
+       last_ewma_diff = ewma - dequeued;
+       if (last_ewma_diff < UTIL_EST_MARGIN)
+               goto done;
  
         /*
          * To avoid overestimation of actual task utilization, skip updates if
          * we cannot grant there is idle time in this CPU.
          */
-       if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
+       if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
                 return;
  
         /*
          * To avoid underestimate of task utilization, skip updates of EWMA if
          * we cannot grant that thread got all CPU time it wanted.
          */
-       if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p))
+       if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
                 goto done;
  
  
@@ -4907,25 +4888,24 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
          * Update Task's estimated utilization
          *
          * When *p completes an activation we can consolidate another sample
-        * of the task size. This is done by storing the current PELT value
-        * as ue.enqueued and by using this value to update the Exponential
-        * Weighted Moving Average (EWMA):
+        * of the task size. This is done by using this value to update the
+        * Exponential Weighted Moving Average (EWMA):
          *
          *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
          *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
          *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
-        *          = w * (      last_ewma_diff            ) +     ewma(t-1)
-        *          = w * (last_ewma_diff  +  ewma(t-1) / w)
+        *          = w * (      -last_ewma_diff           ) +     ewma(t-1)
+        *          = w * (-last_ewma_diff +  ewma(t-1) / w)
          *
          * Where 'w' is the weight of new samples, which is configured to be
          * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
          */
-       ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
-       ue.ewma  += last_ewma_diff;
-       ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+       ewma <<= UTIL_EST_WEIGHT_SHIFT;
+       ewma  -= last_ewma_diff;
+       ewma >>= UTIL_EST_WEIGHT_SHIFT;
  done:
-       ue.enqueued |= UTIL_AVG_UNCHANGED;
-       WRITE_ONCE(p->se.avg.util_est, ue);
+       ewma |= UTIL_AVG_UNCHANGED;
+       WRITE_ONCE(p->se.avg.util_est, ewma);
  
         trace_sched_util_est_se_tp(&p->se);
  }
@@ -7653,16 +7633,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
         if (sched_feat(UTIL_EST)) {
                 unsigned long util_est;
  
-               util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+               util_est = READ_ONCE(cfs_rq->avg.util_est);
  
                 /*
                  * During wake-up @p isn't enqueued yet and doesn't contribute
-                * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+                * to any cpu_rq(cpu)->cfs.avg.util_est.
                  * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
                  * has been enqueued.
                  *
                  * During exec (@dst_cpu = -1) @p is enqueued and does
-                * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+                * contribute to cpu_rq(cpu)->cfs.util_est.
                  * Remove it to "simulate" cpu_util without @p's contribution.
                  *
                  * Despite the task_on_rq_queued(@p) check there is still a
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h

index 3a0e0dc28721960276b97a67eba0bc3ea7343ff5..9e1083465fbc3d2f382e1439bc4cc7c3b2efc935 100644 (file)
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
                 return;
  
         /* Avoid store if the flag has been already reset */
-       enqueued = avg->util_est.enqueued;
+       enqueued = avg->util_est;
         if (!(enqueued & UTIL_AVG_UNCHANGED))
                 return;
  
         /* Reset flag to report util_avg has been updated */
         enqueued &= ~UTIL_AVG_UNCHANGED;
-       WRITE_ONCE(avg->util_est.enqueued, enqueued);
+       WRITE_ONCE(avg->util_est, enqueued);
  }
  
  static inline u64 rq_clock_pelt(struct rq *rq)
author	Vincent Guittot <vincent.guittot@linaro.org>
	Fri, 1 Dec 2023 16:16:52 +0000 (17:16 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Sat, 23 Dec 2023 14:59:58 +0000 (15:59 +0100)
include/linux/sched.h		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/pelt.h		patch \| blob \| history