u32 inv_weight;
};
-/**
- * struct util_est - Estimation utilization of FAIR tasks
- * @enqueued: instantaneous estimated utilization of a task/cpu
- * @ewma: the Exponential Weighted Moving Average (EWMA)
- * utilization of a task
- *
- * Support data structure to track an Exponential Weighted Moving Average
- * (EWMA) of a FAIR task's utilization. New samples are added to the moving
- * average each time a task completes an activation. Sample's weight is chosen
- * so that the EWMA will be relatively insensitive to transient changes to the
- * task's workload.
- *
- * The enqueued attribute has a slightly different meaning for tasks and cpus:
- * - task: the task's util_avg at last task dequeue time
- * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
- * Thus, the util_est.enqueued of a task represents the contribution on the
- * estimated utilization of the CPU where that task is currently enqueued.
- *
- * Only for tasks we track a moving average of the past instantaneous
- * estimated utilization. This allows to absorb sporadic drops in utilization
- * of an otherwise almost periodic task.
- *
- * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- * updates. When a task is dequeued, its util_est should not be updated if its
- * util_avg has not been updated in the meantime.
- * This information is mapped into the MSB bit of util_est.enqueued at dequeue
- * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
- * for a task) it is safe to use MSB.
- */
-struct util_est {
- unsigned int enqueued;
- unsigned int ewma;
-#define UTIL_EST_WEIGHT_SHIFT 2
-#define UTIL_AVG_UNCHANGED 0x80000000
-} __attribute__((__aligned__(sizeof(u64))));
-
/*
* The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
unsigned long load_avg;
unsigned long runnable_avg;
unsigned long util_avg;
- struct util_est util_est;
+ unsigned int util_est;
} ____cacheline_aligned;
+/*
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est at dequeue time.
+ * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ * it is safe to use MSB.
+ */
+#define UTIL_EST_WEIGHT_SHIFT 2
+#define UTIL_AVG_UNCHANGED 0x80000000
+
struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
static inline unsigned long _task_util_est(struct task_struct *p)
{
- struct util_est ue = READ_ONCE(p->se.avg.util_est);
-
- return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+ return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
}
static inline unsigned long task_util_est(struct task_struct *p)
return;
/* Update root cfs_rq's estimated utilization */
- enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued = cfs_rq->avg.util_est;
enqueued += _task_util_est(p);
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
return;
/* Update root cfs_rq's estimated utilization */
- enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued = cfs_rq->avg.util_est;
enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
-/*
- * Check if a (signed) value is within a specified (unsigned) margin,
- * based on the observation that:
- *
- * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
- *
- * NOTE: this only works when value + margin < INT_MAX.
- */
-static inline bool within_margin(int value, int margin)
-{
- return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
-}
-
static inline void util_est_update(struct cfs_rq *cfs_rq,
struct task_struct *p,
bool task_sleep)
{
- long last_ewma_diff, last_enqueued_diff;
- struct util_est ue;
+ unsigned int ewma, dequeued, last_ewma_diff;
if (!sched_feat(UTIL_EST))
return;
if (!task_sleep)
return;
+ /* Get current estimate of utilization */
+ ewma = READ_ONCE(p->se.avg.util_est);
+
/*
* If the PELT values haven't changed since enqueue time,
* skip the util_est update.
*/
- ue = p->se.avg.util_est;
- if (ue.enqueued & UTIL_AVG_UNCHANGED)
+ if (ewma & UTIL_AVG_UNCHANGED)
return;
- last_enqueued_diff = ue.enqueued;
+ /* Get utilization at dequeue */
+ dequeued = task_util(p);
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
- ue.enqueued = task_util(p);
- if (ue.ewma < ue.enqueued) {
- ue.ewma = ue.enqueued;
+ if (ewma <= dequeued) {
+ ewma = dequeued;
goto done;
}
* Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
- last_ewma_diff = ue.enqueued - ue.ewma;
- last_enqueued_diff -= ue.enqueued;
- if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
- if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
- goto done;
-
- return;
- }
+ last_ewma_diff = ewma - dequeued;
+ if (last_ewma_diff < UTIL_EST_MARGIN)
+ goto done;
/*
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
+ if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
return;
/*
* To avoid underestimate of task utilization, skip updates of EWMA if
* we cannot grant that thread got all CPU time it wanted.
*/
- if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p))
+ if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
goto done;
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
- * of the task size. This is done by storing the current PELT value
- * as ue.enqueued and by using this value to update the Exponential
- * Weighted Moving Average (EWMA):
+ * of the task size. This is done by using this value to update the
+ * Exponential Weighted Moving Average (EWMA):
*
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
- * = w * ( last_ewma_diff ) + ewma(t-1)
- * = w * (last_ewma_diff + ewma(t-1) / w)
+ * = w * ( -last_ewma_diff ) + ewma(t-1)
+ * = w * (-last_ewma_diff + ewma(t-1) / w)
*
* Where 'w' is the weight of new samples, which is configured to be
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/
- ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
- ue.ewma += last_ewma_diff;
- ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+ ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ewma -= last_ewma_diff;
+ ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
- ue.enqueued |= UTIL_AVG_UNCHANGED;
- WRITE_ONCE(p->se.avg.util_est, ue);
+ ewma |= UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(p->se.avg.util_est, ewma);
trace_sched_util_est_se_tp(&p->se);
}
if (sched_feat(UTIL_EST)) {
unsigned long util_est;
- util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ util_est = READ_ONCE(cfs_rq->avg.util_est);
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
- * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+ * to any cpu_rq(cpu)->cfs.avg.util_est.
* If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
* has been enqueued.
*
* During exec (@dst_cpu = -1) @p is enqueued and does
- * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+ * contribute to cpu_rq(cpu)->cfs.util_est.
* Remove it to "simulate" cpu_util without @p's contribution.
*
* Despite the task_on_rq_queued(@p) check there is still a