Extrapolating on the existing framework to track rt/dl utilization using
pelt signals, add a similar mechanism to track thermal pressure. The
difference here from rt/dl utilization tracking is that, instead of
tracking time spent by a CPU running a RT/DL task through util_avg, the
average thermal pressure is tracked through load_avg. This is because
thermal pressure signal is weighted time "delta" capacity unlike util_avg
which is binary. "delta capacity" here means delta between the actual
capacity of a CPU and the decreased capacity a CPU due to a thermal event.
In order to track average thermal pressure, a new sched_avg variable
avg_thermal is introduced. Function update_thermal_load_avg can be called
to do the periodic bookkeeping (accumulate, decay and average) of the
thermal pressure.
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200222005213.3873-2-thara.gopinath@linaro.org
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));
 
+DECLARE_TRACE(pelt_thermal_tp,
+       TP_PROTO(struct rq *rq),
+       TP_ARGS(rq));
+
 DECLARE_TRACE(pelt_irq_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));
 
        depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
        depends on SMP
 
+config SCHED_THERMAL_PRESSURE
+       bool "Enable periodic averaging of thermal pressure"
+       depends on SMP
+
 config BSD_PROCESS_ACCT
        bool "BSD Process Accounting"
        depends on MULTIUSER
 
        return 0;
 }
 
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+/*
+ * thermal:
+ *
+ *   load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
+ *
+ *   util_avg and runnable_load_avg are not supported and meaningless.
+ *
+ * Unlike rt/dl utilization tracking that track time spent by a cpu
+ * running a rt/dl task through util_avg, the average thermal pressure is
+ * tracked through load_avg. This is because thermal pressure signal is
+ * time weighted "delta" capacity unlike util_avg which is binary.
+ * "delta capacity" =  actual capacity  -
+ *                     capped capacity a cpu due to a thermal event.
+ */
+
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+       if (___update_load_sum(now, &rq->avg_thermal,
+                              capacity,
+                              capacity,
+                              capacity)) {
+               ___update_load_avg(&rq->avg_thermal, 1);
+               trace_pelt_thermal_tp(rq);
+               return 1;
+       }
+
+       return 0;
+}
+#endif
+
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 /*
  * irq:
 
 int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
 
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+       return READ_ONCE(rq->avg_thermal.load_avg);
+}
+#else
+static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+       return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+       return 0;
+}
+#endif
+
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 int update_irq_load_avg(struct rq *rq, u64 running);
 #else
        return 0;
 }
 
+static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+       return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+       return 0;
+}
+
 static inline int
 update_irq_load_avg(struct rq *rq, u64 running)
 {
 
        struct sched_avg        avg_dl;
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
        struct sched_avg        avg_irq;
+#endif
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+       struct sched_avg        avg_thermal;
 #endif
        u64                     idle_stamp;
        u64                     avg_idle;