PM: EM: convert power field to micro-Watts precision and align drivers

author Lukasz Luba <lukasz.luba@arm.com>

Thu, 7 Jul 2022 07:15:52 +0000 (08:15 +0100)

committer Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Fri, 15 Jul 2022 17:17:30 +0000 (19:17 +0200)
author Lukasz Luba <lukasz.luba@arm.com>
Thu, 7 Jul 2022 07:15:52 +0000 (08:15 +0100)
committer Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Fri, 15 Jul 2022 17:17:30 +0000 (19:17 +0200)
diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c

index 813cccbfe9348b58479d4906517a00639355fae0..f0e0a35c7f21744e4e5ac042c9bd3f8fc11387d1 100644 (file)
--- a/drivers/cpufreq/mediatek-cpufreq-hw.c
+++ b/drivers/cpufreq/mediatek-cpufreq-hw.c
@@ -51,7 +51,7 @@ static const u16 cpufreq_mtk_offsets[REG_ARRAY_SIZE] = {
  };
  
  static int __maybe_unused
-mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW,
+mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *uW,
                           unsigned long *KHz)
  {
         struct mtk_cpufreq_data *data;
@@ -71,8 +71,9 @@ mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW,
         i--;
  
         *KHz = data->table[i].frequency;
-       *mW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] +
-                           i * LUT_ROW_SIZE) / 1000;
+       /* Provide micro-Watts value to the Energy Model */
+       *uW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] +
+                           i * LUT_ROW_SIZE);
  
         return 0;
  }
diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c

index 6d2a4cf46db708601b6428fd5a04134af6ff2566..bfd35583d653202e655bf1e534f8508b11ab145f 100644 (file)
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -19,6 +19,7 @@
  #include <linux/slab.h>
  #include <linux/scmi_protocol.h>
  #include <linux/types.h>
+#include <linux/units.h>
  
  struct scmi_data {
         int domain_id;
@@ -99,6 +100,7 @@ static int __maybe_unused
  scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
                    unsigned long *KHz)
  {
+       bool power_scale_mw = perf_ops->power_scale_mw_get(ph);
         unsigned long Hz;
         int ret, domain;
  
@@ -112,6 +114,10 @@ scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
         if (ret)
                 return ret;
  
+       /* Provide bigger resolution power to the Energy Model */
+       if (power_scale_mw)
+               *power *= MICROWATT_PER_MILLIWATT;
+
         /* The EM framework specifies the frequency in KHz. */
         *KHz = Hz / 1000;
  
diff --git a/drivers/opp/of.c b/drivers/opp/of.c

index 30394929d700e0abde673178ba890885ac7f7424..eb89c9a75985985b3541c6281651e0d6735453bd 100644 (file)
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1443,12 +1443,12 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node);
   * It provides the power used by @dev at @kHz if it is the frequency of an
   * existing OPP, or at the frequency of the first OPP above @kHz otherwise
   * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled
- * frequency and @mW to the associated power.
+ * frequency and @uW to the associated power.
   *
   * Returns 0 on success or a proper -EINVAL value in case of error.
   */
  static int __maybe_unused
-_get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
+_get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz)
  {
         struct dev_pm_opp *opp;
         unsigned long opp_freq, opp_power;
@@ -1465,7 +1465,7 @@ _get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
                 return -EINVAL;
  
         *kHz = opp_freq / 1000;
-       *mW = opp_power / 1000;
+       *uW = opp_power;
  
         return 0;
  }
@@ -1475,14 +1475,14 @@ _get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
   * This computes the power estimated by @dev at @kHz if it is the frequency
   * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise
   * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled
- * frequency and @mW to the associated power. The power is estimated as
+ * frequency and @uW to the associated power. The power is estimated as
   * P = C * V^2 * f with C being the device's capacitance and V and f
   * respectively the voltage and frequency of the OPP.
   *
   * Returns -EINVAL if the power calculation failed because of missing
   * parameters, 0 otherwise.
   */
-static int __maybe_unused _get_power(struct device *dev, unsigned long *mW,
+static int __maybe_unused _get_power(struct device *dev, unsigned long *uW,
                                      unsigned long *kHz)
  {
         struct dev_pm_opp *opp;
@@ -1512,9 +1512,10 @@ static int __maybe_unused _get_power(struct device *dev, unsigned long *mW,
                 return -EINVAL;
  
         tmp = (u64)cap * mV * mV * (Hz / 1000000);
-       do_div(tmp, 1000000000);
+       /* Provide power in micro-Watts */
+       do_div(tmp, 1000000);
  
-       *mW = (unsigned long)tmp;
+       *uW = (unsigned long)tmp;
         *kHz = Hz / 1000;
  
         return 0;
diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c

index f5eced0842b36d158cf18a2c648482513a4496cc..61c5ff80bd30369a2e2c301053eb9a3b2db54ef8 100644 (file)
--- a/drivers/powercap/dtpm_cpu.c
+++ b/drivers/powercap/dtpm_cpu.c
@@ -53,7 +53,7 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
  
         for (i = 0; i < pd->nr_perf_states; i++) {
  
-               power = pd->table[i].power * MICROWATT_PER_MILLIWATT * nr_cpus;
+               power = pd->table[i].power * nr_cpus;
  
                 if (power > power_limit)
                         break;
@@ -63,8 +63,7 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
  
         freq_qos_update_request(&dtpm_cpu->qos_req, freq);
  
-       power_limit = pd->table[i - 1].power *
-               MICROWATT_PER_MILLIWATT * nr_cpus;
+       power_limit = pd->table[i - 1].power * nr_cpus;
  
         return power_limit;
  }
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c

index b8151d95a8068b83fb6d1ce8788dd7d3f8ee671d..dc19e7c80751af4159a3f263f5fba806614d189a 100644 (file)
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -21,6 +21,7 @@
  #include <linux/pm_qos.h>
  #include <linux/slab.h>
  #include <linux/thermal.h>
+#include <linux/units.h>
  
  #include <trace/events/thermal.h>
  
@@ -101,6 +102,7 @@ static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
  static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
                              u32 freq)
  {
+       unsigned long power_mw;
         int i;
  
         for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
@@ -108,16 +110,23 @@ static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
                         break;
         }
  
-       return cpufreq_cdev->em->table[i + 1].power;
+       power_mw = cpufreq_cdev->em->table[i + 1].power;
+       power_mw /= MICROWATT_PER_MILLIWATT;
+
+       return power_mw;
  }
  
  static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
                              u32 power)
  {
+       unsigned long em_power_mw;
         int i;
  
         for (i = cpufreq_cdev->max_level; i > 0; i--) {
-               if (power >= cpufreq_cdev->em->table[i].power)
+               /* Convert EM power to milli-Watts to make safe comparison */
+               em_power_mw = cpufreq_cdev->em->table[i].power;
+               em_power_mw /= MICROWATT_PER_MILLIWATT;
+               if (power >= em_power_mw)
                         break;
         }
  
diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c

index 8c76f9655e5774446fbef0d0e16be75b63f1a015..8d1260f65061e9c4f5a48e88b8e3e98fcb2199f5 100644 (file)
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -200,7 +200,11 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
                 res = dfc->power_ops->get_real_power(df, power, freq, voltage);
                 if (!res) {
                         state = dfc->capped_state;
+
+                       /* Convert EM power into milli-Watts first */
                         dfc->res_util = dfc->em_pd->table[state].power;
+                       dfc->res_util /= MICROWATT_PER_MILLIWATT;
+
                         dfc->res_util *= SCALE_ERROR_MITIGATION;
  
                         if (*power > 1)
@@ -218,8 +222,10 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
  
                 _normalize_load(&status);
  
-               /* Scale power for utilization */
+               /* Convert EM power into milli-Watts first */
                 *power = dfc->em_pd->table[perf_idx].power;
+               *power /= MICROWATT_PER_MILLIWATT;
+               /* Scale power for utilization */
                 *power *= status.busy_time;
                 *power >>= 10;
         }
@@ -244,6 +250,7 @@ static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
  
         perf_idx = dfc->max_state - state;
         *power = dfc->em_pd->table[perf_idx].power;
+       *power /= MICROWATT_PER_MILLIWATT;
  
         return 0;
  }
@@ -254,7 +261,7 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
         struct devfreq_cooling_device *dfc = cdev->devdata;
         struct devfreq *df = dfc->devfreq;
         struct devfreq_dev_status status;
-       unsigned long freq;
+       unsigned long freq, em_power_mw;
         s32 est_power;
         int i;
  
@@ -279,9 +286,13 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
          * Find the first cooling state that is within the power
          * budget. The EM power table is sorted ascending.
          */
-       for (i = dfc->max_state; i > 0; i--)
-               if (est_power >= dfc->em_pd->table[i].power)
+       for (i = dfc->max_state; i > 0; i--) {
+               /* Convert EM power to milli-Watts to make safe comparison */
+               em_power_mw = dfc->em_pd->table[i].power;
+               em_power_mw /= MICROWATT_PER_MILLIWATT;
+               if (est_power >= em_power_mw)
                         break;
+       }
  
         *state = dfc->max_state - i;
         dfc->capped_state = *state;
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h

index 8419bffb4398f8ce569bacf255fd343b3e02c8fc..b9caa01dfac48594e463e954a3bbd65a5b8ad96e 100644 (file)
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -62,7 +62,7 @@ struct em_perf_domain {
  /*
   *  em_perf_domain flags:
   *
- *  EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some
+ *  EM_PERF_DOMAIN_MICROWATTS: The power values are in micro-Watts or some
   *  other scale.
   *
   *  EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating
@@ -71,7 +71,7 @@ struct em_perf_domain {
   *  EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be
   *  created by platform missing real power information
   */
-#define EM_PERF_DOMAIN_MILLIWATTS BIT(0)
+#define EM_PERF_DOMAIN_MICROWATTS BIT(0)
  #define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1)
  #define EM_PERF_DOMAIN_ARTIFICIAL BIT(2)
  
@@ -79,22 +79,44 @@ struct em_perf_domain {
  #define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL)
  
  #ifdef CONFIG_ENERGY_MODEL
-#define EM_MAX_POWER 0xFFFF
+/*
+ * The max power value in micro-Watts. The limit of 64 Watts is set as
+ * a safety net to not overflow multiplications on 32bit platforms. The
+ * 32bit value limit for total Perf Domain power implies a limit of
+ * maximum CPUs in such domain to 64.
+ */
+#define EM_MAX_POWER (64000000) /* 64 Watts */
+
+/*
+ * To avoid possible energy estimation overflow on 32bit machines add
+ * limits to number of CPUs in the Perf. Domain.
+ * We are safe on 64bit machine, thus some big number.
+ */
+#ifdef CONFIG_64BIT
+#define EM_MAX_NUM_CPUS 4096
+#else
+#define EM_MAX_NUM_CPUS 16
+#endif
  
  /*
- * Increase resolution of energy estimation calculations for 64-bit
- * architectures. The extra resolution improves decision made by EAS for the
- * task placement when two Performance Domains might provide similar energy
- * estimation values (w/o better resolution the values could be equal).
+ * To avoid an overflow on 32bit machines while calculating the energy
+ * use a different order in the operation. First divide by the 'cpu_scale'
+ * which would reduce big value stored in the 'cost' field, then multiply by
+ * the 'sum_util'. This would allow to handle existing platforms, which have
+ * e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts.
+ * In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util'
+ * could be 4096, then multiplication: 'cost' * 'sum_util'  would overflow.
+ * This reordering of operations has some limitations, we lose small
+ * precision in the estimation (comparing to 64bit platform w/o reordering).
   *
- * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
- * are pretty high and the returns do not justify the increased costs.
+ * We are safe on 64bit machine.
   */
  #ifdef CONFIG_64BIT
-#define em_scale_power(p) ((p) * 1000)
+#define em_estimate_energy(cost, sum_util, scale_cpu) \
+       (((cost) * (sum_util)) / (scale_cpu))
  #else
-#define em_scale_power(p) (p)
+#define em_estimate_energy(cost, sum_util, scale_cpu) \
+       (((cost) / (scale_cpu)) * (sum_util))
  #endif
  
  struct em_data_callback {
@@ -112,7 +134,7 @@ struct em_data_callback {
          * and frequency.
          *
          * In case of CPUs, the power is the one of a single CPU in the domain,
-        * expressed in milli-Watts or an abstract scale. It is expected to
+        * expressed in micro-Watts or an abstract scale. It is expected to
          * fit in the [0, EM_MAX_POWER] range.
          *
          * Return 0 on success.
@@ -148,7 +170,7 @@ struct em_perf_domain *em_cpu_get(int cpu);
  struct em_perf_domain *em_pd_get(struct device *dev);
  int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
                                 struct em_data_callback *cb, cpumask_t *span,
-                               bool milliwatts);
+                               bool microwatts);
  void em_dev_unregister_perf_domain(struct device *dev);
  
  /**
@@ -273,7 +295,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
          *   pd_nrg = ------------------------                       (4)
          *                  scale_cpu
          */
-       return ps->cost * sum_util / scale_cpu;
+       return em_estimate_energy(ps->cost, sum_util, scale_cpu);
  }
  
  /**
@@ -297,7 +319,7 @@ struct em_data_callback {};
  static inline
  int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
                                 struct em_data_callback *cb, cpumask_t *span,
-                               bool milliwatts)
+                               bool microwatts)
  {
         return -EINVAL;
  }
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c

index 6c373f2960e71d1f402aa487d57ba96fee67cdd6..f82111837b8d1da67386595c8207b9be06bcacb3 100644 (file)
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -145,7 +145,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
  
                 /*
                  * The power returned by active_state() is expected to be
-                * positive and to fit into 16 bits.
+                * positive and be in range.
                  */
                 if (!power || power > EM_MAX_POWER) {
                         dev_err(dev, "EM: invalid power: %lu\n",
@@ -170,7 +170,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
                                 goto free_ps_table;
                         }
                 } else {
-                       power_res = em_scale_power(table[i].power);
+                       power_res = table[i].power;
                         cost = div64_u64(fmax * power_res, table[i].frequency);
                 }
  
@@ -201,9 +201,17 @@ static int em_create_pd(struct device *dev, int nr_states,
  {
         struct em_perf_domain *pd;
         struct device *cpu_dev;
-       int cpu, ret;
+       int cpu, ret, num_cpus;
  
         if (_is_cpu_device(dev)) {
+               num_cpus = cpumask_weight(cpus);
+
+               /* Prevent max possible energy calculation to not overflow */
+               if (num_cpus > EM_MAX_NUM_CPUS) {
+                       dev_err(dev, "EM: too many CPUs, overflow possible\n");
+                       return -EINVAL;
+               }
+
                 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
                 if (!pd)
                         return -ENOMEM;
@@ -314,13 +322,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
   * @cpus       : Pointer to cpumask_t, which in case of a CPU device is
   *             obligatory. It can be taken from i.e. 'policy->cpus'. For other
   *             type of devices this should be set to NULL.
- * @milliwatts : Flag indicating that the power values are in milliWatts or
+ * @microwatts : Flag indicating that the power values are in micro-Watts or
   *             in some other scale. It must be set properly.
   *
   * Create Energy Model tables for a performance domain using the callbacks
   * defined in cb.
   *
- * The @milliwatts is important to set with correct value. Some kernel
+ * The @microwatts is important to set with correct value. Some kernel
   * sub-systems might rely on this flag and check if all devices in the EM are
   * using the same scale.
   *
@@ -331,7 +339,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
   */
  int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
                                 struct em_data_callback *cb, cpumask_t *cpus,
-                               bool milliwatts)
+                               bool microwatts)
  {
         unsigned long cap, prev_cap = 0;
         unsigned long flags = 0;
@@ -381,8 +389,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
                 }
         }
  
-       if (milliwatts)
-               flags |= EM_PERF_DOMAIN_MILLIWATTS;
+       if (microwatts)
+               flags |= EM_PERF_DOMAIN_MICROWATTS;
         else if (cb->get_cost)
                 flags |= EM_PERF_DOMAIN_ARTIFICIAL;
author	Lukasz Luba <lukasz.luba@arm.com>
	Thu, 7 Jul 2022 07:15:52 +0000 (08:15 +0100)
committer	Rafael J. Wysocki <rafael.j.wysocki@intel.com>
	Fri, 15 Jul 2022 17:17:30 +0000 (19:17 +0200)
drivers/cpufreq/mediatek-cpufreq-hw.c		patch \| blob \| history
drivers/cpufreq/scmi-cpufreq.c		patch \| blob \| history
drivers/opp/of.c		patch \| blob \| history
drivers/powercap/dtpm_cpu.c		patch \| blob \| history
drivers/thermal/cpufreq_cooling.c		patch \| blob \| history
drivers/thermal/devfreq_cooling.c		patch \| blob \| history
include/linux/energy_model.h		patch \| blob \| history
kernel/power/energy_model.c		patch \| blob \| history