#include <linux/cpu.h>
 #include <linux/cpufreq.h>
-#include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/of.h>
 
 #define KHZ                     1000
 #define REF_CLK_MHZ             408 /* 408 MHz */
-#define US_DELAY                500
 #define CPUFREQ_TBL_STEP_HZ     (50 * KHZ * KHZ)
 #define MAX_CNT                 ~0U
 
+#define MAX_DELTA_KHZ          115200
+
 #define NDIV_MASK              0x1FF
 
 #define CORE_OFFSET(cpu)                       (cpu * 8)
        int maxcpus_per_cluster;
        unsigned int num_clusters;
        phys_addr_t actmon_cntr_base;
+       u32 refclk_delta_min;
 };
 
 struct tegra194_cpufreq_data {
 {
        struct tegra194_cpufreq_data *data = cpufreq_get_driver_data();
        void __iomem *actmon_reg;
+       u32 delta_refcnt;
+       int cnt = 0;
        u64 val;
 
        actmon_reg = CORE_ACTMON_CNTR_REG(data, data->cpu_data[c->cpu].clusterid,
        val = readq(actmon_reg);
        c->last_refclk_cnt = upper_32_bits(val);
        c->last_coreclk_cnt = lower_32_bits(val);
-       udelay(US_DELAY);
-       val = readq(actmon_reg);
-       c->refclk_cnt = upper_32_bits(val);
-       c->coreclk_cnt = lower_32_bits(val);
+
+       /*
+        * The sampling window is based on the minimum number of reference
+        * clock cycles which is known to give a stable value of CPU frequency.
+        */
+       do {
+               val = readq(actmon_reg);
+               c->refclk_cnt = upper_32_bits(val);
+               c->coreclk_cnt = lower_32_bits(val);
+               if (c->refclk_cnt < c->last_refclk_cnt)
+                       delta_refcnt = c->refclk_cnt + (MAX_CNT - c->last_refclk_cnt);
+               else
+                       delta_refcnt = c->refclk_cnt - c->last_refclk_cnt;
+               if (++cnt >= 0xFFFF) {
+                       pr_warn("cpufreq: problem with refclk on cpu:%d, delta_refcnt:%u, cnt:%d\n",
+                               c->cpu, delta_refcnt, cnt);
+                       break;
+               }
+       } while (delta_refcnt < data->soc->refclk_delta_min);
 }
 
 static struct tegra_cpufreq_ops tegra234_cpufreq_ops = {
        .actmon_cntr_base = 0x9000,
        .maxcpus_per_cluster = 4,
        .num_clusters = 3,
+       .refclk_delta_min = 16000,
 };
 
 static const struct tegra_cpufreq_soc tegra239_cpufreq_soc = {
        .actmon_cntr_base = 0x4000,
        .maxcpus_per_cluster = 8,
        .num_clusters = 1,
+       .refclk_delta_min = 16000,
 };
 
 static void tegra194_get_cpu_cluster_id(u32 cpu, u32 *cpuid, u32 *clusterid)
 
 static void tegra194_read_counters(struct tegra_cpu_ctr *c)
 {
+       struct tegra194_cpufreq_data *data = cpufreq_get_driver_data();
+       u32 delta_refcnt;
+       int cnt = 0;
        u64 val;
 
        val = read_freq_feedback();
        c->last_refclk_cnt = lower_32_bits(val);
        c->last_coreclk_cnt = upper_32_bits(val);
-       udelay(US_DELAY);
-       val = read_freq_feedback();
-       c->refclk_cnt = lower_32_bits(val);
-       c->coreclk_cnt = upper_32_bits(val);
+
+       /*
+        * The sampling window is based on the minimum number of reference
+        * clock cycles which is known to give a stable value of CPU frequency.
+        */
+       do {
+               val = read_freq_feedback();
+               c->refclk_cnt = lower_32_bits(val);
+               c->coreclk_cnt = upper_32_bits(val);
+               if (c->refclk_cnt < c->last_refclk_cnt)
+                       delta_refcnt = c->refclk_cnt + (MAX_CNT - c->last_refclk_cnt);
+               else
+                       delta_refcnt = c->refclk_cnt - c->last_refclk_cnt;
+               if (++cnt >= 0xFFFF) {
+                       pr_warn("cpufreq: problem with refclk on cpu:%d, delta_refcnt:%u, cnt:%d\n",
+                               c->cpu, delta_refcnt, cnt);
+                       break;
+               }
+       } while (delta_refcnt < data->soc->refclk_delta_min);
 }
 
 static void tegra_read_counters(struct work_struct *work)
        u32 rate_mhz;
 
        /*
-        * udelay() is required to reconstruct cpu frequency over an
-        * observation window. Using workqueue to call udelay() with
-        * interrupts enabled.
+        * Reconstruct cpu frequency over an observation/sampling window.
+        * Using workqueue to keep interrupts enabled during the interval.
         */
        read_counters_work.c.cpu = cpu;
        INIT_WORK_ONSTACK(&read_counters_work.work, tegra_read_counters);
                if (pos->driver_data != ndiv)
                        continue;
 
-               if (abs(pos->frequency - rate) > 115200) {
-                       pr_warn("cpufreq: cpu%d,cur:%u,set:%u,set ndiv:%llu\n",
-                               cpu, rate, pos->frequency, ndiv);
+               if (abs(pos->frequency - rate) > MAX_DELTA_KHZ) {
+                       pr_warn("cpufreq: cpu%d,cur:%u,set:%u,delta:%d,set ndiv:%llu\n",
+                               cpu, rate, pos->frequency, abs(rate - pos->frequency), ndiv);
                } else {
                        rate = pos->frequency;
                }
        .ops = &tegra194_cpufreq_ops,
        .maxcpus_per_cluster = 2,
        .num_clusters = 4,
+       .refclk_delta_min = 16000,
 };
 
 static void tegra194_cpufreq_free_resources(void)
 
        soc = of_device_get_match_data(&pdev->dev);
 
-       if (soc->ops && soc->maxcpus_per_cluster && soc->num_clusters) {
+       if (soc->ops && soc->maxcpus_per_cluster && soc->num_clusters && soc->refclk_delta_min) {
                data->soc = soc;
        } else {
                dev_err(&pdev->dev, "soc data missing\n");