sched/rt: cpupri_find: Implement fallback mechanism for !fit case

author Qais Yousef <qais.yousef@arm.com>

Mon, 2 Mar 2020 13:27:16 +0000 (13:27 +0000)

committer Ingo Molnar <mingo@kernel.org>

Fri, 6 Mar 2020 11:57:26 +0000 (12:57 +0100)
author Qais Yousef <qais.yousef@arm.com>
Mon, 2 Mar 2020 13:27:16 +0000 (13:27 +0000)
committer Ingo Molnar <mingo@kernel.org>
Fri, 6 Mar 2020 11:57:26 +0000 (12:57 +0100)
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c

index 1a2719e1350a89563bfdf1960802943e9f70be7d..1bcfa19955508bbc14abf75c52c2a5ec8f2b54fb 100644 (file)
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -41,6 +41,59 @@ static int convert_prio(int prio)
         return cpupri;
  }
  
+static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
+                               struct cpumask *lowest_mask, int idx)
+{
+       struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+       int skip = 0;
+
+       if (!atomic_read(&(vec)->count))
+               skip = 1;
+       /*
+        * When looking at the vector, we need to read the counter,
+        * do a memory barrier, then read the mask.
+        *
+        * Note: This is still all racey, but we can deal with it.
+        *  Ideally, we only want to look at masks that are set.
+        *
+        *  If a mask is not set, then the only thing wrong is that we
+        *  did a little more work than necessary.
+        *
+        *  If we read a zero count but the mask is set, because of the
+        *  memory barriers, that can only happen when the highest prio
+        *  task for a run queue has left the run queue, in which case,
+        *  it will be followed by a pull. If the task we are processing
+        *  fails to find a proper place to go, that pull request will
+        *  pull this task if the run queue is running at a lower
+        *  priority.
+        */
+       smp_rmb();
+
+       /* Need to do the rmb for every iteration */
+       if (skip)
+               return 0;
+
+       if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+               return 0;
+
+       if (lowest_mask) {
+               cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+
+               /*
+                * We have to ensure that we have at least one bit
+                * still set in the array, since the map could have
+                * been concurrently emptied between the first and
+                * second reads of vec->mask.  If we hit this
+                * condition, simply act as though we never hit this
+                * priority level and continue on.
+                */
+               if (cpumask_empty(lowest_mask))
+                       return 0;
+       }
+
+       return 1;
+}
+
  /**
   * cpupri_find - find the best (lowest-pri) CPU in the system
   * @cp: The cpupri context
@@ -62,80 +115,72 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
                 struct cpumask *lowest_mask,
                 bool (*fitness_fn)(struct task_struct *p, int cpu))
  {
-       int idx = 0;
         int task_pri = convert_prio(p->prio);
+       int best_unfit_idx = -1;
+       int idx = 0, cpu;
  
         BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
  
         for (idx = 0; idx < task_pri; idx++) {
-               struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
-               int skip = 0;
  
-               if (!atomic_read(&(vec)->count))
-                       skip = 1;
-               /*
-                * When looking at the vector, we need to read the counter,
-                * do a memory barrier, then read the mask.
-                *
-                * Note: This is still all racey, but we can deal with it.
-                *  Ideally, we only want to look at masks that are set.
-                *
-                *  If a mask is not set, then the only thing wrong is that we
-                *  did a little more work than necessary.
-                *
-                *  If we read a zero count but the mask is set, because of the
-                *  memory barriers, that can only happen when the highest prio
-                *  task for a run queue has left the run queue, in which case,
-                *  it will be followed by a pull. If the task we are processing
-                *  fails to find a proper place to go, that pull request will
-                *  pull this task if the run queue is running at a lower
-                *  priority.
-                */
-               smp_rmb();
-
-               /* Need to do the rmb for every iteration */
-               if (skip)
-                       continue;
-
-               if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+               if (!__cpupri_find(cp, p, lowest_mask, idx))
                         continue;
  
-               if (lowest_mask) {
-                       int cpu;
+               if (!lowest_mask || !fitness_fn)
+                       return 1;
  
-                       cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+               /* Ensure the capacity of the CPUs fit the task */
+               for_each_cpu(cpu, lowest_mask) {
+                       if (!fitness_fn(p, cpu))
+                               cpumask_clear_cpu(cpu, lowest_mask);
+               }
  
+               /*
+                * If no CPU at the current priority can fit the task
+                * continue looking
+                */
+               if (cpumask_empty(lowest_mask)) {
                         /*
-                        * We have to ensure that we have at least one bit
-                        * still set in the array, since the map could have
-                        * been concurrently emptied between the first and
-                        * second reads of vec->mask.  If we hit this
-                        * condition, simply act as though we never hit this
-                        * priority level and continue on.
+                        * Store our fallback priority in case we
+                        * didn't find a fitting CPU
                          */
-                       if (cpumask_empty(lowest_mask))
-                               continue;
+                       if (best_unfit_idx == -1)
+                               best_unfit_idx = idx;
  
-                       if (!fitness_fn)
-                               return 1;
-
-                       /* Ensure the capacity of the CPUs fit the task */
-                       for_each_cpu(cpu, lowest_mask) {
-                               if (!fitness_fn(p, cpu))
-                                       cpumask_clear_cpu(cpu, lowest_mask);
-                       }
-
-                       /*
-                        * If no CPU at the current priority can fit the task
-                        * continue looking
-                        */
-                       if (cpumask_empty(lowest_mask))
-                               continue;
+                       continue;
                 }
  
                 return 1;
         }
  
+       /*
+        * If we failed to find a fitting lowest_mask, make sure we fall back
+        * to the last known unfitting lowest_mask.
+        *
+        * Note that the map of the recorded idx might have changed since then,
+        * so we must ensure to do the full dance to make sure that level still
+        * holds a valid lowest_mask.
+        *
+        * As per above, the map could have been concurrently emptied while we
+        * were busy searching for a fitting lowest_mask at the other priority
+        * levels.
+        *
+        * This rule favours honouring priority over fitting the task in the
+        * correct CPU (Capacity Awareness being the only user now).
+        * The idea is that if a higher priority task can run, then it should
+        * run even if this ends up being on unfitting CPU.
+        *
+        * The cost of this trade-off is not entirely clear and will probably
+        * be good for some workloads and bad for others.
+        *
+        * The main idea here is that if some CPUs were overcommitted, we try
+        * to spread which is what the scheduler traditionally did. Sys admins
+        * must do proper RT planning to avoid overloading the system if they
+        * really care.
+        */
+       if (best_unfit_idx != -1)
+               return __cpupri_find(cp, p, lowest_mask, best_unfit_idx);
+
         return 0;
  }
author	Qais Yousef <qais.yousef@arm.com>
	Mon, 2 Mar 2020 13:27:16 +0000 (13:27 +0000)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 6 Mar 2020 11:57:26 +0000 (12:57 +0100)