sched/numa: Find an alternative idle CPU if the CPU is part of an active NUMA balance
authorMel Gorman <mgorman@techsingularity.net>
Mon, 24 Feb 2020 09:52:21 +0000 (09:52 +0000)
committerIngo Molnar <mingo@kernel.org>
Mon, 24 Feb 2020 10:36:39 +0000 (11:36 +0100)
Multiple tasks can attempt to select and idle CPU but fail because
numa_migrate_on is already set and the migration fails. Instead of failing,
scan for an alternative idle CPU. select_idle_sibling is not used because
it requires IRQs to be disabled and it ignores numa_migrate_on allowing
multiple tasks to stack. This scan may still fail if there are idle
candidate CPUs due to races but if this occurs, it's best that a task
stay on an available CPU that move to a contended one.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Hillf Danton <hdanton@sina.com>
Link: https://lore.kernel.org/r/20200224095223.13361-12-mgorman@techsingularity.net
kernel/sched/fair.c

index 2da21f44e4d0610e76b3f5cf641cd8802db4363a..050c1b19bfc04f1443439e7d72cf968019ea1f3f 100644 (file)
@@ -1624,15 +1624,34 @@ static void task_numa_assign(struct task_numa_env *env,
 {
        struct rq *rq = cpu_rq(env->dst_cpu);
 
-       /* Bail out if run-queue part of active NUMA balance. */
-       if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1))
+       /* Check if run-queue part of active NUMA balance. */
+       if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
+               int cpu;
+               int start = env->dst_cpu;
+
+               /* Find alternative idle CPU. */
+               for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+                       if (cpu == env->best_cpu || !idle_cpu(cpu) ||
+                           !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
+                               continue;
+                       }
+
+                       env->dst_cpu = cpu;
+                       rq = cpu_rq(env->dst_cpu);
+                       if (!xchg(&rq->numa_migrate_on, 1))
+                               goto assign;
+               }
+
+               /* Failed to find an alternative idle CPU */
                return;
+       }
 
+assign:
        /*
         * Clear previous best_cpu/rq numa-migrate flag, since task now
         * found a better CPU to move/swap.
         */
-       if (env->best_cpu != -1) {
+       if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
                rq = cpu_rq(env->best_cpu);
                WRITE_ONCE(rq->numa_migrate_on, 0);
        }
@@ -1806,21 +1825,6 @@ assign:
                        cpu = env->best_cpu;
                }
 
-               /*
-                * Use select_idle_sibling if the previously found idle CPU is
-                * not idle any more.
-                */
-               if (!idle_cpu(cpu)) {
-                       /*
-                        * select_idle_siblings() uses an per-CPU cpumask that
-                        * can be used from IRQ context.
-                        */
-                       local_irq_disable();
-                       cpu = select_idle_sibling(env->p, env->src_cpu,
-                                                  env->dst_cpu);
-                       local_irq_enable();
-               }
-
                env->dst_cpu = cpu;
        }