sched/numa: Apply imbalance limitations consistently

author Mel Gorman <mgorman@techsingularity.net>

Fri, 20 May 2022 10:35:18 +0000 (11:35 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Mon, 13 Jun 2022 08:29:59 +0000 (10:29 +0200)
author Mel Gorman <mgorman@techsingularity.net>
Fri, 20 May 2022 10:35:18 +0000 (11:35 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Mon, 13 Jun 2022 08:29:59 +0000 (10:29 +0200)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 23da36c9cacb857713a7d22423001d6da43a50c5..166f5f9bdb4faff9682cfef8ff41ed8bbc91c827 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1055,6 +1055,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   * Scheduling class queueing methods:
   */
  
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+       /*
+        * Allow a NUMA imbalance if busy CPUs is less than the maximum
+        * threshold. Above this threshold, individual tasks may be contending
+        * for both memory bandwidth and any shared HT resources.  This is an
+        * approximation as the number of running tasks may not be related to
+        * the number of busy CPUs due to sched_setaffinity.
+        */
+       if (dst_running > imb_numa_nr)
+               return imbalance;
+
+       /*
+        * Allow a small imbalance based on a simple pair of communicating
+        * tasks that remain local when the destination is lightly loaded.
+        */
+       if (imbalance <= NUMA_IMBALANCE_MIN)
+               return 0;
+
+       return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
  #ifdef CONFIG_NUMA_BALANCING
  /*
   * Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -1548,8 +1575,6 @@ struct task_numa_env {
  
  static unsigned long cpu_load(struct rq *rq);
  static unsigned long cpu_runnable(struct rq *rq);
-static inline long adjust_numa_imbalance(int imbalance,
-                                       int dst_running, int imb_numa_nr);
  
  static inline enum
  numa_type numa_classify(unsigned int imbalance_pct,
@@ -9067,16 +9092,6 @@ static bool update_pick_idlest(struct sched_group *idlest,
         return true;
  }
  
-/*
- * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
- * This is an approximation as the number of running tasks may not be
- * related to the number of busy CPUs due to sched_setaffinity.
- */
-static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
-{
-       return running <= imb_numa_nr;
-}
-
  /*
   * find_idlest_group() finds and returns the least busy CPU group within the
   * domain.
@@ -9193,6 +9208,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                 break;
  
         case group_has_spare:
+#ifdef CONFIG_NUMA
                 if (sd->flags & SD_NUMA) {
  #ifdef CONFIG_NUMA_BALANCING
                         int idlest_cpu;
@@ -9206,7 +9222,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                         idlest_cpu = cpumask_first(sched_group_span(idlest));
                         if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
                                 return idlest;
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
                         /*
                          * Otherwise, keep the task close to the wakeup source
                          * and improve locality if the number of running tasks
@@ -9214,9 +9230,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                          * allowed. If there is a real need of migration,
                          * periodic load balance will take care of it.
                          */
-                       if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
+                       imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+                       if (!adjust_numa_imbalance(imbalance,
+                                                  local_sgs.sum_nr_running + 1,
+                                                  sd->imb_numa_nr)) {
                                 return NULL;
+                       }
                 }
+#endif /* CONFIG_NUMA */
  
                 /*
                  * Select group with highest number of idle CPUs. We could also
@@ -9303,24 +9324,6 @@ next_group:
         }
  }
  
-#define NUMA_IMBALANCE_MIN 2
-
-static inline long adjust_numa_imbalance(int imbalance,
-                               int dst_running, int imb_numa_nr)
-{
-       if (!allow_numa_imbalance(dst_running, imb_numa_nr))
-               return imbalance;
-
-       /*
-        * Allow a small imbalance based on a simple pair of communicating
-        * tasks that remain local when the destination is lightly loaded.
-        */
-       if (imbalance <= NUMA_IMBALANCE_MIN)
-               return 0;
-
-       return imbalance;
-}
-
  /**
   * calculate_imbalance - Calculate the amount of imbalance present within the
   *                      groups of a given sched_domain during load balance.
@@ -9405,7 +9408,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                          */
                         env->migration_type = migrate_task;
                         lsub_positive(&nr_diff, local->sum_nr_running);
-                       env->imbalance = nr_diff >> 1;
+                       env->imbalance = nr_diff;
                 } else {
  
                         /*
@@ -9413,15 +9416,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                          * idle cpus.
                          */
                         env->migration_type = migrate_task;
-                       env->imbalance = max_t(long, 0, (local->idle_cpus -
-                                                busiest->idle_cpus) >> 1);
+                       env->imbalance = max_t(long, 0,
+                                              (local->idle_cpus - busiest->idle_cpus));
                 }
  
+#ifdef CONFIG_NUMA
                 /* Consider allowing a small imbalance between NUMA groups */
                 if (env->sd->flags & SD_NUMA) {
                         env->imbalance = adjust_numa_imbalance(env->imbalance,
-                               local->sum_nr_running + 1, env->sd->imb_numa_nr);
+                                                              local->sum_nr_running + 1,
+                                                              env->sd->imb_numa_nr);
                 }
+#endif
+
+               /* Number of tasks to move to restore balance */
+               env->imbalance >>= 1;
  
                 return;
         }
author	Mel Gorman <mgorman@techsingularity.net>
	Fri, 20 May 2022 10:35:18 +0000 (11:35 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Mon, 13 Jun 2022 08:29:59 +0000 (10:29 +0200)