sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

author Mel Gorman <mgorman@techsingularity.net>

Tue, 8 Feb 2022 09:43:34 +0000 (09:43 +0000)

committer Peter Zijlstra <peterz@infradead.org>

Fri, 11 Feb 2022 22:30:08 +0000 (23:30 +0100)
author Mel Gorman <mgorman@techsingularity.net>
Tue, 8 Feb 2022 09:43:34 +0000 (09:43 +0000)
committer Peter Zijlstra <peterz@infradead.org>
Fri, 11 Feb 2022 22:30:08 +0000 (23:30 +0100)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index 8054641c0a7bfe71f4fbc6c823c7a68dc21e7a0c..56cffe42abbc4b634b376c3198783806ae1417a6 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -93,6 +93,7 @@ struct sched_domain {
         unsigned int busy_factor;       /* less balancing by factor if busy */
         unsigned int imbalance_pct;     /* No balance until over watermark */
         unsigned int cache_nice_tries;  /* Leave cache hot tasks for # tries */
+       unsigned int imb_numa_nr;       /* Nr running tasks that allows a NUMA imbalance */
  
         int nohz_idle;                  /* NOHZ IDLE status */
         int flags;                      /* See SD_* */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index ea710168ae91b687154b64c1f4065b777c97a4b4..5c4bfffe8c2cb90d1b7098393edeeb7cd365cb23 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1489,6 +1489,7 @@ struct task_numa_env {
  
         int src_cpu, src_nid;
         int dst_cpu, dst_nid;
+       int imb_numa_nr;
  
         struct numa_stats src_stats, dst_stats;
  
@@ -1503,7 +1504,7 @@ struct task_numa_env {
  static unsigned long cpu_load(struct rq *rq);
  static unsigned long cpu_runnable(struct rq *rq);
  static inline long adjust_numa_imbalance(int imbalance,
-                                       int dst_running, int dst_weight);
+                                       int dst_running, int imb_numa_nr);
  
  static inline enum
  numa_type numa_classify(unsigned int imbalance_pct,
@@ -1884,7 +1885,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
                 dst_running = env->dst_stats.nr_running + 1;
                 imbalance = max(0, dst_running - src_running);
                 imbalance = adjust_numa_imbalance(imbalance, dst_running,
-                                                       env->dst_stats.weight);
+                                                 env->imb_numa_nr);
  
                 /* Use idle CPU if there is no imbalance */
                 if (!imbalance) {
@@ -1949,8 +1950,10 @@ static int task_numa_migrate(struct task_struct *p)
          */
         rcu_read_lock();
         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-       if (sd)
+       if (sd) {
                 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+               env.imb_numa_nr = sd->imb_numa_nr;
+       }
         rcu_read_unlock();
  
         /*
@@ -9005,10 +9008,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
   * This is an approximation as the number of running tasks may not be
   * related to the number of busy CPUs due to sched_setaffinity.
   */
-static inline bool
-allow_numa_imbalance(unsigned int running, unsigned int weight)
+static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
  {
-       return (running < (weight >> 2));
+       return running <= imb_numa_nr;
  }
  
  /*
@@ -9148,7 +9150,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                          * allowed. If there is a real need of migration,
                          * periodic load balance will take care of it.
                          */
-                       if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight))
+                       if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
                                 return NULL;
                 }
  
@@ -9240,9 +9242,9 @@ next_group:
  #define NUMA_IMBALANCE_MIN 2
  
  static inline long adjust_numa_imbalance(int imbalance,
-                               int dst_running, int dst_weight)
+                               int dst_running, int imb_numa_nr)
  {
-       if (!allow_numa_imbalance(dst_running, dst_weight))
+       if (!allow_numa_imbalance(dst_running, imb_numa_nr))
                 return imbalance;
  
         /*
@@ -9354,7 +9356,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 /* Consider allowing a small imbalance between NUMA groups */
                 if (env->sd->flags & SD_NUMA) {
                         env->imbalance = adjust_numa_imbalance(env->imbalance,
-                               local->sum_nr_running + 1, local->group_weight);
+                               local->sum_nr_running + 1, env->sd->imb_numa_nr);
                 }
  
                 return;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index d201a7052a299fbc231f79ad5d98ebde3073bbe4..e6cd55951304275acd06c9ced3554711f2b9477e 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2242,6 +2242,59 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
                 }
         }
  
+       /*
+        * Calculate an allowed NUMA imbalance such that LLCs do not get
+        * imbalanced.
+        */
+       for_each_cpu(i, cpu_map) {
+               unsigned int imb = 0;
+               unsigned int imb_span = 1;
+
+               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+                       struct sched_domain *child = sd->child;
+
+                       if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
+                           (child->flags & SD_SHARE_PKG_RESOURCES)) {
+                               struct sched_domain *top, *top_p;
+                               unsigned int nr_llcs;
+
+                               /*
+                                * For a single LLC per node, allow an
+                                * imbalance up to 25% of the node. This is an
+                                * arbitrary cutoff based on SMT-2 to balance
+                                * between memory bandwidth and avoiding
+                                * premature sharing of HT resources and SMT-4
+                                * or SMT-8 *may* benefit from a different
+                                * cutoff.
+                                *
+                                * For multiple LLCs, allow an imbalance
+                                * until multiple tasks would share an LLC
+                                * on one node while LLCs on another node
+                                * remain idle.
+                                */
+                               nr_llcs = sd->span_weight / child->span_weight;
+                               if (nr_llcs == 1)
+                                       imb = sd->span_weight >> 2;
+                               else
+                                       imb = nr_llcs;
+                               sd->imb_numa_nr = imb;
+
+                               /* Set span based on the first NUMA domain. */
+                               top = sd;
+                               top_p = top->parent;
+                               while (top_p && !(top_p->flags & SD_NUMA)) {
+                                       top = top->parent;
+                                       top_p = top->parent;
+                               }
+                               imb_span = top_p ? top_p->span_weight : sd->span_weight;
+                       } else {
+                               int factor = max(1U, (sd->span_weight / imb_span));
+
+                               sd->imb_numa_nr = imb * factor;
+                       }
+               }
+       }
+
         /* Calculate CPU capacity for physical packages and nodes */
         for (i = nr_cpumask_bits-1; i >= 0; i--) {
                 if (!cpumask_test_cpu(i, cpu_map))
author	Mel Gorman <mgorman@techsingularity.net>
	Tue, 8 Feb 2022 09:43:34 +0000 (09:43 +0000)
committer	Peter Zijlstra <peterz@infradead.org>
	Fri, 11 Feb 2022 22:30:08 +0000 (23:30 +0100)
include/linux/sched/topology.h		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/topology.c		patch \| blob \| history