cgroup/cpuset: Make cpuset hotplug processing synchronous

author Waiman Long <longman@redhat.com>

Thu, 4 Apr 2024 13:47:48 +0000 (09:47 -0400)

committer Tejun Heo <tj@kernel.org>

Mon, 8 Apr 2024 17:39:16 +0000 (07:39 -1000)
author Waiman Long <longman@redhat.com>
Thu, 4 Apr 2024 13:47:48 +0000 (09:47 -0400)
committer Tejun Heo <tj@kernel.org>
Mon, 8 Apr 2024 17:39:16 +0000 (07:39 -1000)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h

index 0ce6ff0d9c9aad7f5f6fb31ba5f15207061cdfd9..de4cf0ee96f79eeae224e4433b905b1692c5fc6f 100644 (file)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -70,7 +70,6 @@ extern int cpuset_init(void);
  extern void cpuset_init_smp(void);
  extern void cpuset_force_rebuild(void);
  extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
  extern void inc_dl_tasks_cs(struct task_struct *task);
  extern void dec_dl_tasks_cs(struct task_struct *task);
  extern void cpuset_lock(void);
@@ -185,8 +184,6 @@ static inline void cpuset_update_active_cpus(void)
         partition_sched_domains(1, NULL, NULL);
  }
  
-static inline void cpuset_wait_for_hotplug(void) { }
-
  static inline void inc_dl_tasks_cs(struct task_struct *task) { }
  static inline void dec_dl_tasks_cs(struct task_struct *task) { }
  static inline void cpuset_lock(void) { }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

index 4237c8748715d4c66f2d46787fb7297643abcddb..d8d3439eda4e4350566a31604daebc43944865cd 100644 (file)
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -201,6 +201,14 @@ struct cpuset {
         struct list_head remote_sibling;
  };
  
+/*
+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
+ */
+struct cpuset_remove_tasks_struct {
+       struct work_struct work;
+       struct cpuset *cs;
+};
+
  /*
   * Exclusive CPUs distributed out to sub-partitions of top_cpuset
   */
@@ -449,12 +457,6 @@ static DEFINE_SPINLOCK(callback_lock);
  
  static struct workqueue_struct *cpuset_migrate_mm_wq;
  
-/*
- * CPU / memory hotplug is handled asynchronously.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work);
-static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
-
  static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
  
  static inline void check_insane_mems_config(nodemask_t *nodes)
@@ -540,22 +542,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
         rcu_read_lock();
         cs = task_cs(tsk);
  
-       while (!cpumask_intersects(cs->effective_cpus, pmask)) {
+       while (!cpumask_intersects(cs->effective_cpus, pmask))
                 cs = parent_cs(cs);
-               if (unlikely(!cs)) {
-                       /*
-                        * The top cpuset doesn't have any online cpu as a
-                        * consequence of a race between cpuset_hotplug_work
-                        * and cpu hotplug notifier.  But we know the top
-                        * cpuset's effective_cpus is on its way to be
-                        * identical to cpu_online_mask.
-                        */
-                       goto out_unlock;
-               }
-       }
-       cpumask_and(pmask, pmask, cs->effective_cpus);
  
-out_unlock:
+       cpumask_and(pmask, pmask, cs->effective_cpus);
         rcu_read_unlock();
  }
  
@@ -1217,7 +1207,7 @@ static void rebuild_sched_domains_locked(void)
         /*
          * If we have raced with CPU hotplug, return early to avoid
          * passing doms with offlined cpu to partition_sched_domains().
-        * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
+        * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
          *
          * With no CPUs in any subpartitions, top_cpuset's effective CPUs
          * should be the same as the active CPUs, so checking only top_cpuset
@@ -1260,12 +1250,17 @@ static void rebuild_sched_domains_locked(void)
  }
  #endif /* CONFIG_SMP */
  
-void rebuild_sched_domains(void)
+static void rebuild_sched_domains_cpuslocked(void)
  {
-       cpus_read_lock();
         mutex_lock(&cpuset_mutex);
         rebuild_sched_domains_locked();
         mutex_unlock(&cpuset_mutex);
+}
+
+void rebuild_sched_domains(void)
+{
+       cpus_read_lock();
+       rebuild_sched_domains_cpuslocked();
         cpus_read_unlock();
  }
  
@@ -2079,14 +2074,11 @@ write_error:
  
         /*
          * For partcmd_update without newmask, it is being called from
-        * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
-        * Update the load balance flag and scheduling domain if
-        * cpus_read_trylock() is successful.
+        * cpuset_handle_hotplug(). Update the load balance flag and
+        * scheduling domain accordingly.
          */
-       if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
+       if ((cmd == partcmd_update) && !newmask)
                 update_partition_sd_lb(cs, old_prs);
-               cpus_read_unlock();
-       }
  
         notify_partition_change(cs, old_prs);
         return 0;
@@ -3599,8 +3591,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
          * proceeding, so that we don't end up keep removing tasks added
          * after execution capability is restored.
          *
-        * cpuset_hotplug_work calls back into cgroup core via
-        * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+        * cpuset_handle_hotplug may call back into cgroup core asynchronously
+        * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
          * operation like this one can lead to a deadlock through kernfs
          * active_ref protection.  Let's break the protection.  Losing the
          * protection is okay as we check whether @cs is online after
@@ -3609,7 +3601,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
          */
         css_get(&cs->css);
         kernfs_break_active_protection(of->kn);
-       flush_work(&cpuset_hotplug_work);
  
         cpus_read_lock();
         mutex_lock(&cpuset_mutex);
@@ -4354,6 +4345,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
         }
  }
  
+static void cpuset_migrate_tasks_workfn(struct work_struct *work)
+{
+       struct cpuset_remove_tasks_struct *s;
+
+       s = container_of(work, struct cpuset_remove_tasks_struct, work);
+       remove_tasks_in_empty_cpuset(s->cs);
+       css_put(&s->cs->css);
+       kfree(s);
+}
+
  static void
  hotplug_update_tasks_legacy(struct cpuset *cs,
                             struct cpumask *new_cpus, nodemask_t *new_mems,
@@ -4383,12 +4384,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
         /*
          * Move tasks to the nearest ancestor with execution resources,
          * This is full cgroup operation which will also call back into
-        * cpuset. Should be done outside any lock.
+        * cpuset. Execute it asynchronously using workqueue.
          */
-       if (is_empty) {
-               mutex_unlock(&cpuset_mutex);
-               remove_tasks_in_empty_cpuset(cs);
-               mutex_lock(&cpuset_mutex);
+       if (is_empty && cs->css.cgroup->nr_populated_csets &&
+           css_tryget_online(&cs->css)) {
+               struct cpuset_remove_tasks_struct *s;
+
+               s = kzalloc(sizeof(*s), GFP_KERNEL);
+               if (WARN_ON_ONCE(!s)) {
+                       css_put(&cs->css);
+                       return;
+               }
+
+               s->cs = cs;
+               INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
+               schedule_work(&s->work);
         }
  }
  
@@ -4421,30 +4431,6 @@ void cpuset_force_rebuild(void)
         force_rebuild = true;
  }
  
-/*
- * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
- * progress.
- * Return: true if successful, false otherwise
- *
- * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
- * cpus_read_trylock() is used here to acquire the lock.
- */
-static bool cpuset_hotplug_cpus_read_trylock(void)
-{
-       int retries = 0;
-
-       while (!cpus_read_trylock()) {
-               /*
-                * CPU hotplug still in progress. Retry 5 times
-                * with a 10ms wait before bailing out.
-                */
-               if (++retries > 5)
-                       return false;
-               msleep(10);
-       }
-       return true;
-}
-
  /**
   * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
   * @cs: cpuset in interest
@@ -4493,13 +4479,11 @@ retry:
                 compute_partition_effective_cpumask(cs, &new_cpus);
  
         if (remote && cpumask_empty(&new_cpus) &&
-           partition_is_populated(cs, NULL) &&
-           cpuset_hotplug_cpus_read_trylock()) {
+           partition_is_populated(cs, NULL)) {
                 remote_partition_disable(cs, tmp);
                 compute_effective_cpumask(&new_cpus, cs, parent);
                 remote = false;
                 cpuset_force_rebuild();
-               cpus_read_unlock();
         }
  
         /*
@@ -4519,18 +4503,8 @@ retry:
         else if (is_partition_valid(parent) && is_partition_invalid(cs))
                 partcmd = partcmd_update;
  
-       /*
-        * cpus_read_lock needs to be held before calling
-        * update_parent_effective_cpumask(). To avoid circular lock
-        * dependency between cpuset_mutex and cpus_read_lock,
-        * cpus_read_trylock() is used here to acquire the lock.
-        */
         if (partcmd >= 0) {
-               if (!cpuset_hotplug_cpus_read_trylock())
-                       goto update_tasks;
-
                 update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
-               cpus_read_unlock();
                 if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
                         compute_partition_effective_cpumask(cs, &new_cpus);
                         cpuset_force_rebuild();
@@ -4558,8 +4532,7 @@ unlock:
  }
  
  /**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
- * @work: unused
+ * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
   *
   * This function is called after either CPU or memory configuration has
   * changed and updates cpuset accordingly.  The top_cpuset is always
@@ -4573,8 +4546,10 @@ unlock:
   *
   * Note that CPU offlining during suspend is ignored.  We don't modify
   * cpusets across suspend/resume cycles at all.
+ *
+ * CPU / memory hotplug is handled synchronously.
   */
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_handle_hotplug(void)
  {
         static cpumask_t new_cpus;
         static nodemask_t new_mems;
@@ -4585,6 +4560,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
         if (on_dfl && !alloc_cpumasks(NULL, &tmp))
                 ptmp = &tmp;
  
+       lockdep_assert_cpus_held();
         mutex_lock(&cpuset_mutex);
  
         /* fetch the available cpus/mems and find out which changed how */
@@ -4666,7 +4642,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
         /* rebuild sched domains if cpus_allowed has changed */
         if (cpus_updated || force_rebuild) {
                 force_rebuild = false;
-               rebuild_sched_domains();
+               rebuild_sched_domains_cpuslocked();
         }
  
         free_cpumasks(NULL, ptmp);
@@ -4679,12 +4655,7 @@ void cpuset_update_active_cpus(void)
          * inside cgroup synchronization.  Bounce actual hotplug processing
          * to a work item to avoid reverse locking order.
          */
-       schedule_work(&cpuset_hotplug_work);
-}
-
-void cpuset_wait_for_hotplug(void)
-{
-       flush_work(&cpuset_hotplug_work);
+       cpuset_handle_hotplug();
  }
  
  /*
@@ -4695,7 +4666,7 @@ void cpuset_wait_for_hotplug(void)
  static int cpuset_track_online_nodes(struct notifier_block *self,
                                 unsigned long action, void *arg)
  {
-       schedule_work(&cpuset_hotplug_work);
+       cpuset_handle_hotplug();
         return NOTIFY_OK;
  }
  
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 8f6affd051f77564f96ca4682a58d0c131f62c56..49ce3f3096882b8947abb58d81fe334714a7d6e0 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1208,52 +1208,6 @@ void __init cpuhp_threads_init(void)
         kthread_unpark(this_cpu_read(cpuhp_state.thread));
  }
  
-/*
- *
- * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
- * protected region.
- *
- * The operation is still serialized against concurrent CPU hotplug via
- * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
- * serialized against other hotplug related activity like adding or
- * removing of state callbacks and state instances, which invoke either the
- * startup or the teardown callback of the affected state.
- *
- * This is required for subsystems which are unfixable vs. CPU hotplug and
- * evade lock inversion problems by scheduling work which has to be
- * completed _before_ cpu_up()/_cpu_down() returns.
- *
- * Don't even think about adding anything to this for any new code or even
- * drivers. It's only purpose is to keep existing lock order trainwrecks
- * working.
- *
- * For cpu_down() there might be valid reasons to finish cleanups which are
- * not required to be done under cpu_hotplug_lock, but that's a different
- * story and would be not invoked via this.
- */
-static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
-{
-       /*
-        * cpusets delegate hotplug operations to a worker to "solve" the
-        * lock order problems. Wait for the worker, but only if tasks are
-        * _not_ frozen (suspend, hibernate) as that would wait forever.
-        *
-        * The wait is required because otherwise the hotplug operation
-        * returns with inconsistent state, which could even be observed in
-        * user space when a new CPU is brought up. The CPU plug uevent
-        * would be delivered and user space reacting on it would fail to
-        * move tasks to the newly plugged CPU up to the point where the
-        * work has finished because up to that point the newly plugged CPU
-        * is not assignable in cpusets/cgroups. On unplug that's not
-        * necessarily a visible issue, but it is still inconsistent state,
-        * which is the real problem which needs to be "fixed". This can't
-        * prevent the transient state between scheduling the work and
-        * returning from waiting for it.
-        */
-       if (!tasks_frozen)
-               cpuset_wait_for_hotplug();
-}
-
  #ifdef CONFIG_HOTPLUG_CPU
  #ifndef arch_clear_mm_cpumask_cpu
  #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
@@ -1494,7 +1448,6 @@ out:
          */
         lockup_detector_cleanup();
         arch_smt_update();
-       cpu_up_down_serialize_trainwrecks(tasks_frozen);
         return ret;
  }
  
@@ -1728,7 +1681,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
  out:
         cpus_write_unlock();
         arch_smt_update();
-       cpu_up_down_serialize_trainwrecks(tasks_frozen);
         return ret;
  }
  
diff --git a/kernel/power/process.c b/kernel/power/process.c

index cae81a87cc91e3a2f06da590f8db68fb22b4db26..66ac067d9ae64e022ba7259554628d175494ed13 100644 (file)
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -194,8 +194,6 @@ void thaw_processes(void)
         __usermodehelper_set_disable_depth(UMH_FREEZING);
         thaw_workqueues();
  
-       cpuset_wait_for_hotplug();
-
         read_lock(&tasklist_lock);
         for_each_process_thread(g, p) {
                 /* No other threads should have PF_SUSPEND_TASK set */
author	Waiman Long <longman@redhat.com>
	Thu, 4 Apr 2024 13:47:48 +0000 (09:47 -0400)
committer	Tejun Heo <tj@kernel.org>
	Mon, 8 Apr 2024 17:39:16 +0000 (07:39 -1000)
include/linux/cpuset.h		patch \| blob \| history
kernel/cgroup/cpuset.c		patch \| blob \| history
kernel/cpu.c		patch \| blob \| history
kernel/power/process.c		patch \| blob \| history