sched: Introduce per-memory-map concurrency ID

author Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

Tue, 22 Nov 2022 20:39:09 +0000 (15:39 -0500)

committer Peter Zijlstra <peterz@infradead.org>

Tue, 27 Dec 2022 11:52:11 +0000 (12:52 +0100)
author Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Tue, 22 Nov 2022 20:39:09 +0000 (15:39 -0500)
committer Peter Zijlstra <peterz@infradead.org>
Tue, 27 Dec 2022 11:52:11 +0000 (12:52 +0100)
diff --git a/fs/exec.c b/fs/exec.c

index ab913243a367b767c80e2f016833796dfbb34c29..58f16312b983a97ce9a4a11adaf8186a6a7fe268 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1010,6 +1010,7 @@ static int exec_mmap(struct mm_struct *mm)
         active_mm = tsk->active_mm;
         tsk->active_mm = mm;
         tsk->mm = mm;
+       mm_init_cid(mm);
         /*
          * This prevents preemption while active_mm is being loaded and
          * it and mm are being updated, which could cause problems for
@@ -1822,6 +1823,7 @@ static int bprm_execve(struct linux_binprm *bprm,
          */
         check_unsafe_exec(bprm);
         current->in_execve = 1;
+       sched_mm_cid_before_execve(current);
  
         file = do_open_execat(fd, filename, flags);
         retval = PTR_ERR(file);
@@ -1852,6 +1854,7 @@ static int bprm_execve(struct linux_binprm *bprm,
         if (retval < 0)
                 goto out;
  
+       sched_mm_cid_after_execve(current);
         /* execve succeeded */
         current->fs->in_exec = 0;
         current->in_execve = 0;
@@ -1871,6 +1874,7 @@ out:
                 force_fatal_sig(SIGSEGV);
  
  out_unmark:
+       sched_mm_cid_after_execve(current);
         current->fs->in_exec = 0;
         current->in_execve = 0;
  
diff --git a/include/linux/mm.h b/include/linux/mm.h

index f3f196e4d66d6f42c74366731890a5d4102bf75b..cf008c26a883a61f807ad2ea7974a76f154b3654 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1976,6 +1976,31 @@ struct zap_details {
  /* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
  #define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))
  
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_before_execve(struct task_struct *t);
+void sched_mm_cid_after_execve(struct task_struct *t);
+void sched_mm_cid_fork(struct task_struct *t);
+void sched_mm_cid_exit_signals(struct task_struct *t);
+static inline int task_mm_cid(struct task_struct *t)
+{
+       return t->mm_cid;
+}
+#else
+static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
+static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline int task_mm_cid(struct task_struct *t)
+{
+       /*
+        * Use the processor id as a fall-back when the mm cid feature is
+        * disabled. This provides functional per-cpu data structure accesses
+        * in user-space, althrough it won't provide the memory usage benefits.
+        */
+       return raw_smp_processor_id();
+}
+#endif
+
  #ifdef CONFIG_MMU
  extern bool can_do_mlock(void);
  #else
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 3b8475007734dcaf227e35cad41bd77ada806a49..1c3bf76063d2175b6670cdd1aa20faf593860c56 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -645,7 +645,18 @@ struct mm_struct {
                  * &struct mm_struct is freed.
                  */
                 atomic_t mm_count;
-
+#ifdef CONFIG_SCHED_MM_CID
+               /**
+                * @cid_lock: Protect cid bitmap updates vs lookups.
+                *
+                * Prevent situations where updates to the cid bitmap happen
+                * concurrently with lookups. Those can lead to situations
+                * where a lookup cannot find a free bit simply because it was
+                * unlucky enough to load, non-atomically, bitmap words as they
+                * were being concurrently updated by the updaters.
+                */
+               raw_spinlock_t cid_lock;
+#endif
  #ifdef CONFIG_MMU
                 atomic_long_t pgtables_bytes;   /* PTE page table pages */
  #endif
@@ -909,6 +920,36 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
         vmi->mas.node = MAS_START;
  }
  
+#ifdef CONFIG_SCHED_MM_CID
+/* Accessor for struct mm_struct's cidmask. */
+static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
+{
+       unsigned long cid_bitmap = (unsigned long)mm;
+
+       cid_bitmap += offsetof(struct mm_struct, cpu_bitmap);
+       /* Skip cpu_bitmap */
+       cid_bitmap += cpumask_size();
+       return (struct cpumask *)cid_bitmap;
+}
+
+static inline void mm_init_cid(struct mm_struct *mm)
+{
+       raw_spin_lock_init(&mm->cid_lock);
+       cpumask_clear(mm_cidmask(mm));
+}
+
+static inline unsigned int mm_cid_size(void)
+{
+       return cpumask_size();
+}
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_init_cid(struct mm_struct *mm) { }
+static inline unsigned int mm_cid_size(void)
+{
+       return 0;
+}
+#endif /* CONFIG_SCHED_MM_CID */
+
  struct mmu_gather;
  extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
  extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h

index e0bc020a63a9ac59543cfc07b7210a9aab68a036..4df2b3e76b305baa71dddff370c1395eb2787e08 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1311,6 +1311,11 @@ struct task_struct {
         unsigned long rseq_event_mask;
  #endif
  
+#ifdef CONFIG_SCHED_MM_CID
+       int                             mm_cid;         /* Current cid in mm */
+       int                             mm_cid_active;  /* Whether cid bitmap is active */
+#endif
+
         struct tlbflush_unmap_batch     tlb_ubc;
  
         union {
diff --git a/init/Kconfig b/init/Kconfig

index 7e5c3ddc341de319115e47399beb53e9b9960363..1ce960aa453e0ad609dd1696b1b95dcaf2bea785 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1041,6 +1041,10 @@ config RT_GROUP_SCHED
  
  endif #CGROUP_SCHED
  
+config SCHED_MM_CID
+       def_bool y
+       depends on SMP && RSEQ
+
  config UCLAMP_TASK_GROUP
         bool "Utilization clamping per group of tasks"
         depends on CGROUP_SCHED
diff --git a/kernel/fork.c b/kernel/fork.c

index 9f7fe354189785c2cb72f74568536a86fd898fce..82b2b5846aae16df1e3cfb523a2317ed5ecfcf02 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1060,6 +1060,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
         tsk->reported_split_lock = 0;
  #endif
  
+#ifdef CONFIG_SCHED_MM_CID
+       tsk->mm_cid = -1;
+       tsk->mm_cid_active = 0;
+#endif
         return tsk;
  
  free_stack:
@@ -1169,6 +1173,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
  
         mm->user_ns = get_user_ns(user_ns);
         lru_gen_init_mm(mm);
+       mm_init_cid(mm);
         return mm;
  
  fail_pcpu:
@@ -1601,6 +1606,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
  
         tsk->mm = mm;
         tsk->active_mm = mm;
+       sched_mm_cid_fork(tsk);
         return 0;
  }
  
@@ -3034,7 +3040,7 @@ void __init mm_cache_init(void)
          * dynamically sized based on the maximum CPU number this system
          * can have, taking hotplug into account (nr_cpu_ids).
          */
-       mm_size = sizeof(struct mm_struct) + cpumask_size();
+       mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
  
         mm_cachep = kmem_cache_create_usercopy("mm_struct",
                         mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 25b582b6ee5f78949d7646b2c23f0c4ba593a5d6..75830b7dee8f6b68dfef5b7f04916ccf1882b67b 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5052,6 +5052,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
         sched_info_switch(rq, prev, next);
         perf_event_task_sched_out(prev, next);
         rseq_preempt(prev);
+       switch_mm_cid(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
         kmap_local_sched_out();
         prepare_task(next);
@@ -11305,3 +11306,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
  {
          trace_sched_update_nr_running_tp(rq, count);
  }
+
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+       struct mm_struct *mm = t->mm;
+       unsigned long flags;
+
+       if (!mm)
+               return;
+       local_irq_save(flags);
+       mm_cid_put(mm, t->mm_cid);
+       t->mm_cid = -1;
+       t->mm_cid_active = 0;
+       local_irq_restore(flags);
+}
+
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+       struct mm_struct *mm = t->mm;
+       unsigned long flags;
+
+       if (!mm)
+               return;
+       local_irq_save(flags);
+       mm_cid_put(mm, t->mm_cid);
+       t->mm_cid = -1;
+       t->mm_cid_active = 0;
+       local_irq_restore(flags);
+}
+
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+       struct mm_struct *mm = t->mm;
+       unsigned long flags;
+
+       WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm);
+
+       local_irq_save(flags);
+       t->mm_cid = mm_cid_get(mm);
+       t->mm_cid_active = 1;
+       local_irq_restore(flags);
+       rseq_set_notify_resume(t);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+       WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm || t->mm_cid != -1);
+       t->mm_cid_active = 1;
+}
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index b3d6e819127c973dce807750bb95f368053f193c..c2d7467fdde1c184117b569d86a2c0064c929498 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3269,4 +3269,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
         cgroup_account_cputime(curr, delta_exec);
  }
  
+#ifdef CONFIG_SCHED_MM_CID
+static inline int __mm_cid_get(struct mm_struct *mm)
+{
+       struct cpumask *cpumask;
+       int cid;
+
+       cpumask = mm_cidmask(mm);
+       cid = cpumask_first_zero(cpumask);
+       if (cid >= nr_cpu_ids)
+               return -1;
+       __cpumask_set_cpu(cid, cpumask);
+       return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm, int cid)
+{
+       lockdep_assert_irqs_disabled();
+       if (cid < 0)
+               return;
+       raw_spin_lock(&mm->cid_lock);
+       __cpumask_clear_cpu(cid, mm_cidmask(mm));
+       raw_spin_unlock(&mm->cid_lock);
+}
+
+static inline int mm_cid_get(struct mm_struct *mm)
+{
+       int ret;
+
+       lockdep_assert_irqs_disabled();
+       raw_spin_lock(&mm->cid_lock);
+       ret = __mm_cid_get(mm);
+       raw_spin_unlock(&mm->cid_lock);
+       return ret;
+}
+
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
+{
+       if (prev->mm_cid_active) {
+               if (next->mm_cid_active && next->mm == prev->mm) {
+                       /*
+                        * Context switch between threads in same mm, hand over
+                        * the mm_cid from prev to next.
+                        */
+                       next->mm_cid = prev->mm_cid;
+                       prev->mm_cid = -1;
+                       return;
+               }
+               mm_cid_put(prev->mm, prev->mm_cid);
+               prev->mm_cid = -1;
+       }
+       if (next->mm_cid_active)
+               next->mm_cid = mm_cid_get(next->mm);
+}
+
+#else
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+#endif
+
  #endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/signal.c b/kernel/signal.c

index ae26da61c4d9fa6ab8844c3d3f8481ef5693ca52..8cb28f1df29411bcc4d25e0e845aafdf28d914e9 100644 (file)
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2951,6 +2951,7 @@ void exit_signals(struct task_struct *tsk)
         cgroup_threadgroup_change_begin(tsk);
  
         if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+               sched_mm_cid_exit_signals(tsk);
                 tsk->flags |= PF_EXITING;
                 cgroup_threadgroup_change_end(tsk);
                 return;
@@ -2961,6 +2962,7 @@ void exit_signals(struct task_struct *tsk)
          * From now this task is not visible for group-wide signals,
          * see wants_signal(), do_signal_stop().
          */
+       sched_mm_cid_exit_signals(tsk);
         tsk->flags |= PF_EXITING;
  
         cgroup_threadgroup_change_end(tsk);
author	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
	Tue, 22 Nov 2022 20:39:09 +0000 (15:39 -0500)
committer	Peter Zijlstra <peterz@infradead.org>
	Tue, 27 Dec 2022 11:52:11 +0000 (12:52 +0100)
fs/exec.c		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/signal.c		patch \| blob \| history