sched/numa: enhance vma scanning logic
authorRaghavendra K T <raghavendra.kt@amd.com>
Wed, 1 Mar 2023 12:19:01 +0000 (17:49 +0530)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 6 Apr 2023 03:03:03 +0000 (20:03 -0700)
During Numa scanning make sure only relevant vmas of the tasks are
scanned.

Before:
 All the tasks of a process participate in scanning the vma even if they
 do not access vma in it's lifespan.

Now:
 Except cases of first few unconditional scans, if a process do
 not touch vma (exluding false positive cases of PID collisions)
 tasks no longer scan all vma

Logic used:

1) 6 bits of PID used to mark active bit in vma numab status during
   fault to remember PIDs accessing vma.  (Thanks Mel)

2) Subsequently in scan path, vma scanning is skipped if current PID
   had not accessed vma.

3) First two times we do allow unconditional scan to preserve earlier
   behaviour of scanning.

Acknowledgement to Bharata B Rao <bharata@amd.com> for initial patch to
store pid information and Peter Zijlstra <peterz@infradead.org> (Usage of
test and set bit)

Link: https://lkml.kernel.org/r/092f03105c7c1d3450f4636b1ea350407f07640e.1677672277.git.raghavendra.kt@amd.com
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: Disha Talreja <dishaa.talreja@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mm.h
include/linux/mm_types.h
kernel/sched/fair.c
mm/memory.c

index 9f08a11b355c407630b4ad02735f94715db8edbb..215327daffaea33edf55b2544b20735d8950154a 100644 (file)
@@ -1686,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
        last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
        return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+       unsigned int pid_bit;
+
+       pid_bit = current->pid % BITS_PER_LONG;
+       if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids)) {
+               __set_bit(pid_bit, &vma->numab_state->access_pids);
+       }
+}
 #else /* !CONFIG_NUMA_BALANCING */
 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
@@ -1735,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 {
        return false;
 }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
index 3e1a426737692424f8245a721372390f8f99664c..f8cbd8efc7cbf180bf97f0675ebc63fd6cc83619 100644 (file)
@@ -477,6 +477,7 @@ struct vma_lock {
 
 struct vma_numab_state {
        unsigned long next_scan;
+       unsigned long access_pids;
 };
 
 /*
index 7072de1686d572ca97e7cb2f5ca66baea1b42f2c..ef27b593148017974bfc3ad1ca6ee22d6037bd15 100644 (file)
@@ -2928,6 +2928,21 @@ static void reset_ptenuma_scan(struct task_struct *p)
        p->mm->numa_scan_offset = 0;
 }
 
+static bool vma_is_accessed(struct vm_area_struct *vma)
+{
+       /*
+        * Allow unconditional access first two times, so that all the (pages)
+        * of VMAs get prot_none fault introduced irrespective of accesses.
+        * This is also done to avoid any side effect of task scanning
+        * amplifying the unfairness of disjoint set of VMAs' access.
+        */
+       if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+               return true;
+
+       return test_bit(current->pid % BITS_PER_LONG,
+                               &vma->numab_state->access_pids);
+}
+
 /*
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
@@ -3046,6 +3061,10 @@ static void task_numa_work(struct callback_head *work)
                                                vma->numab_state->next_scan))
                        continue;
 
+               /* Do not scan the VMA if task has not accessed */
+               if (!vma_is_accessed(vma))
+                       continue;
+
                do {
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
index 9999574a963653f7ea131a591384facfca64c465..f77fccb5310cb9352400892ca54875b96dd20e12 100644 (file)
@@ -4661,6 +4661,9 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 {
        get_page(page);
 
+       /* Record the current PID acceesing VMA */
+       vma_set_access_pid_bit(vma);
+
        count_vm_numa_event(NUMA_HINT_FAULTS);
        if (page_nid == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);