sched/numa: Trace decisions related to skipping VMAs
authorMel Gorman <mgorman@techsingularity.net>
Tue, 10 Oct 2023 08:31:40 +0000 (09:31 +0100)
committerIngo Molnar <mingo@kernel.org>
Tue, 10 Oct 2023 09:10:00 +0000 (11:10 +0200)
NUMA balancing skips or scans VMAs for a variety of reasons. In preparation
for completing scans of VMAs regardless of PID access, trace the reasons
why a VMA was skipped. In a later patch, the tracing will be used to track
if a VMA was forcibly scanned.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-4-mgorman@techsingularity.net
include/linux/sched/numa_balancing.h
include/trace/events/sched.h
kernel/sched/fair.c

index 3988762efe15c0e5a80602e2c9acb6a5820a740e..c127a1509e2faad8ca856e9866791500dacc2e4b 100644 (file)
 #define TNF_FAULT_LOCAL        0x08
 #define TNF_MIGRATE_FAIL 0x10
 
+enum numa_vmaskip_reason {
+       NUMAB_SKIP_UNSUITABLE,
+       NUMAB_SKIP_SHARED_RO,
+       NUMAB_SKIP_INACCESSIBLE,
+       NUMAB_SKIP_SCAN_DELAY,
+       NUMAB_SKIP_PID_INACTIVE,
+};
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
index a13d5d06be9d6c0ab4ebe0acae095c8d02a3b0b2..d82a04d6a1bc723345ea203a486867090434af2c 100644 (file)
@@ -664,6 +664,56 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
 );
 
+#ifdef CONFIG_NUMA_BALANCING
+#define NUMAB_SKIP_REASON                                      \
+       EM( NUMAB_SKIP_UNSUITABLE,              "unsuitable" )  \
+       EM( NUMAB_SKIP_SHARED_RO,               "shared_ro" )   \
+       EM( NUMAB_SKIP_INACCESSIBLE,            "inaccessible" )        \
+       EM( NUMAB_SKIP_SCAN_DELAY,              "scan_delay" )  \
+       EMe(NUMAB_SKIP_PID_INACTIVE,            "pid_inactive" )
+
+/* Redefine for export. */
+#undef EM
+#undef EMe
+#define EM(a, b)       TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)      TRACE_DEFINE_ENUM(a);
+
+NUMAB_SKIP_REASON
+
+/* Redefine for symbolic printing. */
+#undef EM
+#undef EMe
+#define EM(a, b)       { a, b },
+#define EMe(a, b)      { a, b }
+
+TRACE_EVENT(sched_skip_vma_numa,
+
+       TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
+                enum numa_vmaskip_reason reason),
+
+       TP_ARGS(mm, vma, reason),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, numa_scan_offset)
+               __field(unsigned long, vm_start)
+               __field(unsigned long, vm_end)
+               __field(enum numa_vmaskip_reason, reason)
+       ),
+
+       TP_fast_assign(
+               __entry->numa_scan_offset       = mm->numa_scan_offset;
+               __entry->vm_start               = vma->vm_start;
+               __entry->vm_end                 = vma->vm_end;
+               __entry->reason                 = reason;
+       ),
+
+       TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
+                 __entry->numa_scan_offset,
+                 __entry->vm_start,
+                 __entry->vm_end,
+                 __print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
+);
+#endif /* CONFIG_NUMA_BALANCING */
 
 /*
  * Tracepoint for waking a polling cpu without an IPI.
index 6b47edcbe834627158add1f4113b6711f748e1b4..31cfdb0794fbcfc8a28bbf6abf1406f974233c35 100644 (file)
@@ -3210,6 +3210,7 @@ static void task_numa_work(struct callback_head *work)
        do {
                if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
                        is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
                        continue;
                }
 
@@ -3220,15 +3221,19 @@ static void task_numa_work(struct callback_head *work)
                 * as migrating the pages will be of marginal benefit.
                 */
                if (!vma->vm_mm ||
-                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
                        continue;
+               }
 
                /*
                 * Skip inaccessible VMAs to avoid any confusion between
                 * PROT_NONE and NUMA hinting ptes
                 */
-               if (!vma_is_accessible(vma))
+               if (!vma_is_accessible(vma)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
                        continue;
+               }
 
                /* Initialise new per-VMA NUMAB state. */
                if (!vma->numab_state) {
@@ -3250,12 +3255,16 @@ static void task_numa_work(struct callback_head *work)
                 * delay the scan for new VMAs.
                 */
                if (mm->numa_scan_seq && time_before(jiffies,
-                                               vma->numab_state->next_scan))
+                                               vma->numab_state->next_scan)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
                        continue;
+               }
 
                /* Do not scan the VMA if task has not accessed */
-               if (!vma_is_accessed(vma))
+               if (!vma_is_accessed(vma)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
                        continue;
+               }
 
                /*
                 * RESET access PIDs regularly for old VMAs. Resetting after checking