TP_ARGS(cgrp, path, val)
 );
 
+DECLARE_EVENT_CLASS(cgroup_rstat,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
+
+       TP_ARGS(cgrp, cpu_in_loop, contended),
+
+       TP_STRUCT__entry(
+               __field(        int,            root                    )
+               __field(        int,            level                   )
+               __field(        u64,            id                      )
+               __field(        int,            cpu_in_loop             )
+               __field(        bool,           contended               )
+       ),
+
+       TP_fast_assign(
+               __entry->root = cgrp->root->hierarchy_id;
+               __entry->id = cgroup_id(cgrp);
+               __entry->level = cgrp->level;
+               __entry->cpu_in_loop = cpu_in_loop;
+               __entry->contended = contended;
+       ),
+
+       TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
+                 __entry->root, __entry->id, __entry->level,
+                 __entry->cpu_in_loop, __entry->contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
 #endif /* _TRACE_CGROUP_H */
 
 /* This part must be outside protection */
 
 #include <linux/btf.h>
 #include <linux/btf_ids.h>
 
+#include <trace/events/cgroup.h>
+
 static DEFINE_SPINLOCK(cgroup_rstat_lock);
 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
 
 
 __bpf_hook_end();
 
+/*
+ * Helper functions for locking cgroup_rstat_lock.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments.  The parameter @cpu_in_loop indicate lock
+ * was released and re-taken when collection data from the CPUs. The
+ * value -1 is used when obtaining the main lock else this is the CPU
+ * number processed last.
+ */
+static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
+       __acquires(&cgroup_rstat_lock)
+{
+       bool contended;
+
+       contended = !spin_trylock_irq(&cgroup_rstat_lock);
+       if (contended) {
+               trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+               spin_lock_irq(&cgroup_rstat_lock);
+       }
+       trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+}
+
+static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
+       __releases(&cgroup_rstat_lock)
+{
+       trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+       spin_unlock_irq(&cgroup_rstat_lock);
+}
+
 /* see cgroup_rstat_flush() */
 static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
        __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
 
                /* play nice and yield if necessary */
                if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
-                       spin_unlock_irq(&cgroup_rstat_lock);
+                       __cgroup_rstat_unlock(cgrp, cpu);
                        if (!cond_resched())
                                cpu_relax();
-                       spin_lock_irq(&cgroup_rstat_lock);
+                       __cgroup_rstat_lock(cgrp, cpu);
                }
        }
 }
 {
        might_sleep();
 
-       spin_lock_irq(&cgroup_rstat_lock);
+       __cgroup_rstat_lock(cgrp, -1);
        cgroup_rstat_flush_locked(cgrp);
-       spin_unlock_irq(&cgroup_rstat_lock);
+       __cgroup_rstat_unlock(cgrp, -1);
 }
 
 /**
        __acquires(&cgroup_rstat_lock)
 {
        might_sleep();
-       spin_lock_irq(&cgroup_rstat_lock);
+       __cgroup_rstat_lock(cgrp, -1);
        cgroup_rstat_flush_locked(cgrp);
 }
 
 /**
  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
  */
-void cgroup_rstat_flush_release(void)
+void cgroup_rstat_flush_release(struct cgroup *cgrp)
        __releases(&cgroup_rstat_lock)
 {
-       spin_unlock_irq(&cgroup_rstat_lock);
+       __cgroup_rstat_unlock(cgrp, -1);
 }
 
 int cgroup_rstat_init(struct cgroup *cgrp)
 #ifdef CONFIG_SCHED_CORE
                forceidle_time = cgrp->bstat.forceidle_sum;
 #endif
-               cgroup_rstat_flush_release();
+               cgroup_rstat_flush_release(cgrp);
        } else {
                root_cgroup_cputime(&bstat);
                usage = bstat.cputime.sum_exec_runtime;