From 0d12cdd5f883f508d33b85c1bae98fa28987c8c7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 8 Nov 2008 16:19:55 +0100
Subject: [PATCH] sched: improve sched_clock() performance

in scheduler-intense workloads native_read_tsc() overhead accounts for
20% of the system overhead:

 659567 system_call                              41222.9375
 686796 schedule                                 435.7843
 718382 __switch_to                              665.1685
 823875 switch_mm                                4526.7857
 1883122 native_read_tsc                          55385.9412
 9761990 total                                      2.8468

this is large part due to the rdtsc_barrier() that is done before
and after reading the TSC.

But sched_clock() is not a precise clock in the GTOD sense, using such
barriers is completely pointless. So remove the barriers and only use
them in vget_cycles().

This improves lat_ctx performance by about 5%.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/msr.h | 2 --
 arch/x86/include/asm/tsc.h | 8 +++++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 46be2fa7ac266..c2a812ebde890 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -108,9 +108,7 @@ static __always_inline unsigned long long __native_read_tsc(void)
 {
 	DECLARE_ARGS(val, low, high);
 
-	rdtsc_barrier();
 	asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
-	rdtsc_barrier();
 
 	return EAX_EDX_VAL(val, low, high);
 }
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 38ae163cc91b0..9cd83a8e40d59 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -34,6 +34,8 @@ static inline cycles_t get_cycles(void)
 
 static __always_inline cycles_t vget_cycles(void)
 {
+	cycles_t cycles;
+
 	/*
 	 * We only do VDSOs on TSC capable CPUs, so this shouldnt
 	 * access boot_cpu_data (which is not VDSO-safe):
@@ -42,7 +44,11 @@ static __always_inline cycles_t vget_cycles(void)
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	return (cycles_t)__native_read_tsc();
+	rdtsc_barrier();
+	cycles = (cycles_t)__native_read_tsc();
+	rdtsc_barrier();
+
+	return cycles;
 }
 
 extern void tsc_init(void);
-- 
2.30.2