From a3ebdb6c423dff420168a3faf25c76e9e5f59258 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 18 Dec 2007 16:22:46 -0800
Subject: [PATCH] IA64: Slim down __clear_bit_unlock

__clear_bit_unlock does not need to perform atomic operations on the
variable.  Avoid a cmpxchg and simply do a store with release semantics.
Add a barrier to be safe that the compiler does not do funky things.

Tony: Use intrinsic rather than inline assembler

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/asm-ia64/bitops.h       | 17 ++++++++++++++---
 include/asm-ia64/gcc_intrin.h   |  5 +++++
 include/asm-ia64/intel_intrin.h |  3 +++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h
index a977affaebeca..a1b9719f5fbb7 100644
--- a/include/asm-ia64/bitops.h
+++ b/include/asm-ia64/bitops.h
@@ -124,10 +124,21 @@ clear_bit_unlock (int nr, volatile void *addr)
 /**
  * __clear_bit_unlock - Non-atomically clear a bit with release
  *
- * This is like clear_bit_unlock, but the implementation may use a non-atomic
- * store (this one uses an atomic, however).
+ * This is like clear_bit_unlock, but the implementation uses a store
+ * with release semantics. See also __raw_spin_unlock().
  */
-#define __clear_bit_unlock clear_bit_unlock
+static __inline__ void
+__clear_bit_unlock(int nr, volatile void *addr)
+{
+	__u32 mask, new;
+	volatile __u32 *m;
+
+	m = (volatile __u32 *)addr + (nr >> 5);
+	mask = ~(1 << (nr & 31));
+	new = *m & mask;
+	barrier();
+	ia64_st4_rel_nta(m, new);
+}
 
 /**
  * __clear_bit - Clears a bit in memory (non-atomic version)
diff --git a/include/asm-ia64/gcc_intrin.h b/include/asm-ia64/gcc_intrin.h
index 4fb4e439b05c3..e58d3298fa109 100644
--- a/include/asm-ia64/gcc_intrin.h
+++ b/include/asm-ia64/gcc_intrin.h
@@ -191,6 +191,11 @@ register unsigned long ia64_r13 asm ("r13") __attribute_used__;
 	asm volatile ("ldf.fill %0=[%1]" :"=f"(__f__): "r"(x));	\
 })
 
+#define ia64_st4_rel_nta(m, val)					\
+({									\
+	asm volatile ("st4.rel.nta [%0] = %1\n\t" :: "r"(m), "r"(val));	\
+})
+
 #define ia64_stfs(x, regnum)						\
 ({									\
 	register double __f__ asm ("f"#regnum);				\
diff --git a/include/asm-ia64/intel_intrin.h b/include/asm-ia64/intel_intrin.h
index d069b6acddce1..a520d103d8086 100644
--- a/include/asm-ia64/intel_intrin.h
+++ b/include/asm-ia64/intel_intrin.h
@@ -110,6 +110,9 @@
 #define ia64_st4_rel		__st4_rel
 #define ia64_st8_rel		__st8_rel
 
+/* FIXME: need st4.rel.nta intrinsic */
+#define ia64_st4_rel_nta	__st4_rel
+
 #define ia64_ld1_acq		__ld1_acq
 #define ia64_ld2_acq		__ld2_acq
 #define ia64_ld4_acq		__ld4_acq
-- 
2.30.2