locking/mutex: Optimize __mutex_trylock_fast()
Use try_cmpxchg to avoid the pointless TEST instruction..
And add the (missing) atomic_long_try_cmpxchg*() wrappery.
On x86_64 this gives:
0000000000000710 <mutex_lock>:						
0000000000000710 <mutex_lock>:
 710:   65 48 8b 14 25 00 00    mov    %gs:0x0,%rdx                      710:   65 48 8b 14 25 00 00    mov    %gs:0x0,%rdx
 717:   00 00                                                            717:   00 00
                        715: R_X86_64_32S       current_task                                    715: R_X86_64_32S       current_task
 719:   31 c0                   xor    %eax,%eax                         719:   31 c0                   xor    %eax,%eax
 71b:   f0 48 0f b1 17          lock cmpxchg %rdx,(%rdi)                 71b:   f0 48 0f b1 17          lock cmpxchg %rdx,(%rdi)
 720:   48 85 c0                test   %rax,%rax                         720:   75 02                   jne    724 <mutex_lock+0x14>
 723:   75 02                   jne    727 <mutex_lock+0x17>             722:   f3 c3                   repz retq
 725:   f3 c3                   repz retq                                724:   eb da                   jmp    700 <__mutex_lock_slowpath>
 727:   eb d7                   jmp    700 <__mutex_lock_slowpath>       726:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
 729:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)                         72d:   00 00 00
On ARM64 this gives:
000000000000638 <mutex_lock>:						
0000000000000638 <mutex_lock>:
     638:       
d5384101        mrs     x1, sp_el0                           638:       
d5384101        mrs     x1, sp_el0
     63c:       
d2800002        mov     x2, #0x0                             63c:       
d2800002        mov     x2, #0x0
     640:       
f9800011        prfm    pstl1strm, [x0]                      640:       
f9800011        prfm    pstl1strm, [x0]
     644:       
c85ffc03        ldaxr   x3, [x0]                             644:       
c85ffc03        ldaxr   x3, [x0]
     648:       
ca020064        eor     x4, x3, x2                           648:       
ca020064        eor     x4, x3, x2
     64c:       
b5000064        cbnz    x4, 658 <mutex_lock+0x20>            64c:       
b5000064        cbnz    x4, 658 <mutex_lock+0x20>
     650:       
c8047c01        stxr    w4, x1, [x0]                         650:       
c8047c01        stxr    w4, x1, [x0]
     654:       
35ffff84        cbnz    w4, 644 <mutex_lock+0xc>             654:       
35ffff84        cbnz    w4, 644 <mutex_lock+0xc>
     658:       
b40000c3        cbz     x3, 670 <mutex_lock+0x38>            658:       
b5000043        cbnz    x3, 660 <mutex_lock+0x28>
     65c:       
a9bf7bfd        stp     x29, x30, [sp,#-16]!                 65c:       
d65f03c0        ret
     660:       
910003fd        mov     x29, sp                              660:       
a9bf7bfd        stp     x29, x30, [sp,#-16]!
     664:       
97ffffef        bl      620 <__mutex_lock_slowpath>          664:       
910003fd        mov     x29, sp
     668:       
a8c17bfd        ldp     x29, x30, [sp],#16                   668:       
97ffffee        bl      620 <__mutex_lock_slowpath>
     66c:       
d65f03c0        ret                                          66c:       
a8c17bfd        ldp     x29, x30, [sp],#16
     670:       
d65f03c0        ret                                          670:       
d65f03c0        ret
Reported-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>