#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
 
 #if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+       WRITE_ONCE(lock->pending, 0);
+}
+
 /**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queued spinlock structure
 
 #else /* _Q_PENDING_BITS == 8 */
 
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+       atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
 /**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queued spinlock structure
 void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
        struct mcs_spinlock *prev, *next, *node;
-       u32 new, old, tail;
+       u32 old, tail;
        int idx;
 
        BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
                                               (VAL != _Q_PENDING_VAL) || !cnt--);
        }
 
+       /*
+        * If we observe any contention; queue.
+        */
+       if (val & ~_Q_LOCKED_MASK)
+               goto queue;
+
        /*
         * trylock || pending
         *
         * 0,0,0 -> 0,0,1 ; trylock
         * 0,0,1 -> 0,1,1 ; pending
         */
-       for (;;) {
+       val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+       if (!(val & ~_Q_LOCKED_MASK)) {
                /*
-                * If we observe any contention; queue.
+                * We're pending, wait for the owner to go away.
+                *
+                * *,1,1 -> *,1,0
+                *
+                * this wait loop must be a load-acquire such that we match the
+                * store-release that clears the locked bit and create lock
+                * sequentiality; this is because not all
+                * clear_pending_set_locked() implementations imply full
+                * barriers.
                 */
-               if (val & ~_Q_LOCKED_MASK)
-                       goto queue;
-
-               new = _Q_LOCKED_VAL;
-               if (val == new)
-                       new |= _Q_PENDING_VAL;
+               if (val & _Q_LOCKED_MASK) {
+                       smp_cond_load_acquire(&lock->val.counter,
+                                             !(VAL & _Q_LOCKED_MASK));
+               }
 
                /*
-                * Acquire semantic is required here as the function may
-                * return immediately if the lock was free.
+                * take ownership and clear the pending bit.
+                *
+                * *,1,0 -> *,0,1
                 */
-               old = atomic_cmpxchg_acquire(&lock->val, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
-
-       /*
-        * we won the trylock
-        */
-       if (new == _Q_LOCKED_VAL)
+               clear_pending_set_locked(lock);
                return;
+       }
 
        /*
-        * we're pending, wait for the owner to go away.
-        *
-        * *,1,1 -> *,1,0
-        *
-        * this wait loop must be a load-acquire such that we match the
-        * store-release that clears the locked bit and create lock
-        * sequentiality; this is because not all clear_pending_set_locked()
-        * implementations imply full barriers.
-        */
-       smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
-
-       /*
-        * take ownership and clear the pending bit.
-        *
-        * *,1,0 -> *,0,1
+        * If pending was clear but there are waiters in the queue, then
+        * we need to undo our setting of pending before we queue ourselves.
         */
-       clear_pending_set_locked(lock);
-       return;
+       if (!(val & _Q_PENDING_MASK))
+               clear_pending(lock);
 
        /*
         * End of pending bit optimistic spinning and beginning of MCS
         * claim the lock:
         *
         * n,0,0 -> 0,0,1 : lock, uncontended
-        * *,0,0 -> *,0,1 : lock, contended
+        * *,*,0 -> *,*,1 : lock, contended
         *
-        * If the queue head is the only one in the queue (lock value == tail),
-        * clear the tail code and grab the lock. Otherwise, we only need
-        * to grab the lock.
+        * If the queue head is the only one in the queue (lock value == tail)
+        * and nobody is pending, clear the tail code and grab the lock.
+        * Otherwise, we only need to grab the lock.
         */
        for (;;) {
                /* In the PV case we might already have _Q_LOCKED_VAL set */
-               if ((val & _Q_TAIL_MASK) != tail) {
+               if ((val & _Q_TAIL_MASK) != tail || (val & _Q_PENDING_MASK)) {
                        set_locked(lock);
                        break;
                }