#include <linux/sched.h>
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
+#include <linux/sync_core.h>
 
 /*
  * Routines for handling mm_structs
        MEMBARRIER_STATE_PRIVATE_EXPEDITED                      = (1U << 1),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY                 = (1U << 2),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED                       = (1U << 3),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY      = (1U << 4),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE            = (1U << 5),
+};
+
+enum {
+       MEMBARRIER_FLAG_SYNC_CORE       = (1U << 0),
 };
 
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
 #include <asm/membarrier.h>
 #endif
 
+static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
+{
+       if (likely(!(atomic_read(&mm->membarrier_state) &
+                    MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
+               return;
+       sync_core_before_usermode();
+}
+
 static inline void membarrier_execve(struct task_struct *t)
 {
        atomic_set(&t->mm->membarrier_state, 0);
 static inline void membarrier_execve(struct task_struct *t)
 {
 }
+static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
+{
+}
 #endif
 
 #endif /* _LINUX_SCHED_MM_H */
 
  *                          to and return from the system call
  *                          (non-running threads are de facto in such a
  *                          state). This only covers threads from the
- *                          same processes as the caller thread. This
+ *                          same process as the caller thread. This
  *                          command returns 0 on success. The
  *                          "expedited" commands complete faster than
  *                          the non-expedited ones, they never block,
  *                          Register the process intent to use
  *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
  *                          returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ *                          In addition to provide memory ordering
+ *                          guarantees described in
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED, ensure
+ *                          the caller thread, upon return from system
+ *                          call, that all its running threads siblings
+ *                          have executed a core serializing
+ *                          instruction. (architectures are required to
+ *                          guarantee that non-running threads issue
+ *                          core serializing instructions before they
+ *                          resume user-space execution). This only
+ *                          covers threads from the same process as the
+ *                          caller thread. This command returns 0 on
+ *                          success. The "expedited" commands complete
+ *                          faster than the non-expedited ones, they
+ *                          never block, but have the downside of
+ *                          causing extra overhead. If this command is
+ *                          not implemented by an architecture, -EINVAL
+ *                          is returned. A process needs to register its
+ *                          intent to use the private expedited sync
+ *                          core command prior to using it, otherwise
+ *                          this command returns -EPERM.
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ *                          Register the process intent to use
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE.
+ *                          If this command is not implemented by an
+ *                          architecture, -EINVAL is returned.
+ *                          Returns 0 on success.
  * @MEMBARRIER_CMD_SHARED:
  *                          Alias to MEMBARRIER_CMD_GLOBAL. Provided for
  *                          header backward compatibility.
        MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                = (1 << 2),
        MEMBARRIER_CMD_PRIVATE_EXPEDITED                        = (1 << 3),
        MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED               = (1 << 4),
+       MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE              = (1 << 5),
+       MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE     = (1 << 6),
 
        /* Alias for header backward compatibility. */
        MEMBARRIER_CMD_SHARED                   = MEMBARRIER_CMD_GLOBAL,
 
 config ARCH_HAS_MEMBARRIER_CALLBACKS
        bool
 
+config ARCH_HAS_MEMBARRIER_SYNC_CORE
+       bool
+
 config EMBEDDED
        bool "Embedded system"
        option allnoconfig_y
 
 
        fire_sched_in_preempt_notifiers(current);
        /*
-        * When transitioning from a kernel thread to a userspace
-        * thread, mmdrop()'s implicit full barrier is required by the
-        * membarrier system call, because the current ->active_mm can
-        * become the current mm without going through switch_mm().
+        * When switching through a kernel thread, the loop in
+        * membarrier_{private,global}_expedited() may have observed that
+        * kernel thread and not issued an IPI. It is therefore possible to
+        * schedule between user->kernel->user threads without passing though
+        * switch_mm(). Membarrier requires a barrier after storing to
+        * rq->curr, before returning to userspace, so provide them here:
+        *
+        * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+        *   provided by mmdrop(),
+        * - a sync_core for SYNC_CORE.
         */
-       if (mm)
+       if (mm) {
+               membarrier_mm_sync_core_before_usermode(mm);
                mmdrop(mm);
+       }
        if (unlikely(prev_state == TASK_DEAD)) {
                if (prev->sched_class->task_dead)
                        prev->sched_class->task_dead(prev);
 
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  * except MEMBARRIER_CMD_QUERY.
  */
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+       (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
+#endif
+
 #define MEMBARRIER_CMD_BITMASK \
        (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
        | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
        | MEMBARRIER_CMD_PRIVATE_EXPEDITED      \
-       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
+       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED     \
+       | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
 
 static void ipi_mb(void *info)
 {
        return 0;
 }
 
-static int membarrier_private_expedited(void)
+static int membarrier_private_expedited(int flags)
 {
        int cpu;
        bool fallback = false;
        cpumask_var_t tmpmask;
 
-       if (!(atomic_read(¤t->mm->membarrier_state)
-                       & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
-               return -EPERM;
+       if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+               if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+                       return -EINVAL;
+               if (!(atomic_read(¤t->mm->membarrier_state) &
+                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
+                       return -EPERM;
+       } else {
+               if (!(atomic_read(¤t->mm->membarrier_state) &
+                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+                       return -EPERM;
+       }
 
        if (num_online_cpus() == 1)
                return 0;
        return 0;
 }
 
-static int membarrier_register_private_expedited(void)
+static int membarrier_register_private_expedited(int flags)
 {
        struct task_struct *p = current;
        struct mm_struct *mm = p->mm;
+       int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+
+       if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+               if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+                       return -EINVAL;
+               state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+       }
 
        /*
         * We need to consider threads belonging to different thread
         * groups, which use the same mm. (CLONE_VM but not
         * CLONE_THREAD).
         */
-       if (atomic_read(&mm->membarrier_state)
-                       & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
+       if (atomic_read(&mm->membarrier_state) & state)
                return 0;
        atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
+       if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+               atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
+                         &mm->membarrier_state);
        if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
                /*
                 * Ensure all future scheduler executions will observe the
                 */
                synchronize_sched();
        }
-       atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
-                       &mm->membarrier_state);
+       atomic_or(state, &mm->membarrier_state);
        return 0;
 }
 
        case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
                return membarrier_register_global_expedited();
        case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-               return membarrier_private_expedited();
+               return membarrier_private_expedited(0);
        case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
-               return membarrier_register_private_expedited();
+               return membarrier_register_private_expedited(0);
+       case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+               return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+       case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+               return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
        default:
                return -EINVAL;
        }