diff --git a/fs/exec.c b/fs/exec.c
index 5470d3c1892a..3e14ba25f678 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1802,6 +1802,7 @@ static int do_execveat_common(int fd, struct filename *filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
+	membarrier_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 46f4ecf5479a..1861ea8dba77 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -445,6 +445,9 @@ struct mm_struct {
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */
+#ifdef CONFIG_MEMBARRIER
+	atomic_t membarrier_state;
+#endif
 #ifdef CONFIG_AIO
 	spinlock_t			ioctx_lock;
 	struct kioctx_table __rcu	*ioctx_table;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index ae53e413fb13..ab9bf7b73954 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -211,4 +211,20 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
 	current->flags = (current->flags & ~PF_MEMALLOC) | flags;
 }
 
+#ifdef CONFIG_MEMBARRIER
+enum {
+	MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY	= (1U << 0),
+	MEMBARRIER_STATE_SWITCH_MM			= (1U << 1),
+};
+
+static inline void membarrier_execve(struct task_struct *t)
+{
+	atomic_set(&t->mm->membarrier_state, 0);
+}
+#else
+static inline void membarrier_execve(struct task_struct *t)
+{
+}
+#endif
+
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index 6d47b3249d8a..4e01ad7ffe98 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -52,21 +52,30 @@
  *                          (non-running threads are de facto in such a
  *                          state). This only covers threads from the
  *                          same processes as the caller thread. This
- *                          command returns 0. The "expedited" commands
- *                          complete faster than the non-expedited ones,
- *                          they never block, but have the downside of
- *                          causing extra overhead.
+ *                          command returns 0 on success. The
+ *                          "expedited" commands complete faster than
+ *                          the non-expedited ones, they never block,
+ *                          but have the downside of causing extra
+ *                          overhead. A process needs to register its
+ *                          intent to use the private expedited command
+ *                          prior to using it, otherwise this command
+ *                          returns -EPERM.
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ *                          Register the process intent to use
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
+ *                          returns 0.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-	MEMBARRIER_CMD_QUERY			= 0,
-	MEMBARRIER_CMD_SHARED			= (1 << 0),
+	MEMBARRIER_CMD_QUERY				= 0,
+	MEMBARRIER_CMD_SHARED				= (1 << 0),
 	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
 	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
-	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED		= (1 << 3),
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED	= (1 << 4),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index a92fddc22747..dd7908743dab 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,6 +18,7 @@
 #include <linux/membarrier.h>
 #include <linux/tick.h>
 #include <linux/cpumask.h>
+#include <linux/atomic.h>
 
 #include "sched.h"	/* for cpu_rq(). */
 
@@ -26,21 +27,26 @@
  * except MEMBARRIER_CMD_QUERY.
  */
 #define MEMBARRIER_CMD_BITMASK	\
-	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED	\
+	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
 
 static void ipi_mb(void *info)
 {
 	smp_mb();	/* IPIs should be serializing but paranoid. */
 }
 
-static void membarrier_private_expedited(void)
+static int membarrier_private_expedited(void)
 {
 	int cpu;
 	bool fallback = false;
 	cpumask_var_t tmpmask;
 
+	if (!(atomic_read(&current->mm->membarrier_state)
+			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+		return -EPERM;
+
 	if (num_online_cpus() == 1)
-		return;
+		return 0;
 
 	/*
 	 * Matches memory barriers around rq->curr modification in
@@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)
 	 * rq->curr modification in scheduler.
 	 */
 	smp_mb();	/* exit from system call is not a mb */
+	return 0;
+}
+
+static void membarrier_register_private_expedited(void)
+{
+	struct task_struct *p = current;
+	struct mm_struct *mm = p->mm;
+
+	/*
+	 * We need to consider threads belonging to different thread
+	 * groups, which use the same mm. (CLONE_VM but not
+	 * CLONE_THREAD).
+	 */
+	if (atomic_read(&mm->membarrier_state)
+			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
+		return;
+	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+			&mm->membarrier_state);
 }
 
 /**
@@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 			synchronize_sched();
 		return 0;
 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-		membarrier_private_expedited();
+		return membarrier_private_expedited();
+	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+		membarrier_register_private_expedited();
 		return 0;
 	default:
 		return -EINVAL;