From 6014bc27561f2cc63e0acc18adbc4ed810834e32 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Apr 2023 12:55:10 -0700
Subject: [PATCH 1/5] x86-64: make access_ok() independent of LAM

The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range.  See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").

We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.

And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear.  No need to bother with any
address bit masking.

In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.

To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range.  But we'll limit that to purely "high bit set
and past the one-page 'slop' area".

We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address.  But let's leave that kind of
optimization for later.  As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.

The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually.  As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().

With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead.  The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.

Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess.h | 39 +++++++++++++++++++++++++++----
 arch/x86/mm/extable.c          | 42 ++++++++++++++++++++++++++++------
 mm/gup.c                       |  2 ++
 3 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 457e814712af..123135d60f72 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -75,6 +75,34 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
 #define untagged_addr(addr)	(addr)
 #endif
 
+#ifdef CONFIG_X86_64
+/*
+ * On x86-64, we may have tag bits in the user pointer. Rather than
+ * mask them off, just change the rules for __access_ok().
+ *
+ * Make the rule be that 'ptr+size' must not overflow, and must not
+ * have the high bit set. Compilers generally understand about
+ * unsigned overflow and the CF bit and generate reasonable code for
+ * this. Although it looks like the combination confuses at least
+ * clang (and instead of just doing an "add" followed by a test of
+ * SF and CF, you'll see that unnecessary comparison).
+ *
+ * For the common case of small sizes that can be checked at compile
+ * time, don't even bother with the addition, and just check that the
+ * base pointer is ok.
+ */
+static inline bool __access_ok(const void __user *ptr, unsigned long size)
+{
+	if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
+		return (long)ptr >= 0;
+	} else {
+		unsigned long sum = size + (unsigned long)ptr;
+		return (long) sum >= 0 && sum >= (unsigned long)ptr;
+	}
+}
+#define __access_ok __access_ok
+#endif
+
 /**
  * access_ok - Checks if a user space pointer is valid
  * @addr: User space pointer to start of block to check
@@ -91,11 +119,14 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
  *
  * Return: true (nonzero) if the memory block may be valid, false (zero)
  * if it is definitely invalid.
+ *
+ * This should not be x86-specific. The only odd things out here is
+ * the WARN_ON_IN_IRQ(), which doesn't exist in the generic version.
  */
-#define access_ok(addr, size)						\
-({									\
-	WARN_ON_IN_IRQ();						\
-	likely(__access_ok(untagged_addr(addr), size));			\
+#define access_ok(addr, size)			\
+({						\
+	WARN_ON_IN_IRQ();			\
+	likely(__access_ok(addr, size));	\
 })
 
 #include <asm-generic/access_ok.h>
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 60814e110a54..8d38dedadbb1 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -130,10 +130,36 @@ static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
 	return true;
 }
 
-static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
-			       struct pt_regs *regs, int trapnr)
+/*
+ * On x86-64, we end up being imprecise with 'access_ok()', and allow
+ * non-canonical user addresses to make the range comparisons simpler,
+ * and to not have to worry about LAM being enabled.
+ *
+ * In fact, we allow up to one page of "slop" at the sign boundary,
+ * which means that we can do access_ok() by just checking the sign
+ * of the pointer for the common case of having a small access size.
+ */
+static bool gp_fault_address_ok(unsigned long fault_address)
 {
-	WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
+#ifdef CONFIG_X86_64
+	/* Is it in the "user space" part of the non-canonical space? */
+	if ((long) fault_address >= 0)
+		return true;
+
+	/* .. or just above it? */
+	fault_address -= PAGE_SIZE;
+	if ((long) fault_address >= 0)
+		return true;
+#endif
+	return false;
+}
+
+static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
+			       struct pt_regs *regs, int trapnr,
+			       unsigned long fault_address)
+{
+	WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address),
+		"General protection fault in user access. Non-canonical address?");
 	return ex_handler_default(fixup, regs);
 }
 
@@ -189,10 +215,12 @@ static bool ex_handler_imm_reg(const struct exception_table_entry *fixup,
 }
 
 static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
-				  struct pt_regs *regs, int trapnr, int reg, int imm)
+				  struct pt_regs *regs, int trapnr,
+				  unsigned long fault_address,
+				  int reg, int imm)
 {
 	regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg);
-	return ex_handler_uaccess(fixup, regs, trapnr);
+	return ex_handler_uaccess(fixup, regs, trapnr, fault_address);
 }
 
 int ex_get_fixup_type(unsigned long ip)
@@ -238,7 +266,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
 	case EX_TYPE_FAULT_MCE_SAFE:
 		return ex_handler_fault(e, regs, trapnr);
 	case EX_TYPE_UACCESS:
-		return ex_handler_uaccess(e, regs, trapnr);
+		return ex_handler_uaccess(e, regs, trapnr, fault_addr);
 	case EX_TYPE_COPY:
 		return ex_handler_copy(e, regs, trapnr);
 	case EX_TYPE_CLEAR_FS:
@@ -269,7 +297,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
 	case EX_TYPE_FAULT_SGX:
 		return ex_handler_sgx(e, regs, trapnr);
 	case EX_TYPE_UCOPY_LEN:
-		return ex_handler_ucopy_len(e, regs, trapnr, reg, imm);
+		return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm);
 	case EX_TYPE_ZEROPAD:
 		return ex_handler_zeropad(e, regs, fault_addr);
 	}
diff --git a/mm/gup.c b/mm/gup.c
index ff689c88a357..bbe416236593 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2970,6 +2970,8 @@ static int internal_get_user_pages_fast(unsigned long start,
 	len = nr_pages << PAGE_SHIFT;
 	if (check_add_overflow(start, len, &end))
 		return 0;
+	if (end > TASK_SIZE_MAX)
+		return -EFAULT;
 	if (unlikely(!access_ok((void __user *)start, len)))
 		return -EFAULT;
 

From 6ccdc91d6af922f3ded5de494ff27daedeb6d6c9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 2 May 2023 12:35:01 -0700
Subject: [PATCH 2/5] x86: mm: remove architecture-specific 'access_ok()'
 define

There's already a generic definition of 'access_ok()' in the
asm-generic/access_ok.h header file, and the only difference bwteen that
and the x86-specific one is the added check for WARN_ON_IN_IRQ().

And it turns out that the reason for that check is long gone: it used to
use a "user_addr_max()" inline function that depended on the current
thread, and caused problems in non-thread contexts.

For details, see commits 7c4788950ba5 ("x86/uaccess, sched/preempt:
Verify access_ok() context") and in particular commit ae31fe51a3cc
("perf/x86: Restore TASK_SIZE check on frame pointer") about how and why
this came to be.

But that "current task" issue was removed in the big set_fs() removal by
Christoph Hellwig in commit 47058bb54b57 ("x86: remove address space
overrides using set_fs()").

So the reason for the test and the architecture-specific access_ok()
define no longer exists, and is actually harmful these days.  For
example, it led various 'copy_from_user_nmi()' games (eg using
__range_not_ok() instead, and then later converted to __access_ok() when
that became ok).

And that in turn meant that LAM was broken for the frame following
before this series, because __access_ok() used to not do the address
untagging.

Accessing user state still needs care in many contexts, but access_ok()
is not the place for this test.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess.h | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 123135d60f72..cad17e11aa83 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -16,14 +16,6 @@
 #include <asm/extable.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline bool pagefault_disabled(void);
-# define WARN_ON_IN_IRQ()	\
-	WARN_ON_ONCE(!in_task() && !pagefault_disabled())
-#else
-# define WARN_ON_IN_IRQ()
-#endif
-
 #ifdef CONFIG_ADDRESS_MASKING
 /*
  * Mask out tag bits from the address.
@@ -103,32 +95,6 @@ static inline bool __access_ok(const void __user *ptr, unsigned long size)
 #define __access_ok __access_ok
 #endif
 
-/**
- * access_ok - Checks if a user space pointer is valid
- * @addr: User space pointer to start of block to check
- * @size: Size of block to check
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Checks if a pointer to a block of memory in user space is valid.
- *
- * Note that, depending on architecture, this function probably just
- * checks that the pointer is in the user space range - after calling
- * this function, memory access functions may still return -EFAULT.
- *
- * Return: true (nonzero) if the memory block may be valid, false (zero)
- * if it is definitely invalid.
- *
- * This should not be x86-specific. The only odd things out here is
- * the WARN_ON_IN_IRQ(), which doesn't exist in the generic version.
- */
-#define access_ok(addr, size)			\
-({						\
-	WARN_ON_IN_IRQ();			\
-	likely(__access_ok(addr, size));	\
-})
-
 #include <asm-generic/access_ok.h>
 
 extern int __get_user_1(void);

From b9bd9f605c4a6f04a83e6640a7d1d6dda80f17ca Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 2 May 2023 16:39:59 -0700
Subject: [PATCH 3/5] x86: uaccess: move 32-bit and 64-bit parts into proper
 <asm/uaccess_N.h> header

The x86 <asm/uaccess.h> file has grown features that are specific to
x86-64 like LAM support and the related access_ok() changes.  They
really should be in the <asm/uaccess_64.h> file and not pollute the
generic x86 header.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess.h    | 87 ++-----------------------------
 arch/x86/include/asm/uaccess_32.h |  3 ++
 arch/x86/include/asm/uaccess_64.h | 77 ++++++++++++++++++++++++++-
 3 files changed, 82 insertions(+), 85 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index cad17e11aa83..8bae40a66282 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -16,83 +16,10 @@
 #include <asm/extable.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_ADDRESS_MASKING
-/*
- * Mask out tag bits from the address.
- *
- * Magic with the 'sign' allows to untag userspace pointer without any branches
- * while leaving kernel addresses intact.
- */
-static inline unsigned long __untagged_addr(unsigned long addr)
-{
-	long sign;
-
-	/*
-	 * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation
-	 * in alternative instructions. The relocation gets wrong when gets
-	 * copied to the target place.
-	 */
-	asm (ALTERNATIVE("",
-			 "sar $63, %[sign]\n\t" /* user_ptr ? 0 : -1UL */
-			 "or %%gs:tlbstate_untag_mask, %[sign]\n\t"
-			 "and %[sign], %[addr]\n\t", X86_FEATURE_LAM)
-	     : [addr] "+r" (addr), [sign] "=r" (sign)
-	     : "m" (tlbstate_untag_mask), "[sign]" (addr));
-
-	return addr;
-}
-
-#define untagged_addr(addr)	({					\
-	unsigned long __addr = (__force unsigned long)(addr);		\
-	(__force __typeof__(addr))__untagged_addr(__addr);		\
-})
-
-static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
-						   unsigned long addr)
-{
-	long sign = addr >> 63;
-
-	mmap_assert_locked(mm);
-	addr &= (mm)->context.untag_mask | sign;
-
-	return addr;
-}
-
-#define untagged_addr_remote(mm, addr)	({				\
-	unsigned long __addr = (__force unsigned long)(addr);		\
-	(__force __typeof__(addr))__untagged_addr_remote(mm, __addr);	\
-})
-
+#ifdef CONFIG_X86_32
+# include <asm/uaccess_32.h>
 #else
-#define untagged_addr(addr)	(addr)
-#endif
-
-#ifdef CONFIG_X86_64
-/*
- * On x86-64, we may have tag bits in the user pointer. Rather than
- * mask them off, just change the rules for __access_ok().
- *
- * Make the rule be that 'ptr+size' must not overflow, and must not
- * have the high bit set. Compilers generally understand about
- * unsigned overflow and the CF bit and generate reasonable code for
- * this. Although it looks like the combination confuses at least
- * clang (and instead of just doing an "add" followed by a test of
- * SF and CF, you'll see that unnecessary comparison).
- *
- * For the common case of small sizes that can be checked at compile
- * time, don't even bother with the addition, and just check that the
- * base pointer is ok.
- */
-static inline bool __access_ok(const void __user *ptr, unsigned long size)
-{
-	if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
-		return (long)ptr >= 0;
-	} else {
-		unsigned long sum = size + (unsigned long)ptr;
-		return (long) sum >= 0 && sum >= (unsigned long)ptr;
-	}
-}
-#define __access_ok __access_ok
+# include <asm/uaccess_64.h>
 #endif
 
 #include <asm-generic/access_ok.h>
@@ -583,14 +510,6 @@ extern struct movsl_mask {
 
 #define ARCH_HAS_NOCACHE_UACCESS 1
 
-#ifdef CONFIG_X86_32
-unsigned long __must_check clear_user(void __user *mem, unsigned long len);
-unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
-# include <asm/uaccess_32.h>
-#else
-# include <asm/uaccess_64.h>
-#endif
-
 /*
  * The "unsafe" user accesses aren't really "unsafe", but the naming
  * is a big fat warning: you have to not only do the access_ok()
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 388a40660c7b..40379a1adbb8 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -33,4 +33,7 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
        return __copy_from_user_ll_nocache_nozero(to, from, n);
 }
 
+unsigned long __must_check clear_user(void __user *mem, unsigned long len);
+unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
+
 #endif /* _ASM_X86_UACCESS_32_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c972bd21aa23..20411e69e67f 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -12,6 +12,81 @@
 #include <asm/cpufeatures.h>
 #include <asm/page.h>
 
+#ifdef CONFIG_ADDRESS_MASKING
+/*
+ * Mask out tag bits from the address.
+ *
+ * Magic with the 'sign' allows to untag userspace pointer without any branches
+ * while leaving kernel addresses intact.
+ */
+static inline unsigned long __untagged_addr(unsigned long addr)
+{
+	long sign;
+
+	/*
+	 * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation
+	 * in alternative instructions. The relocation gets wrong when gets
+	 * copied to the target place.
+	 */
+	asm (ALTERNATIVE("",
+			 "sar $63, %[sign]\n\t" /* user_ptr ? 0 : -1UL */
+			 "or %%gs:tlbstate_untag_mask, %[sign]\n\t"
+			 "and %[sign], %[addr]\n\t", X86_FEATURE_LAM)
+	     : [addr] "+r" (addr), [sign] "=r" (sign)
+	     : "m" (tlbstate_untag_mask), "[sign]" (addr));
+
+	return addr;
+}
+
+#define untagged_addr(addr)	({					\
+	unsigned long __addr = (__force unsigned long)(addr);		\
+	(__force __typeof__(addr))__untagged_addr(__addr);		\
+})
+
+static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
+						   unsigned long addr)
+{
+	long sign = addr >> 63;
+
+	mmap_assert_locked(mm);
+	addr &= (mm)->context.untag_mask | sign;
+
+	return addr;
+}
+
+#define untagged_addr_remote(mm, addr)	({				\
+	unsigned long __addr = (__force unsigned long)(addr);		\
+	(__force __typeof__(addr))__untagged_addr_remote(mm, __addr);	\
+})
+
+#endif
+
+/*
+ * On x86-64, we may have tag bits in the user pointer. Rather than
+ * mask them off, just change the rules for __access_ok().
+ *
+ * Make the rule be that 'ptr+size' must not overflow, and must not
+ * have the high bit set. Compilers generally understand about
+ * unsigned overflow and the CF bit and generate reasonable code for
+ * this. Although it looks like the combination confuses at least
+ * clang (and instead of just doing an "add" followed by a test of
+ * SF and CF, you'll see that unnecessary comparison).
+ *
+ * For the common case of small sizes that can be checked at compile
+ * time, don't even bother with the addition, and just check that the
+ * base pointer is ok.
+ */
+static inline bool __access_ok(const void __user *ptr, unsigned long size)
+{
+	if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
+		return (long)ptr >= 0;
+	} else {
+		unsigned long sum = size + (unsigned long)ptr;
+		return (long) sum >= 0 && sum >= (unsigned long)ptr;
+	}
+}
+#define __access_ok __access_ok
+
 /*
  * Copy To/From Userspace
  */
@@ -106,7 +181,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
 
 static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
 {
-	if (access_ok(to, n))
+	if (__access_ok(to, n))
 		return __clear_user(to, n);
 	return n;
 }

From 1dbc0a9515fdf1f0b9d6c9b1954a347c94e5f5f9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 3 May 2023 09:38:58 -0700
Subject: [PATCH 4/5] x86: mm: remove 'sign' games from LAM untagged_addr*()
 macros

The intent of the sign games was to not modify kernel addresses when
untagging them.  However, that had two issues:

 (a) it didn't actually work as intended, since the mask was calculated
     as 'addr >> 63' on an _unsigned_ address. So instead of getting a
     mask of all ones for kernel addresses, you just got '1'.

 (b) untagging a kernel address isn't actually a valid operation anyway.

Now, (a) had originally been true for both 'untagged_addr()' and the
remote version of it, but had accidentally been fixed for the regular
version of untagged_addr() by commit e0bddc19ba95 ("x86/mm: Reduce
untagged_addr() overhead for systems without LAM").  That one rewrote
the shift to be part of the alternative asm code, and in the process
changed the unsigned shift into a signed 'sar' instruction.

And while it is true that we don't want to turn what looks like a kernel
address into a user address by masking off the high bit, that doesn't
need these sign masking games - all it needs is that the mm context
'untag_mask' value has the high bit set.

Which it always does.

So simplify the code by just removing the superfluous (and in the case
of untagged_addr_remote(), still buggy) sign bit games in the address
masking.

Acked-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 20411e69e67f..e5b23e917f41 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -15,25 +15,17 @@
 #ifdef CONFIG_ADDRESS_MASKING
 /*
  * Mask out tag bits from the address.
- *
- * Magic with the 'sign' allows to untag userspace pointer without any branches
- * while leaving kernel addresses intact.
  */
 static inline unsigned long __untagged_addr(unsigned long addr)
 {
-	long sign;
-
 	/*
 	 * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation
 	 * in alternative instructions. The relocation gets wrong when gets
 	 * copied to the target place.
 	 */
 	asm (ALTERNATIVE("",
-			 "sar $63, %[sign]\n\t" /* user_ptr ? 0 : -1UL */
-			 "or %%gs:tlbstate_untag_mask, %[sign]\n\t"
-			 "and %[sign], %[addr]\n\t", X86_FEATURE_LAM)
-	     : [addr] "+r" (addr), [sign] "=r" (sign)
-	     : "m" (tlbstate_untag_mask), "[sign]" (addr));
+			 "and %%gs:tlbstate_untag_mask, %[addr]\n\t", X86_FEATURE_LAM)
+	     : [addr] "+r" (addr) : "m" (tlbstate_untag_mask));
 
 	return addr;
 }
@@ -46,12 +38,8 @@ static inline unsigned long __untagged_addr(unsigned long addr)
 static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
 						   unsigned long addr)
 {
-	long sign = addr >> 63;
-
 	mmap_assert_locked(mm);
-	addr &= (mm)->context.untag_mask | sign;
-
-	return addr;
+	return addr & (mm)->context.untag_mask;
 }
 
 #define untagged_addr_remote(mm, addr)	({				\

From 798dec3304f69b97cdf78f485473fb5653fc22d1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 3 May 2023 10:13:41 -0700
Subject: [PATCH 5/5] x86-64: mm: clarify the 'positive addresses' user address
 rules

Dave Hansen found the "(long) addr >= 0" code in the x86-64 access_ok
checks somewhat confusing, and suggested using a helper to clarify what
the code is doing.

So this does exactly that: clarifying what the sign bit check is all
about, by adding a helper macro that makes it clear what it is testing.

This also adds some explicit comments talking about how even with LAM
enabled, any addresses with the sign bit will still GP-fault in the
non-canonical region just above the sign bit.

This is all what allows us to do the user address checks with just the
sign bit, and furthermore be a bit cavalier about accesses that might be
done with an additional offset even past that point.

(And yes, this talks about 'positive' even though zero is also a valid
user address and so technically we should call them 'non-negative'.  But
I don't think using 'non-negative' ends up being more understandable).

Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h | 44 ++++++++++++++++++++++---------
 arch/x86/mm/extable.c             |  4 +--
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index e5b23e917f41..81b826d3b753 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -50,27 +50,45 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
 #endif
 
 /*
- * On x86-64, we may have tag bits in the user pointer. Rather than
- * mask them off, just change the rules for __access_ok().
+ * The virtual address space space is logically divided into a kernel
+ * half and a user half.  When cast to a signed type, user pointers
+ * are positive and kernel pointers are negative.
+ */
+#define valid_user_address(x) ((long)(x) >= 0)
+
+/*
+ * User pointers can have tag bits on x86-64.  This scheme tolerates
+ * arbitrary values in those bits rather then masking them off.
  *
- * Make the rule be that 'ptr+size' must not overflow, and must not
- * have the high bit set. Compilers generally understand about
- * unsigned overflow and the CF bit and generate reasonable code for
- * this. Although it looks like the combination confuses at least
- * clang (and instead of just doing an "add" followed by a test of
- * SF and CF, you'll see that unnecessary comparison).
+ * Enforce two rules:
+ * 1. 'ptr' must be in the user half of the address space
+ * 2. 'ptr+size' must not overflow into kernel addresses
  *
- * For the common case of small sizes that can be checked at compile
- * time, don't even bother with the addition, and just check that the
- * base pointer is ok.
+ * Note that addresses around the sign change are not valid addresses,
+ * and will GP-fault even with LAM enabled if the sign bit is set (see
+ * "CR3.LAM_SUP" that can narrow the canonicality check if we ever
+ * enable it, but not remove it entirely).
+ *
+ * So the "overflow into kernel addresses" does not imply some sudden
+ * exact boundary at the sign bit, and we can allow a lot of slop on the
+ * size check.
+ *
+ * In fact, we could probably remove the size check entirely, since
+ * any kernel accesses will be in increasing address order starting
+ * at 'ptr', and even if the end might be in kernel space, we'll
+ * hit the GP faults for non-canonical accesses before we ever get
+ * there.
+ *
+ * That's a separate optimization, for now just handle the small
+ * constant case.
  */
 static inline bool __access_ok(const void __user *ptr, unsigned long size)
 {
 	if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
-		return (long)ptr >= 0;
+		return valid_user_address(ptr);
 	} else {
 		unsigned long sum = size + (unsigned long)ptr;
-		return (long) sum >= 0 && sum >= (unsigned long)ptr;
+		return valid_user_address(sum) && sum >= (unsigned long)ptr;
 	}
 }
 #define __access_ok __access_ok
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 8d38dedadbb1..271dcb2deabc 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -143,12 +143,12 @@ static bool gp_fault_address_ok(unsigned long fault_address)
 {
 #ifdef CONFIG_X86_64
 	/* Is it in the "user space" part of the non-canonical space? */
-	if ((long) fault_address >= 0)
+	if (valid_user_address(fault_address))
 		return true;
 
 	/* .. or just above it? */
 	fault_address -= PAGE_SIZE;
-	if ((long) fault_address >= 0)
+	if (valid_user_address(fault_address))
 		return true;
 #endif
 	return false;