Merge branch 'perf/timer' into perf/core

This WIP branch is now ready to be merged. Signed-off-by: Ingo Molnar <mingo@kernel.org>
2024-09-06 19:07:52 +00:00 · 2015-04-02 13:22:35 +02:00 · 2015-04-02 13:22:35 +02:00 · 223aa646d5
commit 223aa646d5
parent aaa9fa3875 34f439278c
116 changed files with 1813 additions and 861 deletions
--- a/13
+++ b/13
@ -1186,7 +1186,7 @@ M:	Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	arch/arm/mach-mvebu/
-F:	drivers/rtc/armada38x-rtc
+F:	drivers/rtc/rtc-armada38x.c

 ARM/Marvell Berlin SoC support
 M:	Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
@ -1675,8 +1675,8 @@ F:	drivers/misc/eeprom/at24.c
 F:	include/linux/platform_data/at24.h

 ATA OVER ETHERNET (AOE) DRIVER
-M:	"Ed L. Cashin" <ecashin@coraid.com>
-W:	http://support.coraid.com/support/linux
+M:	"Ed L. Cashin" <ed.cashin@acm.org>
+W:	http://www.openaoe.org/
 S:	Supported
 F:	Documentation/aoe/
 F:	drivers/block/aoe/
@ -3252,6 +3252,13 @@ S:	Maintained
 F:	Documentation/hwmon/dme1737
 F:	drivers/hwmon/dme1737.c

+DMI/SMBIOS SUPPORT
+M:	Jean Delvare <jdelvare@suse.de>
+S:	Maintained
+F:	drivers/firmware/dmi-id.c
+F:	drivers/firmware/dmi_scan.c
+F:	include/linux/dmi.h
+
 DOCKING STATION DRIVER
 M:	Shaohua Li <shaohua.li@intel.com>
 L:	linux-acpi@vger.kernel.org
--- a/arch/arm/plat-omap/counter_32k.c
+++ b/arch/arm/plat-omap/counter_32k.c
@ -103,7 +103,7 @@ int __init omap_init_clocksource_32k(void __iomem *vbase)

 	/*
 	 * 120000 rough estimate from the calculations in
-	 * __clocksource_updatefreq_scale.
+	 * __clocksource_update_freq_scale.
 	 */
 	clocks_calc_mult_shift(&persistent_mult, &persistent_shift,
 			32768, NSEC_PER_SEC, 120000);
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@ -246,14 +246,30 @@ static inline unsigned long __cmpxchg_mb(volatile void *ptr, unsigned long old,
 	__ret; \
 })

-#define this_cpu_cmpxchg_1(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
-#define this_cpu_cmpxchg_2(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
-#define this_cpu_cmpxchg_4(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
-#define this_cpu_cmpxchg_8(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
+#define _protect_cmpxchg_local(pcp, o, n)			\
+({								\
+	typeof(*raw_cpu_ptr(&(pcp))) __ret;			\
+	preempt_disable();					\
+	__ret = cmpxchg_local(raw_cpu_ptr(&(pcp)), o, n);	\
+	preempt_enable();					\
+	__ret;							\
+})

-#define this_cpu_cmpxchg_double_8(ptr1, ptr2, o1, o2, n1, n2) \
-	cmpxchg_double_local(raw_cpu_ptr(&(ptr1)), raw_cpu_ptr(&(ptr2)), \
-				o1, o2, n1, n2)
+#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+#define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+#define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+
+#define this_cpu_cmpxchg_double_8(ptr1, ptr2, o1, o2, n1, n2)		\
+({									\
+	int __ret;							\
+	preempt_disable();						\
+	__ret = cmpxchg_double_local(	raw_cpu_ptr(&(ptr1)),		\
+					raw_cpu_ptr(&(ptr2)),		\
+					o1, o2, n1, n2);		\
+	preempt_enable();						\
+	__ret;								\
+})

 #define cmpxchg64(ptr,o,n)		cmpxchg((ptr),(o),(n))
 #define cmpxchg64_local(ptr,o,n)	cmpxchg_local((ptr),(o),(n))
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@ -151,6 +151,15 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 {
 	unsigned int cpu = smp_processor_id();

+	/*
+	 * init_mm.pgd does not contain any user mappings and it is always
+	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
+	 */
+	if (next == &init_mm) {
+		cpu_set_reserved_ttbr0();
+		return;
+	}
+
 	if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next)
 		check_and_switch_context(next, tsk);
 }
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@ -204,25 +204,47 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
 	return ret;
 }

-#define _percpu_add(pcp, val) \
-	__percpu_add(raw_cpu_ptr(&(pcp)), val, sizeof(pcp))
+#define _percpu_read(pcp)						\
+({									\
+	typeof(pcp) __retval;						\
+	preempt_disable();						\
+	__retval = (typeof(pcp))__percpu_read(raw_cpu_ptr(&(pcp)), 	\
+					      sizeof(pcp));		\
+	preempt_enable();						\
+	__retval;							\
+})

-#define _percpu_add_return(pcp, val) (typeof(pcp)) (_percpu_add(pcp, val))
+#define _percpu_write(pcp, val)						\
+do {									\
+	preempt_disable();						\
+	__percpu_write(raw_cpu_ptr(&(pcp)), (unsigned long)(val), 	\
+				sizeof(pcp));				\
+	preempt_enable();						\
+} while(0)								\
+
+#define _pcp_protect(operation, pcp, val)			\
+({								\
+	typeof(pcp) __retval;					\
+	preempt_disable();					\
+	__retval = (typeof(pcp))operation(raw_cpu_ptr(&(pcp)),	\
+					  (val), sizeof(pcp));	\
+	preempt_enable();					\
+	__retval;						\
+})
+
+#define _percpu_add(pcp, val) \
+	_pcp_protect(__percpu_add, pcp, val)
+
+#define _percpu_add_return(pcp, val) _percpu_add(pcp, val)

 #define _percpu_and(pcp, val) \
-	__percpu_and(raw_cpu_ptr(&(pcp)), val, sizeof(pcp))
+	_pcp_protect(__percpu_and, pcp, val)

 #define _percpu_or(pcp, val) \
-	__percpu_or(raw_cpu_ptr(&(pcp)), val, sizeof(pcp))
-
-#define _percpu_read(pcp) (typeof(pcp))	\
-	(__percpu_read(raw_cpu_ptr(&(pcp)), sizeof(pcp)))
-
-#define _percpu_write(pcp, val) \
-	__percpu_write(raw_cpu_ptr(&(pcp)), (unsigned long)(val), sizeof(pcp))
+	_pcp_protect(__percpu_or, pcp, val)

 #define _percpu_xchg(pcp, val) (typeof(pcp)) \
-	(__percpu_xchg(raw_cpu_ptr(&(pcp)), (unsigned long)(val), sizeof(pcp)))
+	_pcp_protect(__percpu_xchg, pcp, (unsigned long)(val))

 #define this_cpu_add_1(pcp, val) _percpu_add(pcp, val)
 #define this_cpu_add_2(pcp, val) _percpu_add(pcp, val)
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@ -200,7 +200,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 void update_vsyscall(struct timekeeper *tk)
 {
 	struct timespec xtime_coarse;
-	u32 use_syscall = strcmp(tk->tkr.clock->name, "arch_sys_counter");
+	u32 use_syscall = strcmp(tk->tkr_mono.clock->name, "arch_sys_counter");

 	++vdso_data->tb_seq_count;
 	smp_wmb();
@ -213,11 +213,11 @@ void update_vsyscall(struct timekeeper *tk)
 	vdso_data->wtm_clock_nsec		= tk->wall_to_monotonic.tv_nsec;

 	if (!use_syscall) {
-		vdso_data->cs_cycle_last	= tk->tkr.cycle_last;
+		vdso_data->cs_cycle_last	= tk->tkr_mono.cycle_last;
 		vdso_data->xtime_clock_sec	= tk->xtime_sec;
-		vdso_data->xtime_clock_nsec	= tk->tkr.xtime_nsec;
-		vdso_data->cs_mult		= tk->tkr.mult;
-		vdso_data->cs_shift		= tk->tkr.shift;
+		vdso_data->xtime_clock_nsec	= tk->tkr_mono.xtime_nsec;
+		vdso_data->cs_mult		= tk->tkr_mono.mult;
+		vdso_data->cs_shift		= tk->tkr_mono.shift;
 	}

 	smp_wmb();
--- a/arch/metag/include/asm/io.h
+++ b/arch/metag/include/asm/io.h
@ -2,6 +2,7 @@
 #define _ASM_METAG_IO_H

 #include <linux/types.h>
+#include <asm/pgtable-bits.h>

 #define IO_SPACE_LIMIT  0

--- a/arch/metag/include/asm/pgtable-bits.h
+++ b/arch/metag/include/asm/pgtable-bits.h
@ -0,0 +1,104 @@
+/*
+ * Meta page table definitions.
+ */
+
+#ifndef _METAG_PGTABLE_BITS_H
+#define _METAG_PGTABLE_BITS_H
+
+#include <asm/metag_mem.h>
+
+/*
+ * Definitions for MMU descriptors
+ *
+ * These are the hardware bits in the MMCU pte entries.
+ * Derived from the Meta toolkit headers.
+ */
+#define _PAGE_PRESENT		MMCU_ENTRY_VAL_BIT
+#define _PAGE_WRITE		MMCU_ENTRY_WR_BIT
+#define _PAGE_PRIV		MMCU_ENTRY_PRIV_BIT
+/* Write combine bit - this can cause writes to occur out of order */
+#define _PAGE_WR_COMBINE	MMCU_ENTRY_WRC_BIT
+/* Sys coherent bit - this bit is never used by Linux */
+#define _PAGE_SYS_COHERENT	MMCU_ENTRY_SYS_BIT
+#define _PAGE_ALWAYS_ZERO_1	0x020
+#define _PAGE_CACHE_CTRL0	0x040
+#define _PAGE_CACHE_CTRL1	0x080
+#define _PAGE_ALWAYS_ZERO_2	0x100
+#define _PAGE_ALWAYS_ZERO_3	0x200
+#define _PAGE_ALWAYS_ZERO_4	0x400
+#define _PAGE_ALWAYS_ZERO_5	0x800
+
+/* These are software bits that we stuff into the gaps in the hardware
+ * pte entries that are not used.  Note, these DO get stored in the actual
+ * hardware, but the hardware just does not use them.
+ */
+#define _PAGE_ACCESSED		_PAGE_ALWAYS_ZERO_1
+#define _PAGE_DIRTY		_PAGE_ALWAYS_ZERO_2
+
+/* Pages owned, and protected by, the kernel. */
+#define _PAGE_KERNEL		_PAGE_PRIV
+
+/* No cacheing of this page */
+#define _PAGE_CACHE_WIN0	(MMCU_CWIN_UNCACHED << MMCU_ENTRY_CWIN_S)
+/* burst cacheing - good for data streaming */
+#define _PAGE_CACHE_WIN1	(MMCU_CWIN_BURST << MMCU_ENTRY_CWIN_S)
+/* One cache way per thread */
+#define _PAGE_CACHE_WIN2	(MMCU_CWIN_C1SET << MMCU_ENTRY_CWIN_S)
+/* Full on cacheing */
+#define _PAGE_CACHE_WIN3	(MMCU_CWIN_CACHED << MMCU_ENTRY_CWIN_S)
+
+#define _PAGE_CACHEABLE		(_PAGE_CACHE_WIN3 | _PAGE_WR_COMBINE)
+
+/* which bits are used for cache control ... */
+#define _PAGE_CACHE_MASK	(_PAGE_CACHE_CTRL0 | _PAGE_CACHE_CTRL1 | \
+				 _PAGE_WR_COMBINE)
+
+/* This is a mask of the bits that pte_modify is allowed to change. */
+#define _PAGE_CHG_MASK		(PAGE_MASK)
+
+#define _PAGE_SZ_SHIFT		1
+#define _PAGE_SZ_4K		(0x0)
+#define _PAGE_SZ_8K		(0x1 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_16K		(0x2 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_32K		(0x3 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_64K		(0x4 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_128K		(0x5 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_256K		(0x6 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_512K		(0x7 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_1M		(0x8 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_2M		(0x9 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_4M		(0xa << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_MASK		(0xf << _PAGE_SZ_SHIFT)
+
+#if defined(CONFIG_PAGE_SIZE_4K)
+#define _PAGE_SZ		(_PAGE_SZ_4K)
+#elif defined(CONFIG_PAGE_SIZE_8K)
+#define _PAGE_SZ		(_PAGE_SZ_8K)
+#elif defined(CONFIG_PAGE_SIZE_16K)
+#define _PAGE_SZ		(_PAGE_SZ_16K)
+#endif
+#define _PAGE_TABLE		(_PAGE_SZ | _PAGE_PRESENT)
+
+#if defined(CONFIG_HUGETLB_PAGE_SIZE_8K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_8K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_16K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_16K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_32K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_32K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_64K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_128K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_128K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_256K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_256K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_512K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_1M)
+# define _PAGE_SZHUGE		(_PAGE_SZ_1M)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_2M)
+# define _PAGE_SZHUGE		(_PAGE_SZ_2M)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4M)
+# define _PAGE_SZHUGE		(_PAGE_SZ_4M)
+#endif
+
+#endif /* _METAG_PGTABLE_BITS_H */
--- a/arch/metag/include/asm/pgtable.h
+++ b/arch/metag/include/asm/pgtable.h
@ -5,6 +5,7 @@
 #ifndef _METAG_PGTABLE_H
 #define _METAG_PGTABLE_H

+#include <asm/pgtable-bits.h>
 #include <asm-generic/pgtable-nopmd.h>

 /* Invalid regions on Meta: 0x00000000-0x001FFFFF and 0xFFFF0000-0xFFFFFFFF */
@ -20,100 +21,6 @@
 #define VMALLOC_END		0x7FFFFFFF
 #endif

-/*
- * Definitions for MMU descriptors
- *
- * These are the hardware bits in the MMCU pte entries.
- * Derived from the Meta toolkit headers.
- */
-#define _PAGE_PRESENT		MMCU_ENTRY_VAL_BIT
-#define _PAGE_WRITE		MMCU_ENTRY_WR_BIT
-#define _PAGE_PRIV		MMCU_ENTRY_PRIV_BIT
-/* Write combine bit - this can cause writes to occur out of order */
-#define _PAGE_WR_COMBINE	MMCU_ENTRY_WRC_BIT
-/* Sys coherent bit - this bit is never used by Linux */
-#define _PAGE_SYS_COHERENT	MMCU_ENTRY_SYS_BIT
-#define _PAGE_ALWAYS_ZERO_1	0x020
-#define _PAGE_CACHE_CTRL0	0x040
-#define _PAGE_CACHE_CTRL1	0x080
-#define _PAGE_ALWAYS_ZERO_2	0x100
-#define _PAGE_ALWAYS_ZERO_3	0x200
-#define _PAGE_ALWAYS_ZERO_4	0x400
-#define _PAGE_ALWAYS_ZERO_5	0x800
-
-/* These are software bits that we stuff into the gaps in the hardware
- * pte entries that are not used.  Note, these DO get stored in the actual
- * hardware, but the hardware just does not use them.
- */
-#define _PAGE_ACCESSED		_PAGE_ALWAYS_ZERO_1
-#define _PAGE_DIRTY		_PAGE_ALWAYS_ZERO_2
-
-/* Pages owned, and protected by, the kernel. */
-#define _PAGE_KERNEL		_PAGE_PRIV
-
-/* No cacheing of this page */
-#define _PAGE_CACHE_WIN0	(MMCU_CWIN_UNCACHED << MMCU_ENTRY_CWIN_S)
-/* burst cacheing - good for data streaming */
-#define _PAGE_CACHE_WIN1	(MMCU_CWIN_BURST << MMCU_ENTRY_CWIN_S)
-/* One cache way per thread */
-#define _PAGE_CACHE_WIN2	(MMCU_CWIN_C1SET << MMCU_ENTRY_CWIN_S)
-/* Full on cacheing */
-#define _PAGE_CACHE_WIN3	(MMCU_CWIN_CACHED << MMCU_ENTRY_CWIN_S)
-
-#define _PAGE_CACHEABLE		(_PAGE_CACHE_WIN3 | _PAGE_WR_COMBINE)
-
-/* which bits are used for cache control ... */
-#define _PAGE_CACHE_MASK	(_PAGE_CACHE_CTRL0 | _PAGE_CACHE_CTRL1 | \
-				 _PAGE_WR_COMBINE)
-
-/* This is a mask of the bits that pte_modify is allowed to change. */
-#define _PAGE_CHG_MASK		(PAGE_MASK)
-
-#define _PAGE_SZ_SHIFT		1
-#define _PAGE_SZ_4K		(0x0)
-#define _PAGE_SZ_8K		(0x1 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_16K		(0x2 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_32K		(0x3 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_64K		(0x4 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_128K		(0x5 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_256K		(0x6 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_512K		(0x7 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_1M		(0x8 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_2M		(0x9 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_4M		(0xa << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_MASK		(0xf << _PAGE_SZ_SHIFT)
-
-#if defined(CONFIG_PAGE_SIZE_4K)
-#define _PAGE_SZ		(_PAGE_SZ_4K)
-#elif defined(CONFIG_PAGE_SIZE_8K)
-#define _PAGE_SZ		(_PAGE_SZ_8K)
-#elif defined(CONFIG_PAGE_SIZE_16K)
-#define _PAGE_SZ		(_PAGE_SZ_16K)
-#endif
-#define _PAGE_TABLE		(_PAGE_SZ | _PAGE_PRESENT)
-
-#if defined(CONFIG_HUGETLB_PAGE_SIZE_8K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_8K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_16K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_16K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_32K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_32K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_64K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_128K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_128K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_256K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_256K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_512K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_1M)
-# define _PAGE_SZHUGE		(_PAGE_SZ_1M)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_2M)
-# define _PAGE_SZHUGE		(_PAGE_SZ_2M)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4M)
-# define _PAGE_SZHUGE		(_PAGE_SZ_4M)
-#endif
-
 /*
 * The Linux memory management assumes a three-level page table setup. On
 * Meta, we use that, but "fold" the mid level into the top-level page
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@ -153,6 +153,7 @@
 #define PPC_INST_MFSPR_PVR_MASK		0xfc1fffff
 #define PPC_INST_MFTMR			0x7c0002dc
 #define PPC_INST_MSGSND			0x7c00019c
+#define PPC_INST_MSGCLR			0x7c0001dc
 #define PPC_INST_MSGSNDP		0x7c00011c
 #define PPC_INST_MTTMR			0x7c0003dc
 #define PPC_INST_NOP			0x60000000
@ -309,6 +310,8 @@
 					___PPC_RB(b) | __PPC_EH(eh))
 #define PPC_MSGSND(b)		stringify_in_c(.long PPC_INST_MSGSND | \
 					___PPC_RB(b))
+#define PPC_MSGCLR(b)		stringify_in_c(.long PPC_INST_MSGCLR | \
+					___PPC_RB(b))
 #define PPC_MSGSNDP(b)		stringify_in_c(.long PPC_INST_MSGSNDP | \
 					___PPC_RB(b))
 #define PPC_POPCNTB(a, s)	stringify_in_c(.long PPC_INST_POPCNTB | \
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@ -608,13 +608,16 @@
 #define   SRR1_ISI_N_OR_G	0x10000000 /* ISI: Access is no-exec or G */
 #define   SRR1_ISI_PROT		0x08000000 /* ISI: Other protection fault */
 #define   SRR1_WAKEMASK		0x00380000 /* reason for wakeup */
+#define   SRR1_WAKEMASK_P8	0x003c0000 /* reason for wakeup on POWER8 */
 #define   SRR1_WAKESYSERR	0x00300000 /* System error */
 #define   SRR1_WAKEEE		0x00200000 /* External interrupt */
 #define   SRR1_WAKEMT		0x00280000 /* mtctrl */
 #define	  SRR1_WAKEHMI		0x00280000 /* Hypervisor maintenance */
 #define   SRR1_WAKEDEC		0x00180000 /* Decrementer interrupt */
+#define   SRR1_WAKEDBELL	0x00140000 /* Privileged doorbell on P8 */
 #define   SRR1_WAKETHERM	0x00100000 /* Thermal management interrupt */
 #define	  SRR1_WAKERESET	0x00100000 /* System reset */
+#define   SRR1_WAKEHDBELL	0x000c0000 /* Hypervisor doorbell on P8 */
 #define	  SRR1_WAKESTATE	0x00030000 /* Powersave exit mask [46:47] */
 #define	  SRR1_WS_DEEPEST	0x00030000 /* Some resources not maintained,
 					  * may not be recoverable */
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@ -437,6 +437,26 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
+	{	/* Power8NVL */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004c0000,
+		.cpu_name		= "POWER8NVL (raw)",
+		.cpu_features		= CPU_FTRS_POWER8,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power8",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
 	{	/* Power8 DD1: Does not support doorbell IPIs */
 		.pvr_mask		= 0xffffff00,
 		.pvr_value		= 0x004d0100,
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@ -17,6 +17,7 @@

 #include <asm/dbell.h>
 #include <asm/irq_regs.h>
+#include <asm/kvm_ppc.h>

 #ifdef CONFIG_SMP
 void doorbell_setup_this_cpu(void)
@ -41,6 +42,7 @@ void doorbell_exception(struct pt_regs *regs)

 	may_hard_irq_enable();

+	kvmppc_set_host_ipi(smp_processor_id(), 0);
 	__this_cpu_inc(irq_stat.doorbell_irqs);

 	smp_ipi_demux();
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@ -1408,7 +1408,7 @@ machine_check_handle_early:
 	bne	9f			/* continue in V mode if we are. */

 5:
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 	/*
 	 * We are coming from kernel context. Check if we are coming from
 	 * guest. if yes, then we can continue. We will fall through
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@ -33,6 +33,8 @@
 #include <asm/runlatch.h>
 #include <asm/code-patching.h>
 #include <asm/dbell.h>
+#include <asm/kvm_ppc.h>
+#include <asm/ppc-opcode.h>

 #include "powernv.h"

@ -149,7 +151,7 @@ static int pnv_smp_cpu_disable(void)
 static void pnv_smp_cpu_kill_self(void)
 {
 	unsigned int cpu;
-	unsigned long srr1;
+	unsigned long srr1, wmask;
 	u32 idle_states;

 	/* Standard hot unplug procedure */
@ -161,6 +163,10 @@ static void pnv_smp_cpu_kill_self(void)
 	generic_set_cpu_dead(cpu);
 	smp_wmb();

+	wmask = SRR1_WAKEMASK;
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		wmask = SRR1_WAKEMASK_P8;
+
 	idle_states = pnv_get_supported_cpuidle_states();
 	/* We don't want to take decrementer interrupts while we are offline,
 	 * so clear LPCR:PECE1. We keep PECE2 enabled.
@ -191,10 +197,14 @@ static void pnv_smp_cpu_kill_self(void)
 		 * having finished executing in a KVM guest, then srr1
 		 * contains 0.
 		 */
-		if ((srr1 & SRR1_WAKEMASK) == SRR1_WAKEEE) {
+		if ((srr1 & wmask) == SRR1_WAKEEE) {
 			icp_native_flush_interrupt();
 			local_paca->irq_happened &= PACA_IRQ_HARD_DIS;
 			smp_mb();
+		} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
+			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+			asm volatile(PPC_MSGCLR(%0) : : "r" (msg));
+			kvmppc_set_host_ipi(cpu, 0);
 		}

 		if (cpu_core_split_required())
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@ -25,10 +25,10 @@
 static struct kobject *mobility_kobj;

 struct update_props_workarea {
-	u32 phandle;
-	u32 state;
-	u64 reserved;
-	u32 nprops;
+	__be32 phandle;
+	__be32 state;
+	__be64 reserved;
+	__be32 nprops;
 } __packed;

 #define NODE_ACTION_MASK	0xff000000
@ -54,11 +54,11 @@ static int mobility_rtas_call(int token, char *buf, s32 scope)
 	return rc;
 }

-static int delete_dt_node(u32 phandle)
+static int delete_dt_node(__be32 phandle)
 {
 	struct device_node *dn;

-	dn = of_find_node_by_phandle(phandle);
+	dn = of_find_node_by_phandle(be32_to_cpu(phandle));
 	if (!dn)
 		return -ENOENT;

@ -127,7 +127,7 @@ static int update_dt_property(struct device_node *dn, struct property **prop,
 	return 0;
 }

-static int update_dt_node(u32 phandle, s32 scope)
+static int update_dt_node(__be32 phandle, s32 scope)
 {
 	struct update_props_workarea *upwa;
 	struct device_node *dn;
@ -136,6 +136,7 @@ static int update_dt_node(u32 phandle, s32 scope)
 	char *prop_data;
 	char *rtas_buf;
 	int update_properties_token;
+	u32 nprops;
 	u32 vd;

 	update_properties_token = rtas_token("ibm,update-properties");
@ -146,7 +147,7 @@ static int update_dt_node(u32 phandle, s32 scope)
 	if (!rtas_buf)
 		return -ENOMEM;

-	dn = of_find_node_by_phandle(phandle);
+	dn = of_find_node_by_phandle(be32_to_cpu(phandle));
 	if (!dn) {
 		kfree(rtas_buf);
 		return -ENOENT;
@ -162,6 +163,7 @@ static int update_dt_node(u32 phandle, s32 scope)
 			break;

 		prop_data = rtas_buf + sizeof(*upwa);
+		nprops = be32_to_cpu(upwa->nprops);

 		/* On the first call to ibm,update-properties for a node the
 		 * the first property value descriptor contains an empty
@ -170,17 +172,17 @@ static int update_dt_node(u32 phandle, s32 scope)
 		 */
 		if (*prop_data == 0) {
 			prop_data++;
-			vd = *(u32 *)prop_data;
+			vd = be32_to_cpu(*(__be32 *)prop_data);
 			prop_data += vd + sizeof(vd);
-			upwa->nprops--;
+			nprops--;
 		}

-		for (i = 0; i < upwa->nprops; i++) {
+		for (i = 0; i < nprops; i++) {
 			char *prop_name;

 			prop_name = prop_data;
 			prop_data += strlen(prop_name) + 1;
-			vd = *(u32 *)prop_data;
+			vd = be32_to_cpu(*(__be32 *)prop_data);
 			prop_data += sizeof(vd);

 			switch (vd) {
@ -212,13 +214,13 @@ static int update_dt_node(u32 phandle, s32 scope)
 	return 0;
 }

-static int add_dt_node(u32 parent_phandle, u32 drc_index)
+static int add_dt_node(__be32 parent_phandle, __be32 drc_index)
 {
 	struct device_node *dn;
 	struct device_node *parent_dn;
 	int rc;

-	parent_dn = of_find_node_by_phandle(parent_phandle);
+	parent_dn = of_find_node_by_phandle(be32_to_cpu(parent_phandle));
 	if (!parent_dn)
 		return -ENOENT;

@ -237,7 +239,7 @@ static int add_dt_node(u32 parent_phandle, u32 drc_index)
 int pseries_devicetree_update(s32 scope)
 {
 	char *rtas_buf;
-	u32 *data;
+	__be32 *data;
 	int update_nodes_token;
 	int rc;

@ -254,17 +256,17 @@ int pseries_devicetree_update(s32 scope)
 		if (rc && rc != 1)
 			break;

-		data = (u32 *)rtas_buf + 4;
-		while (*data & NODE_ACTION_MASK) {
+		data = (__be32 *)rtas_buf + 4;
+		while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
 			int i;
-			u32 action = *data & NODE_ACTION_MASK;
-			int node_count = *data & NODE_COUNT_MASK;
+			u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
+			u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;

 			data++;

 			for (i = 0; i < node_count; i++) {
-				u32 phandle = *data++;
-				u32 drc_index;
+				__be32 phandle = *data++;
+				__be32 drc_index;

 				switch (action) {
 				case DELETE_DT_NODE:
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@ -211,7 +211,7 @@ do {								\

 extern unsigned long mmap_rnd_mask;

-#define STACK_RND_MASK	(mmap_rnd_mask)
+#define STACK_RND_MASK	(test_thread_flag(TIF_31BIT) ? 0x7ff : mmap_rnd_mask)

 #define ARCH_DLINFO							    \
 do {									    \
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@ -57,6 +57,44 @@

 unsigned long ftrace_plt;

+static inline void ftrace_generate_orig_insn(struct ftrace_insn *insn)
+{
+#ifdef CC_USING_HOTPATCH
+	/* brcl 0,0 */
+	insn->opc = 0xc004;
+	insn->disp = 0;
+#else
+	/* stg r14,8(r15) */
+	insn->opc = 0xe3e0;
+	insn->disp = 0xf0080024;
+#endif
+}
+
+static inline int is_kprobe_on_ftrace(struct ftrace_insn *insn)
+{
+#ifdef CONFIG_KPROBES
+	if (insn->opc == BREAKPOINT_INSTRUCTION)
+		return 1;
+#endif
+	return 0;
+}
+
+static inline void ftrace_generate_kprobe_nop_insn(struct ftrace_insn *insn)
+{
+#ifdef CONFIG_KPROBES
+	insn->opc = BREAKPOINT_INSTRUCTION;
+	insn->disp = KPROBE_ON_FTRACE_NOP;
+#endif
+}
+
+static inline void ftrace_generate_kprobe_call_insn(struct ftrace_insn *insn)
+{
+#ifdef CONFIG_KPROBES
+	insn->opc = BREAKPOINT_INSTRUCTION;
+	insn->disp = KPROBE_ON_FTRACE_CALL;
+#endif
+}
+
 int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
 		       unsigned long addr)
 {
@ -72,16 +110,9 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 		return -EFAULT;
 	if (addr == MCOUNT_ADDR) {
 		/* Initial code replacement */
-#ifdef CC_USING_HOTPATCH
-		/* We expect to see brcl 0,0 */
-		ftrace_generate_nop_insn(&orig);
-#else
-		/* We expect to see stg r14,8(r15) */
-		orig.opc = 0xe3e0;
-		orig.disp = 0xf0080024;
-#endif
+		ftrace_generate_orig_insn(&orig);
 		ftrace_generate_nop_insn(&new);
-	} else if (old.opc == BREAKPOINT_INSTRUCTION) {
+	} else if (is_kprobe_on_ftrace(&old)) {
 		/*
 		 * If we find a breakpoint instruction, a kprobe has been
 		 * placed at the beginning of the function. We write the
@ -89,9 +120,8 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 		 * bytes of the original instruction so that the kprobes
 		 * handler can execute a nop, if it reaches this breakpoint.
 		 */
-		new.opc = orig.opc = BREAKPOINT_INSTRUCTION;
-		orig.disp = KPROBE_ON_FTRACE_CALL;
-		new.disp = KPROBE_ON_FTRACE_NOP;
+		ftrace_generate_kprobe_call_insn(&orig);
+		ftrace_generate_kprobe_nop_insn(&new);
 	} else {
 		/* Replace ftrace call with a nop. */
 		ftrace_generate_call_insn(&orig, rec->ip);
@ -111,7 +141,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)

 	if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
 		return -EFAULT;
-	if (old.opc == BREAKPOINT_INSTRUCTION) {
+	if (is_kprobe_on_ftrace(&old)) {
 		/*
 		 * If we find a breakpoint instruction, a kprobe has been
 		 * placed at the beginning of the function. We write the
@ -119,9 +149,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 		 * bytes of the original instruction so that the kprobes
 		 * handler can execute a brasl if it reaches this breakpoint.
 		 */
-		new.opc = orig.opc = BREAKPOINT_INSTRUCTION;
-		orig.disp = KPROBE_ON_FTRACE_NOP;
-		new.disp = KPROBE_ON_FTRACE_CALL;
+		ftrace_generate_kprobe_nop_insn(&orig);
+		ftrace_generate_kprobe_call_insn(&new);
 	} else {
 		/* Replace nop with an ftrace call. */
 		ftrace_generate_nop_insn(&orig);
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@ -1415,7 +1415,7 @@ CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG);

 static struct attribute *cpumsf_pmu_events_attr[] = {
 	CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC),
-	CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG),
+	NULL,
 	NULL,
 };

@ -1606,8 +1606,11 @@ static int __init init_cpum_sampling_pmu(void)
 		return -EINVAL;
 	}

-	if (si.ad)
+	if (si.ad) {
 		sfb_set_limits(CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB);
+		cpumsf_pmu_events_attr[1] =
+			CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG);
+	}

 	sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80);
 	if (!sfdbg)
--- a/arch/s390/kernel/swsusp_asm64.S
+++ b/arch/s390/kernel/swsusp_asm64.S
@ -177,6 +177,17 @@ restart_entry:
 	lhi	%r1,1
 	sigp	%r1,%r0,SIGP_SET_ARCHITECTURE
 	sam64
+#ifdef CONFIG_SMP
+	larl	%r1,smp_cpu_mt_shift
+	icm	%r1,15,0(%r1)
+	jz	smt_done
+	llgfr	%r1,%r1
+smt_loop:
+	sigp	%r1,%r0,SIGP_SET_MULTI_THREADING
+	brc	8,smt_done			/* accepted */
+	brc	2,smt_loop			/* busy, try again */
+smt_done:
+#endif
 	larl	%r1,.Lnew_pgm_check_psw
 	lpswe	0(%r1)
 pgm_check_entry:
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@ -215,20 +215,20 @@ void update_vsyscall(struct timekeeper *tk)
 {
 	u64 nsecps;

-	if (tk->tkr.clock != &clocksource_tod)
+	if (tk->tkr_mono.clock != &clocksource_tod)
 		return;

 	/* Make userspace gettimeofday spin until we're done. */
 	++vdso_data->tb_update_count;
 	smp_wmb();
-	vdso_data->xtime_tod_stamp = tk->tkr.cycle_last;
+	vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last;
 	vdso_data->xtime_clock_sec = tk->xtime_sec;
-	vdso_data->xtime_clock_nsec = tk->tkr.xtime_nsec;
+	vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
 	vdso_data->wtom_clock_sec =
 		tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = tk->tkr.xtime_nsec +
-		+ ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr.shift);
-	nsecps = (u64) NSEC_PER_SEC << tk->tkr.shift;
+	vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec +
+		+ ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+	nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift;
 	while (vdso_data->wtom_clock_nsec >= nsecps) {
 		vdso_data->wtom_clock_nsec -= nsecps;
 		vdso_data->wtom_clock_sec++;
@ -236,7 +236,7 @@ void update_vsyscall(struct timekeeper *tk)

 	vdso_data->xtime_coarse_sec = tk->xtime_sec;
 	vdso_data->xtime_coarse_nsec =
-		(long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+		(long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 	vdso_data->wtom_coarse_sec =
 		vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec;
 	vdso_data->wtom_coarse_nsec =
@ -246,8 +246,8 @@ void update_vsyscall(struct timekeeper *tk)
 		vdso_data->wtom_coarse_sec++;
 	}

-	vdso_data->tk_mult = tk->tkr.mult;
-	vdso_data->tk_shift = tk->tkr.shift;
+	vdso_data->tk_mult = tk->tkr_mono.mult;
+	vdso_data->tk_shift = tk->tkr_mono.shift;
 	smp_wmb();
 	++vdso_data->tb_update_count;
 }
@ -283,7 +283,7 @@ void __init time_init(void)
 	if (register_external_irq(EXT_IRQ_TIMING_ALERT, timing_alert_interrupt))
 		panic("Couldn't request external interrupt 0x1406");

-	if (clocksource_register(&clocksource_tod) != 0)
+	if (__clocksource_register(&clocksource_tod) != 0)
 		panic("Could not register TOD clock source");

 	/* Enable TOD clock interrupts on the boot cpu. */
--- a/arch/sparc/include/asm/hypervisor.h
+++ b/arch/sparc/include/asm/hypervisor.h
@ -2957,6 +2957,17 @@ unsigned long sun4v_t5_set_perfreg(unsigned long reg_num,
 				   unsigned long reg_val);
 #endif

+
+#define HV_FAST_M7_GET_PERFREG	0x43
+#define HV_FAST_M7_SET_PERFREG	0x44
+
+#ifndef	__ASSEMBLY__
+unsigned long sun4v_m7_get_perfreg(unsigned long reg_num,
+				      unsigned long *reg_val);
+unsigned long sun4v_m7_set_perfreg(unsigned long reg_num,
+				      unsigned long reg_val);
+#endif
+
 /* Function numbers for HV_CORE_TRAP.  */
 #define HV_CORE_SET_VER			0x00
 #define HV_CORE_PUTCHAR			0x01
@ -2981,6 +2992,7 @@ unsigned long sun4v_t5_set_perfreg(unsigned long reg_num,
 #define HV_GRP_SDIO			0x0108
 #define HV_GRP_SDIO_ERR			0x0109
 #define HV_GRP_REBOOT_DATA		0x0110
+#define HV_GRP_M7_PERF			0x0114
 #define HV_GRP_NIAG_PERF		0x0200
 #define HV_GRP_FIRE_PERF		0x0201
 #define HV_GRP_N2_CPU			0x0202
--- a/arch/sparc/kernel/hvapi.c
+++ b/arch/sparc/kernel/hvapi.c
@ -48,6 +48,7 @@ static struct api_info api_table[] = {
 	{ .group = HV_GRP_VT_CPU,				},
 	{ .group = HV_GRP_T5_CPU,				},
 	{ .group = HV_GRP_DIAG,		.flags = FLAG_PRE_API	},
+	{ .group = HV_GRP_M7_PERF,				},
 };

 static DEFINE_SPINLOCK(hvapi_lock);
--- a/arch/sparc/kernel/hvcalls.S
+++ b/arch/sparc/kernel/hvcalls.S
@ -837,3 +837,19 @@ ENTRY(sun4v_t5_set_perfreg)
 	retl
 	 nop
 ENDPROC(sun4v_t5_set_perfreg)
+
+ENTRY(sun4v_m7_get_perfreg)
+	mov	%o1, %o4
+	mov	HV_FAST_M7_GET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	stx	%o1, [%o4]
+	retl
+	nop
+ENDPROC(sun4v_m7_get_perfreg)
+
+ENTRY(sun4v_m7_set_perfreg)
+	mov	HV_FAST_M7_SET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	retl
+	nop
+ENDPROC(sun4v_m7_set_perfreg)
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@ -217,6 +217,31 @@ static const struct pcr_ops n5_pcr_ops = {
 	.pcr_nmi_disable	= PCR_N4_PICNPT,
 };

+static u64 m7_pcr_read(unsigned long reg_num)
+{
+	unsigned long val;
+
+	(void) sun4v_m7_get_perfreg(reg_num, &val);
+
+	return val;
+}
+
+static void m7_pcr_write(unsigned long reg_num, u64 val)
+{
+	(void) sun4v_m7_set_perfreg(reg_num, val);
+}
+
+static const struct pcr_ops m7_pcr_ops = {
+	.read_pcr		= m7_pcr_read,
+	.write_pcr		= m7_pcr_write,
+	.read_pic		= n4_pic_read,
+	.write_pic		= n4_pic_write,
+	.nmi_picl_value		= n4_picl_value,
+	.pcr_nmi_enable		= (PCR_N4_PICNPT | PCR_N4_STRACE |
+				   PCR_N4_UTRACE | PCR_N4_TOE |
+				   (26 << PCR_N4_SL_SHIFT)),
+	.pcr_nmi_disable	= PCR_N4_PICNPT,
+};

 static unsigned long perf_hsvc_group;
 static unsigned long perf_hsvc_major;
@ -248,6 +273,10 @@ static int __init register_perf_hsvc(void)
 			perf_hsvc_group = HV_GRP_T5_CPU;
 			break;

+		case SUN4V_CHIP_SPARC_M7:
+			perf_hsvc_group = HV_GRP_M7_PERF;
+			break;
+
 		default:
 			return -ENODEV;
 		}
@ -293,6 +322,10 @@ static int __init setup_sun4v_pcr_ops(void)
 		pcr_ops = &n5_pcr_ops;
 		break;

+	case SUN4V_CHIP_SPARC_M7:
+		pcr_ops = &m7_pcr_ops;
+		break;
+
 	default:
 		ret = -ENODEV;
 		break;
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@ -792,6 +792,42 @@ static const struct sparc_pmu niagara4_pmu = {
 	.num_pic_regs	= 4,
 };

+static void sparc_m7_write_pmc(int idx, u64 val)
+{
+	u64 pcr;
+
+	pcr = pcr_ops->read_pcr(idx);
+	/* ensure ov and ntc are reset */
+	pcr &= ~(PCR_N4_OV | PCR_N4_NTC);
+
+	pcr_ops->write_pic(idx, val & 0xffffffff);
+
+	pcr_ops->write_pcr(idx, pcr);
+}
+
+static const struct sparc_pmu sparc_m7_pmu = {
+	.event_map	= niagara4_event_map,
+	.cache_map	= &niagara4_cache_map,
+	.max_events	= ARRAY_SIZE(niagara4_perfmon_event_map),
+	.read_pmc	= sparc_vt_read_pmc,
+	.write_pmc	= sparc_m7_write_pmc,
+	.upper_shift	= 5,
+	.lower_shift	= 5,
+	.event_mask	= 0x7ff,
+	.user_bit	= PCR_N4_UTRACE,
+	.priv_bit	= PCR_N4_STRACE,
+
+	/* We explicitly don't support hypervisor tracing. */
+	.hv_bit		= 0,
+
+	.irq_bit	= PCR_N4_TOE,
+	.upper_nop	= 0,
+	.lower_nop	= 0,
+	.flags		= 0,
+	.max_hw_events	= 4,
+	.num_pcrs	= 4,
+	.num_pic_regs	= 4,
+};
 static const struct sparc_pmu *sparc_pmu __read_mostly;

 static u64 event_encoding(u64 event_id, int idx)
@ -960,6 +996,8 @@ static void calculate_single_pcr(struct cpu_hw_events *cpuc)
 	cpuc->pcr[0] |= cpuc->event[0]->hw.config_base;
 }

+static void sparc_pmu_start(struct perf_event *event, int flags);
+
 /* On this PMU each PIC has it's own PCR control register.  */
 static void calculate_multiple_pcrs(struct cpu_hw_events *cpuc)
 {
@ -972,20 +1010,13 @@ static void calculate_multiple_pcrs(struct cpu_hw_events *cpuc)
 		struct perf_event *cp = cpuc->event[i];
 		struct hw_perf_event *hwc = &cp->hw;
 		int idx = hwc->idx;
-		u64 enc;

 		if (cpuc->current_idx[i] != PIC_NO_INDEX)
 			continue;

-		sparc_perf_event_set_period(cp, hwc, idx);
 		cpuc->current_idx[i] = idx;

-		enc = perf_event_get_enc(cpuc->events[i]);
-		cpuc->pcr[idx] &= ~mask_for_index(idx);
-		if (hwc->state & PERF_HES_STOPPED)
-			cpuc->pcr[idx] |= nop_for_index(idx);
-		else
-			cpuc->pcr[idx] |= event_encoding(enc, idx);
+		sparc_pmu_start(cp, PERF_EF_RELOAD);
 	}
 out:
 	for (i = 0; i < cpuc->n_events; i++) {
@ -1101,7 +1132,6 @@ static void sparc_pmu_del(struct perf_event *event, int _flags)
 	int i;

 	local_irq_save(flags);
-	perf_pmu_disable(event->pmu);

 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event[i]) {
@ -1127,7 +1157,6 @@ static void sparc_pmu_del(struct perf_event *event, int _flags)
 		}
 	}

-	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }

@ -1361,7 +1390,6 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags)
 	unsigned long flags;

 	local_irq_save(flags);
-	perf_pmu_disable(event->pmu);

 	n0 = cpuc->n_events;
 	if (n0 >= sparc_pmu->max_hw_events)
@ -1394,7 +1422,6 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags)

 	ret = 0;
 out:
-	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 	return ret;
 }
@ -1667,6 +1694,10 @@ static bool __init supported_pmu(void)
 		sparc_pmu = &niagara4_pmu;
 		return true;
 	}
+	if (!strcmp(sparc_pmu_type, "sparc-m7")) {
+		sparc_pmu = &sparc_m7_pmu;
+		return true;
+	}
 	return false;
 }

--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@ -287,6 +287,8 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
 			printk("             TPC[%lx] O7[%lx] I7[%lx] RPC[%lx]\n",
 			       gp->tpc, gp->o7, gp->i7, gp->rpc);
 		}
+
+		touch_nmi_watchdog();
 	}

 	memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
@ -362,6 +364,8 @@ static void pmu_snapshot_all_cpus(void)
 		       (cpu == this_cpu ? '*' : ' '), cpu,
 		       pp->pcr[0], pp->pcr[1], pp->pcr[2], pp->pcr[3],
 		       pp->pic[0], pp->pic[1], pp->pic[2], pp->pic[3]);
+
+		touch_nmi_watchdog();
 	}

 	memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@ -181,17 +181,13 @@ static struct clocksource timer_cs = {
 	.rating	= 100,
 	.read	= timer_cs_read,
 	.mask	= CLOCKSOURCE_MASK(64),
-	.shift	= 2,
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 };

 static __init int setup_timer_cs(void)
 {
 	timer_cs_enabled = 1;
-	timer_cs.mult = clocksource_hz2mult(sparc_config.clock_rate,
-	                                    timer_cs.shift);
-
-	return clocksource_register(&timer_cs);
+	return clocksource_register_hz(&timer_cs, sparc_config.clock_rate);
 }

 #ifdef CONFIG_SMP
--- a/arch/sparc/lib/memmove.S
+++ b/arch/sparc/lib/memmove.S
@ -8,9 +8,11 @@

 	.text
 ENTRY(memmove) /* o0=dst o1=src o2=len */
-	mov		%o0, %g1
+	brz,pn		%o2, 99f
+	 mov		%o0, %g1
+
 	cmp		%o0, %o1
-	bleu,pt		%xcc, memcpy
+	bleu,pt		%xcc, 2f
 	 add		%o1, %o2, %g7
 	cmp		%g7, %o0
 	bleu,pt		%xcc, memcpy
@ -24,7 +26,34 @@ ENTRY(memmove) /* o0=dst o1=src o2=len */
 	stb		%g7, [%o0]
 	bne,pt		%icc, 1b
 	 sub		%o0, 1, %o0
-
+99:
 	retl
 	 mov		%g1, %o0
+
+	/* We can't just call memcpy for these memmove cases.  On some
+	 * chips the memcpy uses cache initializing stores and when dst
+	 * and src are close enough, those can clobber the source data
+	 * before we've loaded it in.
+	 */
+2:	or		%o0, %o1, %g7
+	or		%o2, %g7, %g7
+	andcc		%g7, 0x7, %g0
+	bne,pn		%xcc, 4f
+	 nop
+
+3:	ldx		[%o1], %g7
+	add		%o1, 8, %o1
+	subcc		%o2, 8, %o2
+	add		%o0, 8, %o0
+	bne,pt		%icc, 3b
+	 stx		%g7, [%o0 - 0x8]
+	ba,a,pt		%xcc, 99b
+
+4:	ldub		[%o1], %g7
+	add		%o1, 1, %o1
+	subcc		%o2, 1, %o2
+	add		%o0, 1, %o0
+	bne,pt		%icc, 4b
+	 stb		%g7, [%o0 - 0x1]
+	ba,a,pt		%xcc, 99b
 ENDPROC(memmove)
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@ -257,34 +257,34 @@ void update_vsyscall_tz(void)

 void update_vsyscall(struct timekeeper *tk)
 {
-	if (tk->tkr.clock != &cycle_counter_cs)
+	if (tk->tkr_mono.clock != &cycle_counter_cs)
 		return;

 	write_seqcount_begin(&vdso_data->tb_seq);

-	vdso_data->cycle_last		= tk->tkr.cycle_last;
-	vdso_data->mask			= tk->tkr.mask;
-	vdso_data->mult			= tk->tkr.mult;
-	vdso_data->shift		= tk->tkr.shift;
+	vdso_data->cycle_last		= tk->tkr_mono.cycle_last;
+	vdso_data->mask			= tk->tkr_mono.mask;
+	vdso_data->mult			= tk->tkr_mono.mult;
+	vdso_data->shift		= tk->tkr_mono.shift;

 	vdso_data->wall_time_sec	= tk->xtime_sec;
-	vdso_data->wall_time_snsec	= tk->tkr.xtime_nsec;
+	vdso_data->wall_time_snsec	= tk->tkr_mono.xtime_nsec;

 	vdso_data->monotonic_time_sec	= tk->xtime_sec
 					+ tk->wall_to_monotonic.tv_sec;
-	vdso_data->monotonic_time_snsec	= tk->tkr.xtime_nsec
+	vdso_data->monotonic_time_snsec	= tk->tkr_mono.xtime_nsec
 					+ ((u64)tk->wall_to_monotonic.tv_nsec
-						<< tk->tkr.shift);
+						<< tk->tkr_mono.shift);
 	while (vdso_data->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+					(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
 		vdso_data->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->tkr.shift;
+					((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
 		vdso_data->monotonic_time_sec++;
 	}

 	vdso_data->wall_time_coarse_sec	= tk->xtime_sec;
-	vdso_data->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
-						 tk->tkr.shift);
+	vdso_data->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
+						 tk->tkr_mono.shift);

 	vdso_data->monotonic_time_coarse_sec =
 		vdso_data->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event,

 	data = cyc2ns_read_begin();

+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
 	userpg->time_offset = data->cyc2ns_offset - now;

-	userpg->cap_user_time_zero = 1;
-	userpg->time_zero = data->cyc2ns_offset;
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}

 	cyc2ns_read_end(data);
 }
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
 	gtod_write_begin(vdata);

 	/* copy vsyscall data */
-	vdata->vclock_mode	= tk->tkr.clock->archdata.vclock_mode;
-	vdata->cycle_last	= tk->tkr.cycle_last;
-	vdata->mask		= tk->tkr.mask;
-	vdata->mult		= tk->tkr.mult;
-	vdata->shift		= tk->tkr.shift;
+	vdata->vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
+	vdata->cycle_last	= tk->tkr_mono.cycle_last;
+	vdata->mask		= tk->tkr_mono.mask;
+	vdata->mult		= tk->tkr_mono.mult;
+	vdata->shift		= tk->tkr_mono.shift;

 	vdata->wall_time_sec		= tk->xtime_sec;
-	vdata->wall_time_snsec		= tk->tkr.xtime_nsec;
+	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;

 	vdata->monotonic_time_sec	= tk->xtime_sec
 					+ tk->wall_to_monotonic.tv_sec;
-	vdata->monotonic_time_snsec	= tk->tkr.xtime_nsec
+	vdata->monotonic_time_snsec	= tk->tkr_mono.xtime_nsec
 					+ ((u64)tk->wall_to_monotonic.tv_nsec
-						<< tk->tkr.shift);
+						<< tk->tkr_mono.shift);
 	while (vdata->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+					(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
 		vdata->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->tkr.shift;
+					((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
 		vdata->monotonic_time_sec++;
 	}

 	vdata->wall_time_coarse_sec	= tk->xtime_sec;
-	vdata->wall_time_coarse_nsec	= (long)(tk->tkr.xtime_nsec >>
-						 tk->tkr.shift);
+	vdata->wall_time_coarse_nsec	= (long)(tk->tkr_mono.xtime_nsec >>
+						 tk->tkr_mono.shift);

 	vdata->monotonic_time_coarse_sec =
 		vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@ -422,6 +422,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 			struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
 	int i;
+	struct kvm_lapic *apic = vcpu->arch.apic;

 	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
 		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
@ -443,7 +444,8 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 		kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
 		spin_lock(&ioapic->lock);

-		if (trigger_mode != IOAPIC_LEVEL_TRIG)
+		if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+		    kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
 			continue;

 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@ -833,8 +833,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)

 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
-	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
-	    kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
 		int trigger_mode;
 		if (apic_test_vector(vector, apic->regs + APIC_TMR))
 			trigger_mode = IOAPIC_LEVEL_TRIG;
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@ -2479,8 +2479,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
 		vmx->nested.nested_vmx_secondary_ctls_high |=
-			SECONDARY_EXEC_ENABLE_EPT |
-			SECONDARY_EXEC_UNRESTRICTED_GUEST;
+			SECONDARY_EXEC_ENABLE_EPT;
 		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
 			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
 			 VMX_EPT_INVEPT_BIT;
@ -2494,6 +2493,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;

+	if (enable_unrestricted_guest)
+		vmx->nested.nested_vmx_secondary_ctls_high |=
+			SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
 	/* miscellaneous data */
 	rdmsr(MSR_IA32_VMX_MISC,
 		vmx->nested.nested_vmx_misc_low,
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -1070,19 +1070,19 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
 	u64 boot_ns;

-	boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
+	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));

 	write_seqcount_begin(&vdata->seq);

 	/* copy pvclock gtod data */
-	vdata->clock.vclock_mode	= tk->tkr.clock->archdata.vclock_mode;
-	vdata->clock.cycle_last		= tk->tkr.cycle_last;
-	vdata->clock.mask		= tk->tkr.mask;
-	vdata->clock.mult		= tk->tkr.mult;
-	vdata->clock.shift		= tk->tkr.shift;
+	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
+	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
+	vdata->clock.mask		= tk->tkr_mono.mask;
+	vdata->clock.mult		= tk->tkr_mono.mult;
+	vdata->clock.shift		= tk->tkr_mono.shift;

 	vdata->boot_ns			= boot_ns;
-	vdata->nsec_base		= tk->tkr.xtime_nsec;
+	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;

 	write_seqcount_end(&vdata->seq);
 }
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@ -592,7 +592,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
 		struct bio_vec *bprev;

-		bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1];
+		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
 		if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
 			return false;
 	}
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@ -278,9 +278,11 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		/*
 		 * We're out of tags on this hardware queue, kick any
 		 * pending IO submits before going to sleep waiting for
-		 * some to complete.
+		 * some to complete. Note that hctx can be NULL here for
+		 * reserved tag allocation.
 		 */
-		blk_mq_run_hw_queue(hctx, false);
+		if (hctx)
+			blk_mq_run_hw_queue(hctx, false);

 		/*
 		 * Retry tag allocation after running the hardware queue,
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@ -1938,7 +1938,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	 */
 	if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
 			    PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
-		goto err_map;
+		goto err_mq_usage;

 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
 	blk_queue_rq_timeout(q, 30000);
@ -1981,7 +1981,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);

 	if (blk_mq_init_hw_queues(q, set))
-		goto err_hw;
+		goto err_mq_usage;

 	mutex_lock(&all_q_mutex);
 	list_add_tail(&q->all_q_node, &all_q_list);
@ -1993,7 +1993,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)

 	return q;

-err_hw:
+err_mq_usage:
 	blk_cleanup_queue(q);
 err_hctxs:
 	kfree(map);
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@ -4737,7 +4737,7 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
 		return NULL;

 	/* libsas case */
-	if (!ap->scsi_host) {
+	if (ap->flags & ATA_FLAG_SAS_HOST) {
 		tag = ata_sas_allocate_tag(ap);
 		if (tag < 0)
 			return NULL;
@ -4776,7 +4776,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	tag = qc->tag;
 	if (likely(ata_tag_valid(tag))) {
 		qc->tag = ATA_TAG_POISON;
-		if (!ap->scsi_host)
+		if (ap->flags & ATA_FLAG_SAS_HOST)
 			ata_sas_free_tag(tag, ap);
 	}
 }
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@ -243,4 +243,12 @@ extern struct regcache_ops regcache_rbtree_ops;
 extern struct regcache_ops regcache_lzo_ops;
 extern struct regcache_ops regcache_flat_ops;

+static inline const char *regmap_name(const struct regmap *map)
+{
+	if (map->dev)
+		return dev_name(map->dev);
+
+	return map->name;
+}
+
 #endif
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@ -218,7 +218,7 @@ int regcache_read(struct regmap *map,
 		ret = map->cache_ops->read(map, reg, value);

 		if (ret == 0)
-			trace_regmap_reg_read_cache(map->dev, reg, *value);
+			trace_regmap_reg_read_cache(map, reg, *value);

 		return ret;
 	}
@ -311,7 +311,7 @@ int regcache_sync(struct regmap *map)
 	dev_dbg(map->dev, "Syncing %s cache\n",
 		map->cache_ops->name);
 	name = map->cache_ops->name;
-	trace_regcache_sync(map->dev, name, "start");
+	trace_regcache_sync(map, name, "start");

 	if (!map->cache_dirty)
 		goto out;
@ -346,7 +346,7 @@ int regcache_sync(struct regmap *map)

 	regmap_async_complete(map);

-	trace_regcache_sync(map->dev, name, "stop");
+	trace_regcache_sync(map, name, "stop");

 	return ret;
 }
@ -381,7 +381,7 @@ int regcache_sync_region(struct regmap *map, unsigned int min,
 	name = map->cache_ops->name;
 	dev_dbg(map->dev, "Syncing %s cache from %d-%d\n", name, min, max);

-	trace_regcache_sync(map->dev, name, "start region");
+	trace_regcache_sync(map, name, "start region");

 	if (!map->cache_dirty)
 		goto out;
@ -401,7 +401,7 @@ int regcache_sync_region(struct regmap *map, unsigned int min,

 	regmap_async_complete(map);

-	trace_regcache_sync(map->dev, name, "stop region");
+	trace_regcache_sync(map, name, "stop region");

 	return ret;
 }
@ -428,7 +428,7 @@ int regcache_drop_region(struct regmap *map, unsigned int min,

 	map->lock(map->lock_arg);

-	trace_regcache_drop_region(map->dev, min, max);
+	trace_regcache_drop_region(map, min, max);

 	ret = map->cache_ops->drop(map, min, max);

@ -455,7 +455,7 @@ void regcache_cache_only(struct regmap *map, bool enable)
 	map->lock(map->lock_arg);
 	WARN_ON(map->cache_bypass && enable);
 	map->cache_only = enable;
-	trace_regmap_cache_only(map->dev, enable);
+	trace_regmap_cache_only(map, enable);
 	map->unlock(map->lock_arg);
 }
 EXPORT_SYMBOL_GPL(regcache_cache_only);
@ -493,7 +493,7 @@ void regcache_cache_bypass(struct regmap *map, bool enable)
 	map->lock(map->lock_arg);
 	WARN_ON(map->cache_only && enable);
 	map->cache_bypass = enable;
-	trace_regmap_cache_bypass(map->dev, enable);
+	trace_regmap_cache_bypass(map, enable);
 	map->unlock(map->lock_arg);
 }
 EXPORT_SYMBOL_GPL(regcache_cache_bypass);
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@ -1281,7 +1281,7 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
 	if (map->async && map->bus->async_write) {
 		struct regmap_async *async;

-		trace_regmap_async_write_start(map->dev, reg, val_len);
+		trace_regmap_async_write_start(map, reg, val_len);

 		spin_lock_irqsave(&map->async_lock, flags);
 		async = list_first_entry_or_null(&map->async_free,
@ -1339,8 +1339,7 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
 		return ret;
 	}

-	trace_regmap_hw_write_start(map->dev, reg,
-				    val_len / map->format.val_bytes);
+	trace_regmap_hw_write_start(map, reg, val_len / map->format.val_bytes);

 	/* If we're doing a single register write we can probably just
 	 * send the work_buf directly, otherwise try to do a gather
@ -1372,8 +1371,7 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
 		kfree(buf);
 	}

-	trace_regmap_hw_write_done(map->dev, reg,
-				   val_len / map->format.val_bytes);
+	trace_regmap_hw_write_done(map, reg, val_len / map->format.val_bytes);

 	return ret;
 }
@ -1407,12 +1405,12 @@ static int _regmap_bus_formatted_write(void *context, unsigned int reg,

 	map->format.format_write(map, reg, val);

-	trace_regmap_hw_write_start(map->dev, reg, 1);
+	trace_regmap_hw_write_start(map, reg, 1);

 	ret = map->bus->write(map->bus_context, map->work_buf,
 			      map->format.buf_size);

-	trace_regmap_hw_write_done(map->dev, reg, 1);
+	trace_regmap_hw_write_done(map, reg, 1);

 	return ret;
 }
@ -1470,7 +1468,7 @@ int _regmap_write(struct regmap *map, unsigned int reg,
 		dev_info(map->dev, "%x <= %x\n", reg, val);
 #endif

-	trace_regmap_reg_write(map->dev, reg, val);
+	trace_regmap_reg_write(map, reg, val);

 	return map->reg_write(context, reg, val);
 }
@ -1773,7 +1771,7 @@ static int _regmap_raw_multi_reg_write(struct regmap *map,
 	for (i = 0; i < num_regs; i++) {
 		int reg = regs[i].reg;
 		int val = regs[i].def;
-		trace_regmap_hw_write_start(map->dev, reg, 1);
+		trace_regmap_hw_write_start(map, reg, 1);
 		map->format.format_reg(u8, reg, map->reg_shift);
 		u8 += reg_bytes + pad_bytes;
 		map->format.format_val(u8, val, 0);
@ -1788,7 +1786,7 @@ static int _regmap_raw_multi_reg_write(struct regmap *map,

 	for (i = 0; i < num_regs; i++) {
 		int reg = regs[i].reg;
-		trace_regmap_hw_write_done(map->dev, reg, 1);
+		trace_regmap_hw_write_done(map, reg, 1);
 	}
 	return ret;
 }
@ -2059,15 +2057,13 @@ static int _regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
 	 */
 	u8[0] |= map->read_flag_mask;

-	trace_regmap_hw_read_start(map->dev, reg,
-				   val_len / map->format.val_bytes);
+	trace_regmap_hw_read_start(map, reg, val_len / map->format.val_bytes);

 	ret = map->bus->read(map->bus_context, map->work_buf,
 			     map->format.reg_bytes + map->format.pad_bytes,
 			     val, val_len);

-	trace_regmap_hw_read_done(map->dev, reg,
-				  val_len / map->format.val_bytes);
+	trace_regmap_hw_read_done(map, reg, val_len / map->format.val_bytes);

 	return ret;
 }
@ -2123,7 +2119,7 @@ static int _regmap_read(struct regmap *map, unsigned int reg,
 			dev_info(map->dev, "%x => %x\n", reg, *val);
 #endif

-		trace_regmap_reg_read(map->dev, reg, *val);
+		trace_regmap_reg_read(map, reg, *val);

 		if (!map->cache_bypass)
 			regcache_write(map, reg, *val);
@ -2480,7 +2476,7 @@ void regmap_async_complete_cb(struct regmap_async *async, int ret)
 	struct regmap *map = async->map;
 	bool wake;

-	trace_regmap_async_io_complete(map->dev);
+	trace_regmap_async_io_complete(map);

 	spin_lock(&map->async_lock);
 	list_move(&async->list, &map->async_free);
@ -2525,7 +2521,7 @@ int regmap_async_complete(struct regmap *map)
 	if (!map->bus || !map->bus->async_write)
 		return 0;

-	trace_regmap_async_complete_start(map->dev);
+	trace_regmap_async_complete_start(map);

 	wait_event(map->async_waitq, regmap_async_is_done(map));

@ -2534,7 +2530,7 @@ int regmap_async_complete(struct regmap *map)
 	map->async_ret = 0;
 	spin_unlock_irqrestore(&map->async_lock, flags);

-	trace_regmap_async_complete_done(map->dev);
+	trace_regmap_async_complete_done(map);

 	return ret;
 }
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@ -803,10 +803,6 @@ static int __init nbd_init(void)
 		return -EINVAL;
 	}

-	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
-	if (!nbd_dev)
-		return -ENOMEM;
-
 	part_shift = 0;
 	if (max_part > 0) {
 		part_shift = fls(max_part);
@ -828,6 +824,10 @@ static int __init nbd_init(void)
 	if (nbds_max > 1UL << (MINORBITS - part_shift))
 		return -EINVAL;

+	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
+	if (!nbd_dev)
+		return -ENOMEM;
+
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = alloc_disk(1 << part_shift);
 		if (!disk)
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@ -3003,6 +3003,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 	get_device(dev->device);

+	INIT_LIST_HEAD(&dev->node);
 	INIT_WORK(&dev->probe_work, nvme_async_probe);
 	schedule_work(&dev->probe_work);
 	return 0;
--- a/drivers/clocksource/em_sti.c
+++ b/drivers/clocksource/em_sti.c
@ -210,7 +210,7 @@ static int em_sti_clocksource_enable(struct clocksource *cs)

 	ret = em_sti_start(p, USER_CLOCKSOURCE);
 	if (!ret)
-		__clocksource_updatefreq_hz(cs, p->rate);
+		__clocksource_update_freq_hz(cs, p->rate);
 	return ret;
 }

--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@ -641,7 +641,7 @@ static int sh_cmt_clocksource_enable(struct clocksource *cs)

 	ret = sh_cmt_start(ch, FLAG_CLOCKSOURCE);
 	if (!ret) {
-		__clocksource_updatefreq_hz(cs, ch->rate);
+		__clocksource_update_freq_hz(cs, ch->rate);
 		ch->cs_enabled = true;
 	}
 	return ret;
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@ -272,7 +272,7 @@ static int sh_tmu_clocksource_enable(struct clocksource *cs)

 	ret = sh_tmu_enable(ch);
 	if (!ret) {
-		__clocksource_updatefreq_hz(cs, ch->rate);
+		__clocksource_update_freq_hz(cs, ch->rate);
 		ch->cs_enabled = true;
 	}

--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@ -525,17 +525,6 @@ void drm_framebuffer_reference(struct drm_framebuffer *fb)
 }
 EXPORT_SYMBOL(drm_framebuffer_reference);

-static void drm_framebuffer_free_bug(struct kref *kref)
-{
-	BUG();
-}
-
-static void __drm_framebuffer_unreference(struct drm_framebuffer *fb)
-{
-	DRM_DEBUG("%p: FB ID: %d (%d)\n", fb, fb->base.id, atomic_read(&fb->refcount.refcount));
-	kref_put(&fb->refcount, drm_framebuffer_free_bug);
-}
-
 /**
 * drm_framebuffer_unregister_private - unregister a private fb from the lookup idr
 * @fb: fb to unregister
@ -1320,7 +1309,7 @@ void drm_plane_force_disable(struct drm_plane *plane)
 		return;
 	}
 	/* disconnect the plane from the fb and crtc: */
-	__drm_framebuffer_unreference(plane->old_fb);
+	drm_framebuffer_unreference(plane->old_fb);
 	plane->old_fb = NULL;
 	plane->fb = NULL;
 	plane->crtc = NULL;
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@ -2737,24 +2737,11 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)

 	WARN_ON(i915_verify_lists(ring->dev));

-	/* Move any buffers on the active list that are no longer referenced
-	 * by the ringbuffer to the flushing/inactive lists as appropriate,
-	 * before we free the context associated with the requests.
+	/* Retire requests first as we use it above for the early return.
+	 * If we retire requests last, we may use a later seqno and so clear
+	 * the requests lists without clearing the active list, leading to
+	 * confusion.
 	 */
-	while (!list_empty(&ring->active_list)) {
-		struct drm_i915_gem_object *obj;
-
-		obj = list_first_entry(&ring->active_list,
-				      struct drm_i915_gem_object,
-				      ring_list);
-
-		if (!i915_gem_request_completed(obj->last_read_req, true))
-			break;
-
-		i915_gem_object_move_to_inactive(obj);
-	}
-
-
 	while (!list_empty(&ring->request_list)) {
 		struct drm_i915_gem_request *request;
 		struct intel_ringbuffer *ringbuf;
@ -2789,6 +2776,23 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
 		i915_gem_free_request(request);
 	}

+	/* Move any buffers on the active list that are no longer referenced
+	 * by the ringbuffer to the flushing/inactive lists as appropriate,
+	 * before we free the context associated with the requests.
+	 */
+	while (!list_empty(&ring->active_list)) {
+		struct drm_i915_gem_object *obj;
+
+		obj = list_first_entry(&ring->active_list,
+				      struct drm_i915_gem_object,
+				      ring_list);
+
+		if (!i915_gem_request_completed(obj->last_read_req, true))
+			break;
+
+		i915_gem_object_move_to_inactive(obj);
+	}
+
 	if (unlikely(ring->trace_irq_req &&
 		     i915_gem_request_completed(ring->trace_irq_req, true))) {
 		ring->irq_put(ring);
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@ -2438,8 +2438,15 @@ intel_find_plane_obj(struct intel_crtc *intel_crtc,
 	if (!intel_crtc->base.primary->fb)
 		return;

-	if (intel_alloc_plane_obj(intel_crtc, plane_config))
+	if (intel_alloc_plane_obj(intel_crtc, plane_config)) {
+		struct drm_plane *primary = intel_crtc->base.primary;
+
+		primary->state->crtc = &intel_crtc->base;
+		primary->crtc = &intel_crtc->base;
+		update_state_fb(primary);
+
 		return;
+	}

 	kfree(intel_crtc->base.primary->fb);
 	intel_crtc->base.primary->fb = NULL;
@ -2462,11 +2469,15 @@ intel_find_plane_obj(struct intel_crtc *intel_crtc,
 			continue;

 		if (i915_gem_obj_ggtt_offset(obj) == plane_config->base) {
+			struct drm_plane *primary = intel_crtc->base.primary;
+
 			if (obj->tiling_mode != I915_TILING_NONE)
 				dev_priv->preserve_bios_swizzle = true;

 			drm_framebuffer_reference(c->primary->fb);
-			intel_crtc->base.primary->fb = c->primary->fb;
+			primary->fb = c->primary->fb;
+			primary->state->crtc = &intel_crtc->base;
+			primary->crtc = &intel_crtc->base;
 			obj->frontbuffer_bits |= INTEL_FRONTBUFFER_PRIMARY(intel_crtc->pipe);
 			break;
 		}
@ -6663,7 +6674,6 @@ i9xx_get_initial_plane_config(struct intel_crtc *crtc,
 		      plane_config->size);

 	crtc->base.primary->fb = fb;
-	update_state_fb(crtc->base.primary);
 }

 static void chv_crtc_clock_get(struct intel_crtc *crtc,
@ -7704,7 +7714,6 @@ skylake_get_initial_plane_config(struct intel_crtc *crtc,
 		      plane_config->size);

 	crtc->base.primary->fb = fb;
-	update_state_fb(crtc->base.primary);
 	return;

 error:
@ -7798,7 +7807,6 @@ ironlake_get_initial_plane_config(struct intel_crtc *crtc,
 		      plane_config->size);

 	crtc->base.primary->fb = fb;
-	update_state_fb(crtc->base.primary);
 }

 static bool ironlake_get_pipe_config(struct intel_crtc *crtc,
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@ -433,7 +433,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)

 	dm_get(md);
 	atomic_inc(&md->open_count);
-
 out:
 	spin_unlock(&_minor_lock);

@ -442,16 +441,20 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)

 static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
-	struct mapped_device *md = disk->private_data;
+	struct mapped_device *md;

 	spin_lock(&_minor_lock);

+	md = disk->private_data;
+	if (WARN_ON(!md))
+		goto out;
+
 	if (atomic_dec_and_test(&md->open_count) &&
 	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 		queue_work(deferred_remove_workqueue, &deferred_remove_work);

 	dm_put(md);
-
+out:
 	spin_unlock(&_minor_lock);
 }

@ -2241,7 +2244,6 @@ static void free_dev(struct mapped_device *md)
 	int minor = MINOR(disk_devt(md->disk));

 	unlock_fs(md);
-	bdput(md->bdev);
 	destroy_workqueue(md->wq);

 	if (md->kworker_task)
@ -2252,19 +2254,22 @@ static void free_dev(struct mapped_device *md)
 		mempool_destroy(md->rq_pool);
 	if (md->bs)
 		bioset_free(md->bs);
-	blk_integrity_unregister(md->disk);
-	del_gendisk(md->disk);
+
 	cleanup_srcu_struct(&md->io_barrier);
 	free_table_devices(&md->table_devices);
-	free_minor(minor);
+	dm_stats_cleanup(&md->stats);

 	spin_lock(&_minor_lock);
 	md->disk->private_data = NULL;
 	spin_unlock(&_minor_lock);
-
+	if (blk_get_integrity(md->disk))
+		blk_integrity_unregister(md->disk);
+	del_gendisk(md->disk);
 	put_disk(md->disk);
 	blk_cleanup_queue(md->queue);
-	dm_stats_cleanup(&md->stats);
+	bdput(md->bdev);
+	free_minor(minor);
+
 	module_put(THIS_MODULE);
 	kfree(md);
 }
@ -2642,8 +2647,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)

 	might_sleep();

-	spin_lock(&_minor_lock);
 	map = dm_get_live_table(md, &srcu_idx);
+
+	spin_lock(&_minor_lock);
 	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);
--- a/drivers/mfd/kempld-core.c
+++ b/drivers/mfd/kempld-core.c
@ -739,7 +739,7 @@ static int __init kempld_init(void)
 		for (id = kempld_dmi_table;
 		     id->matches[0].slot != DMI_NONE; id++)
 			if (strstr(id->ident, force_device_id))
-				if (id->callback && id->callback(id))
+				if (id->callback && !id->callback(id))
 					break;
 		if (id->matches[0].slot == DMI_NONE)
 			return -ENODEV;
--- a/drivers/mfd/rtsx_usb.c
+++ b/drivers/mfd/rtsx_usb.c
@ -196,18 +196,27 @@ EXPORT_SYMBOL_GPL(rtsx_usb_ep0_write_register);
 int rtsx_usb_ep0_read_register(struct rtsx_ucr *ucr, u16 addr, u8 *data)
 {
 	u16 value;
+	u8 *buf;
+	int ret;

 	if (!data)
 		return -EINVAL;
-	*data = 0;
+
+	buf = kzalloc(sizeof(u8), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;

 	addr |= EP0_READ_REG_CMD << EP0_OP_SHIFT;
 	value = swab16(addr);

-	return usb_control_msg(ucr->pusb_dev,
+	ret = usb_control_msg(ucr->pusb_dev,
 			usb_rcvctrlpipe(ucr->pusb_dev, 0), RTSX_USB_REQ_REG_OP,
 			USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-			value, 0, data, 1, 100);
+			value, 0, buf, 1, 100);
+	*data = *buf;
+
+	kfree(buf);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(rtsx_usb_ep0_read_register);

@ -288,18 +297,27 @@ static int rtsx_usb_get_status_with_bulk(struct rtsx_ucr *ucr, u16 *status)
 int rtsx_usb_get_card_status(struct rtsx_ucr *ucr, u16 *status)
 {
 	int ret;
+	u16 *buf;

 	if (!status)
 		return -EINVAL;

-	if (polling_pipe == 0)
+	if (polling_pipe == 0) {
+		buf = kzalloc(sizeof(u16), GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+
 		ret = usb_control_msg(ucr->pusb_dev,
 				usb_rcvctrlpipe(ucr->pusb_dev, 0),
 				RTSX_USB_REQ_POLL,
 				USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-				0, 0, status, 2, 100);
-	else
+				0, 0, buf, 2, 100);
+		*status = *buf;
+
+		kfree(buf);
+	} else {
 		ret = rtsx_usb_get_status_with_bulk(ucr, status);
+	}

 	/* usb_control_msg may return positive when success */
 	if (ret < 0)
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@ -1543,7 +1543,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 {
 	struct pcnet32_private *lp;
 	int i, media;
-	int fdx, mii, fset, dxsuflo;
+	int fdx, mii, fset, dxsuflo, sram;
 	int chip_version;
 	char *chipname;
 	struct net_device *dev;
@ -1580,7 +1580,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 	}

 	/* initialize variables */
-	fdx = mii = fset = dxsuflo = 0;
+	fdx = mii = fset = dxsuflo = sram = 0;
 	chip_version = (chip_version >> 12) & 0xffff;

 	switch (chip_version) {
@ -1613,6 +1613,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 		chipname = "PCnet/FAST III 79C973";	/* PCI */
 		fdx = 1;
 		mii = 1;
+		sram = 1;
 		break;
 	case 0x2626:
 		chipname = "PCnet/Home 79C978";	/* PCI */
@ -1636,6 +1637,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 		chipname = "PCnet/FAST III 79C975";	/* PCI */
 		fdx = 1;
 		mii = 1;
+		sram = 1;
 		break;
 	case 0x2628:
 		chipname = "PCnet/PRO 79C976";
@ -1664,6 +1666,31 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 		dxsuflo = 1;
 	}

+	/*
+	 * The Am79C973/Am79C975 controllers come with 12K of SRAM
+	 * which we can use for the Tx/Rx buffers but most importantly,
+	 * the use of SRAM allow us to use the BCR18:NOUFLO bit to avoid
+	 * Tx fifo underflows.
+	 */
+	if (sram) {
+		/*
+		 * The SRAM is being configured in two steps. First we
+		 * set the SRAM size in the BCR25:SRAM_SIZE bits. According
+		 * to the datasheet, each bit corresponds to a 512-byte
+		 * page so we can have at most 24 pages. The SRAM_SIZE
+		 * holds the value of the upper 8 bits of the 16-bit SRAM size.
+		 * The low 8-bits start at 0x00 and end at 0xff. So the
+		 * address range is from 0x0000 up to 0x17ff. Therefore,
+		 * the SRAM_SIZE is set to 0x17. The next step is to set
+		 * the BCR26:SRAM_BND midway through so the Tx and Rx
+		 * buffers can share the SRAM equally.
+		 */
+		a->write_bcr(ioaddr, 25, 0x17);
+		a->write_bcr(ioaddr, 26, 0xc);
+		/* And finally enable the NOUFLO bit */
+		a->write_bcr(ioaddr, 18, a->read_bcr(ioaddr, 18) | (1 << 11));
+	}
+
 	dev = alloc_etherdev(sizeof(*lp));
 	if (!dev) {
 		ret = -ENOMEM;
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@ -354,6 +354,7 @@ struct be_vf_cfg {
 	u16 vlan_tag;
 	u32 tx_rate;
 	u32 plink_tracking;
+	u32 privileges;
 };

 enum vf_state {
@ -423,6 +424,7 @@ struct be_adapter {

 	u8 __iomem *csr;	/* CSR BAR used only for BE2/3 */
 	u8 __iomem *db;		/* Door Bell */
+	u8 __iomem *pcicfg;	/* On SH,BEx only. Shadow of PCI config space */

 	struct mutex mbox_lock; /* For serializing mbox cmds to BE card */
 	struct be_dma_mem mbox_mem;
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@ -1902,15 +1902,11 @@ int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *set_eqd,
 {
 	int num_eqs, i = 0;

-	if (lancer_chip(adapter) && num > 8) {
-		while (num) {
-			num_eqs = min(num, 8);
-			__be_cmd_modify_eqd(adapter, &set_eqd[i], num_eqs);
-			i += num_eqs;
-			num -= num_eqs;
-		}
-	} else {
-		__be_cmd_modify_eqd(adapter, set_eqd, num);
+	while (num) {
+		num_eqs = min(num, 8);
+		__be_cmd_modify_eqd(adapter, &set_eqd[i], num_eqs);
+		i += num_eqs;
+		num -= num_eqs;
 	}

 	return 0;
@ -1918,7 +1914,7 @@ int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *set_eqd,

 /* Uses sycnhronous mcc */
 int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
-		       u32 num)
+		       u32 num, u32 domain)
 {
 	struct be_mcc_wrb *wrb;
 	struct be_cmd_req_vlan_config *req;
@ -1936,6 +1932,7 @@ int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
 	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
 			       OPCODE_COMMON_NTWK_VLAN_CONFIG, sizeof(*req),
 			       wrb, NULL);
+	req->hdr.domain = domain;

 	req->interface_id = if_id;
 	req->untagged = BE_IF_FLAGS_UNTAGGED & be_if_cap_flags(adapter) ? 1 : 0;
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@ -2256,7 +2256,7 @@ int lancer_cmd_get_pport_stats(struct be_adapter *adapter,
 int be_cmd_get_fw_ver(struct be_adapter *adapter);
 int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *, int num);
 int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
-		       u32 num);
+		       u32 num, u32 domain);
 int be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 status);
 int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc);
 int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc);
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@ -1171,7 +1171,7 @@ static int be_vid_config(struct be_adapter *adapter)
 	for_each_set_bit(i, adapter->vids, VLAN_N_VID)
 		vids[num++] = cpu_to_le16(i);

-	status = be_cmd_vlan_config(adapter, adapter->if_handle, vids, num);
+	status = be_cmd_vlan_config(adapter, adapter->if_handle, vids, num, 0);
 	if (status) {
 		dev_err(dev, "Setting HW VLAN filtering failed\n");
 		/* Set to VLAN promisc mode as setting VLAN filter failed */
@ -1380,11 +1380,67 @@ static int be_get_vf_config(struct net_device *netdev, int vf,
 	return 0;
 }

+static int be_set_vf_tvt(struct be_adapter *adapter, int vf, u16 vlan)
+{
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	u16 vids[BE_NUM_VLANS_SUPPORTED];
+	int vf_if_id = vf_cfg->if_handle;
+	int status;
+
+	/* Enable Transparent VLAN Tagging */
+	status = be_cmd_set_hsw_config(adapter, vlan, vf + 1, vf_if_id, 0);
+	if (status)
+		return status;
+
+	/* Clear pre-programmed VLAN filters on VF if any, if TVT is enabled */
+	vids[0] = 0;
+	status = be_cmd_vlan_config(adapter, vf_if_id, vids, 1, vf + 1);
+	if (!status)
+		dev_info(&adapter->pdev->dev,
+			 "Cleared guest VLANs on VF%d", vf);
+
+	/* After TVT is enabled, disallow VFs to program VLAN filters */
+	if (vf_cfg->privileges & BE_PRIV_FILTMGMT) {
+		status = be_cmd_set_fn_privileges(adapter, vf_cfg->privileges &
+						  ~BE_PRIV_FILTMGMT, vf + 1);
+		if (!status)
+			vf_cfg->privileges &= ~BE_PRIV_FILTMGMT;
+	}
+	return 0;
+}
+
+static int be_clear_vf_tvt(struct be_adapter *adapter, int vf)
+{
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	/* Reset Transparent VLAN Tagging. */
+	status = be_cmd_set_hsw_config(adapter, BE_RESET_VLAN_TAG_ID, vf + 1,
+				       vf_cfg->if_handle, 0);
+	if (status)
+		return status;
+
+	/* Allow VFs to program VLAN filtering */
+	if (!(vf_cfg->privileges & BE_PRIV_FILTMGMT)) {
+		status = be_cmd_set_fn_privileges(adapter, vf_cfg->privileges |
+						  BE_PRIV_FILTMGMT, vf + 1);
+		if (!status) {
+			vf_cfg->privileges |= BE_PRIV_FILTMGMT;
+			dev_info(dev, "VF%d: FILTMGMT priv enabled", vf);
+		}
+	}
+
+	dev_info(dev,
+		 "Disable/re-enable i/f in VM to clear Transparent VLAN tag");
+	return 0;
+}
+
 static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
-	int status = 0;
+	int status;

 	if (!sriov_enabled(adapter))
 		return -EPERM;
@ -1394,24 +1450,19 @@ static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)

 	if (vlan || qos) {
 		vlan |= qos << VLAN_PRIO_SHIFT;
-		if (vf_cfg->vlan_tag != vlan)
-			status = be_cmd_set_hsw_config(adapter, vlan, vf + 1,
-						       vf_cfg->if_handle, 0);
+		status = be_set_vf_tvt(adapter, vf, vlan);
 	} else {
-		/* Reset Transparent Vlan Tagging. */
-		status = be_cmd_set_hsw_config(adapter, BE_RESET_VLAN_TAG_ID,
-					       vf + 1, vf_cfg->if_handle, 0);
+		status = be_clear_vf_tvt(adapter, vf);
 	}

 	if (status) {
 		dev_err(&adapter->pdev->dev,
-			"VLAN %d config on VF %d failed : %#x\n", vlan,
-			vf, status);
+			"VLAN %d config on VF %d failed : %#x\n", vlan, vf,
+			status);
 		return be_cmd_status(status);
 	}

 	vf_cfg->vlan_tag = vlan;
-
 	return 0;
 }

@ -2772,14 +2823,12 @@ void be_detect_error(struct be_adapter *adapter)
 			}
 		}
 	} else {
-		pci_read_config_dword(adapter->pdev,
-				      PCICFG_UE_STATUS_LOW, &ue_lo);
-		pci_read_config_dword(adapter->pdev,
-				      PCICFG_UE_STATUS_HIGH, &ue_hi);
-		pci_read_config_dword(adapter->pdev,
-				      PCICFG_UE_STATUS_LOW_MASK, &ue_lo_mask);
-		pci_read_config_dword(adapter->pdev,
-				      PCICFG_UE_STATUS_HI_MASK, &ue_hi_mask);
+		ue_lo = ioread32(adapter->pcicfg + PCICFG_UE_STATUS_LOW);
+		ue_hi = ioread32(adapter->pcicfg + PCICFG_UE_STATUS_HIGH);
+		ue_lo_mask = ioread32(adapter->pcicfg +
+				      PCICFG_UE_STATUS_LOW_MASK);
+		ue_hi_mask = ioread32(adapter->pcicfg +
+				      PCICFG_UE_STATUS_HI_MASK);

 		ue_lo = (ue_lo & ~ue_lo_mask);
 		ue_hi = (ue_hi & ~ue_hi_mask);
@ -3339,7 +3388,6 @@ static int be_if_create(struct be_adapter *adapter, u32 *if_handle,
 			u32 cap_flags, u32 vf)
 {
 	u32 en_flags;
-	int status;

 	en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
 		   BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS |
@ -3347,10 +3395,7 @@ static int be_if_create(struct be_adapter *adapter, u32 *if_handle,

 	en_flags &= cap_flags;

-	status = be_cmd_if_create(adapter, cap_flags, en_flags,
-				  if_handle, vf);
-
-	return status;
+	return be_cmd_if_create(adapter, cap_flags, en_flags, if_handle, vf);
 }

 static int be_vfs_if_create(struct be_adapter *adapter)
@ -3368,8 +3413,13 @@ static int be_vfs_if_create(struct be_adapter *adapter)
 		if (!BE3_chip(adapter)) {
 			status = be_cmd_get_profile_config(adapter, &res,
 							   vf + 1);
-			if (!status)
+			if (!status) {
 				cap_flags = res.if_cap_flags;
+				/* Prevent VFs from enabling VLAN promiscuous
+				 * mode
+				 */
+				cap_flags &= ~BE_IF_FLAGS_VLAN_PROMISCUOUS;
+			}
 		}

 		status = be_if_create(adapter, &vf_cfg->if_handle,
@ -3403,7 +3453,6 @@ static int be_vf_setup(struct be_adapter *adapter)
 	struct device *dev = &adapter->pdev->dev;
 	struct be_vf_cfg *vf_cfg;
 	int status, old_vfs, vf;
-	u32 privileges;

 	old_vfs = pci_num_vf(adapter->pdev);

@ -3433,15 +3482,18 @@ static int be_vf_setup(struct be_adapter *adapter)

 	for_all_vfs(adapter, vf_cfg, vf) {
 		/* Allow VFs to programs MAC/VLAN filters */
-		status = be_cmd_get_fn_privileges(adapter, &privileges, vf + 1);
-		if (!status && !(privileges & BE_PRIV_FILTMGMT)) {
+		status = be_cmd_get_fn_privileges(adapter, &vf_cfg->privileges,
+						  vf + 1);
+		if (!status && !(vf_cfg->privileges & BE_PRIV_FILTMGMT)) {
 			status = be_cmd_set_fn_privileges(adapter,
-							  privileges |
+							  vf_cfg->privileges |
 							  BE_PRIV_FILTMGMT,
 							  vf + 1);
-			if (!status)
+			if (!status) {
+				vf_cfg->privileges |= BE_PRIV_FILTMGMT;
 				dev_info(dev, "VF%d has FILTMGMT privilege\n",
 					 vf);
+			}
 		}

 		/* Allow full available bandwidth */
@ -4820,24 +4872,37 @@ static int be_roce_map_pci_bars(struct be_adapter *adapter)

 static int be_map_pci_bars(struct be_adapter *adapter)
 {
+	struct pci_dev *pdev = adapter->pdev;
 	u8 __iomem *addr;

 	if (BEx_chip(adapter) && be_physfn(adapter)) {
-		adapter->csr = pci_iomap(adapter->pdev, 2, 0);
+		adapter->csr = pci_iomap(pdev, 2, 0);
 		if (!adapter->csr)
 			return -ENOMEM;
 	}

-	addr = pci_iomap(adapter->pdev, db_bar(adapter), 0);
+	addr = pci_iomap(pdev, db_bar(adapter), 0);
 	if (!addr)
 		goto pci_map_err;
 	adapter->db = addr;

+	if (skyhawk_chip(adapter) || BEx_chip(adapter)) {
+		if (be_physfn(adapter)) {
+			/* PCICFG is the 2nd BAR in BE2 */
+			addr = pci_iomap(pdev, BE2_chip(adapter) ? 1 : 0, 0);
+			if (!addr)
+				goto pci_map_err;
+			adapter->pcicfg = addr;
+		} else {
+			adapter->pcicfg = adapter->db + SRIOV_VF_PCICFG_OFFSET;
+		}
+	}
+
 	be_roce_map_pci_bars(adapter);
 	return 0;

 pci_map_err:
-	dev_err(&adapter->pdev->dev, "Error in mapping PCI BARs\n");
+	dev_err(&pdev->dev, "Error in mapping PCI BARs\n");
 	be_unmap_pci_bars(adapter);
 	return -ENOMEM;
 }
--- a/drivers/net/usb/cx82310_eth.c
+++ b/drivers/net/usb/cx82310_eth.c
@ -46,8 +46,7 @@ enum cx82310_status {
 };

 #define CMD_PACKET_SIZE	64
-/* first command after power on can take around 8 seconds */
-#define CMD_TIMEOUT	15000
+#define CMD_TIMEOUT	100
 #define CMD_REPLY_RETRY 5

 #define CX82310_MTU	1514
@ -78,8 +77,9 @@ static int cx82310_cmd(struct usbnet *dev, enum cx82310_cmd cmd, bool reply,
 	ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, CMD_EP), buf,
 			   CMD_PACKET_SIZE, &actual_len, CMD_TIMEOUT);
 	if (ret < 0) {
-		dev_err(&dev->udev->dev, "send command %#x: error %d\n",
-			cmd, ret);
+		if (cmd != CMD_GET_LINK_STATUS)
+			dev_err(&dev->udev->dev, "send command %#x: error %d\n",
+				cmd, ret);
 		goto end;
 	}

@ -90,8 +90,10 @@ static int cx82310_cmd(struct usbnet *dev, enum cx82310_cmd cmd, bool reply,
 					   buf, CMD_PACKET_SIZE, &actual_len,
 					   CMD_TIMEOUT);
 			if (ret < 0) {
-				dev_err(&dev->udev->dev,
-					"reply receive error %d\n", ret);
+				if (cmd != CMD_GET_LINK_STATUS)
+					dev_err(&dev->udev->dev,
+						"reply receive error %d\n",
+						ret);
 				goto end;
 			}
 			if (actual_len > 0)
@ -134,6 +136,8 @@ static int cx82310_bind(struct usbnet *dev, struct usb_interface *intf)
 	int ret;
 	char buf[15];
 	struct usb_device *udev = dev->udev;
+	u8 link[3];
+	int timeout = 50;

 	/* avoid ADSL modems - continue only if iProduct is "USB NET CARD" */
 	if (usb_string(udev, udev->descriptor.iProduct, buf, sizeof(buf)) > 0
@ -160,6 +164,20 @@ static int cx82310_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (!dev->partial_data)
 		return -ENOMEM;

+	/* wait for firmware to become ready (indicated by the link being up) */
+	while (--timeout) {
+		ret = cx82310_cmd(dev, CMD_GET_LINK_STATUS, true, NULL, 0,
+				  link, sizeof(link));
+		/* the command can time out during boot - it's not an error */
+		if (!ret && link[0] == 1 && link[2] == 1)
+			break;
+		msleep(500);
+	};
+	if (!timeout) {
+		dev_err(&udev->dev, "firmware not ready in time\n");
+		return -ETIMEDOUT;
+	}
+
 	/* enable ethernet mode (?) */
 	ret = cx82310_cmd(dev, CMD_ETHERNET_MODE, true, "\x01", 1, NULL, 0);
 	if (ret) {
--- a/drivers/regulator/palmas-regulator.c
+++ b/drivers/regulator/palmas-regulator.c
@ -1572,6 +1572,10 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 	if (!pmic)
 		return -ENOMEM;

+	if (of_device_is_compatible(node, "ti,tps659038-pmic"))
+		palmas_generic_regs_info[PALMAS_REG_REGEN2].ctrl_addr =
+							TPS659038_REGEN2_CTRL;
+
 	pmic->dev = &pdev->dev;
 	pmic->palmas = palmas;
 	palmas->pmic = pmic;
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@ -413,8 +413,8 @@ static void rtc_mrst_do_remove(struct device *dev)
 	mrst->dev = NULL;
 }

-#ifdef	CONFIG_PM
-static int mrst_suspend(struct device *dev, pm_message_t mesg)
+#ifdef CONFIG_PM_SLEEP
+static int mrst_suspend(struct device *dev)
 {
 	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
 	unsigned char	tmp;
@ -453,7 +453,7 @@ static int mrst_suspend(struct device *dev, pm_message_t mesg)
 */
 static inline int mrst_poweroff(struct device *dev)
 {
-	return mrst_suspend(dev, PMSG_HIBERNATE);
+	return mrst_suspend(dev);
 }

 static int mrst_resume(struct device *dev)
@ -490,9 +490,11 @@ static int mrst_resume(struct device *dev)
 	return 0;
 }

+static SIMPLE_DEV_PM_OPS(mrst_pm_ops, mrst_suspend, mrst_resume);
+#define MRST_PM_OPS (&mrst_pm_ops)
+
 #else
-#define	mrst_suspend	NULL
-#define	mrst_resume	NULL
+#define MRST_PM_OPS NULL

 static inline int mrst_poweroff(struct device *dev)
 {
@ -529,9 +531,8 @@ static struct platform_driver vrtc_mrst_platform_driver = {
 	.remove		= vrtc_mrst_platform_remove,
 	.shutdown	= vrtc_mrst_platform_shutdown,
 	.driver = {
-		.name		= (char *) driver_name,
-		.suspend	= mrst_suspend,
-		.resume		= mrst_resume,
+		.name	= driver_name,
+		.pm	= MRST_PM_OPS,
 	}
 };

--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@ -6815,7 +6815,8 @@ static struct ata_port_operations ipr_sata_ops = {
 };

 static struct ata_port_info sata_port_info = {
-	.flags		= ATA_FLAG_SATA | ATA_FLAG_PIO_DMA,
+	.flags		= ATA_FLAG_SATA | ATA_FLAG_PIO_DMA |
+			  ATA_FLAG_SAS_HOST,
 	.pio_mask	= ATA_PIO4_ONLY,
 	.mwdma_mask	= ATA_MWDMA2,
 	.udma_mask	= ATA_UDMA6,
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@ -547,7 +547,8 @@ static struct ata_port_operations sas_sata_ops = {
 };

 static struct ata_port_info sata_port_info = {
-	.flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | ATA_FLAG_NCQ,
+	.flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | ATA_FLAG_NCQ |
+		 ATA_FLAG_SAS_HOST,
 	.pio_mask = ATA_PIO4,
 	.mwdma_mask = ATA_MWDMA2,
 	.udma_mask = ATA_UDMA6,
--- a/drivers/spi/spi-dw-mid.c
+++ b/drivers/spi/spi-dw-mid.c
@ -108,7 +108,8 @@ static void dw_spi_dma_tx_done(void *arg)
 {
 	struct dw_spi *dws = arg;

-	if (test_and_clear_bit(TX_BUSY, &dws->dma_chan_busy) & BIT(RX_BUSY))
+	clear_bit(TX_BUSY, &dws->dma_chan_busy);
+	if (test_bit(RX_BUSY, &dws->dma_chan_busy))
 		return;
 	dw_spi_xfer_done(dws);
 }
@ -156,7 +157,8 @@ static void dw_spi_dma_rx_done(void *arg)
 {
 	struct dw_spi *dws = arg;

-	if (test_and_clear_bit(RX_BUSY, &dws->dma_chan_busy) & BIT(TX_BUSY))
+	clear_bit(RX_BUSY, &dws->dma_chan_busy);
+	if (test_bit(TX_BUSY, &dws->dma_chan_busy))
 		return;
 	dw_spi_xfer_done(dws);
 }
--- a/drivers/spi/spi-qup.c
+++ b/drivers/spi/spi-qup.c
@ -498,7 +498,7 @@ static int spi_qup_probe(struct platform_device *pdev)
 	struct resource *res;
 	struct device *dev;
 	void __iomem *base;
-	u32 max_freq, iomode;
+	u32 max_freq, iomode, num_cs;
 	int ret, irq, size;

 	dev = &pdev->dev;
@ -550,10 +550,11 @@ static int spi_qup_probe(struct platform_device *pdev)
 	}

 	/* use num-cs unless not present or out of range */
-	if (of_property_read_u16(dev->of_node, "num-cs",
-			&master->num_chipselect) ||
-			(master->num_chipselect > SPI_NUM_CHIPSELECTS))
+	if (of_property_read_u32(dev->of_node, "num-cs", &num_cs) ||
+	    num_cs > SPI_NUM_CHIPSELECTS)
 		master->num_chipselect = SPI_NUM_CHIPSELECTS;
+	else
+		master->num_chipselect = num_cs;

 	master->bus_num = pdev->id;
 	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LOOP;
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@ -1105,13 +1105,14 @@ void spi_finalize_current_message(struct spi_master *master)
 				"failed to unprepare message: %d\n", ret);
 		}
 	}
+
+	trace_spi_message_done(mesg);
+
 	master->cur_msg_prepared = false;

 	mesg->state = NULL;
 	if (mesg->complete)
 		mesg->complete(mesg->context);
-
-	trace_spi_message_done(mesg);
 }
 EXPORT_SYMBOL_GPL(spi_finalize_current_message);

--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@ -699,8 +699,10 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 	boff = tmp % bsize;
 	if (boff) {
 		bh = affs_bread_ino(inode, bidx, 0);
-		if (IS_ERR(bh))
-			return PTR_ERR(bh);
+		if (IS_ERR(bh)) {
+			written = PTR_ERR(bh);
+			goto err_first_bh;
+		}
 		tmp = min(bsize - boff, to - from);
 		BUG_ON(boff + tmp > bsize || tmp > bsize);
 		memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
@ -712,14 +714,16 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 		bidx++;
 	} else if (bidx) {
 		bh = affs_bread_ino(inode, bidx - 1, 0);
-		if (IS_ERR(bh))
-			return PTR_ERR(bh);
+		if (IS_ERR(bh)) {
+			written = PTR_ERR(bh);
+			goto err_first_bh;
+		}
 	}
 	while (from + bsize <= to) {
 		prev_bh = bh;
 		bh = affs_getemptyblk_ino(inode, bidx);
 		if (IS_ERR(bh))
-			goto out;
+			goto err_bh;
 		memcpy(AFFS_DATA(bh), data + from, bsize);
 		if (buffer_new(bh)) {
 			AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
@ -751,7 +755,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 		prev_bh = bh;
 		bh = affs_bread_ino(inode, bidx, 1);
 		if (IS_ERR(bh))
-			goto out;
+			goto err_bh;
 		tmp = min(bsize, to - from);
 		BUG_ON(tmp > bsize);
 		memcpy(AFFS_DATA(bh), data + from, tmp);
@ -790,12 +794,13 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 	if (tmp > inode->i_size)
 		inode->i_size = AFFS_I(inode)->mmu_private = tmp;

+err_first_bh:
 	unlock_page(page);
 	page_cache_release(page);

 	return written;

-out:
+err_bh:
 	bh = prev_bh;
 	if (!written)
 		written = PTR_ERR(bh);
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@ -131,13 +131,16 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
 	hfs_bnode_write(node, entry, data_off + key_len, entry_len);
 	hfs_bnode_dump(node);

-	if (new_node) {
-		/* update parent key if we inserted a key
-		 * at the start of the first node
-		 */
-		if (!rec && new_node != node)
-			hfs_brec_update_parent(fd);
+	/*
+	 * update parent key if we inserted a key
+	 * at the start of the node and it is not the new node
+	 */
+	if (!rec && new_node != node) {
+		hfs_bnode_read_key(node, fd->search_key, data_off + size);
+		hfs_brec_update_parent(fd);
+	}

+	if (new_node) {
 		hfs_bnode_put(fd->bnode);
 		if (!new_node->parent) {
 			hfs_btree_inc_height(tree);
@ -168,9 +171,6 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
 		goto again;
 	}

-	if (!rec)
-		hfs_brec_update_parent(fd);
-
 	return 0;
 }

@ -370,6 +370,8 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd)
 	if (IS_ERR(parent))
 		return PTR_ERR(parent);
 	__hfs_brec_find(parent, fd, hfs_find_rec_by_key);
+	if (fd->record < 0)
+		return -ENOENT;
 	hfs_bnode_dump(parent);
 	rec = fd->record;

--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@ -39,6 +39,8 @@ enum clock_event_mode {
 	CLOCK_EVT_MODE_PERIODIC,
 	CLOCK_EVT_MODE_ONESHOT,
 	CLOCK_EVT_MODE_RESUME,
+
+	/* Legacy ->set_mode() callback doesn't support below modes */
 };

 /*
@ -81,7 +83,11 @@ enum clock_event_mode {
 * @mode:		operating mode assigned by the management code
 * @features:		features
 * @retries:		number of forced programming retries
- * @set_mode:		set mode function
+ * @set_mode:		legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
+ * @set_mode_periodic:	switch mode to periodic, if !set_mode
+ * @set_mode_oneshot:	switch mode to oneshot, if !set_mode
+ * @set_mode_shutdown:	switch mode to shutdown, if !set_mode
+ * @set_mode_resume:	resume clkevt device, if !set_mode
 * @broadcast:		function to broadcast events
 * @min_delta_ticks:	minimum delta value in ticks stored for reconfiguration
 * @max_delta_ticks:	maximum delta value in ticks stored for reconfiguration
@ -108,9 +114,20 @@ struct clock_event_device {
 	unsigned int		features;
 	unsigned long		retries;

-	void			(*broadcast)(const struct cpumask *mask);
+	/*
+	 * Mode transition callback(s): Only one of the two groups should be
+	 * defined:
+	 * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME.
+	 * - set_mode_{shutdown|periodic|oneshot|resume}().
+	 */
 	void			(*set_mode)(enum clock_event_mode mode,
 					    struct clock_event_device *);
+	int			(*set_mode_periodic)(struct clock_event_device *);
+	int			(*set_mode_oneshot)(struct clock_event_device *);
+	int			(*set_mode_shutdown)(struct clock_event_device *);
+	int			(*set_mode_resume)(struct clock_event_device *);
+
+	void			(*broadcast)(const struct cpumask *mask);
 	void			(*suspend)(struct clock_event_device *);
 	void			(*resume)(struct clock_event_device *);
 	unsigned long		min_delta_ticks;
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@ -56,6 +56,7 @@ struct module;
 * @shift:		cycle to nanosecond divisor (power of two)
 * @max_idle_ns:	max idle time permitted by the clocksource (nsecs)
 * @maxadj:		maximum adjustment value to mult (~11%)
+ * @max_cycles:		maximum safe cycle value which won't overflow on multiplication
 * @flags:		flags describing special properties
 * @archdata:		arch-specific data
 * @suspend:		suspend function for the clocksource, if necessary
@ -76,7 +77,7 @@ struct clocksource {
 #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
 	struct arch_clocksource_data archdata;
 #endif
-
+	u64 max_cycles;
 	const char *name;
 	struct list_head list;
 	int rating;
@ -178,7 +179,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
 }


-extern int clocksource_register(struct clocksource*);
 extern int clocksource_unregister(struct clocksource*);
 extern void clocksource_touch_watchdog(void);
 extern struct clocksource* clocksource_get_next(void);
@ -189,7 +189,7 @@ extern struct clocksource * __init clocksource_default_clock(void);
 extern void clocksource_mark_unstable(struct clocksource *cs);

 extern u64
-clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask);
+clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cycles);
 extern void
 clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);

@ -200,7 +200,16 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
 extern int
 __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq);
 extern void
-__clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq);
+__clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq);
+
+/*
+ * Don't call this unless you are a default clocksource
+ * (AKA: jiffies) and absolutely have to.
+ */
+static inline int __clocksource_register(struct clocksource *cs)
+{
+	return __clocksource_register_scale(cs, 1, 0);
+}

 static inline int clocksource_register_hz(struct clocksource *cs, u32 hz)
 {
@ -212,14 +221,14 @@ static inline int clocksource_register_khz(struct clocksource *cs, u32 khz)
 	return __clocksource_register_scale(cs, 1000, khz);
 }

-static inline void __clocksource_updatefreq_hz(struct clocksource *cs, u32 hz)
+static inline void __clocksource_update_freq_hz(struct clocksource *cs, u32 hz)
 {
-	__clocksource_updatefreq_scale(cs, 1, hz);
+	__clocksource_update_freq_scale(cs, 1, hz);
 }

-static inline void __clocksource_updatefreq_khz(struct clocksource *cs, u32 khz)
+static inline void __clocksource_update_freq_khz(struct clocksource *cs, u32 khz)
 {
-	__clocksource_updatefreq_scale(cs, 1000, khz);
+	__clocksource_update_freq_scale(cs, 1000, khz);
 }


--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@ -232,6 +232,7 @@ enum {
 					      * led */
 	ATA_FLAG_NO_DIPM	= (1 << 23), /* host not happy with DIPM */
 	ATA_FLAG_LOWTAG		= (1 << 24), /* host wants lowest available tag */
+	ATA_FLAG_SAS_HOST	= (1 << 25), /* SAS host */

 	/* bits 24:31 of ap->flags are reserved for LLD specific flags */

--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@ -2999,6 +2999,9 @@ enum usb_irq_events {
 #define PALMAS_GPADC_TRIM15					0x0E
 #define PALMAS_GPADC_TRIM16					0x0F

+/* TPS659038 regen2_ctrl offset iss different from palmas */
+#define TPS659038_REGEN2_CTRL					0x12
+
 /* TPS65917 Interrupt registers */

 /* Registers for function INTERRUPT */
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@ -173,6 +173,7 @@ struct perf_event;
 * pmu::capabilities flags
 */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
+#define PERF_PMU_CAP_NO_NMI			0x02

 /**
 * struct pmu - generic performance monitoring unit
@ -457,6 +458,7 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;

+	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;

--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@ -316,7 +316,7 @@ struct regulator_desc {
 * @driver_data: private regulator data
 * @of_node: OpenFirmware node to parse for device tree bindings (may be
 *           NULL).
- * @regmap: regmap to use for core regmap helpers if dev_get_regulator() is
+ * @regmap: regmap to use for core regmap helpers if dev_get_regmap() is
 *          insufficient.
 * @ena_gpio_initialized: GPIO controlling regulator enable was properly
 *                        initialized, meaning that >= 0 is a valid gpio
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -1625,11 +1625,11 @@ struct task_struct {

 	/*
 	 * numa_faults_locality tracks if faults recorded during the last
-	 * scan window were remote/local. The task scan period is adapted
-	 * based on the locality of the faults with different weights
-	 * depending on whether they were shared or private faults
+	 * scan window were remote/local or failed to migrate. The task scan
+	 * period is adapted based on the locality of the faults with different
+	 * weights depending on whether they were shared or private faults
 	 */
-	unsigned long numa_faults_locality[2];
+	unsigned long numa_faults_locality[3];

 	unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
@ -1719,6 +1719,7 @@ struct task_struct {
 #define TNF_NO_GROUP	0x02
 #define TNF_SHARED	0x04
 #define TNF_FAULT_LOCAL	0x08
+#define TNF_MIGRATE_FAIL 0x10

 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@ -16,16 +16,16 @@
 * @read:	Read function of @clock
 * @mask:	Bitmask for two's complement subtraction of non 64bit clocks
 * @cycle_last: @clock cycle value at last update
- * @mult:	NTP adjusted multiplier for scaled math conversion
+ * @mult:	(NTP adjusted) multiplier for scaled math conversion
 * @shift:	Shift value for scaled math conversion
 * @xtime_nsec: Shifted (fractional) nano seconds offset for readout
- * @base_mono:  ktime_t (nanoseconds) base time for readout
+ * @base:	ktime_t (nanoseconds) base time for readout
 *
 * This struct has size 56 byte on 64 bit. Together with a seqcount it
 * occupies a single 64byte cache line.
 *
 * The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessor to clock monotonic.
+ * for a fast NMI safe accessors.
 */
 struct tk_read_base {
 	struct clocksource	*clock;
@ -35,12 +35,13 @@ struct tk_read_base {
 	u32			mult;
 	u32			shift;
 	u64			xtime_nsec;
-	ktime_t			base_mono;
+	ktime_t			base;
 };

 /**
 * struct timekeeper - Structure holding internal timekeeping values.
- * @tkr:		The readout base structure
+ * @tkr_mono:		The readout base structure for CLOCK_MONOTONIC
+ * @tkr_raw:		The readout base structure for CLOCK_MONOTONIC_RAW
 * @xtime_sec:		Current CLOCK_REALTIME time in seconds
 * @ktime_sec:		Current CLOCK_MONOTONIC time in seconds
 * @wall_to_monotonic:	CLOCK_REALTIME to CLOCK_MONOTONIC offset
@ -48,7 +49,6 @@ struct tk_read_base {
 * @offs_boot:		Offset clock monotonic -> clock boottime
 * @offs_tai:		Offset clock monotonic -> clock tai
 * @tai_offset:		The current UTC to TAI offset in seconds
- * @base_raw:		Monotonic raw base time in ktime_t format
 * @raw_time:		Monotonic raw base time in timespec64 format
 * @cycle_interval:	Number of clock cycles in one NTP interval
 * @xtime_interval:	Number of clock shifted nano seconds in one NTP
@ -76,7 +76,8 @@ struct tk_read_base {
 * used instead.
 */
 struct timekeeper {
-	struct tk_read_base	tkr;
+	struct tk_read_base	tkr_mono;
+	struct tk_read_base	tkr_raw;
 	u64			xtime_sec;
 	unsigned long		ktime_sec;
 	struct timespec64	wall_to_monotonic;
@ -84,7 +85,6 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
-	ktime_t			base_raw;
 	struct timespec64	raw_time;

 	/* The following members are for timekeeping internal use */
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@ -214,12 +214,18 @@ static inline u64 ktime_get_boot_ns(void)
 	return ktime_to_ns(ktime_get_boottime());
 }

+static inline u64 ktime_get_tai_ns(void)
+{
+	return ktime_to_ns(ktime_get_clocktai());
+}
+
 static inline u64 ktime_get_raw_ns(void)
 {
 	return ktime_to_ns(ktime_get_raw());
 }

 extern u64 ktime_get_mono_fast_ns(void);
+extern u64 ktime_get_raw_fast_ns(void);

 /*
 * Timespec interfaces utilizing the ktime based ones
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@ -79,6 +79,16 @@ void nf_log_packet(struct net *net,
 		   const struct nf_loginfo *li,
 		   const char *fmt, ...);

+__printf(8, 9)
+void nf_log_trace(struct net *net,
+		  u_int8_t pf,
+		  unsigned int hooknum,
+		  const struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  const struct nf_loginfo *li,
+		  const char *fmt, ...);
+
 struct nf_log_buf;

 struct nf_log_buf *nf_log_buf_open(void);
--- a/include/trace/events/regmap.h
+++ b/include/trace/events/regmap.h
@ -7,27 +7,26 @@
 #include <linux/ktime.h>
 #include <linux/tracepoint.h>

-struct device;
-struct regmap;
+#include "../../../drivers/base/regmap/internal.h"

 /*
 * Log register events
 */
 DECLARE_EVENT_CLASS(regmap_reg,

-	TP_PROTO(struct device *dev, unsigned int reg,
+	TP_PROTO(struct regmap *map, unsigned int reg,
 		 unsigned int val),

-	TP_ARGS(dev, reg, val),
+	TP_ARGS(map, reg, val),

 	TP_STRUCT__entry(
-		__string(	name,		dev_name(dev)	)
-		__field(	unsigned int,	reg		)
-		__field(	unsigned int,	val		)
+		__string(	name,		regmap_name(map)	)
+		__field(	unsigned int,	reg			)
+		__field(	unsigned int,	val			)
 	),

 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__entry->reg = reg;
 		__entry->val = val;
 	),
@ -39,45 +38,45 @@ DECLARE_EVENT_CLASS(regmap_reg,

 DEFINE_EVENT(regmap_reg, regmap_reg_write,

-	TP_PROTO(struct device *dev, unsigned int reg,
+	TP_PROTO(struct regmap *map, unsigned int reg,
 		 unsigned int val),

-	TP_ARGS(dev, reg, val)
+	TP_ARGS(map, reg, val)

 );

 DEFINE_EVENT(regmap_reg, regmap_reg_read,

-	TP_PROTO(struct device *dev, unsigned int reg,
+	TP_PROTO(struct regmap *map, unsigned int reg,
 		 unsigned int val),

-	TP_ARGS(dev, reg, val)
+	TP_ARGS(map, reg, val)

 );

 DEFINE_EVENT(regmap_reg, regmap_reg_read_cache,

-	TP_PROTO(struct device *dev, unsigned int reg,
+	TP_PROTO(struct regmap *map, unsigned int reg,
 		 unsigned int val),

-	TP_ARGS(dev, reg, val)
+	TP_ARGS(map, reg, val)

 );

 DECLARE_EVENT_CLASS(regmap_block,

-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),

-	TP_ARGS(dev, reg, count),
+	TP_ARGS(map, reg, count),

 	TP_STRUCT__entry(
-		__string(	name,		dev_name(dev)	)
-		__field(	unsigned int,	reg		)
-		__field(	int,		count		)
+		__string(	name,		regmap_name(map)	)
+		__field(	unsigned int,	reg			)
+		__field(	int,		count			)
 	),

 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__entry->reg = reg;
 		__entry->count = count;
 	),
@ -89,48 +88,48 @@ DECLARE_EVENT_CLASS(regmap_block,

 DEFINE_EVENT(regmap_block, regmap_hw_read_start,

-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),

-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );

 DEFINE_EVENT(regmap_block, regmap_hw_read_done,

-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),

-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );

 DEFINE_EVENT(regmap_block, regmap_hw_write_start,

-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),

-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );

 DEFINE_EVENT(regmap_block, regmap_hw_write_done,

-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),

-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );

 TRACE_EVENT(regcache_sync,

-	TP_PROTO(struct device *dev, const char *type,
+	TP_PROTO(struct regmap *map, const char *type,
 		 const char *status),

-	TP_ARGS(dev, type, status),
+	TP_ARGS(map, type, status),

 	TP_STRUCT__entry(
-		__string(       name,           dev_name(dev)   )
-		__string(	status,		status		)
-		__string(	type,		type		)
-		__field(	int,		type		)
+		__string(       name,           regmap_name(map)	)
+		__string(	status,		status			)
+		__string(	type,		type			)
+		__field(	int,		type			)
 	),

 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__assign_str(status, status);
 		__assign_str(type, type);
 	),
@ -141,17 +140,17 @@ TRACE_EVENT(regcache_sync,

 DECLARE_EVENT_CLASS(regmap_bool,

-	TP_PROTO(struct device *dev, bool flag),
+	TP_PROTO(struct regmap *map, bool flag),

-	TP_ARGS(dev, flag),
+	TP_ARGS(map, flag),

 	TP_STRUCT__entry(
-		__string(	name,		dev_name(dev)	)
-		__field(	int,		flag		)
+		__string(	name,		regmap_name(map)	)
+		__field(	int,		flag			)
 	),

 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__entry->flag = flag;
 	),

@ -161,32 +160,32 @@ DECLARE_EVENT_CLASS(regmap_bool,

 DEFINE_EVENT(regmap_bool, regmap_cache_only,

-	TP_PROTO(struct device *dev, bool flag),
+	TP_PROTO(struct regmap *map, bool flag),

-	TP_ARGS(dev, flag)
+	TP_ARGS(map, flag)

 );

 DEFINE_EVENT(regmap_bool, regmap_cache_bypass,

-	TP_PROTO(struct device *dev, bool flag),
+	TP_PROTO(struct regmap *map, bool flag),

-	TP_ARGS(dev, flag)
+	TP_ARGS(map, flag)

 );

 DECLARE_EVENT_CLASS(regmap_async,

-	TP_PROTO(struct device *dev),
+	TP_PROTO(struct regmap *map),

-	TP_ARGS(dev),
+	TP_ARGS(map),

 	TP_STRUCT__entry(
-		__string(	name,		dev_name(dev)	)
+		__string(	name,		regmap_name(map)	)
 	),

 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 	),

 	TP_printk("%s", __get_str(name))
@ -194,50 +193,50 @@ DECLARE_EVENT_CLASS(regmap_async,

 DEFINE_EVENT(regmap_block, regmap_async_write_start,

-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),

-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );

 DEFINE_EVENT(regmap_async, regmap_async_io_complete,

-	TP_PROTO(struct device *dev),
+	TP_PROTO(struct regmap *map),

-	TP_ARGS(dev)
+	TP_ARGS(map)

 );

 DEFINE_EVENT(regmap_async, regmap_async_complete_start,

-	TP_PROTO(struct device *dev),
+	TP_PROTO(struct regmap *map),

-	TP_ARGS(dev)
+	TP_ARGS(map)

 );

 DEFINE_EVENT(regmap_async, regmap_async_complete_done,

-	TP_PROTO(struct device *dev),
+	TP_PROTO(struct regmap *map),

-	TP_ARGS(dev)
+	TP_ARGS(map)

 );

 TRACE_EVENT(regcache_drop_region,

-	TP_PROTO(struct device *dev, unsigned int from,
+	TP_PROTO(struct regmap *map, unsigned int from,
 		 unsigned int to),

-	TP_ARGS(dev, from, to),
+	TP_ARGS(map, from, to),

 	TP_STRUCT__entry(
-		__string(       name,           dev_name(dev)   )
-		__field(	unsigned int,	from		)
-		__field(	unsigned int,	to		)
+		__string(       name,           regmap_name(map)	)
+		__field(	unsigned int,	from			)
+		__field(	unsigned int,	to			)
 	),

 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__entry->from = from;
 		__entry->to = to;
 	),
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@ -326,7 +326,8 @@ struct perf_event_attr {
 				exclude_callchain_user   : 1, /* exclude user callchains */
 				mmap2          :  1, /* include mmap with inode data     */
 				comm_exec      :  1, /* flag comm events that are due to an exec */
-				__reserved_1   : 39;
+				use_clockid    :  1, /* use @clockid for time fields */
+				__reserved_1   : 38;

 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@ -355,8 +356,7 @@ struct perf_event_attr {
 	 */
 	__u32	sample_stack_user;

-	/* Align to u64. */
-	__u32	__reserved_2;
+	__s32	clockid;
 	/*
 	 * Defines set of regs to dump for each sample
 	 * state captured on:
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }

+static inline u64 perf_event_clock(struct perf_event *event)
+{
+	return event->clock();
+}
+
 static inline struct perf_cpu_context *
 __get_cpu_context(struct perf_event_context *ctx)
 {
@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
 	}

 	if (sample_type & PERF_SAMPLE_TIME)
-		data->time = perf_clock();
+		data->time = perf_event_clock(event);

 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
 		data->id = primary_event_id(event);
@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event,
 	task_event->event_id.tid = perf_event_tid(event, task);
 	task_event->event_id.ptid = perf_event_tid(event, current);

+	task_event->event_id.time = perf_event_clock(event);
+
 	perf_output_put(&handle, task_event->event_id);

 	perf_event__output_id_sample(event, &handle, &sample);
@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task,
 			/* .ppid */
 			/* .tid  */
 			/* .ptid */
-			.time = perf_clock(),
+			/* .time */
 		},
 	};

@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time		= perf_clock(),
+		.time		= perf_event_clock(event),
 		.id		= primary_event_id(event),
 		.stream_id	= event->id,
 	};
@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event)
 static struct pmu perf_swevent = {
 	.task_ctx_nr	= perf_sw_context,

+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= perf_swevent_init,
 	.add		= perf_swevent_add,
 	.del		= perf_swevent_del,
@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 static struct pmu perf_cpu_clock = {
 	.task_ctx_nr	= perf_sw_context,

+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= cpu_clock_event_init,
 	.add		= cpu_clock_event_add,
 	.del		= cpu_clock_event_del,
@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event)
 static struct pmu perf_task_clock = {
 	.task_ctx_nr	= perf_sw_context,

+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= task_clock_event_init,
 	.add		= task_clock_event_add,
 	.del		= task_clock_event_del,
@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		event->hw.target = task;
 	}

+	event->clock = &local_clock;
+	if (parent_event)
+		event->clock = parent_event->clock;
+
 	if (!overflow_handler && parent_event) {
 		overflow_handler = parent_event->overflow_handler;
 		context = parent_event->overflow_handler_context;
@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 		goto out;

+	/*
+	 * Mixing clocks in the same buffer is trouble you don't need.
+	 */
+	if (output_event->clock != event->clock)
+		goto out;
+
 set:
 	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
 	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
 }

+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+	bool nmi_safe = false;
+
+	switch (clk_id) {
+	case CLOCK_MONOTONIC:
+		event->clock = &ktime_get_mono_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_MONOTONIC_RAW:
+		event->clock = &ktime_get_raw_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_REALTIME:
+		event->clock = &ktime_get_real_ns;
+		break;
+
+	case CLOCK_BOOTTIME:
+		event->clock = &ktime_get_boot_ns;
+		break;
+
+	case CLOCK_TAI:
+		event->clock = &ktime_get_tai_ns;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open,
 	 */
 	pmu = event->pmu;

+	if (attr.use_clockid) {
+		err = perf_event_set_clock(event, attr.clockid);
+		if (err)
+			goto err_alloc;
+	}
+
 	if (group_leader &&
 	    (is_software_event(event) != is_software_event(group_leader))) {
 		if (is_software_event(event)) {
@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		 */
 		if (group_leader->group_leader != group_leader)
 			goto err_context;
+
+		/* All events in a group should have the same clock */
+		if (group_leader->clock != event->clock)
+			goto err_context;
+
 		/*
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context:
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p,
 	/*
 	 * If there were no record hinting faults then either the task is
 	 * completely idle or all activity is areas that are not of interest
-	 * to automatic numa balancing. Scan slower
+	 * to automatic numa balancing. Related to that, if there were failed
+	 * migration then it implies we are migrating too quickly or the local
+	 * node is overloaded. In either case, scan slower
 	 */
-	if (local + shared == 0) {
+	if (local + shared == 0 || p->numa_faults_locality[2]) {
 		p->numa_scan_period = min(p->numa_scan_period_max,
 			p->numa_scan_period << 1);

@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

 	if (migrated)
 		p->numa_pages_migrated += pages;
+	if (flags & TNF_MIGRATE_FAIL)
+		p->numa_faults_locality[2] += pages;

 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@ -94,6 +94,57 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);

+static int __clockevents_set_mode(struct clock_event_device *dev,
+				  enum clock_event_mode mode)
+{
+	/* Transition with legacy set_mode() callback */
+	if (dev->set_mode) {
+		/* Legacy callback doesn't support new modes */
+		if (mode > CLOCK_EVT_MODE_RESUME)
+			return -ENOSYS;
+		dev->set_mode(mode, dev);
+		return 0;
+	}
+
+	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;
+
+	/* Transition with new mode-specific callbacks */
+	switch (mode) {
+	case CLOCK_EVT_MODE_UNUSED:
+		/*
+		 * This is an internal state, which is guaranteed to go from
+		 * SHUTDOWN to UNUSED. No driver interaction required.
+		 */
+		return 0;
+
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		return dev->set_mode_shutdown(dev);
+
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* Core internal bug */
+		if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+			return -ENOSYS;
+		return dev->set_mode_periodic(dev);
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* Core internal bug */
+		if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+			return -ENOSYS;
+		return dev->set_mode_oneshot(dev);
+
+	case CLOCK_EVT_MODE_RESUME:
+		/* Optional callback */
+		if (dev->set_mode_resume)
+			return dev->set_mode_resume(dev);
+		else
+			return 0;
+
+	default:
+		return -ENOSYS;
+	}
+}
+
 /**
 * clockevents_set_mode - set the operating mode of a clock event device
 * @dev:	device to modify
@ -105,7 +156,9 @@ void clockevents_set_mode(struct clock_event_device *dev,
 				 enum clock_event_mode mode)
 {
 	if (dev->mode != mode) {
-		dev->set_mode(mode, dev);
+		if (__clockevents_set_mode(dev, mode))
+			return;
+
 		dev->mode = mode;

 		/*
@ -373,6 +426,35 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind);

+/* Sanity check of mode transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+	/* Legacy set_mode() callback */
+	if (dev->set_mode) {
+		/* We shouldn't be supporting new modes now */
+		WARN_ON(dev->set_mode_periodic || dev->set_mode_oneshot ||
+			dev->set_mode_shutdown || dev->set_mode_resume);
+		return 0;
+	}
+
+	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;
+
+	/* New mode-specific callbacks */
+	if (!dev->set_mode_shutdown)
+		return -EINVAL;
+
+	if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+	    !dev->set_mode_periodic)
+		return -EINVAL;
+
+	if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+	    !dev->set_mode_oneshot)
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
 * clockevents_register_device - register a clock event device
 * @dev:	device to register
@ -382,6 +464,8 @@ void clockevents_register_device(struct clock_event_device *dev)
 	unsigned long flags;

 	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+	BUG_ON(clockevents_sanity_check(dev));
+
 	if (!dev->cpumask) {
 		WARN_ON(num_possible_cpus() > 1);
 		dev->cpumask = cpumask_of(smp_processor_id());
@ -449,7 +533,7 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 		return clockevents_program_event(dev, dev->next_event, false);

 	if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
-		dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+		return __clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);

 	return 0;
 }
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
 		schedule_work(&watchdog_work);
 }

-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
-	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
-	       cs->name, delta);
-	__clocksource_unstable(cs);
-}
-
 /**
 * clocksource_mark_unstable - mark clocksource unstable via watchdog
 * @cs:		clocksource to be marked unstable
@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 static void clocksource_watchdog(unsigned long data)
 {
 	struct clocksource *cs;
-	cycle_t csnow, wdnow, delta;
+	cycle_t csnow, wdnow, cslast, wdlast, delta;
 	int64_t wd_nsec, cs_nsec;
 	int next_cpu, reset_pending;

@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)

 		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
 		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+		wdlast = cs->wd_last; /* save these in case we print them */
+		cslast = cs->cs_last;
 		cs->cs_last = csnow;
 		cs->wd_last = wdnow;

@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)

 		/* Check the deviation from the watchdog clocksource. */
 		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
-			clocksource_unstable(cs, cs_nsec - wd_nsec);
+			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+			pr_warn("	'%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+				watchdog->name, wdnow, wdlast, watchdog->mask);
+			pr_warn("	'%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+				cs->name, csnow, cslast, cs->mask);
+			__clocksource_unstable(cs);
 			continue;
 		}

@ -469,26 +469,22 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
 * @shift:	cycle to nanosecond divisor (power of two)
 * @maxadj:	maximum adjustment value to mult (~11%)
 * @mask:	bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc:	maximum cycle value before potential overflow (does not include
+ *		any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, so that bad clock values
+ * can be detected.
 */
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
 {
 	u64 max_nsecs, max_cycles;

 	/*
 	 * Calculate the maximum number of cycles that we can pass to the
-	 * cyc2ns function without overflowing a 64-bit signed result. The
-	 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
-	 * which is equivalent to the below.
-	 * max_cycles < (2^63)/(mult + maxadj)
-	 * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
-	 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
-	 * max_cycles < 2^(63 - log2(mult + maxadj))
-	 * max_cycles < 1 << (63 - log2(mult + maxadj))
-	 * Please note that we add 1 to the result of the log2 to account for
-	 * any rounding errors, ensure the above inequality is satisfied and
-	 * no overflow will occur.
+	 * cyc2ns() function without overflowing a 64-bit result.
 	 */
-	max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+	max_cycles = ULLONG_MAX;
+	do_div(max_cycles, mult+maxadj);

 	/*
 	 * The actual maximum number of cycles we can defer the clocksource is
@ -499,27 +495,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
 	max_cycles = min(max_cycles, mask);
 	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);

+	/* return the max_cycles value as well if requested */
+	if (max_cyc)
+		*max_cyc = max_cycles;
+
+	/* Return 50% of the actual maximum, so we can detect bad values */
+	max_nsecs >>= 1;
+
 	return max_nsecs;
 }

 /**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs:         Pointer to clocksource
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
+ * @cs:         Pointer to clocksource to be updated
 *
 */
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
 {
-	u64 max_nsecs;
-
-	max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
-					  cs->mask);
-	/*
-	 * To ensure that the clocksource does not wrap whilst we are idle,
-	 * limit the time the clocksource can be deferred by 12.5%. Please
-	 * note a margin of 12.5% is used because this can be computed with
-	 * a shift, versus say 10% which would require division.
-	 */
-	return max_nsecs - (max_nsecs >> 3);
+	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+						cs->maxadj, cs->mask,
+						&cs->max_cycles);
 }

 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@ -648,7 +643,7 @@ static void clocksource_enqueue(struct clocksource *cs)
 }

 /**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
 * @cs:		clocksource to be registered
 * @scale:	Scale factor multiplied against freq to get clocksource hz
 * @freq:	clocksource frequency (cycles per second) divided by scale
@ -656,48 +651,64 @@ static void clocksource_enqueue(struct clocksource *cs)
 * This should only be called from the clocksource->enable() method.
 *
 * This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
 */
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 	u64 sec;
+
 	/*
-	 * Calc the maximum number of seconds which we can run before
-	 * wrapping around. For clocksources which have a mask > 32bit
-	 * we need to limit the max sleep time to have a good
-	 * conversion precision. 10 minutes is still a reasonable
-	 * amount. That results in a shift value of 24 for a
-	 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
-	 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
-	 * margin as we do in clocksource_max_deferment()
+	 * Default clocksources are *special* and self-define their mult/shift.
+	 * But, you're not special, so you should specify a freq value.
 	 */
-	sec = (cs->mask - (cs->mask >> 3));
-	do_div(sec, freq);
-	do_div(sec, scale);
-	if (!sec)
-		sec = 1;
-	else if (sec > 600 && cs->mask > UINT_MAX)
-		sec = 600;
-
-	clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
-			       NSEC_PER_SEC / scale, sec * scale);
+	if (freq) {
+		/*
+		 * Calc the maximum number of seconds which we can run before
+		 * wrapping around. For clocksources which have a mask > 32-bit
+		 * we need to limit the max sleep time to have a good
+		 * conversion precision. 10 minutes is still a reasonable
+		 * amount. That results in a shift value of 24 for a
+		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
+		 * ~ 0.06ppm granularity for NTP.
+		 */
+		sec = cs->mask;
+		do_div(sec, freq);
+		do_div(sec, scale);
+		if (!sec)
+			sec = 1;
+		else if (sec > 600 && cs->mask > UINT_MAX)
+			sec = 600;

+		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+				       NSEC_PER_SEC / scale, sec * scale);
+	}
 	/*
-	 * for clocksources that have large mults, to avoid overflow.
-	 * Since mult may be adjusted by ntp, add an safety extra margin
-	 *
+	 * Ensure clocksources that have large 'mult' values don't overflow
+	 * when adjusted.
 	 */
 	cs->maxadj = clocksource_max_adjustment(cs);
-	while ((cs->mult + cs->maxadj < cs->mult)
-		|| (cs->mult - cs->maxadj > cs->mult)) {
+	while (freq && ((cs->mult + cs->maxadj < cs->mult)
+		|| (cs->mult - cs->maxadj > cs->mult))) {
 		cs->mult >>= 1;
 		cs->shift--;
 		cs->maxadj = clocksource_max_adjustment(cs);
 	}

-	cs->max_idle_ns = clocksource_max_deferment(cs);
+	/*
+	 * Only warn for *special* clocksources that self-define
+	 * their mult/shift values and don't specify a freq.
+	 */
+	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+		cs->name);
+
+	clocksource_update_max_deferment(cs);
+
+	pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+			cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);

 /**
 * __clocksource_register_scale - Used to install new clocksources
@ -714,7 +725,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {

 	/* Initialize mult/shift and max_idle_ns */
-	__clocksource_updatefreq_scale(cs, scale, freq);
+	__clocksource_update_freq_scale(cs, scale, freq);

 	/* Add clocksource to the clocksource list */
 	mutex_lock(&clocksource_mutex);
@ -726,33 +737,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 }
 EXPORT_SYMBOL_GPL(__clocksource_register_scale);

-
-/**
- * clocksource_register - Used to install new clocksources
- * @cs:		clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
-	/* calculate max adjustment for given mult/shift */
-	cs->maxadj = clocksource_max_adjustment(cs);
-	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
-		"Clocksource %s might overflow on 11%% adjustment\n",
-		cs->name);
-
-	/* calculate max idle time permitted for this clocksource */
-	cs->max_idle_ns = clocksource_max_deferment(cs);
-
-	mutex_lock(&clocksource_mutex);
-	clocksource_enqueue(cs);
-	clocksource_enqueue_watchdog(cs);
-	clocksource_select();
-	mutex_unlock(&clocksource_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
-
 static void __clocksource_change_rating(struct clocksource *cs, int rating)
 {
 	list_del(&cs->list);
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
 	.mask		= 0xffffffff, /*32bits*/
 	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
 	.shift		= JIFFIES_SHIFT,
+	.max_cycles	= 10,
 };

 __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);

 static int __init init_jiffies_clocksource(void)
 {
-	return clocksource_register(&clocksource_jiffies);
+	return __clocksource_register(&clocksource_jiffies);
 }

 core_initcall(init_jiffies_clocksource);
@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)

 	refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;

-	clocksource_register(&refined_jiffies);
+	__clocksource_register(&refined_jiffies);
 	return 0;
 }
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@ -1,5 +1,6 @@
 /*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ *                hardware time counters to full 64-bit ns values.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@ -18,15 +19,53 @@
 #include <linux/seqlock.h>
 #include <linux/bitops.h>

-struct clock_data {
-	ktime_t wrap_kt;
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns:		sched_clock() value at last update
+ * @epoch_cyc:		Clock cycle value at last update.
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *			clocks.
+ * @read_sched_clock:	Current clock source (or dummy source when suspended).
+ * @mult:		Multipler for scaled math conversion.
+ * @shift:		Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
 	u64 epoch_ns;
 	u64 epoch_cyc;
-	seqcount_t seq;
-	unsigned long rate;
+	u64 sched_clock_mask;
+	u64 (*read_sched_clock)(void);
 	u32 mult;
 	u32 shift;
-	bool suspended;
+};
+
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ *                     registration of a new clock source)
+ *
+ * @seq:		Sequence counter for protecting updates. The lowest
+ *			bit is the index for @read_data.
+ * @read_data:		Data required to read from sched_clock.
+ * @wrap_kt:		Duration for which clock can run before wrapping.
+ * @rate:		Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+	seqcount_t		seq;
+	struct clock_read_data	read_data[2];
+	ktime_t			wrap_kt;
+	unsigned long		rate;
+
+	u64 (*actual_read_sched_clock)(void);
 };

 static struct hrtimer sched_clock_timer;
@ -34,12 +73,6 @@ static int irqtime = -1;

 core_param(irqtime, irqtime, int, 0400);

-static struct clock_data cd = {
-	.mult	= NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
 static u64 notrace jiffy_sched_clock_read(void)
 {
 	/*
@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
 	return (u64)(jiffies - INITIAL_JIFFIES);
 }

-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+	.read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+			  .read_sched_clock = jiffy_sched_clock_read, },
+	.actual_read_sched_clock = jiffy_sched_clock_read,
+};

 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)

 unsigned long long notrace sched_clock(void)
 {
-	u64 epoch_ns;
-	u64 epoch_cyc;
-	u64 cyc;
+	u64 cyc, res;
 	unsigned long seq;
-
-	if (cd.suspended)
-		return cd.epoch_ns;
+	struct clock_read_data *rd;

 	do {
-		seq = raw_read_seqcount_begin(&cd.seq);
-		epoch_cyc = cd.epoch_cyc;
-		epoch_ns = cd.epoch_ns;
+		seq = raw_read_seqcount(&cd.seq);
+		rd = cd.read_data + (seq & 1);
+
+		cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+		      rd->sched_clock_mask;
+		res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
 	} while (read_seqcount_retry(&cd.seq, seq));

-	cyc = read_sched_clock();
-	cyc = (cyc - epoch_cyc) & sched_clock_mask;
-	return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+	return res;
 }

 /*
- * Atomically update the sched_clock epoch.
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
 */
-static void notrace update_sched_clock(void)
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+	/* update the backup (odd) copy with the new data */
+	cd.read_data[1] = *rd;
+
+	/* steer readers towards the odd copy */
+	raw_write_seqcount_latch(&cd.seq);
+
+	/* now its safe for us to update the normal (even) copy */
+	cd.read_data[0] = *rd;
+
+	/* switch readers back to the even copy */
+	raw_write_seqcount_latch(&cd.seq);
+}
+
+/*
+ * Atomically update the sched_clock() epoch.
+ */
+static void update_sched_clock(void)
 {
-	unsigned long flags;
 	u64 cyc;
 	u64 ns;
+	struct clock_read_data rd;

-	cyc = read_sched_clock();
-	ns = cd.epoch_ns +
-		cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
+	rd = cd.read_data[0];

-	raw_local_irq_save(flags);
-	raw_write_seqcount_begin(&cd.seq);
-	cd.epoch_ns = ns;
-	cd.epoch_cyc = cyc;
-	raw_write_seqcount_end(&cd.seq);
-	raw_local_irq_restore(flags);
+	cyc = cd.actual_read_sched_clock();
+	ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+
+	rd.epoch_ns = ns;
+	rd.epoch_cyc = cyc;
+
+	update_clock_read_data(&rd);
 }

 static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
 	update_sched_clock();
 	hrtimer_forward_now(hrt, cd.wrap_kt);
+
 	return HRTIMER_RESTART;
 }

-void __init sched_clock_register(u64 (*read)(void), int bits,
-				 unsigned long rate)
+void __init
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 {
 	u64 res, wrap, new_mask, new_epoch, cyc, ns;
 	u32 new_mult, new_shift;
-	ktime_t new_wrap_kt;
 	unsigned long r;
 	char r_unit;
+	struct clock_read_data rd;

 	if (cd.rate > rate)
 		return;

 	WARN_ON(!irqs_disabled());

-	/* calculate the mult/shift to convert counter ticks to ns. */
+	/* Calculate the mult/shift to convert counter ticks to ns. */
 	clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);

 	new_mask = CLOCKSOURCE_MASK(bits);
-
-	/* calculate how many ns until we wrap */
-	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-	new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
-
-	/* update epoch for new counter and update epoch_ns from old counter*/
-	new_epoch = read();
-	cyc = read_sched_clock();
-	ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
-
-	raw_write_seqcount_begin(&cd.seq);
-	read_sched_clock = read;
-	sched_clock_mask = new_mask;
 	cd.rate = rate;
-	cd.wrap_kt = new_wrap_kt;
-	cd.mult = new_mult;
-	cd.shift = new_shift;
-	cd.epoch_cyc = new_epoch;
-	cd.epoch_ns = ns;
-	raw_write_seqcount_end(&cd.seq);
+
+	/* Calculate how many nanosecs until we risk wrapping */
+	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+	cd.wrap_kt = ns_to_ktime(wrap);
+
+	rd = cd.read_data[0];
+
+	/* Update epoch for new counter and update 'epoch_ns' from old counter*/
+	new_epoch = read();
+	cyc = cd.actual_read_sched_clock();
+	ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+	cd.actual_read_sched_clock = read;
+
+	rd.read_sched_clock	= read;
+	rd.sched_clock_mask	= new_mask;
+	rd.mult			= new_mult;
+	rd.shift		= new_shift;
+	rd.epoch_cyc		= new_epoch;
+	rd.epoch_ns		= ns;
+
+	update_clock_read_data(&rd);

 	r = rate;
 	if (r >= 4000000) {
 		r /= 1000000;
 		r_unit = 'M';
-	} else if (r >= 1000) {
-		r /= 1000;
-		r_unit = 'k';
-	} else
-		r_unit = ' ';
+	} else {
+		if (r >= 1000) {
+			r /= 1000;
+			r_unit = 'k';
+		} else {
+			r_unit = ' ';
+		}
+	}

-	/* calculate the ns resolution of this counter */
+	/* Calculate the ns resolution of this counter */
 	res = cyc_to_ns(1ULL, new_mult, new_shift);

 	pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
 		bits, r, r_unit, res, wrap);

-	/* Enable IRQ time accounting if we have a fast enough sched_clock */
+	/* Enable IRQ time accounting if we have a fast enough sched_clock() */
 	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
 		enable_sched_clock_irqtime();

@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 void __init sched_clock_postinit(void)
 {
 	/*
-	 * If no sched_clock function has been provided at that point,
+	 * If no sched_clock() function has been provided at that point,
 	 * make it the final one one.
 	 */
-	if (read_sched_clock == jiffy_sched_clock_read)
+	if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
 		sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);

 	update_sched_clock();
@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
 }

+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+	unsigned long seq = raw_read_seqcount(&cd.seq);
+
+	return cd.read_data[seq & 1].epoch_cyc;
+}
+
 static int sched_clock_suspend(void)
 {
+	struct clock_read_data *rd = &cd.read_data[0];
+
 	update_sched_clock();
 	hrtimer_cancel(&sched_clock_timer);
-	cd.suspended = true;
+	rd->read_sched_clock = suspended_sched_clock_read;
+
 	return 0;
 }

 static void sched_clock_resume(void)
 {
-	cd.epoch_cyc = read_sched_clock();
+	struct clock_read_data *rd = &cd.read_data[0];
+
+	rd->epoch_cyc = cd.actual_read_sched_clock();
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-	cd.suspended = false;
+	rd->read_sched_clock = cd.actual_read_sched_clock;
 }

 static struct syscore_ops sched_clock_ops = {
-	.suspend = sched_clock_suspend,
-	.resume = sched_clock_resume,
+	.suspend	= sched_clock_suspend,
+	.resume		= sched_clock_resume,
 };

 static int __init sched_clock_syscore_init(void)
 {
 	register_syscore_ops(&sched_clock_ops);
+
 	return 0;
 }
 device_initcall(sched_clock_syscore_init);
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@ -59,6 +59,7 @@ struct tk_fast {
 };

 static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_raw  ____cacheline_aligned;

 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@ -68,8 +69,8 @@ bool __read_mostly persistent_clock_exist = false;

 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
-	while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
-		tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
+	while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
+		tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
 		tk->xtime_sec++;
 	}
 }
@ -79,20 +80,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
 	struct timespec64 ts;

 	ts.tv_sec = tk->xtime_sec;
-	ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+	ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 	return ts;
 }

 static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
 {
 	tk->xtime_sec = ts->tv_sec;
-	tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
+	tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
 }

 static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
 {
 	tk->xtime_sec += ts->tv_sec;
-	tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
+	tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
 	tk_normalize_xtime(tk);
 }

@ -118,6 +119,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 	tk->offs_boot = ktime_add(tk->offs_boot, delta);
 }

+#ifdef CONFIG_DEBUG_TIMEKEEPING
+#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
+/*
+ * These simple flag variables are managed
+ * without locks, which is racy, but ok since
+ * we don't really care about being super
+ * precise about how many events were seen,
+ * just that a problem was observed.
+ */
+static int timekeeping_underflow_seen;
+static int timekeeping_overflow_seen;
+
+/* last_warning is only modified under the timekeeping lock */
+static long timekeeping_last_warning;
+
+static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+
+	cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+	const char *name = tk->tkr_mono.clock->name;
+
+	if (offset > max_cycles) {
+		printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
+				offset, name, max_cycles);
+		printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
+	} else {
+		if (offset > (max_cycles >> 1)) {
+			printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+					offset, name, max_cycles >> 1);
+			printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
+		}
+	}
+
+	if (timekeeping_underflow_seen) {
+		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+			printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
+			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+			printk_deferred("         Your kernel is probably still fine.\n");
+			timekeeping_last_warning = jiffies;
+		}
+		timekeeping_underflow_seen = 0;
+	}
+
+	if (timekeeping_overflow_seen) {
+		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+			printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
+			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+			printk_deferred("         Your kernel is probably still fine.\n");
+			timekeeping_last_warning = jiffies;
+		}
+		timekeeping_overflow_seen = 0;
+	}
+}
+
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+	cycle_t now, last, mask, max, delta;
+	unsigned int seq;
+
+	/*
+	 * Since we're called holding a seqlock, the data may shift
+	 * under us while we're doing the calculation. This can cause
+	 * false positives, since we'd note a problem but throw the
+	 * results away. So nest another seqlock here to atomically
+	 * grab the points we are checking with.
+	 */
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		now = tkr->read(tkr->clock);
+		last = tkr->cycle_last;
+		mask = tkr->mask;
+		max = tkr->clock->max_cycles;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	delta = clocksource_delta(now, last, mask);
+
+	/*
+	 * Try to catch underflows by checking if we are seeing small
+	 * mask-relative negative values.
+	 */
+	if (unlikely((~delta & mask) < (mask >> 3))) {
+		timekeeping_underflow_seen = 1;
+		delta = 0;
+	}
+
+	/* Cap delta value to the max_cycles values to avoid mult overflows */
+	if (unlikely(delta > max)) {
+		timekeeping_overflow_seen = 1;
+		delta = tkr->clock->max_cycles;
+	}
+
+	return delta;
+}
+#else
+static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+	cycle_t cycle_now, delta;
+
+	/* read clocksource */
+	cycle_now = tkr->read(tkr->clock);
+
+	/* calculate the delta since the last update_wall_time */
+	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+
+	return delta;
+}
+#endif
+
 /**
 * tk_setup_internals - Set up internals to use clocksource clock.
 *
@ -135,11 +247,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	u64 tmp, ntpinterval;
 	struct clocksource *old_clock;

-	old_clock = tk->tkr.clock;
-	tk->tkr.clock = clock;
-	tk->tkr.read = clock->read;
-	tk->tkr.mask = clock->mask;
-	tk->tkr.cycle_last = tk->tkr.read(clock);
+	old_clock = tk->tkr_mono.clock;
+	tk->tkr_mono.clock = clock;
+	tk->tkr_mono.read = clock->read;
+	tk->tkr_mono.mask = clock->mask;
+	tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+
+	tk->tkr_raw.clock = clock;
+	tk->tkr_raw.read = clock->read;
+	tk->tkr_raw.mask = clock->mask;
+	tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;

 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
@ -163,11 +280,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	if (old_clock) {
 		int shift_change = clock->shift - old_clock->shift;
 		if (shift_change < 0)
-			tk->tkr.xtime_nsec >>= -shift_change;
+			tk->tkr_mono.xtime_nsec >>= -shift_change;
 		else
-			tk->tkr.xtime_nsec <<= shift_change;
+			tk->tkr_mono.xtime_nsec <<= shift_change;
 	}
-	tk->tkr.shift = clock->shift;
+	tk->tkr_raw.xtime_nsec = 0;
+
+	tk->tkr_mono.shift = clock->shift;
+	tk->tkr_raw.shift = clock->shift;

 	tk->ntp_error = 0;
 	tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@ -178,7 +298,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	 * active clocksource. These value will be adjusted via NTP
 	 * to counteract clock drifting.
 	 */
-	tk->tkr.mult = clock->mult;
+	tk->tkr_mono.mult = clock->mult;
+	tk->tkr_raw.mult = clock->mult;
 	tk->ntp_err_mult = 0;
 }

@ -193,14 +314,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; }

 static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
-	cycle_t cycle_now, delta;
+	cycle_t delta;
 	s64 nsec;

-	/* read clocksource: */
-	cycle_now = tkr->read(tkr->clock);
-
-	/* calculate the delta since the last update_wall_time: */
-	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+	delta = timekeeping_get_delta(tkr);

 	nsec = delta * tkr->mult + tkr->xtime_nsec;
 	nsec >>= tkr->shift;
@ -209,25 +326,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 	return nsec + arch_gettimeoffset();
 }

-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
-{
-	struct clocksource *clock = tk->tkr.clock;
-	cycle_t cycle_now, delta;
-	s64 nsec;
-
-	/* read clocksource: */
-	cycle_now = tk->tkr.read(clock);
-
-	/* calculate the delta since the last update_wall_time: */
-	delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-
-	/* convert delta to nanoseconds. */
-	nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
-
-	/* If arch requires, add in get_arch_timeoffset() */
-	return nsec + arch_gettimeoffset();
-}
-
 /**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
 * @tkr: Timekeeping readout base from which we take the update
@ -267,18 +365,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
-static void update_fast_timekeeper(struct tk_read_base *tkr)
+static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
 {
-	struct tk_read_base *base = tk_fast_mono.base;
+	struct tk_read_base *base = tkf->base;

 	/* Force readers off to base[1] */
-	raw_write_seqcount_latch(&tk_fast_mono.seq);
+	raw_write_seqcount_latch(&tkf->seq);

 	/* Update base[0] */
 	memcpy(base, tkr, sizeof(*base));

 	/* Force readers back to base[0] */
-	raw_write_seqcount_latch(&tk_fast_mono.seq);
+	raw_write_seqcount_latch(&tkf->seq);

 	/* Update base[1] */
 	memcpy(base + 1, base, sizeof(*base));
@ -316,22 +414,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr)
 * of the following timestamps. Callers need to be aware of that and
 * deal with it.
 */
-u64 notrace ktime_get_mono_fast_ns(void)
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 {
 	struct tk_read_base *tkr;
 	unsigned int seq;
 	u64 now;

 	do {
-		seq = raw_read_seqcount(&tk_fast_mono.seq);
-		tkr = tk_fast_mono.base + (seq & 0x01);
-		now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+		seq = raw_read_seqcount(&tkf->seq);
+		tkr = tkf->base + (seq & 0x01);
+		now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+	} while (read_seqcount_retry(&tkf->seq, seq));

-	} while (read_seqcount_retry(&tk_fast_mono.seq, seq));
 	return now;
 }
+
+u64 ktime_get_mono_fast_ns(void)
+{
+	return __ktime_get_fast_ns(&tk_fast_mono);
+}
 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);

+u64 ktime_get_raw_fast_ns(void)
+{
+	return __ktime_get_fast_ns(&tk_fast_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+
 /* Suspend-time cycles value for halted fast timekeeper. */
 static cycle_t cycles_at_suspend;

@ -353,12 +462,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
 static void halt_fast_timekeeper(struct timekeeper *tk)
 {
 	static struct tk_read_base tkr_dummy;
-	struct tk_read_base *tkr = &tk->tkr;
+	struct tk_read_base *tkr = &tk->tkr_mono;

 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
 	cycles_at_suspend = tkr->read(tkr->clock);
 	tkr_dummy.read = dummy_clock_read;
-	update_fast_timekeeper(&tkr_dummy);
+	update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
+
+	tkr = &tk->tkr_raw;
+	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+	tkr_dummy.read = dummy_clock_read;
+	update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
 }

 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@ -369,8 +483,8 @@ static inline void update_vsyscall(struct timekeeper *tk)

 	xt = timespec64_to_timespec(tk_xtime(tk));
 	wm = timespec64_to_timespec(tk->wall_to_monotonic);
-	update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
-			    tk->tkr.cycle_last);
+	update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
+			    tk->tkr_mono.cycle_last);
 }

 static inline void old_vsyscall_fixup(struct timekeeper *tk)
@ -387,11 +501,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
 	* users are removed, this can be killed.
 	*/
-	remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
-	tk->tkr.xtime_nsec -= remainder;
-	tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+	remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
+	tk->tkr_mono.xtime_nsec -= remainder;
+	tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
 	tk->ntp_error += remainder << tk->ntp_error_shift;
-	tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+	tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
 }
 #else
 #define old_vsyscall_fixup(tk)
@ -456,17 +570,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	 */
 	seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
 	nsec = (u32) tk->wall_to_monotonic.tv_nsec;
-	tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+	tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);

 	/* Update the monotonic raw base */
-	tk->base_raw = timespec64_to_ktime(tk->raw_time);
+	tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);

 	/*
 	 * The sum of the nanoseconds portions of xtime and
 	 * wall_to_monotonic can be greater/equal one second. Take
 	 * this into account before updating tk->ktime_sec.
 	 */
-	nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+	nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 	if (nsec >= NSEC_PER_SEC)
 		seconds++;
 	tk->ktime_sec = seconds;
@ -489,7 +603,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
 		       sizeof(tk_core.timekeeper));

-	update_fast_timekeeper(&tk->tkr);
+	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
 }

 /**
@ -501,22 +616,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 */
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
-	struct clocksource *clock = tk->tkr.clock;
+	struct clocksource *clock = tk->tkr_mono.clock;
 	cycle_t cycle_now, delta;
 	s64 nsec;

-	cycle_now = tk->tkr.read(clock);
-	delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-	tk->tkr.cycle_last = cycle_now;
+	cycle_now = tk->tkr_mono.read(clock);
+	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+	tk->tkr_mono.cycle_last = cycle_now;
+	tk->tkr_raw.cycle_last  = cycle_now;

-	tk->tkr.xtime_nsec += delta * tk->tkr.mult;
+	tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;

 	/* If arch requires, add in get_arch_timeoffset() */
-	tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
+	tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;

 	tk_normalize_xtime(tk);

-	nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
+	nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
 	timespec64_add_ns(&tk->raw_time, nsec);
 }

@ -537,7 +653,7 @@ int __getnstimeofday64(struct timespec64 *ts)
 		seq = read_seqcount_begin(&tk_core.seq);

 		ts->tv_sec = tk->xtime_sec;
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);

 	} while (read_seqcount_retry(&tk_core.seq, seq));

@ -577,8 +693,8 @@ ktime_t ktime_get(void)

 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		base = tk->tkr.base_mono;
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		base = tk->tkr_mono.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);

 	} while (read_seqcount_retry(&tk_core.seq, seq));

@ -603,8 +719,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)

 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		base = ktime_add(tk->tkr.base_mono, *offset);
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		base = ktime_add(tk->tkr_mono.base, *offset);
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);

 	} while (read_seqcount_retry(&tk_core.seq, seq));

@ -645,8 +761,8 @@ ktime_t ktime_get_raw(void)

 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		base = tk->base_raw;
-		nsecs = timekeeping_get_ns_raw(tk);
+		base = tk->tkr_raw.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_raw);

 	} while (read_seqcount_retry(&tk_core.seq, seq));

@ -674,7 +790,7 @@ void ktime_get_ts64(struct timespec64 *ts)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 		ts->tv_sec = tk->xtime_sec;
-		nsec = timekeeping_get_ns(&tk->tkr);
+		nsec = timekeeping_get_ns(&tk->tkr_mono);
 		tomono = tk->wall_to_monotonic;

 	} while (read_seqcount_retry(&tk_core.seq, seq));
@ -759,8 +875,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		ts_real->tv_sec = tk->xtime_sec;
 		ts_real->tv_nsec = 0;

-		nsecs_raw = timekeeping_get_ns_raw(tk);
-		nsecs_real = timekeeping_get_ns(&tk->tkr);
+		nsecs_raw  = timekeeping_get_ns(&tk->tkr_raw);
+		nsecs_real = timekeeping_get_ns(&tk->tkr_mono);

 	} while (read_seqcount_retry(&tk_core.seq, seq));

@ -943,7 +1059,7 @@ static int change_clocksource(void *data)
 	 */
 	if (try_module_get(new->owner)) {
 		if (!new->enable || new->enable(new) == 0) {
-			old = tk->tkr.clock;
+			old = tk->tkr_mono.clock;
 			tk_setup_internals(tk, new);
 			if (old->disable)
 				old->disable(old);
@ -971,11 +1087,11 @@ int timekeeping_notify(struct clocksource *clock)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;

-	if (tk->tkr.clock == clock)
+	if (tk->tkr_mono.clock == clock)
 		return 0;
 	stop_machine(change_clocksource, clock, NULL);
 	tick_clock_notify();
-	return tk->tkr.clock == clock ? 0 : -1;
+	return tk->tkr_mono.clock == clock ? 0 : -1;
 }

 /**
@ -993,7 +1109,7 @@ void getrawmonotonic64(struct timespec64 *ts)

 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		nsecs = timekeeping_get_ns_raw(tk);
+		nsecs = timekeeping_get_ns(&tk->tkr_raw);
 		ts64 = tk->raw_time;

 	} while (read_seqcount_retry(&tk_core.seq, seq));
@ -1016,7 +1132,7 @@ int timekeeping_valid_for_hres(void)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);

-		ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+		ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;

 	} while (read_seqcount_retry(&tk_core.seq, seq));

@ -1035,7 +1151,7 @@ u64 timekeeping_max_deferment(void)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);

-		ret = tk->tkr.clock->max_idle_ns;
+		ret = tk->tkr_mono.clock->max_idle_ns;

 	} while (read_seqcount_retry(&tk_core.seq, seq));

@ -1114,7 +1230,6 @@ void __init timekeeping_init(void)
 	tk_set_xtime(tk, &now);
 	tk->raw_time.tv_sec = 0;
 	tk->raw_time.tv_nsec = 0;
-	tk->base_raw.tv64 = 0;
 	if (boot.tv_sec == 0 && boot.tv_nsec == 0)
 		boot = tk_xtime(tk);

@ -1200,7 +1315,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
 void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	struct clocksource *clock = tk->tkr.clock;
+	struct clocksource *clock = tk->tkr_mono.clock;
 	unsigned long flags;
 	struct timespec64 ts_new, ts_delta;
 	struct timespec tmp;
@ -1228,16 +1343,16 @@ void timekeeping_resume(void)
 	 * The less preferred source will only be tried if there is no better
 	 * usable source. The rtc part is handled separately in rtc core code.
 	 */
-	cycle_now = tk->tkr.read(clock);
+	cycle_now = tk->tkr_mono.read(clock);
 	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
-		cycle_now > tk->tkr.cycle_last) {
+		cycle_now > tk->tkr_mono.cycle_last) {
 		u64 num, max = ULLONG_MAX;
 		u32 mult = clock->mult;
 		u32 shift = clock->shift;
 		s64 nsec = 0;

-		cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
-						tk->tkr.mask);
+		cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+						tk->tkr_mono.mask);

 		/*
 		 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@ -1263,7 +1378,9 @@ void timekeeping_resume(void)
 		__timekeeping_inject_sleeptime(tk, &ts_delta);

 	/* Re-base the last cycle value */
-	tk->tkr.cycle_last = cycle_now;
+	tk->tkr_mono.cycle_last = cycle_now;
+	tk->tkr_raw.cycle_last  = cycle_now;
+
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@ -1416,15 +1533,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
 	 *
 	 * XXX - TODO: Doc ntp_error calculation.
 	 */
-	if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+	if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
 		/* NTP adjustment caused clocksource mult overflow */
 		WARN_ON_ONCE(1);
 		return;
 	}

-	tk->tkr.mult += mult_adj;
+	tk->tkr_mono.mult += mult_adj;
 	tk->xtime_interval += interval;
-	tk->tkr.xtime_nsec -= offset;
+	tk->tkr_mono.xtime_nsec -= offset;
 	tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
 }

@ -1486,13 +1603,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 		tk->ntp_err_mult = 0;
 	}

-	if (unlikely(tk->tkr.clock->maxadj &&
-		(abs(tk->tkr.mult - tk->tkr.clock->mult)
-			> tk->tkr.clock->maxadj))) {
+	if (unlikely(tk->tkr_mono.clock->maxadj &&
+		(abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
+			> tk->tkr_mono.clock->maxadj))) {
 		printk_once(KERN_WARNING
 			"Adjusting %s more than 11%% (%ld vs %ld)\n",
-			tk->tkr.clock->name, (long)tk->tkr.mult,
-			(long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+			tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
+			(long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
 	}

 	/*
@ -1509,9 +1626,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 	 * We'll correct this error next time through this function, when
 	 * xtime_nsec is not as small.
 	 */
-	if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
-		s64 neg = -(s64)tk->tkr.xtime_nsec;
-		tk->tkr.xtime_nsec = 0;
+	if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
+		s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
+		tk->tkr_mono.xtime_nsec = 0;
 		tk->ntp_error += neg << tk->ntp_error_shift;
 	}
 }
@ -1526,13 +1643,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 */
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
-	u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
+	u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
 	unsigned int clock_set = 0;

-	while (tk->tkr.xtime_nsec >= nsecps) {
+	while (tk->tkr_mono.xtime_nsec >= nsecps) {
 		int leap;

-		tk->tkr.xtime_nsec -= nsecps;
+		tk->tkr_mono.xtime_nsec -= nsecps;
 		tk->xtime_sec++;

 		/* Figure out if its a leap sec and apply if needed */
@ -1577,9 +1694,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,

 	/* Accumulate one shifted interval */
 	offset -= interval;
-	tk->tkr.cycle_last += interval;
+	tk->tkr_mono.cycle_last += interval;
+	tk->tkr_raw.cycle_last  += interval;

-	tk->tkr.xtime_nsec += tk->xtime_interval << shift;
+	tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
 	*clock_set |= accumulate_nsecs_to_secs(tk);

 	/* Accumulate raw time */
@ -1622,14 +1740,17 @@ void update_wall_time(void)
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 	offset = real_tk->cycle_interval;
 #else
-	offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
-				   tk->tkr.cycle_last, tk->tkr.mask);
+	offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
 #endif

 	/* Check if there's really nothing to do */
 	if (offset < real_tk->cycle_interval)
 		goto out;

+	/* Do some additional sanity checking */
+	timekeeping_check_update(real_tk, offset);
+
 	/*
 	 * With NO_HZ we may have to accumulate many cycle_intervals
 	 * (think "ticks") worth of time at once. To do this efficiently,
@ -1784,8 +1905,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);

-		base = tk->tkr.base_mono;
-		nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
+		base = tk->tkr_mono.base;
+		nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;

 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
@ -1816,8 +1937,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);

-		base = tk->tkr.base_mono;
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		base = tk->tkr_mono.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);

 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	print_name_offset(m, dev->set_next_event);
 	SEQ_printf(m, "\n");

-	SEQ_printf(m, " set_mode:       ");
-	print_name_offset(m, dev->set_mode);
-	SEQ_printf(m, "\n");
+	if (dev->set_mode) {
+		SEQ_printf(m, " set_mode:       ");
+		print_name_offset(m, dev->set_mode);
+		SEQ_printf(m, "\n");
+	} else {
+		if (dev->set_mode_shutdown) {
+			SEQ_printf(m, " shutdown: ");
+			print_name_offset(m, dev->set_mode_shutdown);
+			SEQ_printf(m, "\n");
+		}
+
+		if (dev->set_mode_periodic) {
+			SEQ_printf(m, " periodic: ");
+			print_name_offset(m, dev->set_mode_periodic);
+			SEQ_printf(m, "\n");
+		}
+
+		if (dev->set_mode_oneshot) {
+			SEQ_printf(m, " oneshot:  ");
+			print_name_offset(m, dev->set_mode_oneshot);
+			SEQ_printf(m, "\n");
+		}
+
+		if (dev->set_mode_resume) {
+			SEQ_printf(m, " resume:   ");
+			print_name_offset(m, dev->set_mode_resume);
+			SEQ_printf(m, "\n");
+		}
+	}

 	SEQ_printf(m, " event_handler:  ");
 	print_name_offset(m, dev->event_handler);
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@ -865,6 +865,19 @@ config SCHED_STACK_END_CHECK
 	  data corruption or a sporadic crash at a later stage once the region
 	  is examined. The runtime overhead introduced is minimal.

+config DEBUG_TIMEKEEPING
+	bool "Enable extra timekeeping sanity checking"
+	help
+	  This option will enable additional timekeeping sanity checks
+	  which may be helpful when diagnosing issues where timekeeping
+	  problems are suspected.
+
+	  This may include checks in the timekeeping hotpaths, so this
+	  option may have a (very small) performance impact to some
+	  workloads.
+
+	  If unsure, say N.
+
 config TIMER_STATS
 	bool "Collect kernel timers statistics"
 	depends on DEBUG_KERNEL && PROC_FS
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@ -1260,6 +1260,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int target_nid, last_cpupid = -1;
 	bool page_locked;
 	bool migrated = false;
+	bool was_writable;
 	int flags = 0;

 	/* A PROT_NONE fault should not end up here */
@ -1291,17 +1292,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		flags |= TNF_FAULT_LOCAL;
 	}

-	/*
-	 * Avoid grouping on DSO/COW pages in specific and RO pages
-	 * in general, RO pages shouldn't hurt as much anyway since
-	 * they can be in shared cache state.
-	 *
-	 * FIXME! This checks "pmd_dirty()" as an approximation of
-	 * "is this a read-only page", since checking "pmd_write()"
-	 * is even more broken. We haven't actually turned this into
-	 * a writable page, so pmd_write() will always be false.
-	 */
-	if (!pmd_dirty(pmd))
+	/* See similar comment in do_numa_page for explanation */
+	if (!(vma->vm_flags & VM_WRITE))
 		flags |= TNF_NO_GROUP;

 	/*
@ -1358,12 +1350,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (migrated) {
 		flags |= TNF_MIGRATED;
 		page_nid = target_nid;
-	}
+	} else
+		flags |= TNF_MIGRATE_FAIL;

 	goto out;
 clear_pmdnuma:
 	BUG_ON(!PageLocked(page));
+	was_writable = pmd_write(pmd);
 	pmd = pmd_modify(pmd, vma->vm_page_prot);
+	pmd = pmd_mkyoung(pmd);
+	if (was_writable)
+		pmd = pmd_mkwrite(pmd);
 	set_pmd_at(mm, haddr, pmdp, pmd);
 	update_mmu_cache_pmd(vma, addr, pmdp);
 	unlock_page(page);
@ -1487,6 +1484,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,

 	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		pmd_t entry;
+		bool preserve_write = prot_numa && pmd_write(*pmd);
 		ret = 1;

 		/*
@ -1502,9 +1500,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!prot_numa || !pmd_protnone(*pmd)) {
 			entry = pmdp_get_and_clear_notify(mm, addr, pmd);
 			entry = pmd_modify(entry, newprot);
+			if (preserve_write)
+				entry = pmd_mkwrite(entry);
 			ret = HPAGE_PMD_NR;
 			set_pmd_at(mm, addr, pmd, entry);
-			BUG_ON(pmd_write(entry));
+			BUG_ON(!preserve_write && pmd_write(entry));
 		}
 		spin_unlock(ptl);
 	}
--- a/mm/memory.c
+++ b/mm/memory.c
@ -3035,6 +3035,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int last_cpupid;
 	int target_nid;
 	bool migrated = false;
+	bool was_writable = pte_write(pte);
 	int flags = 0;

 	/* A PROT_NONE fault should not end up here */
@ -3059,6 +3060,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/* Make it present again */
 	pte = pte_modify(pte, vma->vm_page_prot);
 	pte = pte_mkyoung(pte);
+	if (was_writable)
+		pte = pte_mkwrite(pte);
 	set_pte_at(mm, addr, ptep, pte);
 	update_mmu_cache(vma, addr, ptep);

@ -3069,16 +3072,14 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}

 	/*
-	 * Avoid grouping on DSO/COW pages in specific and RO pages
-	 * in general, RO pages shouldn't hurt as much anyway since
-	 * they can be in shared cache state.
-	 *
-	 * FIXME! This checks "pmd_dirty()" as an approximation of
-	 * "is this a read-only page", since checking "pmd_write()"
-	 * is even more broken. We haven't actually turned this into
-	 * a writable page, so pmd_write() will always be false.
+	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
+	 * much anyway since they can be in shared cache state. This misses
+	 * the case where a mapping is writable but the process never writes
+	 * to it but pte_write gets cleared during protection updates and
+	 * pte_dirty has unpredictable behaviour between PTE scan updates,
+	 * background writeback, dirty balancing and application behaviour.
 	 */
-	if (!pte_dirty(pte))
+	if (!(vma->vm_flags & VM_WRITE))
 		flags |= TNF_NO_GROUP;

 	/*
@ -3102,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (migrated) {
 		page_nid = target_nid;
 		flags |= TNF_MIGRATED;
-	}
+	} else
+		flags |= TNF_MIGRATE_FAIL;

 out:
 	if (page_nid != -1)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@ -1092,6 +1092,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 			return NULL;

 		arch_refresh_nodedata(nid, pgdat);
+	} else {
+		/* Reset the nr_zones and classzone_idx to 0 before reuse */
+		pgdat->nr_zones = 0;
+		pgdat->classzone_idx = 0;
 	}

 	/* we can use NODE_DATA(nid) from here */
@ -1977,15 +1981,6 @@ void try_offline_node(int nid)
 		if (is_vmalloc_addr(zone->wait_table))
 			vfree(zone->wait_table);
 	}
-
-	/*
-	 * Since there is no way to guarentee the address of pgdat/zone is not
-	 * on stack of any kernel threads or used by other kernel objects
-	 * without reference counting or other symchronizing method, do not
-	 * reset node_data and free pgdat here. Just reset it to 0 and reuse
-	 * the memory when the node is online again.
-	 */
-	memset(pgdat, 0, sizeof(*pgdat));
 }
 EXPORT_SYMBOL(try_offline_node);

--- a/mm/mmap.c
+++ b/mm/mmap.c
@ -774,10 +774,8 @@ again:			remove_next = 1 + (end > next->vm_end);

 			importer->anon_vma = exporter->anon_vma;
 			error = anon_vma_clone(importer, exporter);
-			if (error) {
-				importer->anon_vma = NULL;
+			if (error)
 				return error;
-			}
 		}
 	}

--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@ -75,6 +75,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		oldpte = *pte;
 		if (pte_present(oldpte)) {
 			pte_t ptent;
+			bool preserve_write = prot_numa && pte_write(oldpte);

 			/*
 			 * Avoid trapping faults against the zero or KSM
@ -94,6 +95,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,

 			ptent = ptep_modify_prot_start(mm, addr, pte);
 			ptent = pte_modify(ptent, newprot);
+			if (preserve_write)
+				ptent = pte_mkwrite(ptent);

 			/* Avoid taking write faults for known dirty pages */
 			if (dirty_accountable && pte_dirty(ptent) &&
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@ -857,8 +857,11 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
 	 *                   bw * elapsed + write_bandwidth * (period - elapsed)
 	 * write_bandwidth = ---------------------------------------------------
 	 *                                          period
+	 *
+	 * @written may have decreased due to account_page_redirty().
+	 * Avoid underflowing @bw calculation.
 	 */
-	bw = written - bdi->written_stamp;
+	bw = written - min(written, bdi->written_stamp);
 	bw *= HZ;
 	if (unlikely(elapsed > period)) {
 		do_div(bw, elapsed);
@ -922,7 +925,7 @@ static void global_update_bandwidth(unsigned long thresh,
 				    unsigned long now)
 {
 	static DEFINE_SPINLOCK(dirty_lock);
-	static unsigned long update_time;
+	static unsigned long update_time = INITIAL_JIFFIES;

 	/*
 	 * check locklessly first to optimize away locking for the most time
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@ -103,6 +103,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)

 			if (!is_migrate_isolate_page(buddy)) {
 				__isolate_free_page(page, order);
+				kernel_map_pages(page, (1 << order), 1);
 				set_page_refcounted(page);
 				isolated_page = page;
 			}
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@ -265,8 +265,15 @@ int walk_page_range(unsigned long start, unsigned long end,
 			vma = vma->vm_next;

 			err = walk_page_test(start, next, walk);
-			if (err > 0)
+			if (err > 0) {
+				/*
+				 * positive return values are purely for
+				 * controlling the pagewalk, so should never
+				 * be passed to the callers.
+				 */
+				err = 0;
 				continue;
+			}
 			if (err < 0)
 				break;
 		}
--- a/mm/rmap.c
+++ b/mm/rmap.c
@ -287,6 +287,13 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 	return 0;

 enomem_failure:
+	/*
+	 * dst->anon_vma is dropped here otherwise its degree can be incorrectly
+	 * decremented in unlink_anon_vmas().
+	 * We can safely do this because callers of anon_vma_clone() don't care
+	 * about dst->anon_vma if anon_vma_clone() failed.
+	 */
+	dst->anon_vma = NULL;
 	unlink_anon_vmas(dst);
 	return -ENOMEM;
 }
--- a/mm/slub.c
+++ b/mm/slub.c
@ -2449,7 +2449,8 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 	do {
 		tid = this_cpu_read(s->cpu_slab->tid);
 		c = raw_cpu_ptr(s->cpu_slab);
-	} while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+	} while (IS_ENABLED(CONFIG_PREEMPT) &&
+		 unlikely(tid != READ_ONCE(c->tid)));

 	/*
 	 * Irqless object alloc/free algorithm used here depends on sequence
@ -2718,7 +2719,8 @@ static __always_inline void slab_free(struct kmem_cache *s,
 	do {
 		tid = this_cpu_read(s->cpu_slab->tid);
 		c = raw_cpu_ptr(s->cpu_slab);
-	} while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+	} while (IS_ENABLED(CONFIG_PREEMPT) &&
+		 unlikely(tid != READ_ONCE(c->tid)));

 	/* Same with comment on barrier() in slab_alloc_node() */
 	barrier();
--- a/Show more
+++ b/Show more