diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5ab1089d1422..62436bd5f34a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3204,6 +3204,10 @@ allowed (eg kernel_enable_fpu()/kernel_disable_fpu()). There is some performance impact when enabling this. + ppc_tm= [PPC] + Format: {"off"} + Disable Hardware Transactional Memory + print-fatal-signals= [KNL] debug: print fatal signals diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index cb782ac1c35d..c51e6ce42e7a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -139,9 +139,11 @@ config PPC select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_UACCESS_FLUSHCACHE if PPC64 select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -335,7 +337,7 @@ config PPC_OF_PLATFORM_PCI default n config ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on PPC32 || PPC_STD_MMU_64 + depends on PPC32 || PPC_BOOK3S_64 def_bool y config ARCH_SUPPORTS_UPROBES @@ -722,7 +724,7 @@ config PPC_16K_PAGES config PPC_64K_PAGES bool "64k page size" - depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64) + depends on !PPC_FSL_BOOK3E && (44x || PPC_BOOK3S_64 || PPC_BOOK3E_64) select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64 config PPC_256K_PAGES @@ -781,7 +783,7 @@ config FORCE_MAX_ZONEORDER config PPC_SUBPAGE_PROT bool "Support setting protections for 4k subpages" - depends on PPC_STD_MMU_64 && PPC_64K_PAGES + depends on PPC_BOOK3S_64 && PPC_64K_PAGES help This option adds support for a system call to allow user programs to set access permissions (read/write, readonly, or no access) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index be1c8c5beb61..657c33cd4eee 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -370,4 +370,10 @@ config PPC_HTDUMP def_bool y depends on PPC_PTDUMP && PPC_BOOK3S +config PPC_FAST_ENDIAN_SWITCH + bool "Deprecated fast endian-switch syscall" + depends on DEBUG_KERNEL && PPC_BOOK3S_64 + help + If you're unsure what this is, say N. + endmenu diff --git a/arch/powerpc/boot/dts/acadia.dts b/arch/powerpc/boot/dts/acadia.dts index 57291f61ffe7..86266159521e 100644 --- a/arch/powerpc/boot/dts/acadia.dts +++ b/arch/powerpc/boot/dts/acadia.dts @@ -183,7 +183,7 @@ ieee1588@ef602800 { usb@ef603000 { compatible = "ohci-be"; reg = <0xef603000 0x80>; - interrupts-parent = <&UIC0>; + interrupt-parent = <&UIC0>; interrupts = <0xd 0x4 0xe 0x4>; }; diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index caee834760d2..4891bbed6258 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -192,6 +192,7 @@ CONFIG_IPMI_DEVICE_INTERFACE=y CONFIG_IPMI_POWERNV=y CONFIG_RAW_DRIVER=y CONFIG_MAX_RAW_DEVS=1024 +CONFIG_I2C_CHARDEV=y CONFIG_DRM=y CONFIG_DRM_AST=y CONFIG_FIRMWARE_EDID=y @@ -295,6 +296,7 @@ CONFIG_FUNCTION_GRAPH_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_PPC_EMULATED_STATS=y CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 3d935969e5a2..bde2cd1005a2 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -193,6 +193,7 @@ CONFIG_VIRTIO_CONSOLE=m CONFIG_IBM_BSR=m CONFIG_RAW_DRIVER=y CONFIG_MAX_RAW_DEVS=1024 +CONFIG_I2C_CHARDEV=y CONFIG_FB=y CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig new file mode 100644 index 000000000000..6bd5e7261335 --- /dev/null +++ b/arch/powerpc/configs/skiroot_defconfig @@ -0,0 +1,232 @@ +CONFIG_PPC64=y +CONFIG_ALTIVEC=y +CONFIG_VSX=y +CONFIG_NR_CPUS=2048 +CONFIG_CPU_LITTLE_ENDIAN=y +# CONFIG_SWAP is not set +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +# CONFIG_CROSS_MEMORY_ATTACH is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=20 +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +# CONFIG_RD_GZIP is not set +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_PERF_EVENTS=y +# CONFIG_COMPAT_BRK is not set +CONFIG_JUMP_LABEL=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_SIG=y +CONFIG_MODULE_SIG_FORCE=y +CONFIG_MODULE_SIG_SHA512=y +CONFIG_PARTITION_ADVANCED=y +# CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_PPC_PSERIES is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_IDLE=y +CONFIG_HZ_100=y +CONFIG_KEXEC=y +CONFIG_IRQ_ALL_CPUS=y +CONFIG_NUMA=y +# CONFIG_COMPACTION is not set +# CONFIG_MIGRATION is not set +# CONFIG_BOUNCE is not set +CONFIG_PPC_64K_PAGES=y +CONFIG_SCHED_SMT=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="console=tty0 console=hvc0 powersave=off" +# CONFIG_SECCOMP is not set +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_NET_IPIP=y +CONFIG_SYN_COOKIES=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_IPV6 is not set +CONFIG_DNS_RESOLVER=y +# CONFIG_WIRELESS is not set +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_MTD=m +CONFIG_MTD_POWERNV_FLASH=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=65536 +CONFIG_VIRTIO_BLK=m +CONFIG_BLK_DEV_NVME=m +CONFIG_EEPROM_AT24=y +# CONFIG_CXL is not set +CONFIG_BLK_DEV_SD=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_SCAN_ASYNC=y +CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_BE2ISCSI=m +CONFIG_SCSI_AACRAID=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_IPR=m +# CONFIG_SCSI_IPR_TRACE is not set +# CONFIG_SCSI_IPR_DUMP is not set +CONFIG_SCSI_QLA_FC=m +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_SCSI_LPFC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_ALUA=m +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +# CONFIG_ATA_SFF is not set +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_BLK_DEV_DM=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_MIRROR=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_ACENIC=m +CONFIG_ACENIC_OMIT_TIGON_I=y +CONFIG_TIGON3=y +CONFIG_BNX2X=m +CONFIG_CHELSIO_T1=y +CONFIG_BE2NET=m +CONFIG_S2IO=m +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_MLX4_EN=m +CONFIG_MLX5_CORE=m +CONFIG_MLX5_CORE_EN=y +CONFIG_MYRI10GE=m +CONFIG_QLGE=m +CONFIG_NETXEN_NIC=m +CONFIG_SFC=m +# CONFIG_USB_NET_DRIVERS is not set +# CONFIG_WLAN is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_MISC=y +# CONFIG_SERIO_SERPORT is not set +# CONFIG_DEVMEM is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_IPMI_HANDLER=y +CONFIG_IPMI_DEVICE_INTERFACE=y +CONFIG_IPMI_POWERNV=y +CONFIG_HW_RANDOM=y +CONFIG_TCG_TIS_I2C_NUVOTON=y +# CONFIG_I2C_COMPAT is not set +CONFIG_I2C_CHARDEV=y +# CONFIG_I2C_HELPER_AUTO is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_DRM_AST=m +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_OF=y +CONFIG_FB_MATROX=y +CONFIG_FB_MATROX_MILLENIUM=y +CONFIG_FB_MATROX_MYSTIQUE=y +CONFIG_FB_MATROX_G=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_GENERIC is not set +# CONFIG_VGA_CONSOLE is not set +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_USB_HIDDEV=y +CONFIG_USB=y +CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_HCD_PPC_OF is not set +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_STORAGE=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_GENERIC=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VIRTIO_PCI=y +# CONFIG_IOMMU_SUPPORT is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_XFS_FS=m +CONFIG_XFS_POSIX_ACL=y +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_ISO9660_FS=m +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +# CONFIG_MISC_FILESYSTEMS is not set +# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_UTF8=y +CONFIG_CRC16=y +CONFIG_CRC_ITU_T=y +CONFIG_LIBCRC32C=y +CONFIG_PRINTK_TIME=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_STACKOVERFLOW=y +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_HARDLOCKUP_DETECTOR=y +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_WQ_WATCHDOG=y +CONFIG_SCHEDSTATS=y +# CONFIG_FTRACE is not set +CONFIG_XMON=y +CONFIG_XMON_DEFAULT=y +CONFIG_SECURITY=y +CONFIG_IMA=y +CONFIG_EVM=y +# CONFIG_CRYPTO_ECHAINIV is not set +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_CMAC=y +CONFIG_CRYPTO_MD4=y +CONFIG_CRYPTO_ARC4=y +CONFIG_CRYPTO_DES=y +# CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 508275bb05d5..e91e115a816f 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -606,7 +606,7 @@ extern void slb_set_size(u16 size); /* 4 bits per slice and we have one slice per 1TB */ #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) -#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.addr_limit >> 41) +#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.slb_addr_limit >> 41) #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 37fdede5a24c..c9448e19847a 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -93,7 +93,7 @@ typedef struct { #ifdef CONFIG_PPC_MM_SLICES u64 low_slices_psize; /* SLB page size encodings */ unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; - unsigned long addr_limit; + unsigned long slb_addr_limit; #else u16 sllp; /* SLB page size encoding */ #endif diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 42178897a050..849ecaae9e79 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -66,6 +66,28 @@ static inline void hash__flush_tlb_mm(struct mm_struct *mm) { } +static inline void hash__local_flush_all_mm(struct mm_struct *mm) +{ + /* + * There's no Page Walk Cache for hash, so what is needed is + * the same as flush_tlb_mm(), which doesn't really make sense + * with hash. So the only thing we could do is flush the + * entire LPID! Punt for now, as it's not being used. + */ + WARN_ON_ONCE(1); +} + +static inline void hash__flush_all_mm(struct mm_struct *mm) +{ + /* + * There's no Page Walk Cache for hash, so what is needed is + * the same as flush_tlb_mm(), which doesn't really make sense + * with hash. So the only thing we could do is flush the + * entire LPID! Punt for now, as it's not being used. + */ + WARN_ON_ONCE(1); +} + static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index c2115dfcef0c..6a9e68003387 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -22,17 +22,20 @@ extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long sta extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void radix__local_flush_tlb_mm(struct mm_struct *mm); +extern void radix__local_flush_all_mm(struct mm_struct *mm); extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); extern void radix__tlb_flush(struct mmu_gather *tlb); #ifdef CONFIG_SMP extern void radix__flush_tlb_mm(struct mm_struct *mm); +extern void radix__flush_all_mm(struct mm_struct *mm); extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); #else #define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) +#define radix__flush_all_mm(mm) radix__local_flush_all_mm(mm) #define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) #define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p) #endif diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index fcffddbb3102..58b576f654b3 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -58,6 +58,13 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, return hash__local_flush_tlb_page(vma, vmaddr); } +static inline void local_flush_all_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__local_flush_all_mm(mm); + return hash__local_flush_all_mm(mm); +} + static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) @@ -80,9 +87,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, return radix__flush_tlb_page(vma, vmaddr); return hash__flush_tlb_page(vma, vmaddr); } + +static inline void flush_all_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__flush_all_mm(mm); + return hash__flush_all_mm(mm); +} #else #define flush_tlb_mm(mm) local_flush_tlb_mm(mm) #define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr) +#define flush_all_mm(mm) local_flush_all_mm(mm) #endif /* CONFIG_SMP */ /* * flush the page walk cache for the address diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 53b31c2bcdf4..0546663a98db 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -207,7 +207,7 @@ enum { #define CPU_FTR_STCX_CHECKS_ADDRESS LONG_ASM_CONST(0x0004000000000000) #define CPU_FTR_POPCNTB LONG_ASM_CONST(0x0008000000000000) #define CPU_FTR_POPCNTD LONG_ASM_CONST(0x0010000000000000) -#define CPU_FTR_ICSWX LONG_ASM_CONST(0x0020000000000000) +/* Free LONG_ASM_CONST(0x0020000000000000) */ #define CPU_FTR_VMX_COPY LONG_ASM_CONST(0x0040000000000000) #define CPU_FTR_TM LONG_ASM_CONST(0x0080000000000000) #define CPU_FTR_CFAR LONG_ASM_CONST(0x0100000000000000) @@ -216,6 +216,7 @@ enum { #define CPU_FTR_DABRX LONG_ASM_CONST(0x0800000000000000) #define CPU_FTR_PMAO_BUG LONG_ASM_CONST(0x1000000000000000) #define CPU_FTR_POWER9_DD1 LONG_ASM_CONST(0x4000000000000000) +#define CPU_FTR_POWER9_DD2_1 LONG_ASM_CONST(0x8000000000000000) #ifndef __ASSEMBLY__ @@ -452,7 +453,7 @@ enum { CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | \ + CPU_FTR_CFAR | CPU_FTR_HVMODE | \ CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR | CPU_FTR_DABRX) #define CPU_FTRS_POWER8 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\ @@ -461,7 +462,7 @@ enum { CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ + CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \ CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP) #define CPU_FTRS_POWER8E (CPU_FTRS_POWER8 | CPU_FTR_PMAO_BUG) @@ -478,6 +479,8 @@ enum { CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300) #define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \ (~CPU_FTR_SAO)) +#define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9 +#define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ @@ -496,7 +499,8 @@ enum { (CPU_FTRS_POWER4 | CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \ CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \ CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \ - CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD1) + CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | \ + CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD2_1) #endif #else enum { diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 9847ae3a12d1..5161c37dd039 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -93,7 +93,7 @@ struct eeh_pe { struct pci_bus *bus; /* Top PCI bus for bus PE */ int check_count; /* Times of ignored error */ int freeze_count; /* Times of froze up */ - struct timeval tstamp; /* Time on first-time freeze */ + time64_t tstamp; /* Time on first-time freeze */ int false_positives; /* Times of reported #ff's */ atomic_t pass_dev_cnt; /* Count of passed through devs */ struct eeh_pe *parent; /* Parent PE */ @@ -200,7 +200,6 @@ enum { struct eeh_ops { char *name; int (*init)(void); - int (*post_init)(void); void* (*probe)(struct pci_dn *pdn, void *data); int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); @@ -275,7 +274,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); struct eeh_dev *eeh_dev_init(struct pci_dn *pdn); void eeh_dev_phb_init_dynamic(struct pci_controller *phb); -int eeh_init(void); +void eeh_probe_devices(void); int __init eeh_ops_register(struct eeh_ops *ops); int __exit eeh_ops_unregister(const char *name); int eeh_check_failure(const volatile void __iomem *token); @@ -321,10 +320,7 @@ static inline bool eeh_enabled(void) return false; } -static inline int eeh_init(void) -{ - return 0; -} +static inline void eeh_probe_devices(void) { } static inline void *eeh_dev_init(struct pci_dn *pdn, void *data) { diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h index f00e10e2a335..651e1354498e 100644 --- a/arch/powerpc/include/asm/emulated_ops.h +++ b/arch/powerpc/include/asm/emulated_ops.h @@ -55,6 +55,10 @@ extern struct ppc_emulated { struct ppc_emulated_entry mfdscr; struct ppc_emulated_entry mtdscr; struct ppc_emulated_entry lq_stq; + struct ppc_emulated_entry lxvw4x; + struct ppc_emulated_entry lxvh8x; + struct ppc_emulated_entry lxvd2x; + struct ppc_emulated_entry lxvb16x; #endif } ppc_emulated; diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index 334459ad145b..90863245df53 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -508,7 +508,7 @@ static unsigned long epapr_hypercall(unsigned long *in, static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; unsigned long r; @@ -520,7 +520,7 @@ static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2) static inline long epapr_hypercall0(unsigned int nr) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; return epapr_hypercall(in, out, nr); @@ -528,7 +528,7 @@ static inline long epapr_hypercall0(unsigned int nr) static inline long epapr_hypercall1(unsigned int nr, unsigned long p1) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -538,7 +538,7 @@ static inline long epapr_hypercall1(unsigned int nr, unsigned long p1) static inline long epapr_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -549,7 +549,7 @@ static inline long epapr_hypercall2(unsigned int nr, unsigned long p1, static inline long epapr_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -562,7 +562,7 @@ static inline long epapr_hypercall4(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 9a318973af05..b27205297e1d 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -54,6 +54,11 @@ #define EX_SIZE 9 /* size in u64 units */ #endif +/* + * maximum recursive depth of MCE exceptions + */ +#define MAX_MCE_DEPTH 4 + /* * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 93f98239159f..14c9d44f355b 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -41,12 +41,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, return radix__flush_hugetlb_page(vma, vmaddr); } -static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) -{ - if (radix_enabled()) - return radix__local_flush_hugetlb_page(vma, vmaddr); -} #else static inline pte_t *hugepd_page(hugepd_t hpd) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index abd04c36c251..3818fa0164f0 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -32,6 +32,7 @@ #ifndef __ASSEMBLY__ +extern void replay_system_reset(void); extern void __replay_interrupt(unsigned int vector); extern void timer_interrupt(struct pt_regs *); diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index 8814a7249ceb..9f3be5c8a4a3 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -103,8 +103,8 @@ extern int kprobe_exceptions_notify(struct notifier_block *self, extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); extern int kprobe_handler(struct pt_regs *regs); extern int kprobe_post_handler(struct pt_regs *regs); -extern int is_current_kprobe_addr(unsigned long addr); #ifdef CONFIG_KPROBES_ON_FTRACE +extern int __is_active_jprobe(unsigned long addr); extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb); #else diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 83596f32f50b..7cea76f11c26 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -104,10 +104,6 @@ struct kvmppc_host_state { u8 napping; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* - * hwthread_req/hwthread_state pair is used to pull sibling threads - * out of guest on pre-ISAv3.0B CPUs where threads share MMU. - */ u8 hwthread_req; u8 hwthread_state; u8 host_ipi; diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 190d69a7f701..3a1226e9b465 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -204,12 +204,10 @@ struct mce_error_info { extern void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, - uint64_t addr); + uint64_t addr, uint64_t phys_addr); extern int get_mce_event(struct machine_check_event *mce, bool release); extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode); -extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); - #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 492d8140a395..6177d43f0ce8 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -78,6 +78,52 @@ extern void switch_cop(struct mm_struct *next); extern int use_cop(unsigned long acop, struct mm_struct *mm); extern void drop_cop(unsigned long acop, struct mm_struct *mm); +#ifdef CONFIG_PPC_BOOK3S_64 +static inline void inc_mm_active_cpus(struct mm_struct *mm) +{ + atomic_inc(&mm->context.active_cpus); +} + +static inline void dec_mm_active_cpus(struct mm_struct *mm) +{ + atomic_dec(&mm->context.active_cpus); +} + +static inline void mm_context_add_copro(struct mm_struct *mm) +{ + /* + * On hash, should only be called once over the lifetime of + * the context, as we can't decrement the active cpus count + * and flush properly for the time being. + */ + inc_mm_active_cpus(mm); +} + +static inline void mm_context_remove_copro(struct mm_struct *mm) +{ + /* + * Need to broadcast a global flush of the full mm before + * decrementing active_cpus count, as the next TLBI may be + * local and the nMMU and/or PSL need to be cleaned up. + * Should be rare enough so that it's acceptable. + * + * Skip on hash, as we don't know how to do the proper flush + * for the time being. Invalidations will remain global if + * used on hash. + */ + if (radix_enabled()) { + flush_all_mm(mm); + dec_mm_active_cpus(mm); + } +} +#else +static inline void inc_mm_active_cpus(struct mm_struct *mm) { } +static inline void dec_mm_active_cpus(struct mm_struct *mm) { } +static inline void mm_context_add_copro(struct mm_struct *mm) { } +static inline void mm_context_remove_copro(struct mm_struct *mm) { } +#endif + + extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); @@ -119,9 +165,13 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm, { } +#ifndef CONFIG_PPC_BOOK3S_64 static inline void arch_exit_mmap(struct mm_struct *mm) { } +#else +extern void arch_exit_mmap(struct mm_struct *mm); +#endif static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 265bbd7cba73..abddf5830ad5 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -204,7 +204,7 @@ static inline unsigned long pte_update(struct mm_struct *mm, if (!huge) assert_pte_locked(mm, addr); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (old & _PAGE_HASHPTE) hpte_need_flush(mm, addr, ptep, old, huge); #endif diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 450a60b81d2a..233c7504b1f2 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -188,6 +188,7 @@ #define OPAL_XIVE_DUMP 142 #define OPAL_XIVE_RESERVED3 143 #define OPAL_XIVE_RESERVED4 144 +#define OPAL_SIGNAL_SYSTEM_RESET 145 #define OPAL_NPU_INIT_CONTEXT 146 #define OPAL_NPU_DESTROY_CONTEXT 147 #define OPAL_NPU_MAP_LPAR 148 @@ -895,6 +896,8 @@ enum { */ OPAL_REINIT_CPUS_MMU_HASH = (1 << 2), OPAL_REINIT_CPUS_MMU_RADIX = (1 << 3), + + OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED = (1 << 4), }; typedef struct oppanel_line { diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 726c23304a57..0c545f7fc77b 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr); int opal_set_power_shift_ratio(u32 handle, int token, u32 psr); int opal_sensor_group_clear(u32 group_hndl, int token); +s64 opal_signal_system_reset(s32 cpu); + /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); @@ -304,11 +306,11 @@ extern void opal_notifier_enable(void); extern void opal_notifier_disable(void); extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val); -extern int __opal_async_get_token(void); extern int opal_async_get_token_interruptible(void); -extern int __opal_async_release_token(int token); extern int opal_async_release_token(int token); extern int opal_async_wait_response(uint64_t token, struct opal_msg *msg); +extern int opal_async_wait_response_interruptible(uint64_t token, + struct opal_msg *msg); extern int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data); struct rtc_time; diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 04b60af027ae..3892db93b837 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -91,14 +91,14 @@ struct paca_struct { u8 cpu_start; /* At startup, processor spins until */ /* this becomes non-zero. */ u8 kexec_state; /* set when kexec down has irqs off */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 struct slb_shadow *slb_shadow_ptr; struct dtl_entry *dispatch_log; struct dtl_entry *dispatch_log_end; -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif u64 dscr_default; /* per-CPU default DSCR */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * Now, starting in cacheline 2, the exception save areas */ @@ -110,7 +110,7 @@ struct paca_struct { u16 vmalloc_sllp; u16 slb_cache_ptr; u32 slb_cache[SLB_CACHE_ENTRIES]; -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_BOOK3E u64 exgen[8] __aligned(0x40); @@ -143,7 +143,7 @@ struct paca_struct { #ifdef CONFIG_PPC_MM_SLICES u64 mm_ctx_low_slices_psize; unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; - unsigned long addr_limit; + unsigned long mm_ctx_slb_addr_limit; #else u16 mm_ctx_user_psize; u16 mm_ctx_sllp; @@ -192,7 +192,7 @@ struct paca_struct { struct stop_sprs stop_sprs; #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* Non-maskable exceptions that are not performance critical */ u64 exnmi[EX_SIZE]; /* used for system reset (nmi) */ u64 exmc[EX_SIZE]; /* used for machine checks */ @@ -210,6 +210,7 @@ struct paca_struct { */ u16 in_mce; u8 hmi_event_available; /* HMI event is available */ + u8 hmi_p9_special_emu; /* HMI P9 special emulation */ #endif /* Stuff for accurate time accounting */ diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index c4d9654bd637..56234c6fcd61 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -117,21 +117,21 @@ extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start, #endif /* __ASSEMBLY__ */ #else #define slice_init() -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #define get_slice_psize(mm, addr) ((mm)->context.user_psize) #define slice_set_user_psize(mm, psize) \ do { \ (mm)->context.user_psize = (psize); \ (mm)->context.sllp = SLB_VSID_USER | mmu_psize_defs[(psize)].sllp; \ } while (0) -#else /* CONFIG_PPC_STD_MMU_64 */ +#else /* !CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_64K_PAGES #define get_slice_psize(mm, addr) MMU_PAGE_64K #else /* CONFIG_PPC_64K_PAGES */ #define get_slice_psize(mm, addr) MMU_PAGE_4K #endif /* !CONFIG_PPC_64K_PAGES */ #define slice_set_user_psize(mm, psize) do { BUG(); } while(0) -#endif /* !CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #define slice_set_range_psize(mm, start, len, psize) \ slice_set_user_psize((mm), (psize)) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 0b8aa1fe2d5f..62ed83db04ae 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -218,6 +218,7 @@ struct pci_dn { #endif struct list_head child_list; struct list_head list; + struct resource holes[PCI_SRIOV_NUM_BARS]; }; /* Get the pointer to a device_node's pci_dn */ diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index beb6e3e79788..a89c67b62680 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -77,7 +77,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index cfe89a6fc308..eccb30b38b47 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -50,13 +50,13 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #include static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index f62797702300..dc5f6a5d4575 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -22,6 +22,8 @@ extern void pnv_npu2_destroy_context(struct npu_context *context, extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, unsigned long *flags, unsigned long *status, int count); + +void pnv_tm_init(void); #else static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, @@ -36,6 +38,8 @@ static inline int pnv_npu2_handle_fault(struct npu_context *context, unsigned long *status, int count) { return -ENODEV; } + +static inline void pnv_tm_init(void) { } #endif #endif /* _ASM_POWERNV_H */ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 36f3e41c9fbe..ae94b3626b6c 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -774,9 +774,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) #ifdef CONFIG_PPC_BOOK3E #define FIXUP_ENDIAN #else +/* + * This version may be used in in HV or non-HV context. + * MSR[EE] must be disabled. + */ #define FIXUP_ENDIAN \ tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ - b $+44; /* Skip trampoline if endian is good */ \ + b 191f; /* Skip trampoline if endian is good */ \ .long 0xa600607d; /* mfmsr r11 */ \ .long 0x01006b69; /* xori r11,r11,1 */ \ .long 0x00004039; /* li r10,0 */ \ @@ -786,7 +790,26 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) .long 0x14004a39; /* addi r10,r10,20 */ \ .long 0xa6035a7d; /* mtsrr0 r10 */ \ .long 0xa6037b7d; /* mtsrr1 r11 */ \ - .long 0x2400004c /* rfid */ + .long 0x2400004c; /* rfid */ \ +191: + +/* + * This version that may only be used with MSR[HV]=1 + * - Does not clear MSR[RI], so more robust. + * - Slightly smaller and faster. + */ +#define FIXUP_ENDIAN_HV \ + tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ + b 191f; /* Skip trampoline if endian is good */ \ + .long 0xa600607d; /* mfmsr r11 */ \ + .long 0x01006b69; /* xori r11,r11,1 */ \ + .long 0x05009f42; /* bcl 20,31,$+4 */ \ + .long 0xa602487d; /* mflr r10 */ \ + .long 0x14004a39; /* addi r10,r10,20 */ \ + .long 0xa64b5a7d; /* mthsrr0 r10 */ \ + .long 0xa64b7b7d; /* mthsrr1 r11 */ \ + .long 0x2402004c; /* hrfid */ \ +191: #endif /* !CONFIG_PPC_BOOK3E */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index fab7ff877304..bdab3b74eb98 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -329,6 +329,7 @@ struct thread_struct { */ int dscr_inherit; unsigned long ppr; /* used to save/restore SMT priority */ + unsigned long tidr; #endif #ifdef CONFIG_PPC_BOOK3S_64 unsigned long tar; @@ -340,7 +341,9 @@ struct thread_struct { unsigned long sier; unsigned long mmcr2; unsigned mmcr0; + unsigned used_ebb; + unsigned int used_vas; #endif }; diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index d98ac188cedb..9b8cedf618f4 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -12,6 +12,7 @@ #define __HAVE_ARCH_MEMCMP #define __HAVE_ARCH_MEMCHR #define __HAVE_ARCH_MEMSET16 +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE extern char * strcpy(char *,const char *); extern char * strncpy(char *,const char *, __kernel_size_t); @@ -24,6 +25,7 @@ extern void * memcpy(void *,const void *,__kernel_size_t); extern void * memmove(void *,const void *,__kernel_size_t); extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); +extern void * memcpy_flushcache(void *,const void *,__kernel_size_t); #ifdef CONFIG_PPC64 #define __HAVE_ARCH_MEMSET32 diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index bf820f53e27e..c3ca42cdc9f5 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -92,4 +92,9 @@ static inline void clear_task_ebb(struct task_struct *t) #endif } +extern int set_thread_uses_vas(void); + +extern int set_thread_tidr(struct task_struct *t); +extern void clear_thread_tidr(struct task_struct *t); + #endif /* _ASM_POWERPC_SWITCH_TO_H */ diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 13dbcd41885e..7d5a157c7832 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -77,7 +77,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) flush_tlb_mm(mm); } -#elif defined(CONFIG_PPC_STD_MMU_64) +#elif defined(CONFIG_PPC_BOOK3S_64) #include #else #error Unsupported MMU type diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h index a8bc72a7f4be..b1658c97047c 100644 --- a/arch/powerpc/include/asm/tm.h +++ b/arch/powerpc/include/asm/tm.h @@ -12,12 +12,13 @@ extern void tm_enable(void); extern void tm_reclaim(struct thread_struct *thread, - unsigned long orig_msr, uint8_t cause); + uint8_t cause); extern void tm_reclaim_current(uint8_t cause); -extern void tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr); +extern void tm_recheckpoint(struct thread_struct *thread); extern void tm_abort(uint8_t cause); extern void tm_save_sprs(struct thread_struct *thread); extern void tm_restore_sprs(struct thread_struct *thread); +extern bool tm_suspend_disabled; + #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 023ff9f17501..88187c285c70 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -97,6 +97,14 @@ static inline int prrn_is_enabled(void) } #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES) +#if defined(CONFIG_PPC_SPLPAR) +extern int timed_topology_update(int nsecs); +#else +#define timed_topology_update(nsecs) +#endif /* CONFIG_PPC_SPLPAR */ +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */ + #include #ifdef CONFIG_SMP diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 11f4bd07cce0..51bfeb8777f0 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -174,6 +174,23 @@ do { \ extern long __get_user_bad(void); +/* + * This does an atomic 128 byte aligned load from userspace. + * Upto caller to do enable_kernel_vmx() before calling! + */ +#define __get_user_atomic_128_aligned(kaddr, uaddr, err) \ + __asm__ __volatile__( \ + "1: lvx 0,0,%1 # get user\n" \ + " stvx 0,0,%2 # put kernel\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + EX_TABLE(1b, 3b) \ + : "=r" (err) \ + : "b" (uaddr), "b" (kaddr), "i" (-EFAULT), "0" (err)) + #define __get_user_asm(x, addr, err, op) \ __asm__ __volatile__( \ "1: "op" %1,0(%2) # get_user\n" \ @@ -340,4 +357,9 @@ static inline unsigned long clear_user(void __user *addr, unsigned long size) extern long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); +extern long __copy_from_user_flushcache(void *dst, const void __user *src, + unsigned size); +extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len); + #endif /* _ARCH_POWERPC_UACCESS_H */ diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h index fd5963acd658..771456227496 100644 --- a/arch/powerpc/include/asm/vas.h +++ b/arch/powerpc/include/asm/vas.h @@ -10,6 +10,8 @@ #ifndef _ASM_POWERPC_VAS_H #define _ASM_POWERPC_VAS_H +struct vas_window; + /* * Min and max FIFO sizes are based on Version 1.05 Section 3.1.4.25 * (Local FIFO Size Register) of the VAS workbook. @@ -103,6 +105,15 @@ struct vas_tx_win_attr { bool rx_win_ord_mode; }; +/* + * Helper to map a chip id to VAS id. + * For POWER9, this is a 1:1 mapping. In the future this maybe a 1:N + * mapping in which case, we will need to update this helper. + * + * Return the VAS id or -1 if no matching vasid is found. + */ +int chip_to_vas_id(int chipid); + /* * Helper to initialize receive window attributes to defaults for an * NX window. @@ -156,4 +167,14 @@ int vas_copy_crb(void *crb, int offset); */ int vas_paste_crb(struct vas_window *win, int offset, bool re); +/* + * Return a system-wide unique id for the VAS window @win. + */ +extern u32 vas_win_id(struct vas_window *win); + +/* + * Return the power bus paste address associated with @win so the caller + * can map that address into their address space. + */ +extern u64 vas_win_paste_addr(struct vas_window *win); #endif /* __ASM_POWERPC_VAS_H */ diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h index 50bcb4295de4..540592034740 100644 --- a/arch/powerpc/include/uapi/asm/cputable.h +++ b/arch/powerpc/include/uapi/asm/cputable.h @@ -49,6 +49,7 @@ #define PPC_FEATURE2_HAS_IEEE128 0x00400000 /* VSX IEEE Binary Float 128-bit */ #define PPC_FEATURE2_DARN 0x00200000 /* darn random number insn */ #define PPC_FEATURE2_SCV 0x00100000 /* scv syscall */ +#define PPC_FEATURE2_HTM_NO_SUSPEND 0x00080000 /* TM w/out suspended state */ /* * IMPORTANT! diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 6c6cce937dd8..1b6bc7fba996 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -129,7 +129,7 @@ obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM) += tm.o obj-$(CONFIG_PPC64) += $(obj64-y) obj-$(CONFIG_PPC32) += $(obj32-y) -ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE),) +ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE)(CONFIG_PPC_BOOK3S),) obj-y += ppc_save_regs.o endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8cfb20e38cfe..9aace433491a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -185,7 +185,7 @@ int main(void) #ifdef CONFIG_PPC_MM_SLICES OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); - DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit)); + OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit); DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); #endif /* CONFIG_PPC_MM_SLICES */ #endif @@ -208,7 +208,7 @@ int main(void) OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first); #endif /* CONFIG_PPC_BOOK3E */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); @@ -230,7 +230,7 @@ int main(void) OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx); OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count); OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ OFFSET(PACAEMERGSP, paca_struct, emergency_sp); #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp); diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 760872916013..1350f49d81a8 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -547,11 +547,31 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check_early = __machine_check_early_realmode_p9, .platform = "power9", }, - { /* Power9 */ + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD2_0, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power9", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .flush_tlb = __flush_tlb_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, + { /* Power9 DD 2.1 or later (see DD2.0 above) */ .pvr_mask = 0xffff0000, .pvr_value = 0x004e0000, .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9, + .cpu_features = CPU_FTRS_POWER9_DD2_1, .cpu_user_features = COMMON_USER_POWER9, .cpu_user_features2 = COMMON_USER2_POWER9, .mmu_features = MMU_FTRS_POWER9, diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 7275fed271af..602e0fde19b4 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -634,7 +634,7 @@ static struct dt_cpu_feature_match __initdata {"no-execute", feat_enable, 0}, {"strong-access-ordering", feat_enable, CPU_FTR_SAO}, {"cache-inhibited-large-page", feat_enable_large_ci, 0}, - {"coprocessor-icswx", feat_enable, CPU_FTR_ICSWX}, + {"coprocessor-icswx", feat_enable, 0}, {"hypervisor-virtualization-interrupt", feat_enable_hvi, 0}, {"program-priority-register", feat_enable, CPU_FTR_HAS_PPR}, {"wait", feat_enable, 0}, @@ -735,6 +735,8 @@ static __init void cpufeatures_cpu_quirks(void) */ if ((version & 0xffffff00) == 0x004e0100) cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1; + else if ((version & 0xffffefff) == 0x004e0200) + cur_cpu_spec->cpu_features &= ~CPU_FTR_POWER9_DD2_1; } static void __init cpufeatures_setup_finished(void) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 116000b45531..cbca0a667682 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -972,6 +972,18 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; +void eeh_probe_devices(void) +{ + struct pci_controller *hose, *tmp; + struct pci_dn *pdn; + + /* Enable EEH for all adapters */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + pdn = hose->pci_data; + traverse_pci_dn(pdn, eeh_ops->probe, NULL); + } +} + /** * eeh_init - EEH initialization * @@ -987,22 +999,11 @@ static struct notifier_block eeh_reboot_nb = { * Even if force-off is set, the EEH hardware is still enabled, so that * newer systems can boot. */ -int eeh_init(void) +static int eeh_init(void) { struct pci_controller *hose, *tmp; - struct pci_dn *pdn; - static int cnt = 0; int ret = 0; - /* - * We have to delay the initialization on PowerNV after - * the PCI hierarchy tree has been built because the PEs - * are figured out based on PCI devices instead of device - * tree nodes - */ - if (machine_is(powernv) && cnt++ <= 0) - return ret; - /* Register reboot notifier */ ret = register_reboot_notifier(&eeh_reboot_nb); if (ret) { @@ -1028,22 +1029,7 @@ int eeh_init(void) if (ret) return ret; - /* Enable EEH for all adapters */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - pdn = hose->pci_data; - traverse_pci_dn(pdn, eeh_ops->probe, NULL); - } - - /* - * Call platform post-initialization. Actually, It's good chance - * to inform platform that EEH is ready to supply service if the - * I/O cache stuff has been built up. - */ - if (eeh_ops->post_init) { - ret = eeh_ops->post_init(); - if (ret) - return ret; - } + eeh_probe_devices(); if (eeh_enabled()) pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); @@ -1757,10 +1743,6 @@ static int eeh_enable_dbgfs_set(void *data, u64 val) else eeh_add_flag(EEH_FORCE_DISABLED); - /* Notify the backend */ - if (eeh_ops->post_init) - eeh_ops->post_init(); - return 0; } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 4e1b433f6cb5..4f71e4c9beb7 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -623,7 +623,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, struct eeh_rmv_data *rmv_data) { struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); - struct timeval tstamp; + time64_t tstamp; int cnt, rc; struct eeh_dev *edev; diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 2e8d1b2b5af4..2d4956e97aa9 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -526,16 +526,16 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) */ void eeh_pe_update_time_stamp(struct eeh_pe *pe) { - struct timeval tstamp; + time64_t tstamp; if (!pe) return; if (pe->freeze_count <= 0) { pe->freeze_count = 0; - do_gettimeofday(&pe->tstamp); + pe->tstamp = ktime_get_seconds(); } else { - do_gettimeofday(&tstamp); - if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) { + tstamp = ktime_get_seconds(); + if (tstamp - pe->tstamp > 3600) { pe->tstamp = tstamp; pe->freeze_count = 0; } diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 4a0fd4f40245..3320bcac7192 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -539,7 +539,7 @@ _GLOBAL(_switch) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION b 2f END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) @@ -588,7 +588,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) slbmte r7,r0 isync 2: -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 1c80bd292e48..e441b469dc8f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -114,6 +114,7 @@ EXC_VIRT_NONE(0x4000, 0x100) cmpwi cr3,r10,2 ; \ BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ + KVMTEST_PR(n) ; \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else #define IDLETEST NOTEST @@ -130,6 +131,7 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100) EXC_REAL_END(system_reset, 0x100, 0x100) EXC_VIRT_NONE(0x4100, 0x100) +TRAMP_KVM(PACA_EXNMI, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) @@ -233,7 +235,7 @@ BEGIN_FTR_SECTION addi r10,r10,1 /* increment paca->in_mce */ sth r10,PACA_IN_MCE(r13) /* Limit nested MCE to level 4 to avoid stack overflow */ - cmpwi r10,4 + cmpwi r10,MAX_MCE_DEPTH bgt 2f /* Check if we hit limit of 4 */ std r11,GPR1(r1) /* Save r1 on the stack. */ std r11,0(r1) /* make stack chain pointer */ @@ -542,7 +544,7 @@ EXC_COMMON_BEGIN(instruction_access_common) RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,_NIP(r1) - andis. r4,r12,DSISR_BAD_FAULT_64S@h + andis. r4,r12,DSISR_SRR1_MATCH_64S@h li r5,0x400 std r3,_DAR(r1) std r4,_DSISR(r1) @@ -606,7 +608,7 @@ EXC_COMMON_BEGIN(slb_miss_common) cmpdi cr5,r11,MSR_RI crset 4*cr0+eq -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) @@ -888,12 +890,6 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -#define SYSCALL_FASTENDIAN_TEST \ -BEGIN_FTR_SECTION \ - cmpdi r0,0x1ebe ; \ - beq- 1f ; \ -END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - /* * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, * and HMT_MEDIUM. @@ -908,6 +904,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ +#ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH +#define SYSCALL_FASTENDIAN_TEST \ +BEGIN_FTR_SECTION \ + cmpdi r0,0x1ebe ; \ + beq- 1f ; \ +END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ + #define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ @@ -916,6 +919,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ +#else +#define SYSCALL_FASTENDIAN_TEST +#define SYSCALL_FASTENDIAN +#endif /* CONFIG_PPC_FAST_ENDIAN_SWITCH */ #if defined(CONFIG_RELOCATABLE) /* @@ -1033,6 +1040,8 @@ TRAMP_REAL_BEGIN(hmi_exception_early) EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */ + cmpdi cr0,r3,0 + /* Windup the stack. */ /* Move original HSRR0 and HSRR1 into the respective regs */ ld r9,_MSR(r1) @@ -1049,10 +1058,15 @@ TRAMP_REAL_BEGIN(hmi_exception_early) REST_8GPRS(2, r1) REST_GPR(10, r1) ld r11,_CCR(r1) + REST_2GPRS(12, r1) + bne 1f mtcr r11 REST_GPR(11, r1) - REST_2GPRS(12, r1) - /* restore original r1. */ + ld r1,GPR1(r1) + hrfid + +1: mtcr r11 + REST_GPR(11, r1) ld r1,GPR1(r1) /* @@ -1065,8 +1079,9 @@ hmi_exception_after_realmode: EXCEPTION_PROLOG_0(PACA_EXGEN) b tramp_real_hmi_exception -EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception) - +EXC_COMMON_BEGIN(hmi_exception_common) +EXCEPTION_COMMON(PACA_EXGEN, 0xe60, hmi_exception_common, handle_hmi_exception, + ret_from_except, FINISH_NAP;ADD_NVGPRS;ADD_RECONCILE;RUNLATCH_ON) EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20) EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80) @@ -1505,8 +1520,8 @@ USE_TEXT_SECTION() */ .balign IFETCH_ALIGN_BYTES do_hash_page: - #ifdef CONFIG_PPC_STD_MMU_64 - lis r0,DSISR_BAD_FAULT_64S@h +#ifdef CONFIG_PPC_BOOK3S_64 + lis r0,(DSISR_BAD_FAULT_64S|DSISR_DABRMATCH)@h ori r0,r0,DSISR_BAD_FAULT_64S@l and. r0,r4,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ @@ -1536,7 +1551,7 @@ do_hash_page: /* Reload DSISR into r4 for the DABR check below */ ld r4,_DSISR(r1) -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: @@ -1565,7 +1580,7 @@ handle_dabr_fault: 12: b ret_from_except_lite -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index e1431800bfb9..04ea5c04fd24 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1270,10 +1270,15 @@ static ssize_t fadump_release_memory_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + int input = -1; + if (!fw_dump.dump_active) return -EPERM; - if (buf[0] == '1') { + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { /* * Take away the '/proc/vmcore'. We are releasing the dump * memory, hence it will not be valid anymore. @@ -1307,21 +1312,25 @@ static ssize_t fadump_register_store(struct kobject *kobj, const char *buf, size_t count) { int ret = 0; + int input = -1; if (!fw_dump.fadump_enabled || fdm_active) return -EPERM; + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + mutex_lock(&fadump_mutex); - switch (buf[0]) { - case '0': + switch (input) { + case 0: if (fw_dump.dump_registered == 0) { goto unlock_out; } /* Un-register Firmware-assisted dump */ fadump_unregister_dump(&fdm); break; - case '1': + case 1: if (fw_dump.dump_registered == 1) { ret = -EEXIST; goto unlock_out; diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 8c54166491e7..29b2fed93289 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -388,7 +388,7 @@ DataAccess: EXCEPTION_PROLOG mfspr r10,SPRN_DSISR stw r10,_DSISR(r11) - andis. r0,r10,DSISR_BAD_FAULT_32S@h + andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h bne 1f /* if not, try to put a PTE */ mfspr r4,SPRN_DAR /* into the hash table */ rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index ff8511d6d8ea..aa71a90f5222 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -55,12 +55,18 @@ * * For pSeries or server processors: * 1. The MMU is off & open firmware is running in real mode. - * 2. The kernel is entered at __start + * 2. The primary CPU enters at __start. + * 3. If the RTAS supports "query-cpu-stopped-state", then secondary + * CPUs will enter as directed by "start-cpu" RTAS call, which is + * generic_secondary_smp_init, with PIR in r3. + * 4. Else the secondary CPUs will enter at secondary_hold (0x60) as + * directed by the "start-cpu" RTS call, with PIR in r3. * -or- For OPAL entry: - * 1. The MMU is off, processor in HV mode, primary CPU enters at 0 - * with device-tree in gpr3. We also get OPAL base in r8 and - * entry in r9 for debugging purposes - * 2. Secondary processors enter at 0x60 with PIR in gpr3 + * 1. The MMU is off, processor in HV mode. + * 2. The primary CPU enters at 0 with device-tree in r3, OPAL base + * in r8, and entry in r9 for debugging purposes. + * 3. Secondary CPUs enter as directed by OPAL_START_CPU call, which + * is at generic_secondary_smp_init, with PIR in r3. * * For Book3E processors: * 1. The MMU is on running in AS0 in a state defined in ePAPR diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 1125c9be9e06..01e1c1997893 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -112,12 +112,14 @@ power9_save_additional_sprs: std r4, STOP_HFSCR(r13) mfspr r3, SPRN_MMCRA - mfspr r4, SPRN_MMCR1 + mfspr r4, SPRN_MMCR0 std r3, STOP_MMCRA(r13) - std r4, STOP_MMCR1(r13) + std r4, _MMCR0(r1) - mfspr r3, SPRN_MMCR2 - std r3, STOP_MMCR2(r13) + mfspr r3, SPRN_MMCR1 + mfspr r4, SPRN_MMCR2 + std r3, STOP_MMCR1(r13) + std r4, STOP_MMCR2(r13) blr power9_restore_additional_sprs: @@ -135,11 +137,14 @@ power9_restore_additional_sprs: ld r4, STOP_MMCRA(r13) mtspr SPRN_HFSCR, r3 mtspr SPRN_MMCRA, r4 - /* We have already restored PACA_MMCR0 */ - ld r3, STOP_MMCR1(r13) - ld r4, STOP_MMCR2(r13) - mtspr SPRN_MMCR1, r3 - mtspr SPRN_MMCR2, r4 + + ld r3, _MMCR0(r1) + ld r4, STOP_MMCR1(r13) + mtspr SPRN_MMCR0, r3 + mtspr SPRN_MMCR1, r4 + + ld r3, STOP_MMCR2(r13) + mtspr SPRN_MMCR2, r3 blr /* @@ -319,20 +324,13 @@ enter_winkle: /* * r3 - PSSCR value corresponding to the requested stop state. */ +power_enter_stop: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -power_enter_stop_kvm_rm: - /* - * This is currently unused because POWER9 KVM does not have to - * gather secondary threads into sibling mode, but the code is - * here in case that function is required. - * - * Tell KVM we're entering idle. - */ + /* Tell KVM we're entering idle */ li r4,KVM_HWTHREAD_IN_IDLE /* DO THIS IN REAL MODE! See comment above. */ stb r4,HSTATE_HWTHREAD_STATE(r13) #endif -power_enter_stop: /* * Check if we are executing the lite variant with ESL=EC=0 */ @@ -357,13 +355,15 @@ power_enter_stop: b pnv_wakeup_noloss .Lhandle_esl_ec_set: +BEGIN_FTR_SECTION /* - * POWER9 DD2 can incorrectly set PMAO when waking up after a - * state-loss idle. Saving and restoring MMCR0 over idle is a + * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after + * a state-loss idle. Saving and restoring MMCR0 over idle is a * workaround. */ mfspr r4,SPRN_MMCR0 std r4,_MMCR0(r1) +END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) /* * Check if the requested state is a deep idle state. @@ -496,18 +496,6 @@ pnv_powersave_wakeup_mce: b pnv_powersave_wakeup -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -kvm_start_guest_check: - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync - lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beqlr - b kvm_start_guest -#endif - /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss @@ -532,9 +520,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) mr r3,r12 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -BEGIN_FTR_SECTION - bl kvm_start_guest_check -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beq 1f + b kvm_start_guest +1: #endif /* Return SRR1 from power7_nap() */ @@ -555,15 +549,17 @@ pnv_restore_hyp_resource_arch300: * then clear bit 60 in MMCRA to ensure the PMU starts running. */ blt cr3,1f +BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT ld r1,PACAR1(r13) + ld r4,_MMCR0(r1) + mtspr SPRN_MMCR0,r4 +END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) mfspr r4,SPRN_MMCRA ori r4,r4,(1 << (63-60)) mtspr SPRN_MMCRA,r4 xori r4,r4,(1 << (63-60)) mtspr SPRN_MMCRA,r4 - ld r4,_MMCR0(r1) - mtspr SPRN_MMCR0,r4 1: /* * POWER ISA 3. Use PSSCR to determine if we diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 4e65bf82f5e0..b7a84522e652 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -143,6 +143,13 @@ notrace unsigned int __check_irq_replay(void) */ unsigned char happened = local_paca->irq_happened; + /* + * We are responding to the next interrupt, so interrupt-off + * latencies should be reset here. + */ + trace_hardirqs_on(); + trace_hardirqs_off(); + if (happened & PACA_IRQ_HARD_DIS) { /* Clear bit 0 which we wouldn't clear otherwise */ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; @@ -270,6 +277,7 @@ notrace void arch_local_irq_restore(unsigned long en) #endif /* CONFIG_TRACE_IRQFLAGS */ set_soft_enabled(0); + trace_hardirqs_off(); /* * Check if anything needs to be re-emitted. We haven't @@ -279,6 +287,7 @@ notrace void arch_local_irq_restore(unsigned long en) replay = __check_irq_replay(); /* We can soft-enable now */ + trace_hardirqs_on(); set_soft_enabled(1); /* @@ -394,11 +403,19 @@ bool prep_irq_for_idle_irqsoff(void) /* * Take the SRR1 wakeup reason, index into this table to find the * appropriate irq_happened bit. + * + * Sytem reset exceptions taken in idle state also come through here, + * but they are NMI interrupts so do not need to wait for IRQs to be + * restored, and should be taken as early as practical. These are marked + * with 0xff in the table. The Power ISA specifies 0100b as the system + * reset interrupt reason. */ +#define IRQ_SYSTEM_RESET 0xff + static const u8 srr1_to_lazyirq[0x10] = { 0, 0, 0, PACA_IRQ_DBELL, - 0, + IRQ_SYSTEM_RESET, PACA_IRQ_DBELL, PACA_IRQ_DEC, 0, @@ -407,15 +424,43 @@ static const u8 srr1_to_lazyirq[0x10] = { PACA_IRQ_HMI, 0, 0, 0, 0, 0 }; +void replay_system_reset(void) +{ + struct pt_regs regs; + + ppc_save_regs(®s); + regs.trap = 0x100; + get_paca()->in_nmi = 1; + system_reset_exception(®s); + get_paca()->in_nmi = 0; +} +EXPORT_SYMBOL_GPL(replay_system_reset); + void irq_set_pending_from_srr1(unsigned long srr1) { unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + u8 reason = srr1_to_lazyirq[idx]; + + /* + * Take the system reset now, which is immediately after registers + * are restored from idle. It's an NMI, so interrupts need not be + * re-enabled before it is taken. + */ + if (unlikely(reason == IRQ_SYSTEM_RESET)) { + replay_system_reset(); + return; + } /* * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, - * so this can be called unconditionally with srr1 wake reason. + * so this can be called unconditionally with the SRR1 wake + * reason as returned by the idle code, which uses 0 to mean no + * interrupt. + * + * If a future CPU was to designate this as an interrupt reason, + * then a new index for no interrupt must be assigned. */ - local_paca->irq_happened |= srr1_to_lazyirq[idx]; + local_paca->irq_happened |= reason; } #endif /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 6c089d9757c9..7a1f99f1b47f 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -25,6 +25,21 @@ #include #include +/* + * This is called from ftrace code after invoking registered handlers to + * disambiguate regs->nip changes done by jprobes and livepatch. We check if + * there is an active jprobe at the provided address (mcount location). + */ +int __is_active_jprobe(unsigned long addr) +{ + if (!preemptible()) { + struct kprobe *p = raw_cpu_read(current_kprobe); + return (p && (unsigned long)p->addr == addr) ? 1 : 0; + } + + return 0; +} + static nokprobe_inline int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, unsigned long orig_nip) @@ -60,11 +75,8 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, { struct kprobe *p; struct kprobe_ctlblk *kcb; - unsigned long flags; - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); - hard_irq_disable(); + preempt_disable(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) @@ -86,13 +98,17 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) __skip_singlestep(p, regs, kcb, orig_nip); - /* - * If pre_handler returns !0, it sets regs->nip and - * resets current kprobe. - */ + else { + /* + * If pre_handler returns !0, it sets regs->nip and + * resets current kprobe. In this case, we should not + * re-enable preemption. + */ + return; + } } end: - local_irq_restore(flags); + preempt_enable_no_resched(); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index bebc3007a793..ca5d5a081e75 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -43,12 +43,6 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; -int is_current_kprobe_addr(unsigned long addr) -{ - struct kprobe *p = kprobe_running(); - return (p && (unsigned long)p->addr == addr) ? 1 : 0; -} - bool arch_within_kprobe_blacklist(unsigned long addr) { return (addr >= (unsigned long)__kprobes_text_start && @@ -59,7 +53,7 @@ bool arch_within_kprobe_blacklist(unsigned long addr) kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) { - kprobe_opcode_t *addr; + kprobe_opcode_t *addr = NULL; #ifdef PPC64_ELF_ABI_v2 /* PPC64 ABIv2 needs local entry point */ @@ -91,36 +85,29 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) * Also handle format. */ char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; - const char *modsym; bool dot_appended = false; - if ((modsym = strchr(name, ':')) != NULL) { - modsym++; - if (*modsym != '\0' && *modsym != '.') { - /* Convert to */ - strncpy(dot_name, name, modsym - name); - dot_name[modsym - name] = '.'; - dot_name[modsym - name + 1] = '\0'; - strncat(dot_name, modsym, - sizeof(dot_name) - (modsym - name) - 2); - dot_appended = true; - } else { - dot_name[0] = '\0'; - strncat(dot_name, name, sizeof(dot_name) - 1); - } - } else if (name[0] != '.') { - dot_name[0] = '.'; - dot_name[1] = '\0'; - strncat(dot_name, name, KSYM_NAME_LEN - 2); + const char *c; + ssize_t ret = 0; + int len = 0; + + if ((c = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { + c++; + len = c - name; + memcpy(dot_name, name, len); + } else + c = name; + + if (*c != '\0' && *c != '.') { + dot_name[len++] = '.'; dot_appended = true; - } else { - dot_name[0] = '\0'; - strncat(dot_name, name, KSYM_NAME_LEN - 1); } - addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); - if (!addr && dot_appended) { - /* Let's try the original non-dot symbol lookup */ + ret = strscpy(dot_name + len, c, KSYM_NAME_LEN); + if (ret > 0) + addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); + + /* Fallback to the original non-dot symbol lookup */ + if (!addr && dot_appended) addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); - } #else addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); #endif @@ -239,7 +226,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) } NOKPROBE_SYMBOL(arch_prepare_kretprobe); -int try_to_emulate(struct kprobe *p, struct pt_regs *regs) +static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; unsigned int insn = *p->ainsn.insn; @@ -261,9 +248,20 @@ int try_to_emulate(struct kprobe *p, struct pt_regs *regs) */ printk("Can't step on instruction %x\n", insn); BUG(); - } else if (ret == 0) - /* This instruction can't be boosted */ - p->ainsn.boostable = -1; + } else { + /* + * If we haven't previously emulated this instruction, then it + * can't be boosted. Note it down so we don't try to do so again. + * + * If, however, we had emulated this instruction in the past, + * then this is just an error with the current run (for + * instance, exceptions due to a load/store). We return 0 so + * that this is now single-stepped, but continue to try + * emulating it in subsequent probe hits. + */ + if (unlikely(p->ainsn.boostable != 1)) + p->ainsn.boostable = -1; + } return ret; } @@ -639,24 +637,22 @@ NOKPROBE_SYMBOL(setjmp_pre_handler); void __used jprobe_return(void) { - asm volatile("trap" ::: "memory"); + asm volatile("jprobe_return_trap:\n" + "trap\n" + ::: "memory"); } NOKPROBE_SYMBOL(jprobe_return); -static void __used jprobe_return_end(void) -{ -} -NOKPROBE_SYMBOL(jprobe_return_end); - int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - /* - * FIXME - we should ideally be validating that we got here 'cos - * of the "trap" in jprobe_return() above, before restoring the - * saved regs... - */ + if (regs->nip != ppc_kallsyms_lookup_name("jprobe_return_trap")) { + pr_debug("longjmp_break_handler NIP (0x%lx) does not match jprobe_return_trap (0x%lx)\n", + regs->nip, ppc_kallsyms_lookup_name("jprobe_return_trap")); + return 0; + } + memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); /* It's OK to start function graph tracing again */ unpause_graph_tracing(); diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 5c12e21d0d1a..49d34d7271e7 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -360,7 +360,7 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; static unsigned long htab_size; @@ -402,4 +402,4 @@ static int __init export_htab_values(void) return 0; } late_initcall(export_htab_values); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 9b2ea7e71c06..742e4658c5dc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -39,11 +39,21 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); static DEFINE_PER_CPU(int, mce_queue_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); +/* Queue for delayed MCE UE events. */ +static DEFINE_PER_CPU(int, mce_ue_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], + mce_ue_event_queue); + static void machine_check_process_queued_event(struct irq_work *work); +void machine_check_ue_event(struct machine_check_event *evt); +static void machine_process_ue_event(struct work_struct *work); + static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; +DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); + static void mce_set_error_info(struct machine_check_event *mce, struct mce_error_info *mce_err) { @@ -82,7 +92,7 @@ static void mce_set_error_info(struct machine_check_event *mce, */ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, - uint64_t nip, uint64_t addr) + uint64_t nip, uint64_t addr, uint64_t phys_addr) { int index = __this_cpu_inc_return(mce_nest_count) - 1; struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]); @@ -140,6 +150,11 @@ void save_mce_event(struct pt_regs *regs, long handled, } else if (mce->error_type == MCE_ERROR_TYPE_UE) { mce->u.ue_error.effective_address_provided = true; mce->u.ue_error.effective_address = addr; + if (phys_addr != ULONG_MAX) { + mce->u.ue_error.physical_address_provided = true; + mce->u.ue_error.physical_address = phys_addr; + machine_check_ue_event(mce); + } } return; } @@ -193,6 +208,26 @@ void release_mce_event(void) get_mce_event(NULL, true); } + +/* + * Queue up the MCE event which then can be handled later. + */ +void machine_check_ue_event(struct machine_check_event *evt) +{ + int index; + + index = __this_cpu_inc_return(mce_ue_count) - 1; + /* If queue is full, just return for now. */ + if (index >= MAX_MC_EVT) { + __this_cpu_dec(mce_ue_count); + return; + } + memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); + + /* Queue work to process this event later. */ + schedule_work(&mce_ue_event_work); +} + /* * Queue up the MCE event which then can be handled later. */ @@ -215,7 +250,39 @@ void machine_check_queue_event(void) /* Queue irq work to process this event later. */ irq_work_queue(&mce_event_process_work); } +/* + * process pending MCE event from the mce event queue. This function will be + * called during syscall exit. + */ +static void machine_process_ue_event(struct work_struct *work) +{ + int index; + struct machine_check_event *evt; + while (__this_cpu_read(mce_ue_count) > 0) { + index = __this_cpu_read(mce_ue_count) - 1; + evt = this_cpu_ptr(&mce_ue_event_queue[index]); +#ifdef CONFIG_MEMORY_FAILURE + /* + * This should probably queued elsewhere, but + * oh! well + */ + if (evt->error_type == MCE_ERROR_TYPE_UE) { + if (evt->u.ue_error.physical_address_provided) { + unsigned long pfn; + + pfn = evt->u.ue_error.physical_address >> + PAGE_SHIFT; + memory_failure(pfn, SIGBUS, 0); + } else + pr_warn("Failed to identify bad address from " + "where the uncorrectable error (UE) " + "was generated\n"); + } +#endif + __this_cpu_dec(mce_ue_count); + } +} /* * process pending MCE event from the mce event queue. This function will be * called during syscall exit. @@ -223,6 +290,7 @@ void machine_check_queue_event(void) static void machine_check_process_queued_event(struct irq_work *work) { int index; + struct machine_check_event *evt; add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -232,8 +300,8 @@ static void machine_check_process_queued_event(struct irq_work *work) */ while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; - machine_check_print_event_info( - this_cpu_ptr(&mce_event_queue[index]), false); + evt = this_cpu_ptr(&mce_event_queue[index]); + machine_check_print_event_info(evt, false); __this_cpu_dec(mce_queue_count); } } @@ -340,7 +408,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, printk("%s Effective address: %016llx\n", level, evt->u.ue_error.effective_address); if (evt->u.ue_error.physical_address_provided) - printk("%s Physical address: %016llx\n", + printk("%s Physical address: %016llx\n", level, evt->u.ue_error.physical_address); break; case MCE_ERROR_TYPE_SLB: @@ -411,45 +479,6 @@ void machine_check_print_event_info(struct machine_check_event *evt, } EXPORT_SYMBOL_GPL(machine_check_print_event_info); -uint64_t get_mce_fault_addr(struct machine_check_event *evt) -{ - switch (evt->error_type) { - case MCE_ERROR_TYPE_UE: - if (evt->u.ue_error.effective_address_provided) - return evt->u.ue_error.effective_address; - break; - case MCE_ERROR_TYPE_SLB: - if (evt->u.slb_error.effective_address_provided) - return evt->u.slb_error.effective_address; - break; - case MCE_ERROR_TYPE_ERAT: - if (evt->u.erat_error.effective_address_provided) - return evt->u.erat_error.effective_address; - break; - case MCE_ERROR_TYPE_TLB: - if (evt->u.tlb_error.effective_address_provided) - return evt->u.tlb_error.effective_address; - break; - case MCE_ERROR_TYPE_USER: - if (evt->u.user_error.effective_address_provided) - return evt->u.user_error.effective_address; - break; - case MCE_ERROR_TYPE_RA: - if (evt->u.ra_error.effective_address_provided) - return evt->u.ra_error.effective_address; - break; - case MCE_ERROR_TYPE_LINK: - if (evt->u.link_error.effective_address_provided) - return evt->u.link_error.effective_address; - break; - default: - case MCE_ERROR_TYPE_UNKNOWN: - break; - } - return 0; -} -EXPORT_SYMBOL(get_mce_fault_addr); - /* * This function is called in real mode. Strictly no printk's please. * @@ -470,6 +499,34 @@ long hmi_exception_realmode(struct pt_regs *regs) { __this_cpu_inc(irq_stat.hmi_exceptions); +#ifdef CONFIG_PPC_BOOK3S_64 + /* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */ + if (pvr_version_is(PVR_POWER9)) { + unsigned long hmer = mfspr(SPRN_HMER); + + /* Do we have the debug bit set */ + if (hmer & PPC_BIT(17)) { + hmer &= ~PPC_BIT(17); + mtspr(SPRN_HMER, hmer); + + /* + * Now to avoid problems with soft-disable we + * only do the emulation if we are coming from + * user space + */ + if (user_mode(regs)) + local_paca->hmi_p9_special_emu = 1; + + /* + * Don't bother going to OPAL if that's the + * only relevant bit. + */ + if (!(hmer & mfspr(SPRN_HMEER))) + return local_paca->hmi_p9_special_emu; + } + } +#endif /* CONFIG_PPC_BOOK3S_64 */ + wait_for_subcore_guest_exit(); if (ppc_md.hmi_exception_early) @@ -477,5 +534,5 @@ long hmi_exception_realmode(struct pt_regs *regs) wait_for_tb_resync(); - return 0; + return 1; } diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 72f153c6f3fa..644f7040b91c 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -27,6 +27,36 @@ #include #include #include +#include +#include +#include +#include + +/* + * Convert an address related to an mm to a PFN. NOTE: we are in real + * mode, we could potentially race with page table updates. + */ +static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) +{ + pte_t *ptep; + unsigned long flags; + struct mm_struct *mm; + + if (user_mode(regs)) + mm = current->mm; + else + mm = &init_mm; + + local_irq_save(flags); + if (mm == current->mm) + ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); + else + ptep = find_init_mm_pte(addr, NULL); + local_irq_restore(flags); + if (!ptep || pte_special(*ptep)) + return ULONG_MAX; + return pte_pfn(*ptep); +} static void flush_tlb_206(unsigned int num_sets, unsigned int action) { @@ -128,7 +158,7 @@ void __flush_tlb_power9(unsigned int action) { unsigned int num_sets; - if (radix_enabled()) + if (early_radix_enabled()) num_sets = POWER9_TLB_SETS_RADIX; else num_sets = POWER9_TLB_SETS_HASH; @@ -138,7 +168,7 @@ void __flush_tlb_power9(unsigned int action) /* flush SLBs and reload */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void flush_and_reload_slb(void) { struct slb_shadow *slb; @@ -185,7 +215,7 @@ static void flush_erat(void) static int mce_flush(int what) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (what == MCE_FLUSH_SLB) { flush_and_reload_slb(); return 1; @@ -421,9 +451,45 @@ static const struct mce_derror_table mce_p9_derror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0, false, 0, 0, 0, 0 } }; +static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, + uint64_t *phys_addr) +{ + /* + * Carefully look at the NIP to determine + * the instruction to analyse. Reading the NIP + * in real-mode is tricky and can lead to recursive + * faults + */ + int instr; + unsigned long pfn, instr_addr; + struct instruction_op op; + struct pt_regs tmp = *regs; + + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK); + instr = *(unsigned int *)(instr_addr); + if (!analyse_instr(&op, &tmp, instr)) { + pfn = addr_to_pfn(regs, op.ea); + *addr = op.ea; + *phys_addr = (pfn << PAGE_SHIFT); + return 0; + } + /* + * analyse_instr() might fail if the instruction + * is not a load/store, although this is unexpected + * for load/store errors or if we got the NIP + * wrong + */ + } + *addr = 0; + return -1; +} + static int mce_handle_ierror(struct pt_regs *regs, const struct mce_ierror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t srr1 = regs->msr; int handled = 0; @@ -475,8 +541,22 @@ static int mce_handle_ierror(struct pt_regs *regs, } mce_err->severity = table[i].severity; mce_err->initiator = table[i].initiator; - if (table[i].nip_valid) + if (table[i].nip_valid) { *addr = regs->nip; + if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + unsigned long pfn; + + if (get_paca()->in_mce < MAX_MCE_DEPTH) { + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + *phys_addr = + (pfn << PAGE_SHIFT); + handled = 1; + } + } + } + } return handled; } @@ -489,7 +569,8 @@ static int mce_handle_ierror(struct pt_regs *regs, static int mce_handle_derror(struct pt_regs *regs, const struct mce_derror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t dsisr = regs->dsisr; int handled = 0; @@ -555,7 +636,17 @@ static int mce_handle_derror(struct pt_regs *regs, mce_err->initiator = table[i].initiator; if (table[i].dar_valid) *addr = regs->dar; - + else if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + /* + * We do a maximum of 4 nested MCE calls, see + * kernel/exception-64s.h + */ + if (get_paca()->in_mce < MAX_MCE_DEPTH) + if (!mce_find_instr_ea_and_pfn(regs, addr, + phys_addr)) + handled = 1; + } found = 1; } @@ -592,19 +683,21 @@ static long mce_handle_error(struct pt_regs *regs, const struct mce_ierror_table itable[]) { struct mce_error_info mce_err = { 0 }; - uint64_t addr; + uint64_t addr, phys_addr; uint64_t srr1 = regs->msr; long handled; if (SRR1_MC_LOADSTORE(srr1)) - handled = mce_handle_derror(regs, dtable, &mce_err, &addr); + handled = mce_handle_derror(regs, dtable, &mce_err, &addr, + &phys_addr); else - handled = mce_handle_ierror(regs, itable, &mce_err, &addr); + handled = mce_handle_ierror(regs, itable, &mce_err, &addr, + &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) handled = mce_handle_ue_error(regs); - save_mce_event(regs, handled, &mce_err, regs->nip, addr); + save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr); return handled; } diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 0b0f89685b67..759104b99f9f 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -429,7 +429,8 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, /* Find this stub, or if that fails, the next avail. entry */ stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; for (i = 0; stub_func_addr(stubs[i].funcdata); i++) { - BUG_ON(i >= num_stubs); + if (WARN_ON(i >= num_stubs)) + return 0; if (stub_func_addr(stubs[i].funcdata) == func_addr(addr)) return (unsigned long)&stubs[i]; diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 91e037ab20a1..8237884ca389 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -115,32 +115,23 @@ static unsigned long can_optimize(struct kprobe *p) static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - unsigned long flags; - /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - local_irq_save(flags); - hard_irq_disable(); + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { __this_cpu_write(current_kprobe, &op->kp); regs->nip = (unsigned long)op->kp.addr; - kcb->kprobe_status = KPROBE_HIT_ACTIVE; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - /* - * No need for an explicit __hard_irq_enable() here. - * local_irq_restore() will re-enable interrupts, - * if they were hard disabled. - */ - local_irq_restore(flags); + preempt_enable_no_resched(); } NOKPROBE_SYMBOL(optimized_callback); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 2ff2b8a19f71..d6597038931d 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -90,7 +90,7 @@ static inline void free_lppacas(void) { } #endif /* CONFIG_PPC_BOOK3S */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * 3 persistent SLBs are registered here. The buffer will be zero @@ -135,11 +135,11 @@ static struct slb_shadow * __init init_slb_shadow(int cpu) return s; } -#else /* CONFIG_PPC_STD_MMU_64 */ +#else /* !CONFIG_PPC_BOOK3S_64 */ static void __init allocate_slb_shadows(int nr_cpus, int limit) { } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* The Paca is an array with one entry per processor. Each contains an * lppaca, which contains the information shared between the @@ -170,9 +170,9 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) new_paca->kexec_state = KEXEC_STATE_NONE; new_paca->__current = &init_task; new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 new_paca->slb_shadow_ptr = init_slb_shadow(cpu); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif #ifdef CONFIG_PPC_BOOK3E /* For now -- if we have threads this will be adjusted later */ @@ -262,8 +262,8 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_id = context->id; #ifdef CONFIG_PPC_MM_SLICES - VM_BUG_ON(!mm->context.addr_limit); - get_paca()->addr_limit = mm->context.addr_limit; + VM_BUG_ON(!mm->context.slb_addr_limit); + get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit; get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; memcpy(&get_paca()->mm_ctx_high_slices_psize, &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); @@ -271,7 +271,7 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_user_psize = context->user_psize; get_paca()->mm_ctx_sllp = context->sllp; #endif -#else /* CONFIG_PPC_BOOK3S */ +#else /* !CONFIG_PPC_BOOK3S */ return; #endif } diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 932b9741aa8f..15ce0306b092 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -90,14 +90,14 @@ int pcibios_unmap_io_space(struct pci_bus *bus) * to do an appropriate TLB flush here too */ if (bus->self) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 struct resource *res = bus->resource[0]; #endif pr_debug("IO unmapping for PCI-PCI bridge %s\n", pci_name(bus->self)); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 __flush_hash_table_range(&init_mm, res->start + _IO_BASE, res->end + _IO_BASE + 1); #endif diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a0c74bbf3454..bfdd783e3916 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -77,6 +77,13 @@ extern unsigned long _get_SP(void); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Are we running in "Suspend disabled" mode? If so we have to block any + * sigreturn that would get us into suspended state, and we also warn in some + * other paths that we should never reach with suspend disabled. + */ +bool tm_suspend_disabled __ro_after_init = false; + static void check_if_tm_restore_required(struct task_struct *tsk) { /* @@ -97,9 +104,23 @@ static inline bool msr_tm_active(unsigned long msr) { return MSR_TM_ACTIVE(msr); } + +static bool tm_active_with_fp(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_FP); +} + +static bool tm_active_with_altivec(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_VEC); +} #else static inline bool msr_tm_active(unsigned long msr) { return false; } static inline void check_if_tm_restore_required(struct task_struct *tsk) { } +static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; } +static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ bool strict_msr_control; @@ -232,7 +253,7 @@ EXPORT_SYMBOL(enable_kernel_fp); static int restore_fp(struct task_struct *tsk) { - if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) { + if (tsk->thread.load_fp || tm_active_with_fp(tsk)) { load_fp_state(¤t->thread.fp_state); current->thread.load_fp++; return 1; @@ -314,7 +335,7 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread); static int restore_altivec(struct task_struct *tsk) { if (cpu_has_feature(CPU_FTR_ALTIVEC) && - (tsk->thread.load_vec || msr_tm_active(tsk->thread.regs->msr))) { + (tsk->thread.load_vec || tm_active_with_altivec(tsk))) { load_vr_state(&tsk->thread.vr_state); tsk->thread.used_vr = 1; tsk->thread.load_vec++; @@ -853,6 +874,10 @@ static void tm_reclaim_thread(struct thread_struct *thr, if (!MSR_TM_SUSPENDED(mfmsr())) return; + giveup_all(container_of(thr, struct task_struct, thread)); + + tm_reclaim(thr, cause); + /* * If we are in a transaction and FP is off then we can't have * used FP inside that transaction. Hence the checkpointed @@ -871,10 +896,6 @@ static void tm_reclaim_thread(struct thread_struct *thr, if ((thr->ckpt_regs.msr & MSR_VEC) == 0) memcpy(&thr->ckvr_state, &thr->vr_state, sizeof(struct thread_vr_state)); - - giveup_all(container_of(thr, struct task_struct, thread)); - - tm_reclaim(thr, thr->ckpt_regs.msr, cause); } void tm_reclaim_current(uint8_t cause) @@ -903,6 +924,8 @@ static inline void tm_reclaim_task(struct task_struct *tsk) if (!MSR_TM_ACTIVE(thr->regs->msr)) goto out_and_saveregs; + WARN_ON(tm_suspend_disabled); + TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, " "ccr=%lx, msr=%lx, trap=%lx)\n", tsk->pid, thr->regs->nip, @@ -923,11 +946,9 @@ static inline void tm_reclaim_task(struct task_struct *tsk) tm_save_sprs(thr); } -extern void __tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr); +extern void __tm_recheckpoint(struct thread_struct *thread); -void tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr) +void tm_recheckpoint(struct thread_struct *thread) { unsigned long flags; @@ -946,15 +967,13 @@ void tm_recheckpoint(struct thread_struct *thread, */ tm_restore_sprs(thread); - __tm_recheckpoint(thread, orig_msr); + __tm_recheckpoint(thread); local_irq_restore(flags); } static inline void tm_recheckpoint_new_task(struct task_struct *new) { - unsigned long msr; - if (!cpu_has_feature(CPU_FTR_TM)) return; @@ -973,13 +992,11 @@ static inline void tm_recheckpoint_new_task(struct task_struct *new) tm_restore_sprs(&new->thread); return; } - msr = new->thread.ckpt_regs.msr; /* Recheckpoint to restore original checkpointed register state. */ - TM_DEBUG("*** tm_recheckpoint of pid %d " - "(new->msr 0x%lx, new->origmsr 0x%lx)\n", - new->pid, new->thread.regs->msr, msr); + TM_DEBUG("*** tm_recheckpoint of pid %d (new->msr 0x%lx)\n", + new->pid, new->thread.regs->msr); - tm_recheckpoint(&new->thread, msr); + tm_recheckpoint(&new->thread); /* * The checkpointed state has been restored but the live state has @@ -1119,6 +1136,10 @@ static inline void restore_sprs(struct thread_struct *old_thread, if (old_thread->tar != new_thread->tar) mtspr(SPRN_TAR, new_thread->tar); } + + if (cpu_has_feature(CPU_FTR_ARCH_300) && + old_thread->tidr != new_thread->tidr) + mtspr(SPRN_TIDR, new_thread->tidr); #endif } @@ -1155,7 +1176,7 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1163,7 +1184,7 @@ struct task_struct *__switch_to(struct task_struct *prev, __flush_tlb_pending(batch); batch->active = 0; } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS switch_booke_debug_regs(&new->thread.debug); @@ -1209,7 +1230,7 @@ struct task_struct *__switch_to(struct task_struct *prev, last = _switch(old_thread, new_thread); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; batch = this_cpu_ptr(&ppc64_tlb_batch); @@ -1223,22 +1244,22 @@ struct task_struct *__switch_to(struct task_struct *prev, * The copy-paste buffer can only store into foreign real * addresses, so unprivileged processes can not see the * data or use it in any way unless they have foreign real - * mappings. We don't have a VAS driver that allocates those - * yet, so no cpabort is required. + * mappings. If the new process has the foreign real address + * mappings, we must issue a cp_abort to clear any state and + * prevent snooping, corruption or a covert channel. + * + * DD1 allows paste into normal system memory so we do an + * unpaired copy, rather than cp_abort, to clear the buffer, + * since cp_abort is quite expensive. */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - /* - * DD1 allows paste into normal system memory, so we - * do an unpaired copy here to clear the buffer and - * prevent a covert channel being set up. - * - * cpabort is not used because it is quite expensive. - */ + if (current_thread_info()->task->thread.used_vas) { + asm volatile(PPC_CP_ABORT); + } else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { asm volatile(PPC_COPY(%0, %1) : : "r"(dummy_copy_buffer), "r"(0)); } } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ return last; } @@ -1434,6 +1455,137 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } +int set_thread_uses_vas(void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -EINVAL; + + current->thread.used_vas = 1; + + /* + * Even a process that has no foreign real address mapping can use + * an unpaired COPY instruction (to no real effect). Issue CP_ABORT + * to clear any pending COPY and prevent a covert channel. + * + * __switch_to() will issue CP_ABORT on future context switches. + */ + asm volatile(PPC_CP_ABORT); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + return 0; +} + +#ifdef CONFIG_PPC64 +static DEFINE_SPINLOCK(vas_thread_id_lock); +static DEFINE_IDA(vas_thread_ida); + +/* + * We need to assign a unique thread id to each thread in a process. + * + * This thread id, referred to as TIDR, and separate from the Linux's tgid, + * is intended to be used to direct an ASB_Notify from the hardware to the + * thread, when a suitable event occurs in the system. + * + * One such event is a "paste" instruction in the context of Fast Thread + * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard + * (VAS) in POWER9. + * + * To get a unique TIDR per process we could simply reuse task_pid_nr() but + * the problem is that task_pid_nr() is not yet available copy_thread() is + * called. Fixing that would require changing more intrusive arch-neutral + * code in code path in copy_process()?. + * + * Further, to assign unique TIDRs within each process, we need an atomic + * field (or an IDR) in task_struct, which again intrudes into the arch- + * neutral code. So try to assign globally unique TIDRs for now. + * + * NOTE: TIDR 0 indicates that the thread does not need a TIDR value. + * For now, only threads that expect to be notified by the VAS + * hardware need a TIDR value and we assign values > 0 for those. + */ +#define MAX_THREAD_CONTEXT ((1 << 16) - 1) +static int assign_thread_tidr(void) +{ + int index; + int err; + +again: + if (!ida_pre_get(&vas_thread_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&vas_thread_id_lock); + err = ida_get_new_above(&vas_thread_ida, 1, &index); + spin_unlock(&vas_thread_id_lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > MAX_THREAD_CONTEXT) { + spin_lock(&vas_thread_id_lock); + ida_remove(&vas_thread_ida, index); + spin_unlock(&vas_thread_id_lock); + return -ENOMEM; + } + + return index; +} + +static void free_thread_tidr(int id) +{ + spin_lock(&vas_thread_id_lock); + ida_remove(&vas_thread_ida, id); + spin_unlock(&vas_thread_id_lock); +} + +/* + * Clear any TIDR value assigned to this thread. + */ +void clear_thread_tidr(struct task_struct *t) +{ + if (!t->thread.tidr) + return; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + WARN_ON_ONCE(1); + return; + } + + mtspr(SPRN_TIDR, 0); + free_thread_tidr(t->thread.tidr); + t->thread.tidr = 0; +} + +void arch_release_task_struct(struct task_struct *t) +{ + clear_thread_tidr(t); +} + +/* + * Assign a unique TIDR (thread id) for task @t and set it in the thread + * structure. For now, we only support setting TIDR for 'current' task. + */ +int set_thread_tidr(struct task_struct *t) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -EINVAL; + + if (t != current) + return -EINVAL; + + t->thread.tidr = assign_thread_tidr(); + if (t->thread.tidr < 0) + return t->thread.tidr; + + mtspr(SPRN_TIDR, t->thread.tidr); + + return 0; +} + +#endif /* CONFIG_PPC64 */ + void release_thread(struct task_struct *t) { @@ -1467,7 +1619,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; @@ -1580,6 +1732,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, } if (cpu_has_feature(CPU_FTR_HAS_PPR)) p->thread.ppr = INIT_PPR; + + p->thread.tidr = 0; #endif kregs->nip = ppc_function_entry(f); return 0; @@ -1898,7 +2052,8 @@ unsigned long get_wchan(struct task_struct *p) do { sp = *(unsigned long *)sp; - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || + p->state == TASK_RUNNING) return 0; if (count > 0) { ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE]; @@ -2046,7 +2201,7 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) unsigned long base = mm->brk; unsigned long ret; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * If we are using 1TB segments and we are allowed to randomise * the heap, we can put it above 1TB so it is backed by a 1TB diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f83056297441..b15bae265c90 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -228,7 +229,7 @@ static void __init check_cpu_pa_features(unsigned long node) ibm_pa_features, ARRAY_SIZE(ibm_pa_features)); } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void __init init_mmu_slb_size(unsigned long node) { const __be32 *slb_size_ptr; @@ -658,6 +659,38 @@ static void __init early_reserve_mem(void) #endif } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static bool tm_disabled __initdata; + +static int __init parse_ppc_tm(char *str) +{ + bool res; + + if (kstrtobool(str, &res)) + return -EINVAL; + + tm_disabled = !res; + + return 0; +} +early_param("ppc_tm", parse_ppc_tm); + +static void __init tm_init(void) +{ + if (tm_disabled) { + pr_info("Disabling hardware transactional memory (HTM)\n"); + cur_cpu_spec->cpu_user_features2 &= + ~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM); + cur_cpu_spec->cpu_features &= ~CPU_FTR_TM; + return; + } + + pnv_tm_init(); +} +#else +static void tm_init(void) { } +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -767,6 +800,8 @@ void __init early_init_devtree(void *params) powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE; #endif + tm_init(); + DBG(" <- early_init_devtree()\n"); } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2e3bc16d02b2..2075322cd225 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -773,7 +773,7 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) static __init void print_system_info(void) { pr_info("-----------------------------------------------------\n"); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); #endif #ifdef CONFIG_PPC_STD_MMU_32 @@ -800,7 +800,7 @@ static __init void print_system_info(void) pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (htab_address) pr_info("htab_address = 0x%p\n", htab_address); if (htab_hash_mask) @@ -898,7 +898,8 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PPC_MM_SLICES #ifdef CONFIG_PPC64 - init_mm.context.addr_limit = DEFAULT_MAP_WINDOW_USER64; + if (!radix_enabled()) + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; #else #error "context.addr_limit not initialized." #endif diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h index cfba134b3024..21c18071d9d5 100644 --- a/arch/powerpc/kernel/setup.h +++ b/arch/powerpc/kernel/setup.h @@ -45,6 +45,12 @@ void emergency_stack_init(void); static inline void emergency_stack_init(void) { }; #endif +#ifdef CONFIG_PPC64 +void record_spr_defaults(void); +#else +static inline void record_spr_defaults(void) { }; +#endif + /* * Having this in kvm_ppc.h makes include dependencies too * tricky to solve for setup-common.c so have it here. diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index b89c6aac48c9..8956a9856604 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -69,6 +69,8 @@ #include #include +#include "setup.h" + #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) #else @@ -316,6 +318,13 @@ void __init early_setup(unsigned long dt_ptr) /* Initialize the hash table or TLB handling */ early_init_mmu(); + /* + * After firmware and early platform setup code has set things up, + * we note the SPR values for configurable control/performance + * registers, and use those as initial defaults. + */ + record_spr_defaults(); + /* * At this point, we can let interrupts switch to virtual mode * (the MMU has been setup), so adjust the MSR in the PACA to @@ -360,8 +369,16 @@ void early_setup_secondary(void) #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) static bool use_spinloop(void) { - if (!IS_ENABLED(CONFIG_PPC_BOOK3E)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S)) { + /* + * See comments in head_64.S -- not all platforms insert + * secondaries at __secondary_hold and wait at the spin + * loop. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) + return false; return true; + } /* * When book3e boots from kexec, the ePAPR spin table does diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e9436c5e1e09..3d7539b90010 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -103,7 +103,7 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, static void do_signal(struct task_struct *tsk) { sigset_t *oldset = sigmask_to_save(); - struct ksignal ksig; + struct ksignal ksig = { .sig = 0 }; int ret; int is32 = is_32bit_task(); diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 92fb1c8dbbd8..16d16583cf11 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -519,6 +519,8 @@ static int save_tm_user_regs(struct pt_regs *regs, { unsigned long msr = regs->msr; + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -769,6 +771,8 @@ static long restore_tm_user_regs(struct pt_regs *regs, int i; #endif + if (tm_suspend_disabled) + return 1; /* * restore general registers but not including MSR or SOFTE. Also * take care of keeping r2 (TLS) intact if not a signal. @@ -876,7 +880,7 @@ static long restore_tm_user_regs(struct pt_regs *regs, /* Make sure the transaction is marked as failed */ current->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(¤t->thread, msr); + tm_recheckpoint(¤t->thread); /* This loads the speculative FP/VEC state, if used */ msr_check_and_set(msr & (MSR_FP | MSR_VEC)); diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index b2c002993d78..4b9ca3570344 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -214,6 +214,8 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, BUG_ON(!MSR_TM_ACTIVE(regs->msr)); + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -430,6 +432,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, BUG_ON(tsk != current); + if (tm_suspend_disabled) + return -EINVAL; + /* copy the GPRs */ err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr)); err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs, @@ -558,7 +563,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, /* Make sure the transaction is marked as failed */ tsk->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(&tsk->thread, msr); + tm_recheckpoint(&tsk->thread); msr_check_and_set(msr & (MSR_FP | MSR_VEC)); if (msr & MSR_FP) { diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 4437c70c7c2b..b8d4a1dac39f 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -590,6 +590,17 @@ static void sysfs_create_dscr_default(void) if (cpu_has_feature(CPU_FTR_DSCR)) err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default); } + +void __init record_spr_defaults(void) +{ + int cpu; + + if (cpu_has_feature(CPU_FTR_DSCR)) { + dscr_default = mfspr(SPRN_DSCR); + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + paca[cpu].dscr_default = dscr_default; + } +} #endif /* CONFIG_PPC64 */ #ifdef HAS_PPC_PMC_PA6T diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index a3374e8a258c..e3c5f75d137c 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -230,8 +230,7 @@ int __init TAU_init(void) /* first, set up the window shrinking timer */ - init_timer(&tau_timer); - tau_timer.function = tau_timeout_smp; + setup_timer(&tau_timer, tau_timeout_smp, 0UL); tau_timer.expires = jiffies + shrink_timer; add_timer(&tau_timer); diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 1da12f521cb7..b92ac8e711db 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -80,15 +80,12 @@ _GLOBAL(tm_abort) blr /* void tm_reclaim(struct thread_struct *thread, - * unsigned long orig_msr, * uint8_t cause) * * - Performs a full reclaim. This destroys outstanding * transactions and updates thread->regs.tm_ckpt_* with the * original checkpointed state. Note that thread->regs is * unchanged. - * - FP regs are written back to thread->transact_fpr before - * reclaiming. These are the transactional (current) versions. * * Purpose is to both abort transactions of, and preserve the state of, * a transactions at a context switch. We preserve/restore both sets of process @@ -99,9 +96,9 @@ _GLOBAL(tm_abort) * Call with IRQs off, stacks get all out of sync for some periods in here! */ _GLOBAL(tm_reclaim) - mfcr r6 + mfcr r5 mflr r0 - stw r6, 8(r1) + stw r5, 8(r1) std r0, 16(r1) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) @@ -109,7 +106,6 @@ _GLOBAL(tm_reclaim) /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */ std r3, STK_PARAM(R3)(r1) - std r4, STK_PARAM(R4)(r1) SAVE_NVGPRS(r1) /* We need to setup MSR for VSX register save instructions. */ @@ -139,8 +135,8 @@ _GLOBAL(tm_reclaim) std r1, PACAR1(r13) /* Clear MSR RI since we are about to change r1, EE is already off. */ - li r4, 0 - mtmsrd r4, 1 + li r5, 0 + mtmsrd r5, 1 /* * BE CAREFUL HERE: @@ -152,7 +148,7 @@ _GLOBAL(tm_reclaim) * to user register state. (FPRs, CCR etc. also!) * Use an sprg and a tm_scratch in the PACA to shuffle. */ - TRECLAIM(R5) /* Cause in r5 */ + TRECLAIM(R4) /* Cause in r4 */ /* ******************** GPRs ******************** */ /* Stash the checkpointed r13 away in the scratch SPR and get the real @@ -243,40 +239,30 @@ _GLOBAL(tm_reclaim) /* ******************** FPR/VR/VSRs ************ - * After reclaiming, capture the checkpointed FPRs/VRs /if used/. - * - * (If VSX used, FP and VMX are implied. Or, we don't need to look - * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.) - * - * We're passed the thread's MSR as the second parameter + * After reclaiming, capture the checkpointed FPRs/VRs. * * We enabled VEC/FP/VSX in the msr above, so we can execute these * instructions! */ - ld r4, STK_PARAM(R4)(r1) /* Second parameter, MSR * */ mr r3, r12 - andis. r0, r4, MSR_VEC@h - beq dont_backup_vec + /* Altivec (VEC/VMX/VR)*/ addi r7, r3, THREAD_CKVRSTATE SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ mfvscr v0 li r6, VRSTATE_VSCR stvx v0, r7, r6 -dont_backup_vec: + + /* VRSAVE */ mfspr r0, SPRN_VRSAVE std r0, THREAD_CKVRSAVE(r3) - andi. r0, r4, MSR_FP - beq dont_backup_fp - + /* Floating Point (FP) */ addi r7, r3, THREAD_CKFPSTATE SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ - mffs fr0 stfd fr0,FPSTATE_FPSCR(r7) -dont_backup_fp: /* TM regs, incl TEXASR -- these live in thread_struct. Note they've * been updated by the treclaim, to explain to userland the failure @@ -344,22 +330,19 @@ _GLOBAL(__tm_recheckpoint) */ subi r7, r7, STACK_FRAME_OVERHEAD + /* We need to setup MSR for FP/VMX/VSX register save instructions. */ mfmsr r6 - /* R4 = original MSR to indicate whether thread used FP/Vector etc. */ - - /* Enable FP/vec in MSR if necessary! */ - lis r5, MSR_VEC@h + mr r5, r6 ori r5, r5, MSR_FP - and. r5, r4, r5 - beq restore_gprs /* if neither, skip both */ - +#ifdef CONFIG_ALTIVEC + oris r5, r5, MSR_VEC@h +#endif #ifdef CONFIG_VSX BEGIN_FTR_SECTION - oris r5, r5, MSR_VSX@h + oris r5,r5, MSR_VSX@h END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - or r5, r6, r5 /* Set MSR.FP+.VSX/.VEC */ - mtmsr r5 + mtmsrd r5 #ifdef CONFIG_ALTIVEC /* @@ -368,28 +351,20 @@ _GLOBAL(__tm_recheckpoint) * thread.fp_state[] version holds the 'live' (transactional) * and will be loaded subsequently by any FPUnavailable trap. */ - andis. r0, r4, MSR_VEC@h - beq dont_restore_vec - addi r8, r3, THREAD_CKVRSTATE li r5, VRSTATE_VSCR lvx v0, r8, r5 mtvscr v0 REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */ -dont_restore_vec: ld r5, THREAD_CKVRSAVE(r3) mtspr SPRN_VRSAVE, r5 #endif - andi. r0, r4, MSR_FP - beq dont_restore_fp - addi r8, r3, THREAD_CKFPSTATE lfd fr0, FPSTATE_FPSCR(r8) MTFSF_L(fr0) REST_32FPRS_VSRS(0, R4, R8) -dont_restore_fp: mtmsr r6 /* FP/Vec off again! */ restore_gprs: diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index b4e2b7165f79..3f3e81852422 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -110,9 +110,9 @@ ftrace_call: /* NIP has not been altered, skip over further checks */ beq 1f - /* Check if there is an active kprobe on us */ + /* Check if there is an active jprobe on us */ subi r3, r14, 4 - bl is_current_kprobe_addr + bl __is_active_jprobe nop /* diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 13c9dcdcba69..f3eb61be0d30 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -699,6 +700,187 @@ void SMIException(struct pt_regs *regs) die("System Management Interrupt", regs, SIGABRT); } +#ifdef CONFIG_VSX +static void p9_hmi_special_emu(struct pt_regs *regs) +{ + unsigned int ra, rb, t, i, sel, instr, rc; + const void __user *addr; + u8 vbuf[16], *vdst; + unsigned long ea, msr, msr_mask; + bool swap; + + if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip)) + return; + + /* + * lxvb16x opcode: 0x7c0006d8 + * lxvd2x opcode: 0x7c000698 + * lxvh8x opcode: 0x7c000658 + * lxvw4x opcode: 0x7c000618 + */ + if ((instr & 0xfc00073e) != 0x7c000618) { + pr_devel("HMI vec emu: not vector CI %i:%s[%d] nip=%016lx" + " instr=%08x\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr); + return; + } + + /* Grab vector registers into the task struct */ + msr = regs->msr; /* Grab msr before we flush the bits */ + flush_vsx_to_thread(current); + enable_kernel_altivec(); + + /* + * Is userspace running with a different endian (this is rare but + * not impossible) + */ + swap = (msr & MSR_LE) != (MSR_KERNEL & MSR_LE); + + /* Decode the instruction */ + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + t = (instr >> 21) & 0x1f; + if (instr & 1) + vdst = (u8 *)¤t->thread.vr_state.vr[t]; + else + vdst = (u8 *)¤t->thread.fp_state.fpr[t][0]; + + /* Grab the vector address */ + ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0); + if (is_32bit_task()) + ea &= 0xfffffffful; + addr = (__force const void __user *)ea; + + /* Check it */ + if (!access_ok(VERIFY_READ, addr, 16)) { + pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + /* Read the vector */ + rc = 0; + if ((unsigned long)addr & 0xfUL) + /* unaligned case */ + rc = __copy_from_user_inatomic(vbuf, addr, 16); + else + __get_user_atomic_128_aligned(vbuf, addr, rc); + if (rc) { + pr_devel("HMI vec emu: page fault %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + pr_devel("HMI vec emu: emulated vector CI %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, regs->nip, + instr, (unsigned long) addr); + + /* Grab instruction "selector" */ + sel = (instr >> 6) & 3; + + /* + * Check to make sure the facility is actually enabled. This + * could happen if we get a false positive hit. + * + * lxvd2x/lxvw4x always check MSR VSX sel = 0,2 + * lxvh8x/lxvb16x check MSR VSX or VEC depending on VSR used sel = 1,3 + */ + msr_mask = MSR_VSX; + if ((sel & 1) && (instr & 1)) /* lxvh8x & lxvb16x + VSR >= 32 */ + msr_mask = MSR_VEC; + if (!(msr & msr_mask)) { + pr_devel("HMI vec emu: MSR fac clear %i:%s[%d] nip=%016lx" + " instr=%08x msr:%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, msr); + return; + } + + /* Do logging here before we modify sel based on endian */ + switch (sel) { + case 0: /* lxvw4x */ + PPC_WARN_EMULATED(lxvw4x, regs); + break; + case 1: /* lxvh8x */ + PPC_WARN_EMULATED(lxvh8x, regs); + break; + case 2: /* lxvd2x */ + PPC_WARN_EMULATED(lxvd2x, regs); + break; + case 3: /* lxvb16x */ + PPC_WARN_EMULATED(lxvb16x, regs); + break; + } + +#ifdef __LITTLE_ENDIAN__ + /* + * An LE kernel stores the vector in the task struct as an LE + * byte array (effectively swapping both the components and + * the content of the components). Those instructions expect + * the components to remain in ascending address order, so we + * swap them back. + * + * If we are running a BE user space, the expectation is that + * of a simple memcpy, so forcing the emulation to look like + * a lxvb16x should do the trick. + */ + if (swap) + sel = 3; + + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = ((u32 *)vbuf)[3-i]; + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = ((u16 *)vbuf)[7-i]; + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = ((u64 *)vbuf)[1-i]; + break; + case 3: /* lxvb16x */ + for (i = 0; i < 16; i++) + vdst[i] = vbuf[15-i]; + break; + } +#else /* __LITTLE_ENDIAN__ */ + /* On a big endian kernel, a BE userspace only needs a memcpy */ + if (!swap) + sel = 3; + + /* Otherwise, we need to swap the content of the components */ + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = cpu_to_le32(((u32 *)vbuf)[i]); + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = cpu_to_le16(((u16 *)vbuf)[i]); + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = cpu_to_le64(((u64 *)vbuf)[i]); + break; + case 3: /* lxvb16x */ + memcpy(vdst, vbuf, 16); + break; + } +#endif /* !__LITTLE_ENDIAN__ */ + + /* Go to next instruction */ + regs->nip += 4; +} +#endif /* CONFIG_VSX */ + void handle_hmi_exception(struct pt_regs *regs) { struct pt_regs *old_regs; @@ -706,6 +888,21 @@ void handle_hmi_exception(struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_VSX + /* Real mode flagged P9 special emu is needed */ + if (local_paca->hmi_p9_special_emu) { + local_paca->hmi_p9_special_emu = 0; + + /* + * We don't want to take page faults while doing the + * emulation, we just replay the instruction if necessary. + */ + pagefault_disable(); + p9_hmi_special_emu(regs); + pagefault_enable(); + } +#endif /* CONFIG_VSX */ + if (ppc_md.handle_hmi_exception) ppc_md.handle_hmi_exception(regs); @@ -1140,13 +1337,8 @@ void program_check_exception(struct pt_regs *regs) * - A treclaim is attempted when non transactional. * - A tend is illegally attempted. * - writing a TM SPR when transactional. - */ - if (!user_mode(regs) && - report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) { - regs->nip += 4; - goto bail; - } - /* If usermode caused this, it's done something illegal and + * + * If usermode caused this, it's done something illegal and * gets a SIGILL slap on the wrist. We call it an illegal * operand to distinguish from the instruction just being bad * (e.g. executing a 'tend' on a CPU without TM!); it's an @@ -1487,7 +1679,7 @@ void fp_unavailable_tm(struct pt_regs *regs) /* Reclaim didn't save out any FPRs to transact_fprs. */ /* Enable FP for the task: */ - regs->msr |= (MSR_FP | current->thread.fpexc_mode); + current->thread.load_fp = 1; /* This loads and recheckpoints the FP registers from * thread.fpr[]. They will remain in registers after the @@ -1495,15 +1687,7 @@ void fp_unavailable_tm(struct pt_regs *regs) * If VMX is in use, the VRs now hold checkpointed values, * so we don't want to load the VRs from the thread_struct. */ - tm_recheckpoint(¤t->thread, MSR_FP); - - /* If VMX is in use, get the transactional values back */ - if (regs->msr & MSR_VEC) { - msr_check_and_set(MSR_VEC); - load_vr_state(¤t->thread.vr_state); - /* At this point all the VSX state is loaded, so enable it */ - regs->msr |= MSR_VSX; - } + tm_recheckpoint(¤t->thread); } void altivec_unavailable_tm(struct pt_regs *regs) @@ -1516,21 +1700,13 @@ void altivec_unavailable_tm(struct pt_regs *regs) "MSR=%lx\n", regs->nip, regs->msr); tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC; - tm_recheckpoint(¤t->thread, MSR_VEC); + current->thread.load_vec = 1; + tm_recheckpoint(¤t->thread); current->thread.used_vr = 1; - - if (regs->msr & MSR_FP) { - msr_check_and_set(MSR_FP); - load_fp_state(¤t->thread.fp_state); - regs->msr |= MSR_VSX; - } } void vsx_unavailable_tm(struct pt_regs *regs) { - unsigned long orig_msr = regs->msr; - /* See the comments in fp_unavailable_tm(). This works similarly, * though we're loading both FP and VEC registers in here. * @@ -1544,29 +1720,13 @@ void vsx_unavailable_tm(struct pt_regs *regs) current->thread.used_vsr = 1; - /* If FP and VMX are already loaded, we have all the state we need */ - if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) { - regs->msr |= MSR_VSX; - return; - } - /* This reclaims FP and/or VR regs if they're already enabled */ tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode | - MSR_VSX; + current->thread.load_vec = 1; + current->thread.load_fp = 1; - /* This loads & recheckpoints FP and VRs; but we have - * to be sure not to overwrite previously-valid state. - */ - tm_recheckpoint(¤t->thread, regs->msr & ~orig_msr); - - msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC)); - - if (orig_msr & MSR_FP) - load_fp_state(¤t->thread.fp_state); - if (orig_msr & MSR_VEC) - load_vr_state(¤t->thread.vr_state); + tm_recheckpoint(¤t->thread); } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ @@ -1924,6 +2084,10 @@ struct ppc_emulated ppc_emulated = { WARN_EMULATED_SETUP(mfdscr), WARN_EMULATED_SETUP(mtdscr), WARN_EMULATED_SETUP(lq_stq), + WARN_EMULATED_SETUP(lxvw4x), + WARN_EMULATED_SETUP(lxvh8x), + WARN_EMULATED_SETUP(lxvd2x), + WARN_EMULATED_SETUP(lxvb16x), #endif }; diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 1d89163d67f2..87da80ccced1 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -98,8 +98,7 @@ static void wd_lockup_ipi(struct pt_regs *regs) else dump_stack(); - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); + /* Do not panic from here because that can recurse into NMI IPI layer */ } static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) @@ -135,15 +134,18 @@ static void watchdog_smp_panic(int cpu, u64 tb) pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n", cpu, cpumask_pr_args(&wd_smp_cpus_pending)); - /* - * Try to trigger the stuck CPUs. - */ - for_each_cpu(c, &wd_smp_cpus_pending) { - if (c == cpu) - continue; - smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + if (!sysctl_hardlockup_all_cpu_backtrace) { + /* + * Try to trigger the stuck CPUs, unless we are going to + * get a backtrace on all of them anyway. + */ + for_each_cpu(c, &wd_smp_cpus_pending) { + if (c == cpu) + continue; + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + } + smp_flush_nmi_ipi(1000000); } - smp_flush_nmi_ipi(1000000); /* Take the stuck CPUs out of the watch group */ set_cpumask_stuck(&wd_smp_cpus_pending, tb); @@ -275,9 +277,12 @@ void arch_touch_nmi_watchdog(void) { unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; int cpu = smp_processor_id(); + u64 tb = get_tb(); - if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks) - watchdog_timer_interrupt(cpu); + if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { + per_cpu(wd_timer_tb, cpu) = tb; + wd_smp_clear_cpu_pending(cpu, tb); + } } EXPORT_SYMBOL(arch_touch_nmi_watchdog); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8d43cf205d34..40e5857c4b1c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -47,6 +47,7 @@ #include #include +#include #include #include #include @@ -1089,9 +1090,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; - /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/ + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ case BOOK3S_INTERRUPT_HMI: case BOOK3S_INTERRUPT_PERFMON: + case BOOK3S_INTERRUPT_SYSTEM_RESET: r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: @@ -2117,15 +2119,6 @@ static int kvmppc_grab_hwthread(int cpu) struct paca_struct *tpaca; long timeout = 10000; - /* - * ISA v3.0 idle routines do not set hwthread_state or test - * hwthread_req, so they can not grab idle threads. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - WARN(1, "KVM: can not control sibling threads\n"); - return -EBUSY; - } - tpaca = &paca[cpu]; /* Ensure the thread won't go into the kernel if it wakes */ @@ -2160,12 +2153,10 @@ static void kvmppc_release_hwthread(int cpu) struct paca_struct *tpaca; tpaca = &paca[cpu]; + tpaca->kvm_hstate.hwthread_req = 0; tpaca->kvm_hstate.kvm_vcpu = NULL; tpaca->kvm_hstate.kvm_vcore = NULL; tpaca->kvm_hstate.kvm_split_mode = NULL; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - tpaca->kvm_hstate.hwthread_req = 0; - } static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) @@ -2615,6 +2606,9 @@ static void set_irq_happened(int trap) case BOOK3S_INTERRUPT_HMI: local_paca->irq_happened |= PACA_IRQ_HMI; break; + case BOOK3S_INTERRUPT_SYSTEM_RESET: + replay_system_reset(); + break; } } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 42639fba89e8..68bf0f14a962 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -149,11 +149,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) subf r4, r4, r3 mtspr SPRN_DEC, r4 -BEGIN_FTR_SECTION /* hwthread_req may have got set by cede or no vcpu, so clear it */ li r0, 0 stb r0, HSTATE_HWTHREAD_REQ(r13) -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* * For external interrupts we need to call the Linux @@ -316,7 +314,6 @@ kvm_novcpu_exit: * Relocation is off and most register values are lost. * r13 points to the PACA. * r3 contains the SRR1 wakeup value, SRR1 is trashed. - * This is not used by ISAv3.0B processors. */ .globl kvm_start_guest kvm_start_guest: @@ -435,9 +432,6 @@ kvm_secondary_got_guest: * While waiting we also need to check if we get given a vcpu to run. */ kvm_no_guest: -BEGIN_FTR_SECTION - twi 31,0,0 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r3, HSTATE_HWTHREAD_REQ(r13) cmpwi r3, 0 bne 53f @@ -2546,10 +2540,8 @@ kvm_do_nap: clrrdi r0, r0, 1 mtspr SPRN_CTRLT, r0 -BEGIN_FTR_SECTION li r0,1 stb r0,HSTATE_HWTHREAD_REQ(r13) -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 BEGIN_FTR_SECTION diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ee279c7f4802..1abe6eb51335 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -644,7 +644,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_PPC_HTM: - r = cpu_has_feature(CPU_FTR_TM_COMP) && hv_enabled; + r = hv_enabled && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM_COMP); break; default: r = 0; diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index c66c3626a216..3c29c9009bbf 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -24,7 +24,7 @@ endif obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \ - memcpy_64.o memcmp_64.o + memcpy_64.o memcmp_64.o pmem.o obj64-$(CONFIG_SMP) += locks.o obj64-$(CONFIG_ALTIVEC) += vmx-helper.o diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c new file mode 100644 index 000000000000..53c018762e1c --- /dev/null +++ b/arch/powerpc/lib/pmem.c @@ -0,0 +1,67 @@ +/* + * Copyright(c) 2017 IBM Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include + +#include + +/* + * CONFIG_ARCH_HAS_PMEM_API symbols + */ +void arch_wb_cache_pmem(void *addr, size_t size) +{ + unsigned long start = (unsigned long) addr; + flush_inval_dcache_range(start, start + size); +} +EXPORT_SYMBOL(arch_wb_cache_pmem); + +void arch_invalidate_pmem(void *addr, size_t size) +{ + unsigned long start = (unsigned long) addr; + flush_inval_dcache_range(start, start + size); +} +EXPORT_SYMBOL(arch_invalidate_pmem); + +/* + * CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE symbols + */ +long __copy_from_user_flushcache(void *dest, const void __user *src, + unsigned size) +{ + unsigned long copied, start = (unsigned long) dest; + + copied = __copy_from_user(dest, src, size); + flush_inval_dcache_range(start, start + size); + + return copied; +} + +void *memcpy_flushcache(void *dest, const void *src, size_t size) +{ + unsigned long start = (unsigned long) dest; + + memcpy(dest, src, size); + flush_inval_dcache_range(start, start + size); + + return dest; +} +EXPORT_SYMBOL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + memcpy_flushcache(to, page_to_virt(page) + offset, len); +} +EXPORT_SYMBOL(memcpy_page_flushcache); diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index f208f560aecd..70274b7b4773 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -31,6 +31,8 @@ extern char system_call_common[]; #define XER_SO 0x80000000U #define XER_OV 0x40000000U #define XER_CA 0x20000000U +#define XER_OV32 0x00080000U +#define XER_CA32 0x00040000U #ifdef CONFIG_PPC_FPU /* @@ -962,6 +964,16 @@ static nokprobe_inline void set_cr0(const struct pt_regs *regs, op->ccval |= 0x20000000; } +static nokprobe_inline void set_ca32(struct instruction_op *op, bool val) +{ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (val) + op->xerval |= XER_CA32; + else + op->xerval &= ~XER_CA32; + } +} + static nokprobe_inline void add_with_carry(const struct pt_regs *regs, struct instruction_op *op, int rd, unsigned long val1, unsigned long val2, @@ -985,6 +997,9 @@ static nokprobe_inline void add_with_carry(const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + + set_ca32(op, (unsigned int)val < (unsigned int)val1 || + (carry_in && (unsigned int)val == (unsigned int)val1)); } static nokprobe_inline void do_cmp_signed(const struct pt_regs *regs, @@ -1791,6 +1806,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; case 824: /* srawi */ @@ -1803,6 +1819,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; #ifdef __powerpc64__ @@ -1832,6 +1849,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; case 826: /* sradi with sh_5 = 0 */ @@ -1845,6 +1863,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; #endif /* __powerpc64__ */ @@ -2698,6 +2717,7 @@ void emulate_update_regs(struct pt_regs *regs, struct instruction_op *op) } regs->nip = next_pc; } +NOKPROBE_SYMBOL(emulate_update_regs); /* * Emulate a previously-analysed load or store instruction. diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index a0c327d544d1..76a6b057d454 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -15,11 +15,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o -ifeq ($(CONFIG_PPC_STD_MMU_64),y) +ifeq ($(CONFIG_PPC_BOOK3S_64),y) obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o endif @@ -32,7 +32,7 @@ obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifeq ($(CONFIG_HUGETLB_PAGE),y) -obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c index 5c4c93dcff19..14cfb11b09d0 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/dump_hashpagetable.c @@ -500,7 +500,7 @@ static void populate_markers(void) address_markers[6].start_address = PHB_IO_END; address_markers[7].start_address = IOREMAP_BASE; address_markers[8].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 address_markers[9].start_address = H_VMEMMAP_BASE; #else address_markers[9].start_address = VMEMMAP_BASE; diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index c9282d27b203..c2e7dea59490 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -112,7 +112,7 @@ struct flag_info { static const struct flag_info flag_array[] = { { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_PRIVILEGED, .val = 0, #else @@ -147,7 +147,7 @@ static const struct flag_info flag_array[] = { .set = "present", .clear = " ", }, { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 .mask = H_PAGE_HASHPTE, .val = H_PAGE_HASHPTE, #else @@ -157,7 +157,7 @@ static const struct flag_info flag_array[] = { .set = "hpte", .clear = " ", }, { -#ifndef CONFIG_PPC_STD_MMU_64 +#ifndef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_GUARDED, .val = _PAGE_GUARDED, .set = "guarded", @@ -174,7 +174,7 @@ static const struct flag_info flag_array[] = { .set = "accessed", .clear = " ", }, { -#ifndef CONFIG_PPC_STD_MMU_64 +#ifndef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_WRITETHRU, .val = _PAGE_WRITETHRU, .set = "write through", @@ -450,7 +450,7 @@ static void populate_markers(void) address_markers[i++].start_address = PHB_IO_END; address_markers[i++].start_address = IOREMAP_BASE; address_markers[i++].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 address_markers[i++].start_address = H_VMEMMAP_BASE; #else address_markers[i++].start_address = VMEMMAP_BASE; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 67ec2e927253..655a5a9a183d 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -21,6 +21,7 @@ #undef DEBUG #undef DEBUG_LOW +#define pr_fmt(fmt) "hash-mmu: " fmt #include #include #include diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index 558e9d3891bf..2486bee0f93e 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -49,17 +49,22 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + int fixed = (flags & MAP_FIXED); + unsigned long high_limit; struct vm_unmapped_area_info info; - if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) - mm->context.addr_limit = TASK_SIZE; + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; if (len & ~huge_page_mask(h)) return -EINVAL; - if (len > mm->task_size) + if (len > high_limit) return -ENOMEM; - if (flags & MAP_FIXED) { + if (fixed) { + if (addr > high_limit - len) + return -ENOMEM; if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; @@ -68,7 +73,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (mm->task_size - len >= addr && + if (high_limit - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -79,12 +84,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; + info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; - if (addr > DEFAULT_MAP_WINDOW) - info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; - return vm_unmapped_area(&info); } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 588a521966ec..a07722531b32 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -68,11 +68,11 @@ #include "mmu_decl.h" -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ phys_addr_t memstart_addr = ~0; EXPORT_SYMBOL_GPL(memstart_addr); @@ -367,11 +367,20 @@ EXPORT_SYMBOL_GPL(realmode_pfn_to_page); #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -#ifdef CONFIG_PPC_STD_MMU_64 -static bool disable_radix; +#ifdef CONFIG_PPC_BOOK3S_64 +static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); + static int __init parse_disable_radix(char *p) { - disable_radix = true; + bool val; + + if (strlen(p) == 0) + val = true; + else if (kstrtobool(p, &val)) + return -EINVAL; + + disable_radix = val; + return 0; } early_param("disable_radix", parse_disable_radix); @@ -444,4 +453,4 @@ void __init mmu_early_init_devtree(void) else hash__early_init_devtree(); } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 5d78b193fec4..d503f344e476 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -106,22 +106,27 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + int fixed = (flags & MAP_FIXED); + unsigned long high_limit; struct vm_unmapped_area_info info; - if (unlikely(addr > mm->context.addr_limit && - mm->context.addr_limit != TASK_SIZE)) - mm->context.addr_limit = TASK_SIZE; + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; - if (len > mm->task_size - mmap_min_addr) + if (len > high_limit) return -ENOMEM; - if (flags & MAP_FIXED) + if (fixed) { + if (addr > high_limit - len) + return -ENOMEM; return addr; + } if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (mm->task_size - len >= addr && addr >= mmap_min_addr && + if (high_limit - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -129,13 +134,9 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; + info.high_limit = high_limit; info.align_mask = 0; - if (unlikely(addr > DEFAULT_MAP_WINDOW)) - info.high_limit = mm->context.addr_limit; - else - info.high_limit = DEFAULT_MAP_WINDOW; - return vm_unmapped_area(&info); } @@ -149,37 +150,37 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, struct vm_area_struct *vma; struct mm_struct *mm = current->mm; unsigned long addr = addr0; + int fixed = (flags & MAP_FIXED); + unsigned long high_limit; struct vm_unmapped_area_info info; - if (unlikely(addr > mm->context.addr_limit && - mm->context.addr_limit != TASK_SIZE)) - mm->context.addr_limit = TASK_SIZE; + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; - /* requested length too big for entire address space */ - if (len > mm->task_size - mmap_min_addr) + if (len > high_limit) return -ENOMEM; - if (flags & MAP_FIXED) + if (fixed) { + if (addr > high_limit - len) + return -ENOMEM; return addr; + } - /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (mm->task_size - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) + if (high_limit - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base; + info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); info.align_mask = 0; - if (addr > DEFAULT_MAP_WINDOW) - info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; - addr = vm_unmapped_area(&info); if (!(addr & ~PAGE_MASK)) return addr; diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 0f613bc63c50..d60a62bf4fc7 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -34,15 +34,6 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, struct mm_struct *mm) { } #endif -#ifdef CONFIG_PPC_BOOK3S_64 -static inline void inc_mm_active_cpus(struct mm_struct *mm) -{ - atomic_inc(&mm->context.active_cpus); -} -#else -static inline void inc_mm_active_cpus(struct mm_struct *mm) { } -#endif - void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index a7e998158f37..59c0766ae4e0 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -93,11 +93,11 @@ static int hash__init_new_context(struct mm_struct *mm) return index; /* - * We do switch_slb() early in fork, even before we setup the - * mm->context.addr_limit. Default to max task size so that we copy the - * default values to paca which will help us to handle slb miss early. + * In the case of exec, use the default limit, + * otherwise inherit it from the mm we are duplicating. */ - mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64; + if (!mm->context.slb_addr_limit) + mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; /* * The old code would re-promote on fork, we don't do that when using @@ -216,19 +216,34 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif + if (radix_enabled()) + WARN_ON(process_tb[mm->context.id].prtb0 != 0); + else + subpage_prot_free(mm); + destroy_pagetable_page(mm); + __destroy_context(mm->context.id); + mm->context.id = MMU_NO_CONTEXT; +} + +void arch_exit_mmap(struct mm_struct *mm) +{ if (radix_enabled()) { /* * Radix doesn't have a valid bit in the process table * entries. However we know that at least P9 implementation * will avoid caching an entry with an invalid RTS field, * and 0 is invalid. So this will do. + * + * This runs before the "fullmm" tlb flush in exit_mmap, + * which does a RIC=2 tlbie to clear the process table + * entry. See the "fullmm" comments in tlb-radix.c. + * + * No barrier required here after the store because + * this process will do the invalidate, which starts with + * ptesync. */ process_tb[mm->context.id].prtb0 = 0; - } else - subpage_prot_free(mm); - destroy_pagetable_page(mm); - __destroy_context(mm->context.id); - mm->context.id = MMU_NO_CONTEXT; + } } #ifdef CONFIG_PPC_RADIX_MMU diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 73016451f330..adb6364f4091 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1148,11 +1148,33 @@ struct topology_update_data { int new_nid; }; +#define TOPOLOGY_DEF_TIMER_SECS 60 + static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; static cpumask_t cpu_associativity_changes_mask; static int vphn_enabled; static int prrn_enabled; static void reset_topology_timer(void); +static int topology_timer_secs = 1; +static int topology_inited; +static int topology_update_needed; + +/* + * Change polling interval for associativity changes. + */ +int timed_topology_update(int nsecs) +{ + if (vphn_enabled) { + if (nsecs > 0) + topology_timer_secs = nsecs; + else + topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS; + + reset_topology_timer(); + } + + return 0; +} /* * Store the current values of the associativity change counters in the @@ -1246,6 +1268,11 @@ static long vphn_get_associativity(unsigned long cpu, "hcall_vphn() experienced a hardware fault " "preventing VPHN. Disabling polling...\n"); stop_topology_update(); + break; + case H_SUCCESS: + dbg("VPHN hcall succeeded. Reset polling...\n"); + timed_topology_update(0); + break; } return rc; @@ -1323,8 +1350,11 @@ int numa_update_cpu_topology(bool cpus_locked) struct device *dev; int weight, new_nid, i = 0; - if (!prrn_enabled && !vphn_enabled) + if (!prrn_enabled && !vphn_enabled) { + if (!topology_inited) + topology_update_needed = 1; return 0; + } weight = cpumask_weight(&cpu_associativity_changes_mask); if (!weight) @@ -1363,22 +1393,30 @@ int numa_update_cpu_topology(bool cpus_locked) cpumask_andnot(&cpu_associativity_changes_mask, &cpu_associativity_changes_mask, cpu_sibling_mask(cpu)); + dbg("Assoc chg gives same node %d for cpu%d\n", + new_nid, cpu); cpu = cpu_last_thread_sibling(cpu); continue; } for_each_cpu(sibling, cpu_sibling_mask(cpu)) { ud = &updates[i++]; + ud->next = &updates[i]; ud->cpu = sibling; ud->new_nid = new_nid; ud->old_nid = numa_cpu_lookup_table[sibling]; cpumask_set_cpu(sibling, &updated_cpus); - if (i < weight) - ud->next = &updates[i]; } cpu = cpu_last_thread_sibling(cpu); } + /* + * Prevent processing of 'updates' from overflowing array + * where last entry filled in a 'next' pointer. + */ + if (i) + updates[i-1].next = NULL; + pr_debug("Topology update for the following CPUs:\n"); if (cpumask_weight(&updated_cpus)) { for (ud = &updates[0]; ud; ud = ud->next) { @@ -1433,6 +1471,7 @@ int numa_update_cpu_topology(bool cpus_locked) out: kfree(updates); + topology_update_needed = 0; return changed; } @@ -1466,7 +1505,7 @@ static struct timer_list topology_timer; static void reset_topology_timer(void) { - mod_timer(&topology_timer, jiffies + 60 * HZ); + mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ); } #ifdef CONFIG_SMP @@ -1515,15 +1554,14 @@ int start_topology_update(void) if (firmware_has_feature(FW_FEATURE_PRRN)) { if (!prrn_enabled) { prrn_enabled = 1; - vphn_enabled = 0; #ifdef CONFIG_SMP rc = of_reconfig_notifier_register(&dt_update_nb); #endif } - } else if (firmware_has_feature(FW_FEATURE_VPHN) && + } + if (firmware_has_feature(FW_FEATURE_VPHN) && lppaca_shared_proc(get_lppaca())) { if (!vphn_enabled) { - prrn_enabled = 0; vphn_enabled = 1; setup_cpu_associativity_change_counters(); timer_setup(&topology_timer, topology_timer_fn, @@ -1547,7 +1585,8 @@ int stop_topology_update(void) #ifdef CONFIG_SMP rc = of_reconfig_notifier_unregister(&dt_update_nb); #endif - } else if (vphn_enabled) { + } + if (vphn_enabled) { vphn_enabled = 0; rc = del_timer_sync(&topology_timer); } @@ -1610,9 +1649,17 @@ static int topology_update_init(void) if (topology_updates_enabled) start_topology_update(); + if (vphn_enabled) + topology_schedule_update(); + if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) return -ENOMEM; + topology_inited = 1; + if (topology_update_needed) + bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), + nr_cpumask_bits); + return 0; } device_initcall(topology_update_init); diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 39c252b54d16..cfbbee941a76 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -169,6 +169,16 @@ void radix__mark_rodata_ro(void) { unsigned long start, end; + /* + * mark_rodata_ro() will mark itself as !writable at some point. + * Due to DD1 workaround in radix__pte_update(), we'll end up with + * an invalid pte and the system will crash quite severly. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n"); + return; + } + start = (unsigned long)_stext; end = (unsigned long)__init_begin; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 1ec3aee43624..813ea22c3e00 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -57,7 +57,7 @@ #include "mmu_decl.h" -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) #error TASK_SIZE_USER64 exceeds user VSID range #endif diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 906a86fe457b..2cf5ef3fc50d 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -167,7 +167,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) /* * user space make sure we are within the allowed limit */ - ld r11,PACA_ADDR_LIMIT(r13) + ld r11,PACA_SLB_ADDR_LIMIT(r13) cmpld r3,r11 bge- 8f @@ -309,10 +309,6 @@ slb_compare_rr_to_size: srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ rldimi r10,r9,ESID_BITS_1T,0 ASM_VSID_SCRAMBLE(r10,r9,r11,1T) - /* - * bits above VSID_BITS_1T need to be ignored from r10 - * also combine VSID and flags - */ li r10,MMU_SEGSIZE_1T rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 45f6740dd407..564fff06f5c1 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *vma; - if ((mm->task_size - len) < addr) + if ((mm->context.slb_addr_limit - len) < addr) return 0; vma = find_vma(mm, addr); return (!vma || (addr + len) <= vm_start_gap(vma)); @@ -133,10 +133,10 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret) if (!slice_low_has_vma(mm, i)) ret->low_slices |= 1u << i; - if (mm->task_size <= SLICE_LOW_TOP) + if (mm->context.slb_addr_limit <= SLICE_LOW_TOP) return; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) if (!slice_high_has_vma(mm, i)) __set_bit(i, ret->high_slices); } @@ -157,7 +157,7 @@ static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_ma ret->low_slices |= 1u << i; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) { mask_index = i & 0x1; index = i >> 1; if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) @@ -169,7 +169,7 @@ static int slice_check_fit(struct mm_struct *mm, struct slice_mask mask, struct slice_mask available) { DECLARE_BITMAP(result, SLICE_NUM_HIGH); - unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit); + unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); bitmap_and(result, mask.high_slices, available.high_slices, slice_count); @@ -219,7 +219,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz mm->context.low_slices_psize = lpsizes; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) { mask_index = i & 0x1; index = i >> 1; if (test_bit(i, mask.high_slices)) @@ -329,8 +329,8 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * Only for that request for which high_limit is above * DEFAULT_MAP_WINDOW we should apply this. */ - if (high_limit > DEFAULT_MAP_WINDOW) - addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW; + if (high_limit > DEFAULT_MAP_WINDOW) + addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW; while (addr > PAGE_SIZE) { info.high_limit = addr; @@ -412,25 +412,31 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, struct slice_mask compat_mask; int fixed = (flags & MAP_FIXED); int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long page_size = 1UL << pshift; struct mm_struct *mm = current->mm; unsigned long newaddr; unsigned long high_limit; - /* - * Check if we need to expland slice area. - */ - if (unlikely(addr > mm->context.addr_limit && - mm->context.addr_limit != TASK_SIZE)) { - mm->context.addr_limit = TASK_SIZE; + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; + + if (len > high_limit) + return -ENOMEM; + if (len & (page_size - 1)) + return -EINVAL; + if (fixed) { + if (addr & (page_size - 1)) + return -EINVAL; + if (addr > high_limit - len) + return -ENOMEM; + } + + if (high_limit > mm->context.slb_addr_limit) { + mm->context.slb_addr_limit = high_limit; on_each_cpu(slice_flush_segments, mm, 1); } - /* - * This mmap request can allocate upt to 512TB - */ - if (addr > DEFAULT_MAP_WINDOW) - high_limit = mm->context.addr_limit; - else - high_limit = DEFAULT_MAP_WINDOW; + /* * init different masks */ @@ -446,27 +452,19 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Sanity checks */ BUG_ON(mm->task_size == 0); + BUG_ON(mm->context.slb_addr_limit == 0); VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", addr, len, flags, topdown); - if (len > mm->task_size) - return -ENOMEM; - if (len & ((1ul << pshift) - 1)) - return -EINVAL; - if (fixed && (addr & ((1ul << pshift) - 1))) - return -EINVAL; - if (fixed && addr > (mm->task_size - len)) - return -ENOMEM; - /* If hint, make sure it matches our alignment restrictions */ if (!fixed && addr) { - addr = _ALIGN_UP(addr, 1ul << pshift); + addr = _ALIGN_UP(addr, page_size); slice_dbg(" aligned addr=%lx\n", addr); /* Ignore hint if it's too large or overlaps a VMA */ - if (addr > mm->task_size - len || + if (addr > high_limit - len || !slice_area_is_free(mm, addr, len)) addr = 0; } diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index d304028641a2..884f4b705b57 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -39,6 +39,20 @@ static inline void __tlbiel_pid(unsigned long pid, int set, trace_tlbie(0, 1, rb, rs, ric, prs, r); } +static inline void __tlbie_pid(unsigned long pid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + /* * We use 128 set in radix mode and 256 set in hpt mode. */ @@ -70,22 +84,13 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) static inline void _tlbie_pid(unsigned long pid, unsigned long ric) { - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(53); /* IS = 1 */ - rs = pid << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + __tlbie_pid(pid, ric); asm volatile("eieio; tlbsync; ptesync": : :"memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static inline void _tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) +static inline void __tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -95,14 +100,44 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid, prs = 1; /* process scoped */ r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - asm volatile("ptesync": : :"memory"); trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void _tlbie_va(unsigned long va, unsigned long pid, +static inline void __tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); +} + +static inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbiel_va(va, pid, ap, ric); + asm volatile("ptesync": : :"memory"); +} + +static inline void _tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + __tlbiel_va_range(start, end, pid, page_size, psize); + asm volatile("ptesync": : :"memory"); +} + +static inline void __tlbie_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -113,13 +148,43 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid, prs = 1; /* process scoped */ r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); trace_tlbie(0, 0, rb, rs, ric, prs, r); } +static inline void __tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); +} + +static inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, ric); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbie_pid(pid, RIC_FLUSH_PWC); + __tlbie_va_range(start, end, pid, page_size, psize); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + /* * Base TLB flushing operations: * @@ -144,7 +209,7 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm) EXPORT_SYMBOL(radix__local_flush_tlb_mm); #ifndef CONFIG_SMP -static void radix__local_flush_all_mm(struct mm_struct *mm) +void radix__local_flush_all_mm(struct mm_struct *mm) { unsigned long pid; @@ -154,18 +219,18 @@ static void radix__local_flush_all_mm(struct mm_struct *mm) _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } +EXPORT_SYMBOL(radix__local_flush_all_mm); #endif /* CONFIG_SMP */ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { unsigned long pid; - unsigned long ap = mmu_get_ap(psize); preempt_disable(); - pid = mm ? mm->context.id : 0; + pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -173,11 +238,10 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd { #ifdef CONFIG_HUGETLB_PAGE /* need the return fix for nohash.c */ - if (vma && is_vm_hugetlb_page(vma)) - return __local_flush_hugetlb_page(vma, vmaddr); + if (is_vm_hugetlb_page(vma)) + return radix__local_flush_hugetlb_page(vma, vmaddr); #endif - radix__local_flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr, - mmu_virtual_psize); + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); } EXPORT_SYMBOL(radix__local_flush_tlb_page); @@ -186,36 +250,35 @@ void radix__flush_tlb_mm(struct mm_struct *mm) { unsigned long pid; - preempt_disable(); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; + return; + preempt_disable(); if (!mm_is_thread_local(mm)) _tlbie_pid(pid, RIC_FLUSH_TLB); else _tlbiel_pid(pid, RIC_FLUSH_TLB); -no_context: preempt_enable(); } EXPORT_SYMBOL(radix__flush_tlb_mm); -static void radix__flush_all_mm(struct mm_struct *mm) +void radix__flush_all_mm(struct mm_struct *mm) { unsigned long pid; - preempt_disable(); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; + return; + preempt_disable(); if (!mm_is_thread_local(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); else _tlbiel_pid(pid, RIC_FLUSH_ALL); -no_context: preempt_enable(); } +EXPORT_SYMBOL(radix__flush_all_mm); void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) { @@ -227,28 +290,26 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { unsigned long pid; - unsigned long ap = mmu_get_ap(psize); + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm ? mm->context.id : 0; - if (unlikely(pid == MMU_NO_CONTEXT)) - goto bail; if (!mm_is_thread_local(mm)) - _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); else - _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); -bail: + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { #ifdef CONFIG_HUGETLB_PAGE - if (vma && is_vm_hugetlb_page(vma)) - return flush_hugetlb_page(vma, vmaddr); + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_page(vma, vmaddr); #endif - radix__flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr, - mmu_virtual_psize); + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); } EXPORT_SYMBOL(radix__flush_tlb_page); @@ -262,17 +323,86 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); +#define TLB_FLUSH_ALL -1UL + /* - * Currently, for range flushing, we just do a full mm flush. Because - * we use this in code path where we don' track the page size. + * Number of pages above which we invalidate the entire PID rather than + * flush individual pages, for local and global flushes respectively. + * + * tlbie goes out to the interconnect and individual ops are more costly. + * It also does not iterate over sets like the local tlbiel variant when + * invalidating a full PID, so it has a far lower threshold to change from + * individual page flushes to full-pid flushes. */ +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; + void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; + unsigned long pid; + unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool local, full; - radix__flush_tlb_mm(mm); +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_tlb_range(vma, start, end); +#endif + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + if (mm_is_thread_local(mm)) { + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } else { + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } + + if (full) { + if (local) + _tlbiel_pid(pid, RIC_FLUSH_TLB); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + bool hflush = false; + unsigned long hstart, hend; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT; + hend = end >> HPAGE_PMD_SHIFT; + if (hstart < hend) { + hstart <<= HPAGE_PMD_SHIFT; + hend <<= HPAGE_PMD_SHIFT; + hflush = true; + } +#endif + + asm volatile("ptesync": : :"memory"); + if (local) { + __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbiel_va_range(hstart, hend, pid, + HPAGE_PMD_SIZE, MMU_PAGE_2M); + asm volatile("ptesync": : :"memory"); + } else { + __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbie_va_range(hstart, hend, pid, + HPAGE_PMD_SIZE, MMU_PAGE_2M); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } + } + preempt_enable(); } EXPORT_SYMBOL(radix__flush_tlb_range); @@ -291,101 +421,118 @@ static int radix_get_mmu_psize(int page_size) return psize; } +static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize); + void radix__tlb_flush(struct mmu_gather *tlb) { int psize = 0; struct mm_struct *mm = tlb->mm; int page_size = tlb->page_size; - psize = radix_get_mmu_psize(page_size); /* * if page size is not something we understand, do a full mm flush + * + * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush + * that flushes the process table entry cache upon process teardown. + * See the comment for radix in arch_exit_mmap(). */ - if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all) - radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize); - else if (tlb->need_flush_all) { - tlb->need_flush_all = 0; + if (tlb->fullmm) { radix__flush_all_mm(mm); - } else - radix__flush_tlb_mm(mm); + } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { + if (!tlb->need_flush_all) + radix__flush_tlb_mm(mm); + else + radix__flush_all_mm(mm); + } else { + unsigned long start = tlb->start; + unsigned long end = tlb->end; + + if (!tlb->need_flush_all) + radix__flush_tlb_range_psize(mm, start, end, psize); + else + radix__flush_tlb_pwc_range_psize(mm, start, end, psize); + } + tlb->need_flush_all = 0; } -#define TLB_FLUSH_ALL -1UL -/* - * Number of pages above which we will do a bcast tlbie. Just a - * number at this point copied from x86 - */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, + unsigned long start, unsigned long end, + int psize, bool also_pwc) +{ + unsigned long pid; + unsigned int page_shift = mmu_psize_defs[psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool local, full; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + if (mm_is_thread_local(mm)) { + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } else { + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } + + if (full) { + if (local) + _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + else + _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB); + } else { + if (local) + _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); + else + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); + } + preempt_enable(); +} void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize) { - unsigned long pid; - unsigned long addr; - int local = mm_is_thread_local(mm); - unsigned long ap = mmu_get_ap(psize); - unsigned long page_size = 1UL << mmu_psize_defs[psize].shift; + return __radix__flush_tlb_range_psize(mm, start, end, psize, false); +} - - preempt_disable(); - pid = mm ? mm->context.id : 0; - if (unlikely(pid == MMU_NO_CONTEXT)) - goto err_out; - - if (end == TLB_FLUSH_ALL || - (end - start) > tlb_single_page_flush_ceiling * page_size) { - if (local) - _tlbiel_pid(pid, RIC_FLUSH_TLB); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); - goto err_out; - } - for (addr = start; addr < end; addr += page_size) { - - if (local) - _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); - else - _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); - } -err_out: - preempt_enable(); +static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + __radix__flush_tlb_range_psize(mm, start, end, psize, true); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) { - int local = mm_is_thread_local(mm); - unsigned long ap = mmu_get_ap(mmu_virtual_psize); unsigned long pid, end; - - pid = mm ? mm->context.id : 0; - preempt_disable(); + pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; + return; /* 4k page size, just blow the world */ if (PAGE_SIZE == 0x1000) { radix__flush_all_mm(mm); - preempt_enable(); return; } - /* Otherwise first do the PWC */ - if (local) - _tlbiel_pid(pid, RIC_FLUSH_PWC); - else - _tlbie_pid(pid, RIC_FLUSH_PWC); - - /* Then iterate the pages */ end = addr + HPAGE_PMD_SIZE; - for (; addr < end; addr += PAGE_SIZE) { - if (local) - _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); - else - _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + + /* Otherwise first do the PWC, then iterate the pages. */ + preempt_disable(); + + if (mm_is_thread_local(mm)) { + _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } else { + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } -no_context: + preempt_enable(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h index 62fa7589db2b..8bdef7ed28a8 100644 --- a/arch/powerpc/net/bpf_jit64.h +++ b/arch/powerpc/net/bpf_jit64.h @@ -23,7 +23,7 @@ * [ nv gpr save area ] 8*8 | * [ tail_call_cnt ] 8 | * [ local_tmp_var ] 8 | - * fp (r31) --> [ ebpf stack space ] 512 | + * fp (r31) --> [ ebpf stack space ] upto 512 | * [ frame header ] 32/112 | * sp (r1) ---> [ stack pointer ] -------------- */ @@ -32,8 +32,8 @@ #define BPF_PPC_STACK_SAVE (8*8) /* for bpf JIT code internal usage */ #define BPF_PPC_STACK_LOCALS 16 -/* Ensure this is quadword aligned */ -#define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + MAX_BPF_STACK + \ +/* stack frame excluding BPF stack, ensure this is quadword aligned */ +#define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + \ BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE) #ifndef __ASSEMBLY__ @@ -103,6 +103,7 @@ struct codegen_context { */ unsigned int seen; unsigned int idx; + unsigned int stack_size; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index a66e64b0b251..46d74e81aff1 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -69,7 +69,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) static int bpf_jit_stack_local(struct codegen_context *ctx) { if (bpf_has_stack_frame(ctx)) - return STACK_FRAME_MIN_SIZE + MAX_BPF_STACK; + return STACK_FRAME_MIN_SIZE + ctx->stack_size; else return -(BPF_PPC_STACK_SAVE + 16); } @@ -82,8 +82,9 @@ static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx) static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) { if (reg >= BPF_PPC_NVR_MIN && reg < 32) - return (bpf_has_stack_frame(ctx) ? BPF_PPC_STACKFRAME : 0) - - (8 * (32 - reg)); + return (bpf_has_stack_frame(ctx) ? + (BPF_PPC_STACKFRAME + ctx->stack_size) : 0) + - (8 * (32 - reg)); pr_err("BPF JIT is asking about unknown registers"); BUG(); @@ -134,7 +135,7 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) PPC_BPF_STL(0, 1, PPC_LR_STKOFF); } - PPC_BPF_STLU(1, 1, -BPF_PPC_STACKFRAME); + PPC_BPF_STLU(1, 1, -(BPF_PPC_STACKFRAME + ctx->stack_size)); } /* @@ -161,7 +162,7 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) /* Setup frame pointer to point to the bpf stack area */ if (bpf_is_seen_register(ctx, BPF_REG_FP)) PPC_ADDI(b2p[BPF_REG_FP], 1, - STACK_FRAME_MIN_SIZE + MAX_BPF_STACK); + STACK_FRAME_MIN_SIZE + ctx->stack_size); } static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx) @@ -183,7 +184,7 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx /* Tear down our stack frame */ if (bpf_has_stack_frame(ctx)) { - PPC_ADDI(1, 1, BPF_PPC_STACKFRAME); + PPC_ADDI(1, 1, BPF_PPC_STACKFRAME + ctx->stack_size); if (ctx->seen & SEEN_FUNC) { PPC_BPF_LL(0, 1, PPC_LR_STKOFF); PPC_MTLR(0); @@ -1013,6 +1014,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) memset(&cgctx, 0, sizeof(struct codegen_context)); + /* Make sure that the stack is quadword aligned. */ + cgctx.stack_size = round_up(fp->aux->stack_depth, 16); + /* Scouting faux-generate pass 0 */ if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { /* We hit something illegal or unsupported. */ diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c index c82497a31c54..264b6ab11978 100644 --- a/arch/powerpc/oprofile/op_model_cell.c +++ b/arch/powerpc/oprofile/op_model_cell.c @@ -555,9 +555,7 @@ static void cell_virtual_cntr(unsigned long data) static void start_virt_cntrs(void) { - init_timer(&timer_virt_cntr); - timer_virt_cntr.function = cell_virtual_cntr; - timer_virt_cntr.data = 0UL; + setup_timer(&timer_virt_cntr, cell_virtual_cntr, 0UL); timer_virt_cntr.expires = jiffies + HZ / 10; add_timer(&timer_virt_cntr); } @@ -679,9 +677,7 @@ static void spu_evnt_swap(unsigned long data) static void start_spu_event_swap(void) { - init_timer(&timer_spu_event_swap); - timer_spu_event_swap.function = spu_evnt_swap; - timer_spu_event_swap.data = 0UL; + setup_timer(&timer_spu_event_swap, spu_evnt_swap, 0UL); timer_spu_event_swap.expires = jiffies + HZ / 25; add_timer(&timer_spu_event_swap); } diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 9c88b82f6229..72238eedc360 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -540,7 +540,7 @@ static int memord(const void *d1, size_t s1, const void *d2, size_t s2) { if (s1 < s2) return 1; - if (s2 > s1) + if (s1 > s2) return -1; return memcmp(d1, d2, s1); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index a78f255111f2..ae07470fde3c 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -295,10 +295,6 @@ config PPC_STD_MMU_32 def_bool y depends on PPC_STD_MMU && PPC32 -config PPC_STD_MMU_64 - def_bool y - depends on PPC_STD_MMU && PPC64 - config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 @@ -309,6 +305,19 @@ config PPC_RADIX_MMU is only implemented by IBM Power9 CPUs, if you don't have one of them you can probably disable this. +config PPC_RADIX_MMU_DEFAULT + bool "Default to using the Radix MMU when possible" + depends on PPC_RADIX_MMU + default y + help + When the hardware supports the Radix MMU, default to using it unless + "disable_radix[=yes]" is specified on the kernel command line. + + If this option is disabled, the Hash MMU will be used by default, + unless "disable_radix=no" is specified on the kernel command line. + + If you're unsure, say Y. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION @@ -324,7 +333,7 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if PPC_STD_MMU_64 + default y if PPC_BOOK3S_64 default n config PPC_HAVE_PMU_SUPPORT diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index 70183eb3d5c8..39a1d4225e0f 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -513,9 +513,7 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np) mutex_init(&host->mutex); init_completion(&host->complete); spin_lock_init(&host->lock); - init_timer(&host->timeout_timer); - host->timeout_timer.function = kw_i2c_timeout; - host->timeout_timer.data = (unsigned long)host; + setup_timer(&host->timeout_timer, kw_i2c_timeout, (unsigned long)host); psteps = of_get_property(np, "AAPL,address-step", NULL); steps = psteps ? (*psteps) : 0x10; diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 7a31c26500e6..3732118a0482 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -15,4 +15,5 @@ obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o -obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o +obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o +obj-$(CONFIG_PPC_FTW) += nx-ftw.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 8864065eba22..4650fb294e7a 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -41,7 +41,6 @@ #include "powernv.h" #include "pci.h" -static bool pnv_eeh_nb_init = false; static int eeh_event_irq = -EINVAL; static int pnv_eeh_init(void) @@ -197,31 +196,31 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); * been built. If the I/O cache staff has been built, EEH is * ready to supply service. */ -static int pnv_eeh_post_init(void) +int pnv_eeh_post_init(void) { struct pci_controller *hose; struct pnv_phb *phb; int ret = 0; + /* Probe devices & build address cache */ + eeh_probe_devices(); + eeh_addr_cache_build(); + /* Register OPAL event notifier */ - if (!pnv_eeh_nb_init) { - eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); - if (eeh_event_irq < 0) { - pr_err("%s: Can't register OPAL event interrupt (%d)\n", - __func__, eeh_event_irq); - return eeh_event_irq; - } + eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); + if (eeh_event_irq < 0) { + pr_err("%s: Can't register OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); + return eeh_event_irq; + } - ret = request_irq(eeh_event_irq, pnv_eeh_event, - IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL); - if (ret < 0) { - irq_dispose_mapping(eeh_event_irq); - pr_err("%s: Can't request OPAL event interrupt (%d)\n", - __func__, eeh_event_irq); - return ret; - } - - pnv_eeh_nb_init = true; + ret = request_irq(eeh_event_irq, pnv_eeh_event, + IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL); + if (ret < 0) { + irq_dispose_mapping(eeh_event_irq); + pr_err("%s: Can't request OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); + return ret; } if (!eeh_enabled()) @@ -367,6 +366,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + /* Skip if we haven't probed yet */ + if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE) + return NULL; + /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; @@ -1731,7 +1734,6 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) static struct eeh_ops pnv_eeh_ops = { .name = "powernv", .init = pnv_eeh_init, - .post_init = pnv_eeh_post_init, .probe = pnv_eeh_probe, .set_option = pnv_eeh_set_option, .get_pe_addr = pnv_eeh_get_pe_addr, diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 2cb6cbea4b3b..f6cbc1a71472 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -395,6 +395,7 @@ struct npu_context { struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; struct mmu_notifier mn; struct kref kref; + bool nmmu_flush; /* Callback to stop translation requests on a given GPU */ struct npu_context *(*release_cb)(struct npu_context *, void *); @@ -545,11 +546,13 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; unsigned long pid = npu_context->mm->context.id; - /* - * Unfortunately the nest mmu does not support flushing specific - * addresses so we have to flush the whole mm. - */ - flush_tlb_mm(npu_context->mm); + if (npu_context->nmmu_flush) + /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm once before + * shooting down the GPU translation. + */ + flush_all_mm(npu_context->mm); /* * Loop over all the NPUs this process is active on and launch @@ -722,6 +725,16 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, return ERR_PTR(-ENODEV); npu_context->npdev[npu->index][nvlink_index] = npdev; + if (!nphb->npu.nmmu_flush) { + /* + * If we're not explicitly flushing ourselves we need to mark + * the thread for global flushes + */ + npu_context->nmmu_flush = false; + mm_context_add_copro(mm); + } else + npu_context->nmmu_flush = true; + return npu_context; } EXPORT_SYMBOL(pnv_npu2_init_context); @@ -731,6 +744,9 @@ static void pnv_npu2_release_context(struct kref *kref) struct npu_context *npu_context = container_of(kref, struct npu_context, kref); + if (!npu_context->nmmu_flush) + mm_context_remove_copro(npu_context->mm); + npu_context->mm->context.npu_context = NULL; mmu_notifier_unregister(&npu_context->mn, npu_context->mm); @@ -819,6 +835,8 @@ int pnv_npu2_init(struct pnv_phb *phb) static int npu_index; uint64_t rc = 0; + phb->npu.nmmu_flush = + of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); for_each_child_of_node(phb->hose->dn, dn) { gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); if (gpdev) { diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index cf33769a7b72..18a355fa15e8 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -1,7 +1,7 @@ /* * PowerNV OPAL asynchronous completion interfaces * - * Copyright 2013 IBM Corp. + * Copyright 2013-2017 IBM Corp. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -23,40 +23,50 @@ #include #include -#define N_ASYNC_COMPLETIONS 64 +enum opal_async_token_state { + ASYNC_TOKEN_UNALLOCATED = 0, + ASYNC_TOKEN_ALLOCATED, + ASYNC_TOKEN_DISPATCHED, + ASYNC_TOKEN_ABANDONED, + ASYNC_TOKEN_COMPLETED +}; + +struct opal_async_token { + enum opal_async_token_state state; + struct opal_msg response; +}; -static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL}; -static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS); static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait); static DEFINE_SPINLOCK(opal_async_comp_lock); static struct semaphore opal_async_sem; -static struct opal_msg *opal_async_responses; static unsigned int opal_max_async_tokens; +static struct opal_async_token *opal_async_tokens; -int __opal_async_get_token(void) +static int __opal_async_get_token(void) { unsigned long flags; - int token; + int i, token = -EBUSY; spin_lock_irqsave(&opal_async_comp_lock, flags); - token = find_first_bit(opal_async_complete_map, opal_max_async_tokens); - if (token >= opal_max_async_tokens) { - token = -EBUSY; - goto out; + + for (i = 0; i < opal_max_async_tokens; i++) { + if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) { + opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED; + token = i; + break; + } } - if (__test_and_set_bit(token, opal_async_token_map)) { - token = -EBUSY; - goto out; - } - - __clear_bit(token, opal_async_complete_map); - -out: spin_unlock_irqrestore(&opal_async_comp_lock, flags); return token; } +/* + * Note: If the returned token is used in an opal call and opal returns + * OPAL_ASYNC_COMPLETION you MUST call one of opal_async_wait_response() or + * opal_async_wait_response_interruptible() at least once before calling another + * opal_async_* function + */ int opal_async_get_token_interruptible(void) { int token; @@ -73,9 +83,10 @@ int opal_async_get_token_interruptible(void) } EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible); -int __opal_async_release_token(int token) +static int __opal_async_release_token(int token) { unsigned long flags; + int rc; if (token < 0 || token >= opal_max_async_tokens) { pr_err("%s: Passed token is out of range, token %d\n", @@ -84,11 +95,26 @@ int __opal_async_release_token(int token) } spin_lock_irqsave(&opal_async_comp_lock, flags); - __set_bit(token, opal_async_complete_map); - __clear_bit(token, opal_async_token_map); + switch (opal_async_tokens[token].state) { + case ASYNC_TOKEN_COMPLETED: + case ASYNC_TOKEN_ALLOCATED: + opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED; + rc = 0; + break; + /* + * DISPATCHED and ABANDONED tokens must wait for OPAL to respond. + * Mark a DISPATCHED token as ABANDONED so that the response handling + * code knows no one cares and that it can free it then. + */ + case ASYNC_TOKEN_DISPATCHED: + opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED; + /* Fall through */ + default: + rc = 1; + } spin_unlock_irqrestore(&opal_async_comp_lock, flags); - return 0; + return rc; } int opal_async_release_token(int token) @@ -96,12 +122,10 @@ int opal_async_release_token(int token) int ret; ret = __opal_async_release_token(token); - if (ret) - return ret; + if (!ret) + up(&opal_async_sem); - up(&opal_async_sem); - - return 0; + return ret; } EXPORT_SYMBOL_GPL(opal_async_release_token); @@ -117,22 +141,83 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg) return -EINVAL; } - /* Wakeup the poller before we wait for events to speed things + /* + * There is no need to mark the token as dispatched, wait_event() + * will block until the token completes. + * + * Wakeup the poller before we wait for events to speed things * up on platforms or simulators where the interrupts aren't * functional. */ opal_wake_poller(); - wait_event(opal_async_wait, test_bit(token, opal_async_complete_map)); - memcpy(msg, &opal_async_responses[token], sizeof(*msg)); + wait_event(opal_async_wait, opal_async_tokens[token].state + == ASYNC_TOKEN_COMPLETED); + memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg)); return 0; } EXPORT_SYMBOL_GPL(opal_async_wait_response); +int opal_async_wait_response_interruptible(uint64_t token, struct opal_msg *msg) +{ + unsigned long flags; + int ret; + + if (token >= opal_max_async_tokens) { + pr_err("%s: Invalid token passed\n", __func__); + return -EINVAL; + } + + if (!msg) { + pr_err("%s: Invalid message pointer passed\n", __func__); + return -EINVAL; + } + + /* + * The first time this gets called we mark the token as DISPATCHED + * so that if wait_event_interruptible() returns not zero and the + * caller frees the token, we know not to actually free the token + * until the response comes. + * + * Only change if the token is ALLOCATED - it may have been + * completed even before the caller gets around to calling this + * the first time. + * + * There is also a dirty great comment at the token allocation + * function that if the opal call returns OPAL_ASYNC_COMPLETION to + * the caller then the caller *must* call this or the not + * interruptible version before doing anything else with the + * token. + */ + if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) { + spin_lock_irqsave(&opal_async_comp_lock, flags); + if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) + opal_async_tokens[token].state = ASYNC_TOKEN_DISPATCHED; + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + } + + /* + * Wakeup the poller before we wait for events to speed things + * up on platforms or simulators where the interrupts aren't + * functional. + */ + opal_wake_poller(); + ret = wait_event_interruptible(opal_async_wait, + opal_async_tokens[token].state == + ASYNC_TOKEN_COMPLETED); + if (!ret) + memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg)); + + return ret; +} +EXPORT_SYMBOL_GPL(opal_async_wait_response_interruptible); + +/* Called from interrupt context */ static int opal_async_comp_event(struct notifier_block *nb, unsigned long msg_type, void *msg) { struct opal_msg *comp_msg = msg; + enum opal_async_token_state state; unsigned long flags; uint64_t token; @@ -140,11 +225,17 @@ static int opal_async_comp_event(struct notifier_block *nb, return 0; token = be64_to_cpu(comp_msg->params[0]); - memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg)); spin_lock_irqsave(&opal_async_comp_lock, flags); - __set_bit(token, opal_async_complete_map); + state = opal_async_tokens[token].state; + opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED; spin_unlock_irqrestore(&opal_async_comp_lock, flags); + if (state == ASYNC_TOKEN_ABANDONED) { + /* Free the token, no one else will */ + opal_async_release_token(token); + return 0; + } + memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg)); wake_up(&opal_async_wait); return 0; @@ -178,32 +269,23 @@ int __init opal_async_comp_init(void) } opal_max_async_tokens = be32_to_cpup(async); - if (opal_max_async_tokens > N_ASYNC_COMPLETIONS) - opal_max_async_tokens = N_ASYNC_COMPLETIONS; + opal_async_tokens = kcalloc(opal_max_async_tokens, + sizeof(*opal_async_tokens), GFP_KERNEL); + if (!opal_async_tokens) { + err = -ENOMEM; + goto out_opal_node; + } err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP, &opal_async_comp_nb); if (err) { pr_err("%s: Can't register OPAL event notifier (%d)\n", __func__, err); + kfree(opal_async_tokens); goto out_opal_node; } - opal_async_responses = kzalloc( - sizeof(*opal_async_responses) * opal_max_async_tokens, - GFP_KERNEL); - if (!opal_async_responses) { - pr_err("%s: Out of memory, failed to do asynchronous " - "completion init\n", __func__); - err = -ENOMEM; - goto out_opal_node; - } - - /* Initialize to 1 less than the maximum tokens available, as we may - * require to pop one during emergency through synchronous call to - * __opal_async_get_token() - */ - sema_init(&opal_async_sem, opal_max_async_tokens - 1); + sema_init(&opal_async_sem, opal_max_async_tokens); out_opal_node: of_node_put(opal_node); diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index d78fed728cdf..c9e1a4ff295c 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -1,5 +1,5 @@ /* - * OPAL hypervisor Maintenance interrupt handling support in PowreNV. + * OPAL hypervisor Maintenance interrupt handling support in PowerNV. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index ecdcba9d1220..9d1b8c0aaf93 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -174,8 +174,14 @@ void opal_event_shutdown(void) /* First free interrupts, which will also mask them */ for (i = 0; i < opal_irq_count; i++) { - if (opal_irqs[i]) + if (!opal_irqs[i]) + continue; + + if (in_interrupt()) + disable_irq_nosync(opal_irqs[i]); + else free_irq(opal_irqs[i], NULL); + opal_irqs[i] = 0; } } diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c index 4495f428b500..d9916ea62305 100644 --- a/arch/powerpc/platforms/powernv/opal-memory-errors.c +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -1,5 +1,5 @@ /* - * OPAL asynchronus Memory error handling support in PowreNV. + * OPAL asynchronus Memory error handling support in PowerNV. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index aa267f120033..0a7074bb91dc 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -19,13 +19,10 @@ */ #include -#include #include #include #include -static DEFINE_MUTEX(opal_sensor_mutex); - /* * This will return sensor information to driver based on the requested sensor * handle. A handle is an opaque id for the powernv, read by the driver from the @@ -38,13 +35,9 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) __be32 data; token = opal_async_get_token_interruptible(); - if (token < 0) { - pr_err("%s: Couldn't get the token, returning\n", __func__); - ret = token; - goto out; - } + if (token < 0) + return token; - mutex_lock(&opal_sensor_mutex); ret = opal_sensor_read(sensor_hndl, token, &data); switch (ret) { case OPAL_ASYNC_COMPLETION: @@ -52,7 +45,7 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) if (ret) { pr_err("%s: Failed to wait for the async response, %d\n", __func__, ret); - goto out_token; + goto out; } ret = opal_error_code(opal_get_async_rc(msg)); @@ -73,10 +66,8 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) break; } -out_token: - mutex_unlock(&opal_sensor_mutex); - opal_async_release_token(token); out: + opal_async_release_token(token); return ret; } EXPORT_SYMBOL_GPL(opal_get_sensor_data); diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 8c1ede2d3f7e..6f4b00a2ac46 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -94,7 +94,7 @@ opal_return: * bytes (always BE) since MSR:LE will end up fixed up as a side * effect of the rfid. */ - FIXUP_ENDIAN + FIXUP_ENDIAN_HV ld r2,PACATOC(r13); lwz r4,8(r1); ld r5,PPC_LR_STKOFF(r1); @@ -120,7 +120,7 @@ opal_real_call: hrfid opal_return_realmode: - FIXUP_ENDIAN + FIXUP_ENDIAN_HV ld r2,PACATOC(r13); lwz r11,8(r1); ld r12,PPC_LR_STKOFF(r1) @@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 65c79ecf5a4d..041ddbd1fc57 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -998,6 +998,7 @@ int opal_error_code(int rc) case OPAL_PARAMETER: return -EINVAL; case OPAL_ASYNC_COMPLETION: return -EINPROGRESS; + case OPAL_BUSY: case OPAL_BUSY_EVENT: return -EBUSY; case OPAL_NO_MEM: return -ENOMEM; case OPAL_PERMISSION: return -EPERM; @@ -1037,3 +1038,4 @@ EXPORT_SYMBOL_GPL(opal_write_oppanel_async); /* Export this for KVM */ EXPORT_SYMBOL_GPL(opal_int_set_mfrr); EXPORT_SYMBOL_GPL(opal_int_eoi); +EXPORT_SYMBOL_GPL(opal_error_code); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 57f9e55f4352..749055553064 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1002,9 +1002,12 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) } /* - * After doing so, there would be a "hole" in the /proc/iomem when - * offset is a positive value. It looks like the device return some - * mmio back to the system, which actually no one could use it. + * Since M64 BAR shares segments among all possible 256 PEs, + * we have to shift the beginning of PF IOV BAR to make it start from + * the segment which belongs to the PE number assigned to the first VF. + * This creates a "hole" in the /proc/iomem which could be used for + * allocating other resources so we reserve this area below and + * release when IOV is released. */ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &dev->resource[i + PCI_IOV_RESOURCES]; @@ -1018,7 +1021,22 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", i, &res2, res, (offset > 0) ? "En" : "Dis", num_vfs, offset); + + if (offset < 0) { + devm_release_resource(&dev->dev, &pdn->holes[i]); + memset(&pdn->holes[i], 0, sizeof(pdn->holes[i])); + } + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + + if (offset > 0) { + pdn->holes[i].start = res2.start; + pdn->holes[i].end = res2.start + size * offset - 1; + pdn->holes[i].flags = IORESOURCE_BUS; + pdn->holes[i].name = "pnv_iov_reserved"; + devm_request_resource(&dev->dev, res->parent, + &pdn->holes[i]); + } } return 0; } @@ -2779,7 +2797,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; - if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) + if (!is_power_of_2(window_size)) return -EINVAL; /* Adjust direct table size from window_size and levels */ @@ -3293,8 +3311,7 @@ static void pnv_pci_ioda_fixup(void) pnv_pci_ioda_create_dbgfs(); #ifdef CONFIG_EEH - eeh_init(); - eeh_addr_cache_build(); + pnv_eeh_post_init(); #endif } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index b47f9406d97e..b772d7473896 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -188,6 +188,9 @@ struct pnv_phb { /* Bitmask for MMIO register usage */ unsigned long mmio_atsd_usage; + + /* Do we need to explicitly flush the nest mmu? */ + bool nmmu_flush; } npu; #ifdef CONFIG_CXL_BASE @@ -235,6 +238,7 @@ extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern bool pnv_pci_enable_device_hook(struct pci_dev *dev); extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +extern int pnv_eeh_post_init(void); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index bbb73aa0eb8f..1edfbc1e40f4 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "powernv.h" @@ -290,6 +291,7 @@ static void __init pnv_setup_machdep_opal(void) ppc_md.restart = pnv_restart; pm_power_off = pnv_power_off; ppc_md.halt = pnv_halt; + /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */ ppc_md.machine_check_exception = opal_machine_check; ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; ppc_md.hmi_exception_early = opal_hmi_exception_early; @@ -311,6 +313,28 @@ static int __init pnv_probe(void) return 1; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void __init pnv_tm_init(void) +{ + if (!firmware_has_feature(FW_FEATURE_OPAL) || + !pvr_version_is(PVR_POWER9) || + early_cpu_has_feature(CPU_FTR_TM)) + return; + + if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS) + return; + + pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n"); + cur_cpu_spec->cpu_features |= CPU_FTR_TM; + /* Make sure "normal" HTM is off (it should be) */ + cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM; + /* Turn on no suspend mode, and HTM no SC */ + cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \ + PPC_FEATURE2_HTM_NOSC; + tm_suspend_disabled = true; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + /* * Returns the cpu frequency for 'cpu' in Hz. This is used by * /proc/cpuinfo @@ -319,7 +343,7 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu) { unsigned long ret_freq; - ret_freq = cpufreq_quick_get(cpu) * 1000ul; + ret_freq = cpufreq_get(cpu) * 1000ul; /* * If the backend cpufreq driver does not exist, diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index c17f81e433f7..ba030669eca1 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -49,6 +49,13 @@ static void pnv_smp_setup_cpu(int cpu) { + /* + * P9 workaround for CI vector load (see traps.c), + * enable the corresponding HMI interrupt + */ + if (pvr_version_is(PVR_POWER9)) + mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17)); + if (xive_enabled()) xive_smp_setup_cpu(); else if (cpu != boot_cpuid) @@ -290,6 +297,54 @@ static void __init pnv_smp_probe(void) } } +static int pnv_system_reset_exception(struct pt_regs *regs) +{ + if (smp_handle_nmi_ipi(regs)) + return 1; + return 0; +} + +static int pnv_cause_nmi_ipi(int cpu) +{ + int64_t rc; + + if (cpu >= 0) { + rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu)); + if (rc != OPAL_SUCCESS) + return 0; + return 1; + + } else if (cpu == NMI_IPI_ALL_OTHERS) { + bool success = true; + int c; + + + /* + * We do not use broadcasts (yet), because it's not clear + * exactly what semantics Linux wants or the firmware should + * provide. + */ + for_each_online_cpu(c) { + if (c == smp_processor_id()) + continue; + + rc = opal_signal_system_reset( + get_hard_smp_processor_id(c)); + if (rc != OPAL_SUCCESS) + success = false; + } + if (success) + return 1; + + /* + * Caller will fall back to doorbells, which may pick + * up the remainders. + */ + } + + return 0; +} + static struct smp_ops_t pnv_smp_ops = { .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */ @@ -308,6 +363,10 @@ static struct smp_ops_t pnv_smp_ops = { /* This is called very early during platform setup_arch */ void __init pnv_smp_init(void) { + if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) { + ppc_md.system_reset_exception = pnv_system_reset_exception; + pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi; + } smp_ops = &pnv_smp_ops; #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c new file mode 100644 index 000000000000..ca22f1eae050 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-debug.c @@ -0,0 +1,209 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include +#include +#include +#include +#include "vas.h" + +static struct dentry *vas_debugfs; + +static char *cop_to_str(int cop) +{ + switch (cop) { + case VAS_COP_TYPE_FAULT: return "Fault"; + case VAS_COP_TYPE_842: return "NX-842 Normal Priority"; + case VAS_COP_TYPE_842_HIPRI: return "NX-842 High Priority"; + case VAS_COP_TYPE_GZIP: return "NX-GZIP Normal Priority"; + case VAS_COP_TYPE_GZIP_HIPRI: return "NX-GZIP High Priority"; + case VAS_COP_TYPE_FTW: return "Fast Thread-wakeup"; + default: return "Unknown"; + } +} + +static int info_dbg_show(struct seq_file *s, void *private) +{ + struct vas_window *window = s->private; + + mutex_lock(&vas_mutex); + + /* ensure window is not unmapped */ + if (!window->hvwc_map) + goto unlock; + + seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop), + window->tx_win ? "Send" : "Receive"); + seq_printf(s, "Pid : %d\n", window->pid); + +unlock: + mutex_unlock(&vas_mutex); + return 0; +} + +static int info_dbg_open(struct inode *inode, struct file *file) +{ + return single_open(file, info_dbg_show, inode->i_private); +} + +static const struct file_operations info_fops = { + .open = info_dbg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static inline void print_reg(struct seq_file *s, struct vas_window *win, + char *name, u32 reg) +{ + seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name); +} + +static int hvwc_dbg_show(struct seq_file *s, void *private) +{ + struct vas_window *window = s->private; + + mutex_lock(&vas_mutex); + + /* ensure window is not unmapped */ + if (!window->hvwc_map) + goto unlock; + + print_reg(s, window, VREG(LPID)); + print_reg(s, window, VREG(PID)); + print_reg(s, window, VREG(XLATE_MSR)); + print_reg(s, window, VREG(XLATE_LPCR)); + print_reg(s, window, VREG(XLATE_CTL)); + print_reg(s, window, VREG(AMR)); + print_reg(s, window, VREG(SEIDR)); + print_reg(s, window, VREG(FAULT_TX_WIN)); + print_reg(s, window, VREG(OSU_INTR_SRC_RA)); + print_reg(s, window, VREG(HV_INTR_SRC_RA)); + print_reg(s, window, VREG(PSWID)); + print_reg(s, window, VREG(LFIFO_BAR)); + print_reg(s, window, VREG(LDATA_STAMP_CTL)); + print_reg(s, window, VREG(LDMA_CACHE_CTL)); + print_reg(s, window, VREG(LRFIFO_PUSH)); + print_reg(s, window, VREG(CURR_MSG_COUNT)); + print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT)); + print_reg(s, window, VREG(LRX_WCRED)); + print_reg(s, window, VREG(LRX_WCRED_ADDER)); + print_reg(s, window, VREG(TX_WCRED)); + print_reg(s, window, VREG(TX_WCRED_ADDER)); + print_reg(s, window, VREG(LFIFO_SIZE)); + print_reg(s, window, VREG(WINCTL)); + print_reg(s, window, VREG(WIN_STATUS)); + print_reg(s, window, VREG(WIN_CTX_CACHING_CTL)); + print_reg(s, window, VREG(TX_RSVD_BUF_COUNT)); + print_reg(s, window, VREG(LRFIFO_WIN_PTR)); + print_reg(s, window, VREG(LNOTIFY_CTL)); + print_reg(s, window, VREG(LNOTIFY_PID)); + print_reg(s, window, VREG(LNOTIFY_LPID)); + print_reg(s, window, VREG(LNOTIFY_TID)); + print_reg(s, window, VREG(LNOTIFY_SCOPE)); + print_reg(s, window, VREG(NX_UTIL_ADDER)); +unlock: + mutex_unlock(&vas_mutex); + return 0; +} + +static int hvwc_dbg_open(struct inode *inode, struct file *file) +{ + return single_open(file, hvwc_dbg_show, inode->i_private); +} + +static const struct file_operations hvwc_fops = { + .open = hvwc_dbg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void vas_window_free_dbgdir(struct vas_window *window) +{ + if (window->dbgdir) { + debugfs_remove_recursive(window->dbgdir); + kfree(window->dbgname); + window->dbgdir = NULL; + window->dbgname = NULL; + } +} + +void vas_window_init_dbgdir(struct vas_window *window) +{ + struct dentry *f, *d; + + if (!window->vinst->dbgdir) + return; + + window->dbgname = kzalloc(16, GFP_KERNEL); + if (!window->dbgname) + return; + + snprintf(window->dbgname, 16, "w%d", window->winid); + + d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir); + if (IS_ERR(d)) + goto free_name; + + window->dbgdir = d; + + f = debugfs_create_file("info", 0444, d, window, &info_fops); + if (IS_ERR(f)) + goto remove_dir; + + f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops); + if (IS_ERR(f)) + goto remove_dir; + + return; + +free_name: + kfree(window->dbgname); + window->dbgname = NULL; + +remove_dir: + debugfs_remove_recursive(window->dbgdir); + window->dbgdir = NULL; +} + +void vas_instance_init_dbgdir(struct vas_instance *vinst) +{ + struct dentry *d; + + if (!vas_debugfs) + return; + + vinst->dbgname = kzalloc(16, GFP_KERNEL); + if (!vinst->dbgname) + return; + + snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id); + + d = debugfs_create_dir(vinst->dbgname, vas_debugfs); + if (IS_ERR(d)) + goto free_name; + + vinst->dbgdir = d; + return; + +free_name: + kfree(vinst->dbgname); + vinst->dbgname = NULL; + vinst->dbgdir = NULL; +} + +void vas_init_dbgdir(void) +{ + vas_debugfs = debugfs_create_dir("vas", NULL); + if (IS_ERR(vas_debugfs)) + vas_debugfs = NULL; +} diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 5aae845b8cd9..2b3eb01ab110 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -16,7 +16,8 @@ #include #include #include - +#include +#include #include "vas.h" #include "copy-paste.h" @@ -40,6 +41,16 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr); } +u64 vas_win_paste_addr(struct vas_window *win) +{ + u64 addr; + + compute_paste_address(win, &addr, NULL); + + return addr; +} +EXPORT_SYMBOL(vas_win_paste_addr); + static inline void get_hvwc_mmio_bar(struct vas_window *window, u64 *start, int *len) { @@ -145,23 +156,37 @@ static void unmap_paste_region(struct vas_window *window) } /* - * Unmap the MMIO regions for a window. + * Unmap the MMIO regions for a window. Hold the vas_mutex so we don't + * unmap when the window's debugfs dir is in use. This serializes close + * of a window even on another VAS instance but since its not a critical + * path, just minimize the time we hold the mutex for now. We can add + * a per-instance mutex later if necessary. */ static void unmap_winctx_mmio_bars(struct vas_window *window) { int len; + void *uwc_map; + void *hvwc_map; u64 busaddr_start; - if (window->hvwc_map) { + mutex_lock(&vas_mutex); + + hvwc_map = window->hvwc_map; + window->hvwc_map = NULL; + + uwc_map = window->uwc_map; + window->uwc_map = NULL; + + mutex_unlock(&vas_mutex); + + if (hvwc_map) { get_hvwc_mmio_bar(window, &busaddr_start, &len); - unmap_region(window->hvwc_map, busaddr_start, len); - window->hvwc_map = NULL; + unmap_region(hvwc_map, busaddr_start, len); } - if (window->uwc_map) { + if (uwc_map) { get_uwc_mmio_bar(window, &busaddr_start, &len); - unmap_region(window->uwc_map, busaddr_start, len); - window->uwc_map = NULL; + unmap_region(uwc_map, busaddr_start, len); } } @@ -528,6 +553,9 @@ static void vas_window_free(struct vas_window *window) struct vas_instance *vinst = window->vinst; unmap_winctx_mmio_bars(window); + + vas_window_free_dbgdir(window); + kfree(window); vas_release_window_id(&vinst->ida, winid); @@ -552,6 +580,8 @@ static struct vas_window *vas_window_alloc(struct vas_instance *vinst) if (map_winctx_mmio_bars(window)) goto out_free; + vas_window_init_dbgdir(window); + return window; out_free: @@ -568,6 +598,32 @@ static void put_rx_win(struct vas_window *rxwin) atomic_dec(&rxwin->num_txwins); } +/* + * Find the user space receive window given the @pswid. + * - We must have a valid vasid and it must belong to this instance. + * (so both send and receive windows are on the same VAS instance) + * - The window must refer to an OPEN, FTW, RECEIVE window. + * + * NOTE: We access ->windows[] table and assume that vinst->mutex is held. + */ +static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid) +{ + int vasid, winid; + struct vas_window *rxwin; + + decode_pswid(pswid, &vasid, &winid); + + if (vinst->vas_id != vasid) + return ERR_PTR(-EINVAL); + + rxwin = vinst->windows[winid]; + + if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW) + return ERR_PTR(-EINVAL); + + return rxwin; +} + /* * Get the VAS receive window associated with NX engine identified * by @cop and if applicable, @pswid. @@ -581,10 +637,10 @@ static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst, mutex_lock(&vinst->mutex); - if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) - rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL); + if (cop == VAS_COP_TYPE_FTW) + rxwin = get_user_rxwin(vinst, pswid); else - rxwin = ERR_PTR(-EINVAL); + rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL); if (!IS_ERR(rxwin)) atomic_inc(&rxwin->num_txwins); @@ -674,15 +730,18 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->rx_fifo = rxattr->rx_fifo; winctx->rx_fifo_size = rxattr->rx_fifo_size; - winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + winctx->wcreds_max = rxwin->wcreds_max; winctx->pin_win = rxattr->pin_win; winctx->nx_win = rxattr->nx_win; winctx->fault_win = rxattr->fault_win; + winctx->user_win = rxattr->user_win; + winctx->rej_no_credit = rxattr->rej_no_credit; winctx->rx_word_mode = rxattr->rx_win_ord_mode; winctx->tx_word_mode = rxattr->tx_win_ord_mode; winctx->rx_wcred_mode = rxattr->rx_wcred_mode; winctx->tx_wcred_mode = rxattr->tx_wcred_mode; + winctx->notify_early = rxattr->notify_early; if (winctx->nx_win) { winctx->data_stamp = true; @@ -723,7 +782,10 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, static bool rx_win_args_valid(enum vas_cop_type cop, struct vas_rx_win_attr *attr) { - dump_rx_win_attr(attr); + pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n", + attr->fault_win, attr->notify_disable, + attr->intr_disable, attr->notify_early, + attr->rx_fifo_size); if (cop >= VAS_COP_TYPE_MAX) return false; @@ -735,6 +797,9 @@ static bool rx_win_args_valid(enum vas_cop_type cop, if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX) return false; + if (attr->wcreds_max > VAS_RX_WCREDS_MAX) + return false; + if (attr->nx_win) { /* cannot be fault or user window if it is nx */ if (attr->fault_win || attr->user_win) @@ -835,6 +900,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, rxwin->nx_win = rxattr->nx_win; rxwin->user_win = rxattr->user_win; rxwin->cop = cop; + rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; if (rxattr->user_win) rxwin->pid = task_pid_vnr(current); @@ -884,21 +950,23 @@ static void init_winctx_for_txwin(struct vas_window *txwin, */ memset(winctx, 0, sizeof(struct vas_winctx)); - winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + winctx->wcreds_max = txwin->wcreds_max; winctx->user_win = txattr->user_win; winctx->nx_win = txwin->rxwin->nx_win; winctx->pin_win = txattr->pin_win; + winctx->rej_no_credit = txattr->rej_no_credit; + winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable; winctx->rx_wcred_mode = txattr->rx_wcred_mode; winctx->tx_wcred_mode = txattr->tx_wcred_mode; winctx->rx_word_mode = txattr->rx_win_ord_mode; winctx->tx_word_mode = txattr->tx_win_ord_mode; + winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count; - if (winctx->nx_win) { + winctx->intr_disable = true; + if (winctx->nx_win) winctx->data_stamp = true; - winctx->intr_disable = true; - } winctx->lpid = txattr->lpid; winctx->pidr = txattr->pidr; @@ -921,6 +989,9 @@ static bool tx_win_args_valid(enum vas_cop_type cop, if (cop > VAS_COP_TYPE_MAX) return false; + if (attr->wcreds_max > VAS_TX_WCREDS_MAX) + return false; + if (attr->user_win && (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count)) return false; @@ -940,6 +1011,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, if (!tx_win_args_valid(cop, attr)) return ERR_PTR(-EINVAL); + /* + * If caller did not specify a vasid but specified the PSWID of a + * receive window (applicable only to FTW windows), use the vasid + * from that receive window. + */ + if (vasid == -1 && attr->pswid) + decode_pswid(attr->pswid, &vasid, NULL); + vinst = find_vas_instance(vasid); if (!vinst) { pr_devel("vasid %d not found!\n", vasid); @@ -958,11 +1037,13 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, goto put_rxwin; } + txwin->cop = cop; txwin->tx_win = 1; txwin->rxwin = rxwin; txwin->nx_win = txwin->rxwin->nx_win; txwin->pid = attr->pid; txwin->user_win = attr->user_win; + txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT; init_winctx_for_txwin(txwin, attr, &winctx); @@ -984,6 +1065,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, } } + /* + * Now that we have a send window, ensure context switch issues + * CP_ABORT for this thread. + */ + rc = -EINVAL; + if (set_thread_uses_vas() < 0) + goto free_window; + set_vinst_win(vinst, txwin); return txwin; @@ -1038,50 +1127,110 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re) else rc = -EINVAL; - print_fifo_msg_count(txwin); + pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid, + read_hvwc_reg(txwin, VREG(LRFIFO_PUSH))); return rc; } EXPORT_SYMBOL_GPL(vas_paste_crb); +/* + * If credit checking is enabled for this window, poll for the return + * of window credits (i.e for NX engines to process any outstanding CRBs). + * Since NX-842 waits for the CRBs to be processed before closing the + * window, we should not have to wait for too long. + * + * TODO: We retry in 10ms intervals now. We could/should probably peek at + * the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending + * CRBs on the FIFO and compute the delay dynamically on each retry. + * But that is not really needed until we support NX-GZIP access from + * user space. (NX-842 driver waits for CSB and Fast thread-wakeup + * doesn't use credit checking). + */ +static void poll_window_credits(struct vas_window *window) +{ + u64 val; + int creds, mode; + + val = read_hvwc_reg(window, VREG(WINCTL)); + if (window->tx_win) + mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val); + else + mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val); + + if (!mode) + return; +retry: + if (window->tx_win) { + val = read_hvwc_reg(window, VREG(TX_WCRED)); + creds = GET_FIELD(VAS_TX_WCRED, val); + } else { + val = read_hvwc_reg(window, VREG(LRX_WCRED)); + creds = GET_FIELD(VAS_LRX_WCRED, val); + } + + if (creds < window->wcreds_max) { + val = 0; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(10)); + goto retry; + } +} + +/* + * Wait for the window to go to "not-busy" state. It should only take a + * short time to queue a CRB, so window should not be busy for too long. + * Trying 5ms intervals. + */ static void poll_window_busy_state(struct vas_window *window) { int busy; u64 val; retry: - /* - * Poll Window Busy flag - */ val = read_hvwc_reg(window, VREG(WIN_STATUS)); busy = GET_FIELD(VAS_WIN_BUSY, val); if (busy) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(msecs_to_jiffies(5)); goto retry; } } +/* + * Have the hardware cast a window out of cache and wait for it to + * be completed. + * + * NOTE: It can take a relatively long time to cast the window context + * out of the cache. It is not strictly necessary to cast out if: + * + * - we clear the "Pin Window" bit (so hardware is free to evict) + * + * - we re-initialize the window context when it is reassigned. + * + * We do the former in vas_win_close() and latter in vas_win_open(). + * So, ignoring the cast-out for now. We can add it as needed. If + * casting out becomes necessary we should consider offloading the + * job to a worker thread, so the window close can proceed quickly. + */ static void poll_window_castout(struct vas_window *window) { - int cached; + /* stub for now */ +} + +/* + * Unpin and close a window so no new requests are accepted and the + * hardware can evict this window from cache if necessary. + */ +static void unpin_close_window(struct vas_window *window) +{ u64 val; - /* Cast window context out of the cache */ -retry: - val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL)); - cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val); - if (cached) { - val = 0ULL; - val = SET_FIELD(VAS_CASTOUT_REQ, val, 1); - val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0); - write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val); - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - goto retry; - } + val = read_hvwc_reg(window, VREG(WINCTL)); + val = SET_FIELD(VAS_WINCTL_PIN, val, 0); + val = SET_FIELD(VAS_WINCTL_OPEN, val, 0); + write_hvwc_reg(window, VREG(WINCTL), val); } /* @@ -1098,8 +1247,6 @@ static void poll_window_castout(struct vas_window *window) */ int vas_win_close(struct vas_window *window) { - u64 val; - if (!window) return 0; @@ -1115,11 +1262,9 @@ int vas_win_close(struct vas_window *window) poll_window_busy_state(window); - /* Unpin window from cache and close it */ - val = read_hvwc_reg(window, VREG(WINCTL)); - val = SET_FIELD(VAS_WINCTL_PIN, val, 0); - val = SET_FIELD(VAS_WINCTL_OPEN, val, 0); - write_hvwc_reg(window, VREG(WINCTL), val); + unpin_close_window(window); + + poll_window_credits(window); poll_window_castout(window); @@ -1132,3 +1277,12 @@ int vas_win_close(struct vas_window *window) return 0; } EXPORT_SYMBOL_GPL(vas_win_close); + +/* + * Return a system-wide unique window id for the window @win. + */ +u32 vas_win_id(struct vas_window *win) +{ + return encode_pswid(win->vinst->vas_id, win->winid); +} +EXPORT_SYMBOL_GPL(vas_win_id); diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index 565a4878fefa..c488621dbec3 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -18,15 +18,18 @@ #include #include #include +#include #include "vas.h" -static DEFINE_MUTEX(vas_mutex); +DEFINE_MUTEX(vas_mutex); static LIST_HEAD(vas_instances); +static DEFINE_PER_CPU(int, cpu_vas_id); + static int init_vas_instance(struct platform_device *pdev) { - int rc, vasid; + int rc, cpu, vasid; struct resource *res; struct vas_instance *vinst; struct device_node *dn = pdev->dev.of_node; @@ -74,10 +77,17 @@ static int init_vas_instance(struct platform_device *pdev) "paste_win_id_shift 0x%llx\n", pdev->name, vasid, vinst->paste_base_addr, vinst->paste_win_id_shift); + for_each_possible_cpu(cpu) { + if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn)) + per_cpu(cpu_vas_id, cpu) = vasid; + } + mutex_lock(&vas_mutex); list_add(&vinst->node, &vas_instances); mutex_unlock(&vas_mutex); + vas_instance_init_dbgdir(vinst); + dev_set_drvdata(&pdev->dev, vinst); return 0; @@ -98,6 +108,10 @@ struct vas_instance *find_vas_instance(int vasid) struct vas_instance *vinst; mutex_lock(&vas_mutex); + + if (vasid == -1) + vasid = per_cpu(cpu_vas_id, smp_processor_id()); + list_for_each(ent, &vas_instances) { vinst = list_entry(ent, struct vas_instance, node); if (vinst->vas_id == vasid) { @@ -111,6 +125,17 @@ struct vas_instance *find_vas_instance(int vasid) return NULL; } +int chip_to_vas_id(int chipid) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu_to_chip_id(cpu) == chipid) + return per_cpu(cpu_vas_id, cpu); + } + return -1; +} + static int vas_probe(struct platform_device *pdev) { return init_vas_instance(pdev); @@ -134,6 +159,8 @@ static int __init vas_init(void) int found = 0; struct device_node *dn; + vas_init_dbgdir(); + platform_driver_register(&vas_driver); for_each_compatible_node(dn, NULL, "ibm,vas") { diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 38dee5d50f31..ae0100fd35bb 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -13,6 +13,8 @@ #include #include #include +#include +#include /* * Overview of Virtual Accelerator Switchboard (VAS). @@ -106,8 +108,8 @@ * * TODO: Needs tuning for per-process credits */ -#define VAS_WCREDS_MIN 16 -#define VAS_WCREDS_MAX ((64 << 10) - 1) +#define VAS_RX_WCREDS_MAX ((64 << 10) - 1) +#define VAS_TX_WCREDS_MAX ((4 << 10) - 1) #define VAS_WCREDS_DEFAULT (1 << 10) /* @@ -258,6 +260,16 @@ #define VAS_NX_UTIL_ADDER_OFFSET 0x180 #define VAS_NX_UTIL_ADDER PPC_BITMASK(32, 63) +/* + * VREG(x): + * Expand a register's short name (eg: LPID) into two parameters: + * - the register's short name in string form ("LPID"), and + * - the name of the macro (eg: VAS_LPID_OFFSET), defining the + * register's offset in the window context + */ +#define VREG_SFX(n, s) __stringify(n), VAS_##n##s +#define VREG(r) VREG_SFX(r, _OFFSET) + /* * Local Notify Scope Control Register. (Receive windows only). */ @@ -307,6 +319,9 @@ struct vas_instance { struct mutex mutex; struct vas_window *rxwin[VAS_COP_TYPE_MAX]; struct vas_window *windows[VAS_WINDOWS_PER_CHIP]; + + char *dbgname; + struct dentry *dbgdir; }; /* @@ -322,6 +337,10 @@ struct vas_window { void *hvwc_map; /* HV window context */ void *uwc_map; /* OS/User window context */ pid_t pid; /* Linux process id of owner */ + int wcreds_max; /* Window credits */ + + char *dbgname; + struct dentry *dbgdir; /* Fields applicable only to send windows */ void *paste_kaddr; @@ -383,45 +402,23 @@ struct vas_winctx { enum vas_notify_after_count notify_after_count; }; +extern struct mutex vas_mutex; + extern struct vas_instance *find_vas_instance(int vasid); - -/* - * VREG(x): - * Expand a register's short name (eg: LPID) into two parameters: - * - the register's short name in string form ("LPID"), and - * - the name of the macro (eg: VAS_LPID_OFFSET), defining the - * register's offset in the window context - */ -#define VREG_SFX(n, s) __stringify(n), VAS_##n##s -#define VREG(r) VREG_SFX(r, _OFFSET) - -#ifdef vas_debug -static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr) -{ - pr_err("fault %d, notify %d, intr %d early %d\n", - attr->fault_win, attr->notify_disable, - attr->intr_disable, attr->notify_early); - - pr_err("rx_fifo_size %d, max value %d\n", - attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX); -} +extern void vas_init_dbgdir(void); +extern void vas_instance_init_dbgdir(struct vas_instance *vinst); +extern void vas_window_init_dbgdir(struct vas_window *win); +extern void vas_window_free_dbgdir(struct vas_window *win); static inline void vas_log_write(struct vas_window *win, char *name, void *regptr, u64 val) { if (val) - pr_err("%swin #%d: %s reg %p, val 0x%016llx\n", + pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n", win->tx_win ? "Tx" : "Rx", win->winid, name, regptr, val); } -#else /* vas_debug */ - -#define vas_log_write(win, name, reg, val) -#define dump_rx_win_attr(attr) - -#endif /* vas_debug */ - static inline void write_uwc_reg(struct vas_window *win, char *name, s32 reg, u64 val) { @@ -450,18 +447,32 @@ static inline u64 read_hvwc_reg(struct vas_window *win, return in_be64(win->hvwc_map+reg); } -#ifdef vas_debug - -static void print_fifo_msg_count(struct vas_window *txwin) +/* + * Encode/decode the Partition Send Window ID (PSWID) for a window in + * a way that we can uniquely identify any window in the system. i.e. + * we should be able to locate the 'struct vas_window' given the PSWID. + * + * Bits Usage + * 0:7 VAS id (8 bits) + * 8:15 Unused, 0 (3 bits) + * 16:31 Window id (16 bits) + */ +static inline u32 encode_pswid(int vasid, int winid) { - uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o); - pr_devel("Winid %d, Msg count %llu\n", txwin->winid, - (uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH))); + u32 pswid = 0; + + pswid |= vasid << (31 - 7); + pswid |= winid; + + return pswid; } -#else /* vas_debug */ -#define print_fifo_msg_count(window) - -#endif /* vas_debug */ +static inline void decode_pswid(u32 pswid, int *vasid, int *winid) +{ + if (vasid) + *vasid = pswid >> (31 - 7) & 0xFF; + if (winid) + *winid = pswid & 0xFFFF; +} #endif /* _VAS_H */ diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index fadb95efbb9e..a7d14aa7bb7c 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -363,6 +363,7 @@ static int dlpar_online_cpu(struct device_node *dn) BUG_ON(get_cpu_current_state(cpu) != CPU_STATE_OFFLINE); cpu_maps_update_done(); + timed_topology_update(1); rc = device_online(get_cpu_device(cpu)); if (rc) goto out; @@ -533,6 +534,7 @@ static int dlpar_offline_cpu(struct device_node *dn) set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); cpu_maps_update_done(); + timed_topology_update(1); rc = device_offline(get_cpu_device(cpu)); if (rc) goto out; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 7c181467d0ad..69921f72e2da 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -55,23 +55,23 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) { - struct iommu_table_group *table_group = NULL; - struct iommu_table *tbl = NULL; - struct iommu_table_group_link *tgl = NULL; + struct iommu_table_group *table_group; + struct iommu_table *tbl; + struct iommu_table_group_link *tgl; table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, node); if (!table_group) - goto fail_exit; + return NULL; tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); if (!tbl) - goto fail_exit; + goto free_group; tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, node); if (!tgl) - goto fail_exit; + goto free_table; INIT_LIST_HEAD_RCU(&tbl->it_group_list); kref_init(&tbl->it_kref); @@ -82,11 +82,10 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) return table_group; -fail_exit: - kfree(tgl); - kfree(table_group); +free_table: kfree(tbl); - +free_group: + kfree(table_group); return NULL; } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 495ba4e7336d..0ee4a469a4ae 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -93,7 +93,7 @@ void vpa_init(int cpu) return; } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. @@ -106,7 +106,7 @@ void vpa_init(int cpu) "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* * Register dispatch trace log, if one has been allocated. @@ -129,7 +129,7 @@ void vpa_init(int cpu) } } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, @@ -824,7 +824,7 @@ void arch_free_page(struct page *page, int order) EXPORT_SYMBOL(arch_free_page); #endif /* CONFIG_PPC_SMLPAR */ -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_TRACEPOINTS #ifdef HAVE_JUMP_LABEL diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 779fc2a1c8f7..b2706c483067 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -485,7 +485,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "shared_processor_mode=%d\n", lppaca_shared_proc(get_lppaca())); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 seq_printf(m, "slb_size=%d\n", mmu_slb_size); #endif parse_em_data(m); diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 12277bc9fd9e..d86938260a86 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1592,6 +1592,8 @@ ATTRIBUTE_GROUPS(vio_dev); void vio_unregister_device(struct vio_dev *viodev) { device_unregister(&viodev->dev); + if (viodev->family == VDEVICE) + irq_dispose_mapping(viodev->irq); } EXPORT_SYMBOL(vio_unregister_device); diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index c60e84e4558d..1b307c80b401 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -184,7 +184,7 @@ static int axon_ram_probe(struct platform_device *device) static int axon_ram_bank_id = -1; struct axon_ram_bank *bank; struct resource resource; - int rc = 0; + int rc; axon_ram_bank_id++; diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 16f1edd78c40..535cf1f6941c 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -846,12 +846,12 @@ void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq) u32 ipic_get_mcp_status(void) { - return ipic_read(primary_ipic->regs, IPIC_SERMR); + return ipic_read(primary_ipic->regs, IPIC_SERSR); } void ipic_clear_mcp_status(u32 mask) { - ipic_write(primary_ipic->regs, IPIC_SERMR, mask); + ipic_write(primary_ipic->regs, IPIC_SERSR, mask); } /* Return an interrupt vector or 0 if no interrupt is pending. */ diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 33351c6704b1..1b2d8cb49abb 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,7 @@ static void byterev(unsigned char *, int); static void memex(void); static int bsesc(void); static void dump(void); +static void show_pte(unsigned long); static void prdump(unsigned long, long); static int ppc_inst_dump(unsigned long, long, int); static void dump_log_buf(void); @@ -234,6 +236,7 @@ Commands:\n\ #endif "\ dr dump stream of raw bytes\n\ + dv dump virtual address translation \n\ dt dump the tracing buffers (uses printk)\n\ dtc dump the tracing buffers for current CPU (uses printk)\n\ " @@ -278,6 +281,7 @@ Commands:\n\ #elif defined(CONFIG_44x) || defined(CONFIG_PPC_BOOK3E) " u dump TLB\n" #endif +" U show uptime information\n" " ? help\n" " # n limit output to n lines per page (for dp, dpa, dl)\n" " zr reboot\n\ @@ -530,14 +534,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi) waiting: secondary = 1; + spin_begin(); while (secondary && !xmon_gate) { if (in_xmon == 0) { - if (fromipi) + if (fromipi) { + spin_end(); goto leave; + } secondary = test_and_set_bit(0, &in_xmon); } - barrier(); + spin_cpu_relax(); + touch_nmi_watchdog(); } + spin_end(); if (!secondary && !xmon_gate) { /* we are the first cpu to come in */ @@ -568,21 +577,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi) mb(); xmon_gate = 1; barrier(); + touch_nmi_watchdog(); } cmdloop: while (in_xmon) { if (secondary) { + spin_begin(); if (cpu == xmon_owner) { if (!test_and_set_bit(0, &xmon_taken)) { secondary = 0; + spin_end(); continue; } /* missed it */ while (cpu == xmon_owner) - barrier(); + spin_cpu_relax(); } - barrier(); + spin_cpu_relax(); + touch_nmi_watchdog(); } else { cmd = cmds(regs); if (cmd != 0) { @@ -896,6 +909,26 @@ static void remove_cpu_bpts(void) write_ciabr(0); } +/* Based on uptime_proc_show(). */ +static void +show_uptime(void) +{ + struct timespec uptime; + + if (setjmp(bus_error_jmp) == 0) { + catch_memory_errors = 1; + sync(); + + get_monotonic_boottime(&uptime); + printf("Uptime: %lu.%.2lu seconds\n", (unsigned long)uptime.tv_sec, + ((unsigned long)uptime.tv_nsec / (NSEC_PER_SEC/100))); + + sync(); + __delay(200); \ + } + catch_memory_errors = 0; +} + static void set_lpp_cmd(void) { unsigned long lpp; @@ -1031,6 +1064,9 @@ cmds(struct pt_regs *excp) dump_tlb_book3e(); break; #endif + case 'U': + show_uptime(); + break; default: printf("Unrecognized command: "); do { @@ -2279,7 +2315,7 @@ static void dump_tracing(void) static void dump_one_paca(int cpu) { struct paca_struct *p; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 int i = 0; #endif @@ -2320,7 +2356,7 @@ static void dump_one_paca(int cpu) DUMP(p, hw_cpu_id, "x"); DUMP(p, cpu_start, "x"); DUMP(p, kexec_state, "x"); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 for (i = 0; i < SLB_NUM_BOLTED; i++) { u64 esid, vsid; @@ -2351,6 +2387,7 @@ static void dump_one_paca(int cpu) #endif DUMP(p, __current, "p"); DUMP(p, kstack, "lx"); + printf(" kstack_base = 0x%016lx\n", p->kstack & ~(THREAD_SIZE - 1)); DUMP(p, stab_rr, "lx"); DUMP(p, saved_r1, "lx"); DUMP(p, trap_save, "x"); @@ -2475,6 +2512,11 @@ static void dump_xives(void) unsigned long num; int c; + if (!xive_enabled()) { + printf("Xive disabled on this system\n"); + return; + } + c = inchar(); if (c == 'a') { dump_all_xives(); @@ -2574,6 +2616,9 @@ dump(void) dump_log_buf(); } else if (c == 'o') { dump_opal_msglog(); + } else if (c == 'v') { + /* dump virtual to physical translation */ + show_pte(adrs); } else if (c == 'r') { scanhex(&ndump); if (ndump == 0) @@ -2907,6 +2952,116 @@ static void show_task(struct task_struct *tsk) tsk->comm); } +#ifdef CONFIG_PPC_BOOK3S_64 +void format_pte(void *ptep, unsigned long pte) +{ + printf("ptep @ 0x%016lx = 0x%016lx\n", (unsigned long)ptep, pte); + printf("Maps physical address = 0x%016lx\n", pte & PTE_RPN_MASK); + + printf("Flags = %s%s%s%s%s\n", + (pte & _PAGE_ACCESSED) ? "Accessed " : "", + (pte & _PAGE_DIRTY) ? "Dirty " : "", + (pte & _PAGE_READ) ? "Read " : "", + (pte & _PAGE_WRITE) ? "Write " : "", + (pte & _PAGE_EXEC) ? "Exec " : ""); +} + +static void show_pte(unsigned long addr) +{ + unsigned long tskv = 0; + struct task_struct *tsk = NULL; + struct mm_struct *mm; + pgd_t *pgdp, *pgdir; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + if (!scanhex(&tskv)) + mm = &init_mm; + else + tsk = (struct task_struct *)tskv; + + if (tsk == NULL) + mm = &init_mm; + else + mm = tsk->active_mm; + + if (setjmp(bus_error_jmp) != 0) { + catch_memory_errors = 0; + printf("*** Error dumping pte for task %p\n", tsk); + return; + } + + catch_memory_errors = 1; + sync(); + + if (mm == &init_mm) { + pgdp = pgd_offset_k(addr); + pgdir = pgd_offset_k(0); + } else { + pgdp = pgd_offset(mm, addr); + pgdir = pgd_offset(mm, 0); + } + + if (pgd_none(*pgdp)) { + printf("no linux page table for address\n"); + return; + } + + printf("pgd @ 0x%016lx\n", pgdir); + + if (pgd_huge(*pgdp)) { + format_pte(pgdp, pgd_val(*pgdp)); + return; + } + printf("pgdp @ 0x%016lx = 0x%016lx\n", pgdp, pgd_val(*pgdp)); + + pudp = pud_offset(pgdp, addr); + + if (pud_none(*pudp)) { + printf("No valid PUD\n"); + return; + } + + if (pud_huge(*pudp)) { + format_pte(pudp, pud_val(*pudp)); + return; + } + + printf("pudp @ 0x%016lx = 0x%016lx\n", pudp, pud_val(*pudp)); + + pmdp = pmd_offset(pudp, addr); + + if (pmd_none(*pmdp)) { + printf("No valid PMD\n"); + return; + } + + if (pmd_huge(*pmdp)) { + format_pte(pmdp, pmd_val(*pmdp)); + return; + } + printf("pmdp @ 0x%016lx = 0x%016lx\n", pmdp, pmd_val(*pmdp)); + + ptep = pte_offset_map(pmdp, addr); + if (pte_none(*ptep)) { + printf("no valid PTE\n"); + return; + } + + format_pte(ptep, pte_val(*ptep)); + + sync(); + __delay(200); + catch_memory_errors = 0; +} +#else +static void show_pte(unsigned long addr) +{ + printf("show_pte not yet implemented\n"); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + static void show_tasks(void) { unsigned long tskv; @@ -3224,7 +3379,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid, printf("%s", after); } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 void dump_segments(void) { int i; diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index ed6531f075c6..e06605b21841 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -384,9 +384,9 @@ static int powernv_add_idle_states(void) * Firmware passes residency and latency values in ns. * cpuidle expects it in us. */ - exit_latency = latency_ns[i] / 1000; + exit_latency = DIV_ROUND_UP(latency_ns[i], 1000); if (!rc) - target_residency = residency_ns[i] / 1000; + target_residency = DIV_ROUND_UP(residency_ns[i], 1000); else target_residency = 0; diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c index 0f20f5ec9617..f2246a5abcf6 100644 --- a/drivers/crypto/nx/nx-842-powernv.c +++ b/drivers/crypto/nx/nx-842-powernv.c @@ -46,7 +46,6 @@ struct nx842_workmem { ktime_t start; - struct vas_window *txwin; /* Used with VAS function */ char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */ } __packed __aligned(WORKMEM_ALIGN); @@ -65,7 +64,7 @@ struct nx842_coproc { * Send the request to NX engine on the chip for the corresponding CPU * where the process is executing. Use with VAS function. */ -static DEFINE_PER_CPU(struct nx842_coproc *, coproc_inst); +static DEFINE_PER_CPU(struct vas_window *, cpu_txwin); /* no cpu hotplug on powernv, so this list never changes after init */ static LIST_HEAD(nx842_coprocs); @@ -586,16 +585,11 @@ static int nx842_exec_vas(const unsigned char *in, unsigned int inlen, ccw = SET_FIELD(CCW_FC_842, ccw, fc); crb->ccw = cpu_to_be32(ccw); - txwin = wmem->txwin; - /* shoudn't happen, we don't load without a coproc */ - if (!txwin) { - pr_err_ratelimited("NX-842 coprocessor is not available"); - return -ENODEV; - } - do { wmem->start = ktime_get(); preempt_disable(); + txwin = this_cpu_read(cpu_txwin); + /* * VAS copy CRB into L2 cache. Refer . * @crb and @offset. @@ -689,25 +683,6 @@ static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc, list_add(&coproc->list, &nx842_coprocs); } -/* - * Identify chip ID for each CPU and save coprocesor adddress for the - * corresponding NX engine in percpu coproc_inst. - * coproc_inst is used in crypto_init to open send window on the NX instance - * for the corresponding CPU / chip where the open request is executed. - */ -static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc) -{ - unsigned int i, chip_id; - - for_each_possible_cpu(i) { - chip_id = cpu_to_chip_id(i); - - if (coproc->chip_id == chip_id) - per_cpu(coproc_inst, i) = coproc; - } -} - - static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc) { struct vas_window *txwin = NULL; @@ -725,15 +700,58 @@ static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc) * Open a VAS send window which is used to send request to NX. */ txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr); - if (IS_ERR(txwin)) { + if (IS_ERR(txwin)) pr_err("ibm,nx-842: Can not open TX window: %ld\n", PTR_ERR(txwin)); - return NULL; - } return txwin; } +/* + * Identify chip ID for each CPU, open send wndow for the corresponding NX + * engine and save txwin in percpu cpu_txwin. + * cpu_txwin is used in copy/paste operation for each compression / + * decompression request. + */ +static int nx842_open_percpu_txwins(void) +{ + struct nx842_coproc *coproc, *n; + unsigned int i, chip_id; + + for_each_possible_cpu(i) { + struct vas_window *txwin = NULL; + + chip_id = cpu_to_chip_id(i); + + list_for_each_entry_safe(coproc, n, &nx842_coprocs, list) { + /* + * Kernel requests use only high priority FIFOs. So + * open send windows for these FIFOs. + */ + + if (coproc->ct != VAS_COP_TYPE_842_HIPRI) + continue; + + if (coproc->chip_id == chip_id) { + txwin = nx842_alloc_txwin(coproc); + if (IS_ERR(txwin)) + return PTR_ERR(txwin); + + per_cpu(cpu_txwin, i) = txwin; + break; + } + } + + if (!per_cpu(cpu_txwin, i)) { + /* shoudn't happen, Each chip will have NX engine */ + pr_err("NX engine is not availavle for CPU %d\n", i); + return -EINVAL; + } + } + + return 0; +} + static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id, int vasid) { @@ -819,14 +837,6 @@ static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id, coproc->vas.id = vasid; nx842_add_coprocs_list(coproc, chip_id); - /* - * Kernel requests use only high priority FIFOs. So save coproc - * info in percpu coproc_inst which will be used to open send - * windows for crypto open requests later. - */ - if (coproc->ct == VAS_COP_TYPE_842_HIPRI) - nx842_set_per_cpu_coproc(coproc); - return 0; err_out: @@ -847,24 +857,12 @@ static int __init nx842_powernv_probe_vas(struct device_node *pn) return -EINVAL; } - for_each_compatible_node(dn, NULL, "ibm,power9-vas-x") { - if (of_get_ibm_chip_id(dn) == chip_id) - break; - } - - if (!dn) { - pr_err("Missing VAS device node\n"); + vasid = chip_to_vas_id(chip_id); + if (vasid < 0) { + pr_err("Unable to map chip_id %d to vasid\n", chip_id); return -EINVAL; } - if (of_property_read_u32(dn, "ibm,vas-id", &vasid)) { - pr_err("Missing ibm,vas-id device property\n"); - of_node_put(dn); - return -EINVAL; - } - - of_node_put(dn); - for_each_child_of_node(pn, dn) { if (of_device_is_compatible(dn, "ibm,p9-nx-842")) { ret = vas_cfg_coproc_info(dn, chip_id, vasid); @@ -928,6 +926,19 @@ static int __init nx842_powernv_probe(struct device_node *dn) static void nx842_delete_coprocs(void) { struct nx842_coproc *coproc, *n; + struct vas_window *txwin; + int i; + + /* + * close percpu txwins that are opened for the corresponding coproc. + */ + for_each_possible_cpu(i) { + txwin = per_cpu(cpu_txwin, i); + if (txwin) + vas_win_close(txwin); + + per_cpu(cpu_txwin, i) = 0; + } list_for_each_entry_safe(coproc, n, &nx842_coprocs, list) { if (coproc->vas.rxwin) @@ -954,46 +965,6 @@ static struct nx842_driver nx842_powernv_driver = { .decompress = nx842_powernv_decompress, }; -static int nx842_powernv_crypto_init_vas(struct crypto_tfm *tfm) -{ - struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm); - struct nx842_workmem *wmem; - struct nx842_coproc *coproc; - int ret; - - ret = nx842_crypto_init(tfm, &nx842_powernv_driver); - - if (ret) - return ret; - - wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN); - coproc = per_cpu(coproc_inst, smp_processor_id()); - - ret = -EINVAL; - if (coproc && coproc->vas.rxwin) { - wmem->txwin = nx842_alloc_txwin(coproc); - if (!IS_ERR(wmem->txwin)) - return 0; - - ret = PTR_ERR(wmem->txwin); - } - - return ret; -} - -void nx842_powernv_crypto_exit_vas(struct crypto_tfm *tfm) -{ - struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm); - struct nx842_workmem *wmem; - - wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN); - - if (wmem && wmem->txwin) - vas_win_close(wmem->txwin); - - nx842_crypto_exit(tfm); -} - static int nx842_powernv_crypto_init(struct crypto_tfm *tfm) { return nx842_crypto_init(tfm, &nx842_powernv_driver); @@ -1044,9 +1015,13 @@ static __init int nx842_powernv_init(void) nx842_powernv_exec = nx842_exec_icswx; } else { + ret = nx842_open_percpu_txwins(); + if (ret) { + nx842_delete_coprocs(); + return ret; + } + nx842_powernv_exec = nx842_exec_vas; - nx842_powernv_alg.cra_init = nx842_powernv_crypto_init_vas; - nx842_powernv_alg.cra_exit = nx842_powernv_crypto_exit_vas; } ret = crypto_register_alg(&nx842_powernv_alg); diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c index da3cb8c35ec7..d94e25df503b 100644 --- a/drivers/crypto/nx/nx-842.c +++ b/drivers/crypto/nx/nx-842.c @@ -116,7 +116,7 @@ int nx842_crypto_init(struct crypto_tfm *tfm, struct nx842_driver *driver) spin_lock_init(&ctx->lock); ctx->driver = driver; - ctx->wmem = kzalloc(driver->workmem_size, GFP_KERNEL); + ctx->wmem = kmalloc(driver->workmem_size, GFP_KERNEL); ctx->sbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER); ctx->dbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER); if (!ctx->wmem || !ctx->sbounce || !ctx->dbounce) { diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index a0c44d16bf30..7c11bad5cded 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "cxl.h" @@ -331,9 +332,12 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, /* ensure this mm_struct can't be freed */ cxl_context_mm_count_get(ctx); - /* decrement the use count */ - if (ctx->mm) + if (ctx->mm) { + /* decrement the use count from above */ mmput(ctx->mm); + /* make TLBIs for this context global */ + mm_context_add_copro(ctx->mm); + } } /* @@ -342,13 +346,19 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, */ cxl_ctx_get(); + /* See the comment in afu_ioctl_start_work() */ + smp_mb(); + if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) { put_pid(ctx->pid); ctx->pid = NULL; cxl_adapter_context_put(ctx->afu->adapter); cxl_ctx_put(); - if (task) + if (task) { cxl_context_mm_count_put(ctx); + if (ctx->mm) + mm_context_remove_copro(ctx->mm); + } goto out; } diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 8c32040b9c09..12a41b2753f0 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -267,6 +268,8 @@ int __detach_context(struct cxl_context *ctx) /* Decrease the mm count on the context */ cxl_context_mm_count_put(ctx); + if (ctx->mm) + mm_context_remove_copro(ctx->mm); ctx->mm = NULL; return 0; diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index b1afeccbb97f..e46a4062904a 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -100,9 +100,12 @@ static const cxl_p1_reg_t CXL_XSL_FEC = {0x0158}; static const cxl_p1_reg_t CXL_XSL_DSNCTL = {0x0168}; /* PSL registers - CAIA 2 */ static const cxl_p1_reg_t CXL_PSL9_CONTROL = {0x0020}; +static const cxl_p1_reg_t CXL_XSL9_INV = {0x0110}; +static const cxl_p1_reg_t CXL_XSL9_DBG = {0x0130}; +static const cxl_p1_reg_t CXL_XSL9_DEF = {0x0140}; static const cxl_p1_reg_t CXL_XSL9_DSNCTL = {0x0168}; static const cxl_p1_reg_t CXL_PSL9_FIR1 = {0x0300}; -static const cxl_p1_reg_t CXL_PSL9_FIR2 = {0x0308}; +static const cxl_p1_reg_t CXL_PSL9_FIR_MASK = {0x0308}; static const cxl_p1_reg_t CXL_PSL9_Timebase = {0x0310}; static const cxl_p1_reg_t CXL_PSL9_DEBUG = {0x0320}; static const cxl_p1_reg_t CXL_PSL9_FIR_CNTL = {0x0348}; @@ -112,6 +115,7 @@ static const cxl_p1_reg_t CXL_PSL9_TRACECFG = {0x0368}; static const cxl_p1_reg_t CXL_PSL9_APCDEDALLOC = {0x0378}; static const cxl_p1_reg_t CXL_PSL9_APCDEDTYPE = {0x0380}; static const cxl_p1_reg_t CXL_PSL9_TNR_ADDR = {0x0388}; +static const cxl_p1_reg_t CXL_PSL9_CTCCFG = {0x0390}; static const cxl_p1_reg_t CXL_PSL9_GP_CT = {0x0398}; static const cxl_p1_reg_t CXL_XSL9_IERAT = {0x0588}; static const cxl_p1_reg_t CXL_XSL9_ILPP = {0x0590}; @@ -414,6 +418,9 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0}; #define CXL_CARD_MINOR(adapter) (adapter->adapter_num * CXL_DEV_MINORS) #define CXL_DEVT_ADAPTER(dev) (MINOR(dev) / CXL_DEV_MINORS) +#define CXL_PSL9_TRACEID_MAX 0xAU +#define CXL_PSL9_TRACESTATE_FIN 0x3U + enum cxl_context_status { CLOSED, OPENED, @@ -938,8 +945,6 @@ int cxl_debugfs_adapter_add(struct cxl *adapter); void cxl_debugfs_adapter_remove(struct cxl *adapter); int cxl_debugfs_afu_add(struct cxl_afu *afu); void cxl_debugfs_afu_remove(struct cxl_afu *afu); -void cxl_stop_trace_psl9(struct cxl *cxl); -void cxl_stop_trace_psl8(struct cxl *cxl); void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir); void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir); void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir); @@ -975,14 +980,6 @@ static inline void cxl_debugfs_afu_remove(struct cxl_afu *afu) { } -static inline void cxl_stop_trace_psl9(struct cxl *cxl) -{ -} - -static inline void cxl_stop_trace_psl8(struct cxl *cxl) -{ -} - static inline void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir) { @@ -1070,7 +1067,8 @@ u64 cxl_calculate_sr(bool master, bool kernel, bool real_mode, bool p9); void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx); void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx); -void cxl_native_err_irq_dump_regs(struct cxl *adapter); +void cxl_native_err_irq_dump_regs_psl8(struct cxl *adapter); +void cxl_native_err_irq_dump_regs_psl9(struct cxl *adapter); int cxl_pci_vphb_add(struct cxl_afu *afu); void cxl_pci_vphb_remove(struct cxl_afu *afu); void cxl_release_mapping(struct cxl_context *ctx); diff --git a/drivers/misc/cxl/debugfs.c b/drivers/misc/cxl/debugfs.c index eae9d749f967..1643850d2302 100644 --- a/drivers/misc/cxl/debugfs.c +++ b/drivers/misc/cxl/debugfs.c @@ -15,28 +15,6 @@ static struct dentry *cxl_debugfs; -void cxl_stop_trace_psl9(struct cxl *adapter) -{ - /* Stop the trace */ - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x4480000000000000ULL); -} - -void cxl_stop_trace_psl8(struct cxl *adapter) -{ - int slice; - - /* Stop the trace */ - cxl_p1_write(adapter, CXL_PSL_TRACE, 0x8000000000000017LL); - - /* Stop the slice traces */ - spin_lock(&adapter->afu_list_lock); - for (slice = 0; slice < adapter->slices; slice++) { - if (adapter->afu[slice]) - cxl_p1n_write(adapter->afu[slice], CXL_PSL_SLICE_TRACE, 0x8000000000000000LL); - } - spin_unlock(&adapter->afu_list_lock); -} - /* Helpers to export CXL mmaped IO registers via debugfs */ static int debugfs_io_u64_get(void *data, u64 *val) { @@ -62,9 +40,14 @@ static struct dentry *debugfs_create_io_x64(const char *name, umode_t mode, void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir) { debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR1)); - debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR2)); + debugfs_create_io_x64("fir_mask", 0400, dir, + _cxl_p1_addr(adapter, CXL_PSL9_FIR_MASK)); debugfs_create_io_x64("fir_cntl", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR_CNTL)); debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_TRACECFG)); + debugfs_create_io_x64("debug", 0600, dir, + _cxl_p1_addr(adapter, CXL_PSL9_DEBUG)); + debugfs_create_io_x64("xsl-debug", 0600, dir, + _cxl_p1_addr(adapter, CXL_XSL9_DBG)); } void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir) diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index f17f72ea0545..70dbb6de102c 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -220,22 +220,11 @@ static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr) static bool cxl_is_page_fault(struct cxl_context *ctx, u64 dsisr) { - u64 crs; /* Translation Checkout Response Status */ - if ((cxl_is_power8()) && (dsisr & CXL_PSL_DSISR_An_DM)) return true; - if (cxl_is_power9()) { - crs = (dsisr & CXL_PSL9_DSISR_An_CO_MASK); - if ((crs == CXL_PSL9_DSISR_An_PF_SLR) || - (crs == CXL_PSL9_DSISR_An_PF_RGC) || - (crs == CXL_PSL9_DSISR_An_PF_RGP) || - (crs == CXL_PSL9_DSISR_An_PF_HRH) || - (crs == CXL_PSL9_DSISR_An_PF_STEG) || - (crs == CXL_PSL9_DSISR_An_URTCH)) { - return true; - } - } + if (cxl_is_power9()) + return true; return false; } diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 4bfad9f6dc9f..76c0b0ca9388 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -220,9 +221,12 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, /* ensure this mm_struct can't be freed */ cxl_context_mm_count_get(ctx); - /* decrement the use count */ - if (ctx->mm) + if (ctx->mm) { + /* decrement the use count from above */ mmput(ctx->mm); + /* make TLBIs for this context global */ + mm_context_add_copro(ctx->mm); + } /* * Increment driver use count. Enables global TLBIs for hash @@ -230,6 +234,20 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, */ cxl_ctx_get(); + /* + * A barrier is needed to make sure all TLBIs are global + * before we attach and the context starts being used by the + * adapter. + * + * Needed after mm_context_add_copro() for radix and + * cxl_ctx_get() for hash/p8. + * + * The barrier should really be mb(), since it involves a + * device. However, it's only useful when we have local + * vs. global TLBIs, i.e SMP=y. So keep smp_mb(). + */ + smp_mb(); + trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr); if ((rc = cxl_ops->attach_process(ctx, false, work.work_element_descriptor, @@ -240,6 +258,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, ctx->pid = NULL; cxl_ctx_put(); cxl_context_mm_count_put(ctx); + if (ctx->mm) + mm_context_remove_copro(ctx->mm); goto out; } diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 4a82c313cf71..02b6b45b4c20 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -897,6 +897,14 @@ int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr) if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes) afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx); + ctx->elem->software_state = cpu_to_be32(CXL_PE_SOFTWARE_STATE_V); + /* + * Ideally we should do a wmb() here to make sure the changes to the + * PE are visible to the card before we call afu_enable. + * On ppc64 though all mmios are preceded by a 'sync' instruction hence + * we dont dont need one here. + */ + result = cxl_ops->afu_reset(afu); if (result) return result; @@ -1077,13 +1085,11 @@ static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info) void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx) { - u64 fir1, fir2, serr; + u64 fir1, serr; fir1 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR1); - fir2 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR2); dev_crit(&ctx->afu->dev, "PSL_FIR1: 0x%016llx\n", fir1); - dev_crit(&ctx->afu->dev, "PSL_FIR2: 0x%016llx\n", fir2); if (ctx->afu->adapter->native->sl_ops->register_serr_irq) { serr = cxl_p1n_read(ctx->afu, CXL_PSL_SERR_An); cxl_afu_decode_psl_serr(ctx->afu, serr); @@ -1257,14 +1263,23 @@ static irqreturn_t native_slice_irq_err(int irq, void *data) return IRQ_HANDLED; } -void cxl_native_err_irq_dump_regs(struct cxl *adapter) +void cxl_native_err_irq_dump_regs_psl9(struct cxl *adapter) +{ + u64 fir1; + + fir1 = cxl_p1_read(adapter, CXL_PSL9_FIR1); + dev_crit(&adapter->dev, "PSL_FIR: 0x%016llx\n", fir1); +} + +void cxl_native_err_irq_dump_regs_psl8(struct cxl *adapter) { u64 fir1, fir2; fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1); fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2); - - dev_crit(&adapter->dev, "PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n", fir1, fir2); + dev_crit(&adapter->dev, + "PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n", + fir1, fir2); } static irqreturn_t native_irq_err(int irq, void *data) diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 3ba04f371380..bb7fd3f4edab 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -401,7 +401,8 @@ int cxl_calc_capp_routing(struct pci_dev *dev, u64 *chipid, *capp_unit_id = get_capp_unit_id(np, *phb_index); of_node_put(np); if (!*capp_unit_id) { - pr_err("cxl: invalid capp unit id\n"); + pr_err("cxl: invalid capp unit id (phb_index: %d)\n", + *phb_index); return -ENODEV; } @@ -475,37 +476,37 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, psl_fircntl |= 0x1ULL; /* ce_thresh */ cxl_p1_write(adapter, CXL_PSL9_FIR_CNTL, psl_fircntl); - /* vccredits=0x1 pcklat=0x4 */ - cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x0000000000001810ULL); - - /* - * For debugging with trace arrays. - * Configure RX trace 0 segmented mode. - * Configure CT trace 0 segmented mode. - * Configure LA0 trace 0 segmented mode. - * Configure LA1 trace 0 segmented mode. + /* Setup the PSL to transmit packets on the PCIe before the + * CAPP is enabled */ - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000000ULL); - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000003ULL); - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000005ULL); - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000006ULL); + cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x0001001000002A10ULL); /* * A response to an ASB_Notify request is returned by the * system as an MMIO write to the address defined in - * the PSL_TNR_ADDR register + * the PSL_TNR_ADDR register. + * keep the Reset Value: 0x00020000E0000000 */ - /* PSL_TNR_ADDR */ - /* NORST */ - cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000000000000000ULL); + /* Enable XSL rty limit */ + cxl_p1_write(adapter, CXL_XSL9_DEF, 0x51F8000000000005ULL); - /* allocate the apc machines */ - cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000003FFFF0000ULL); + /* Change XSL_INV dummy read threshold */ + cxl_p1_write(adapter, CXL_XSL9_INV, 0x0000040007FFC200ULL); - /* Disable vc dd1 fix */ - if (cxl_is_power9_dd1()) - cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0400000000000001ULL); + if (phb_index == 3) { + /* disable machines 31-47 and 20-27 for DMA */ + cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000FF3FFFF0000ULL); + } + + /* Snoop machines */ + cxl_p1_write(adapter, CXL_PSL9_APCDEDALLOC, 0x800F000200000000ULL); + + if (cxl_is_power9_dd1()) { + /* Disabling deadlock counter CAR */ + cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0020000000000001ULL); + } else + cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x4000000000000000ULL); return 0; } @@ -1746,6 +1747,44 @@ static void cxl_deconfigure_adapter(struct cxl *adapter) pci_disable_device(pdev); } +static void cxl_stop_trace_psl9(struct cxl *adapter) +{ + int traceid; + u64 trace_state, trace_mask; + struct pci_dev *dev = to_pci_dev(adapter->dev.parent); + + /* read each tracearray state and issue mmio to stop them is needed */ + for (traceid = 0; traceid <= CXL_PSL9_TRACEID_MAX; ++traceid) { + trace_state = cxl_p1_read(adapter, CXL_PSL9_CTCCFG); + trace_mask = (0x3ULL << (62 - traceid * 2)); + trace_state = (trace_state & trace_mask) >> (62 - traceid * 2); + dev_dbg(&dev->dev, "cxl: Traceid-%d trace_state=0x%0llX\n", + traceid, trace_state); + + /* issue mmio if the trace array isn't in FIN state */ + if (trace_state != CXL_PSL9_TRACESTATE_FIN) + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, + 0x8400000000000000ULL | traceid); + } +} + +static void cxl_stop_trace_psl8(struct cxl *adapter) +{ + int slice; + + /* Stop the trace */ + cxl_p1_write(adapter, CXL_PSL_TRACE, 0x8000000000000017LL); + + /* Stop the slice traces */ + spin_lock(&adapter->afu_list_lock); + for (slice = 0; slice < adapter->slices; slice++) { + if (adapter->afu[slice]) + cxl_p1n_write(adapter->afu[slice], CXL_PSL_SLICE_TRACE, + 0x8000000000000000LL); + } + spin_unlock(&adapter->afu_list_lock); +} + static const struct cxl_service_layer_ops psl9_ops = { .adapter_regs_init = init_implementation_adapter_regs_psl9, .invalidate_all = cxl_invalidate_all_psl9, @@ -1762,6 +1801,7 @@ static const struct cxl_service_layer_ops psl9_ops = { .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9, .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9, .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9, + .err_irq_dump_registers = cxl_native_err_irq_dump_regs_psl9, .debugfs_stop_trace = cxl_stop_trace_psl9, .write_timebase_ctrl = write_timebase_ctrl_psl9, .timebase_read = timebase_read_psl9, @@ -1785,7 +1825,7 @@ static const struct cxl_service_layer_ops psl8_ops = { .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl8, .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl8, .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl8, - .err_irq_dump_registers = cxl_native_err_irq_dump_regs, + .err_irq_dump_registers = cxl_native_err_irq_dump_regs_psl8, .debugfs_stop_trace = cxl_stop_trace_psl8, .write_timebase_ctrl = write_timebase_ctrl_psl8, .timebase_read = timebase_read_psl8, diff --git a/drivers/mtd/devices/powernv_flash.c b/drivers/mtd/devices/powernv_flash.c index f5396f26ddb4..26f9feaa5d17 100644 --- a/drivers/mtd/devices/powernv_flash.c +++ b/drivers/mtd/devices/powernv_flash.c @@ -47,6 +47,11 @@ enum flash_op { FLASH_OP_ERASE, }; +/* + * Don't return -ERESTARTSYS if we can't get a token, the MTD core + * might have split up the call from userspace and called into the + * driver more than once, we'll already have done some amount of work. + */ static int powernv_flash_async_op(struct mtd_info *mtd, enum flash_op op, loff_t offset, size_t len, size_t *retlen, u_char *buf) { @@ -63,7 +68,8 @@ static int powernv_flash_async_op(struct mtd_info *mtd, enum flash_op op, if (token < 0) { if (token != -ERESTARTSYS) dev_err(dev, "Failed to get an async token\n"); - + else + token = -EINTR; return token; } @@ -78,32 +84,53 @@ static int powernv_flash_async_op(struct mtd_info *mtd, enum flash_op op, rc = opal_flash_erase(info->id, offset, len, token); break; default: - BUG_ON(1); - } - - if (rc != OPAL_ASYNC_COMPLETION) { - dev_err(dev, "opal_flash_async_op(op=%d) failed (rc %d)\n", - op, rc); + WARN_ON_ONCE(1); opal_async_release_token(token); return -EIO; } - rc = opal_async_wait_response(token, &msg); + if (rc == OPAL_ASYNC_COMPLETION) { + rc = opal_async_wait_response_interruptible(token, &msg); + if (rc) { + /* + * If we return the mtd core will free the + * buffer we've just passed to OPAL but OPAL + * will continue to read or write from that + * memory. + * It may be tempting to ultimately return 0 + * if we're doing a read or a write since we + * are going to end up waiting until OPAL is + * done. However, because the MTD core sends + * us the userspace request in chunks, we need + * it to know we've been interrupted. + */ + rc = -EINTR; + if (opal_async_wait_response(token, &msg)) + dev_err(dev, "opal_async_wait_response() failed\n"); + goto out; + } + rc = opal_get_async_rc(msg); + } + + /* + * OPAL does mutual exclusion on the flash, it will return + * OPAL_BUSY. + * During firmware updates by the service processor OPAL may + * be (temporarily) prevented from accessing the flash, in + * this case OPAL will also return OPAL_BUSY. + * Both cases aren't errors exactly but the flash could have + * changed, userspace should be informed. + */ + if (rc != OPAL_SUCCESS && rc != OPAL_BUSY) + dev_err(dev, "opal_flash_async_op(op=%d) failed (rc %d)\n", + op, rc); + + if (rc == OPAL_SUCCESS && retlen) + *retlen = len; + + rc = opal_error_code(rc); +out: opal_async_release_token(token); - if (rc) { - dev_err(dev, "opal async wait failed (rc %d)\n", rc); - return -EIO; - } - - rc = opal_get_async_rc(msg); - if (rc == OPAL_SUCCESS) { - rc = 0; - if (retlen) - *retlen = len; - } else { - rc = -EIO; - } - return rc; } @@ -220,21 +247,20 @@ static int powernv_flash_probe(struct platform_device *pdev) int ret; data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); - if (!data) { - ret = -ENOMEM; - goto out; - } + if (!data) + return -ENOMEM; + data->mtd.priv = data; ret = of_property_read_u32(dev->of_node, "ibm,opal-id", &(data->id)); if (ret) { dev_err(dev, "no device property 'ibm,opal-id'\n"); - goto out; + return ret; } ret = powernv_flash_set_driver_info(dev, &data->mtd); if (ret) - goto out; + return ret; dev_set_drvdata(dev, data); @@ -243,10 +269,7 @@ static int powernv_flash_probe(struct platform_device *pdev) * with an ffs partition at the start, it should prove easier for users * to deal with partitions or not as they see fit */ - ret = mtd_device_register(&data->mtd, NULL, 0); - -out: - return ret; + return mtd_device_register(&data->mtd, NULL, 0); } /** diff --git a/tools/testing/selftests/powerpc/benchmarks/context_switch.c b/tools/testing/selftests/powerpc/benchmarks/context_switch.c index f4241339edd2..87f1f0252299 100644 --- a/tools/testing/selftests/powerpc/benchmarks/context_switch.c +++ b/tools/testing/selftests/powerpc/benchmarks/context_switch.c @@ -10,6 +10,7 @@ */ #define _GNU_SOURCE +#include #include #include #include @@ -75,6 +76,7 @@ static void touch(void) static void start_thread_on(void *(*fn)(void *), void *arg, unsigned long cpu) { + int rc; pthread_t tid; cpu_set_t cpuset; pthread_attr_t attr; @@ -82,14 +84,23 @@ static void start_thread_on(void *(*fn)(void *), void *arg, unsigned long cpu) CPU_ZERO(&cpuset); CPU_SET(cpu, &cpuset); - pthread_attr_init(&attr); + rc = pthread_attr_init(&attr); + if (rc) { + errno = rc; + perror("pthread_attr_init"); + exit(1); + } - if (pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset)) { + rc = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset); + if (rc) { + errno = rc; perror("pthread_attr_setaffinity_np"); exit(1); } - if (pthread_create(&tid, &attr, fn, arg)) { + rc = pthread_create(&tid, &attr, fn, arg); + if (rc) { + errno = rc; perror("pthread_create"); exit(1); } diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c index 17fb1b43c320..1899bd85121f 100644 --- a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c +++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c @@ -53,6 +53,8 @@ static int check_all_cpu_dscr_defaults(unsigned long val) } while ((dp = readdir(sysfs))) { + int len; + if (!(dp->d_type & DT_DIR)) continue; if (!strcmp(dp->d_name, "cpuidle")) @@ -60,7 +62,9 @@ static int check_all_cpu_dscr_defaults(unsigned long val) if (!strstr(dp->d_name, "cpu")) continue; - sprintf(file, "%s%s/dscr", CPU_PATH, dp->d_name); + len = snprintf(file, LEN_MAX, "%s%s/dscr", CPU_PATH, dp->d_name); + if (len >= LEN_MAX) + continue; if (access(file, F_OK)) continue; diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index 2f1f7b013293..241a4a4ee0e4 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -12,3 +12,4 @@ tm-signal-context-chk-gpr tm-signal-context-chk-vmx tm-signal-context-chk-vsx tm-vmx-unavail +tm-unavailable diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index fca7c7f5e640..8ed6f8c57230 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -3,7 +3,7 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu tm-signal-context-chk-vmx tm-signal-context-chk-vsx TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \ - tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail \ + tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable \ $(SIGNAL_CONTEXT_CHK_TESTS) include ../../lib.mk @@ -17,6 +17,7 @@ $(OUTPUT)/tm-syscall: CFLAGS += -I../../../../../usr/include $(OUTPUT)/tm-tmspr: CFLAGS += -pthread $(OUTPUT)/tm-vmx-unavail: CFLAGS += -pthread -m64 $(OUTPUT)/tm-resched-dscr: ../pmu/lib.o +$(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -mvsx SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS)) $(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S diff --git a/tools/testing/selftests/powerpc/tm/tm-unavailable.c b/tools/testing/selftests/powerpc/tm/tm-unavailable.c new file mode 100644 index 000000000000..96c37f84ce54 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-unavailable.c @@ -0,0 +1,371 @@ +/* + * Copyright 2017, Gustavo Romero, Breno Leitao, Cyril Bur, IBM Corp. + * Licensed under GPLv2. + * + * Force FP, VEC and VSX unavailable exception during transaction in all + * possible scenarios regarding the MSR.FP and MSR.VEC state, e.g. when FP + * is enable and VEC is disable, when FP is disable and VEC is enable, and + * so on. Then we check if the restored state is correctly set for the + * FP and VEC registers to the previous state we set just before we entered + * in TM, i.e. we check if it corrupts somehow the recheckpointed FP and + * VEC/Altivec registers on abortion due to an unavailable exception in TM. + * N.B. In this test we do not test all the FP/Altivec/VSX registers for + * corruption, but only for registers vs0 and vs32, which are respectively + * representatives of FP and VEC/Altivec reg sets. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include "tm.h" + +#define DEBUG 0 + +/* Unavailable exceptions to test in HTM */ +#define FP_UNA_EXCEPTION 0 +#define VEC_UNA_EXCEPTION 1 +#define VSX_UNA_EXCEPTION 2 + +#define NUM_EXCEPTIONS 3 + +struct Flags { + int touch_fp; + int touch_vec; + int result; + int exception; +} flags; + +bool expecting_failure(void) +{ + if (flags.touch_fp && flags.exception == FP_UNA_EXCEPTION) + return false; + + if (flags.touch_vec && flags.exception == VEC_UNA_EXCEPTION) + return false; + + /* + * If both FP and VEC are touched it does not mean that touching VSX + * won't raise an exception. However since FP and VEC state are already + * correctly loaded, the transaction is not aborted (i.e. + * treclaimed/trecheckpointed) and MSR.VSX is just set as 1, so a TM + * failure is not expected also in this case. + */ + if ((flags.touch_fp && flags.touch_vec) && + flags.exception == VSX_UNA_EXCEPTION) + return false; + + return true; +} + +/* Check if failure occurred whilst in transaction. */ +bool is_failure(uint64_t condition_reg) +{ + /* + * When failure handling occurs, CR0 is set to 0b1010 (0xa). Otherwise + * transaction completes without failure and hence reaches out 'tend.' + * that sets CR0 to 0b0100 (0x4). + */ + return ((condition_reg >> 28) & 0xa) == 0xa; +} + +void *ping(void *input) +{ + + /* + * Expected values for vs0 and vs32 after a TM failure. They must never + * change, otherwise they got corrupted. + */ + uint64_t high_vs0 = 0x5555555555555555; + uint64_t low_vs0 = 0xffffffffffffffff; + uint64_t high_vs32 = 0x5555555555555555; + uint64_t low_vs32 = 0xffffffffffffffff; + + /* Counter for busy wait */ + uint64_t counter = 0x1ff000000; + + /* + * Variable to keep a copy of CR register content taken just after we + * leave the transactional state. + */ + uint64_t cr_ = 0; + + /* + * Wait a bit so thread can get its name "ping". This is not important + * to reproduce the issue but it's nice to have for systemtap debugging. + */ + if (DEBUG) + sleep(1); + + printf("If MSR.FP=%d MSR.VEC=%d: ", flags.touch_fp, flags.touch_vec); + + if (flags.exception != FP_UNA_EXCEPTION && + flags.exception != VEC_UNA_EXCEPTION && + flags.exception != VSX_UNA_EXCEPTION) { + printf("No valid exception specified to test.\n"); + return NULL; + } + + asm ( + /* Prepare to merge low and high. */ + " mtvsrd 33, %[high_vs0] ;" + " mtvsrd 34, %[low_vs0] ;" + + /* + * Adjust VS0 expected value after an TM failure, + * i.e. vs0 = 0x5555555555555555555FFFFFFFFFFFFFFFF + */ + " xxmrghd 0, 33, 34 ;" + + /* + * Adjust VS32 expected value after an TM failure, + * i.e. vs32 = 0x5555555555555555555FFFFFFFFFFFFFFFF + */ + " xxmrghd 32, 33, 34 ;" + + /* + * Wait an amount of context switches so load_fp and load_vec + * overflow and MSR.FP, MSR.VEC, and MSR.VSX become zero (off). + */ + " mtctr %[counter] ;" + + /* Decrement CTR branch if CTR non zero. */ + "1: bdnz 1b ;" + + /* + * Check if we want to touch FP prior to the test in order + * to set MSR.FP = 1 before provoking an unavailable + * exception in TM. + */ + " cmpldi %[touch_fp], 0 ;" + " beq no_fp ;" + " fadd 10, 10, 10 ;" + "no_fp: ;" + + /* + * Check if we want to touch VEC prior to the test in order + * to set MSR.VEC = 1 before provoking an unavailable + * exception in TM. + */ + " cmpldi %[touch_vec], 0 ;" + " beq no_vec ;" + " vaddcuw 10, 10, 10 ;" + "no_vec: ;" + + /* + * Perhaps it would be a better idea to do the + * compares outside transactional context and simply + * duplicate code. + */ + " tbegin. ;" + " beq trans_fail ;" + + /* Do we do FP Unavailable? */ + " cmpldi %[exception], %[ex_fp] ;" + " bne 1f ;" + " fadd 10, 10, 10 ;" + " b done ;" + + /* Do we do VEC Unavailable? */ + "1: cmpldi %[exception], %[ex_vec] ;" + " bne 2f ;" + " vaddcuw 10, 10, 10 ;" + " b done ;" + + /* + * Not FP or VEC, therefore VSX. Ensure this + * instruction always generates a VSX Unavailable. + * ISA 3.0 is tricky here. + * (xxmrghd will on ISA 2.07 and ISA 3.0) + */ + "2: xxmrghd 10, 10, 10 ;" + + "done: tend. ;" + + "trans_fail: ;" + + /* Give values back to C. */ + " mfvsrd %[high_vs0], 0 ;" + " xxsldwi 3, 0, 0, 2 ;" + " mfvsrd %[low_vs0], 3 ;" + " mfvsrd %[high_vs32], 32 ;" + " xxsldwi 3, 32, 32, 2 ;" + " mfvsrd %[low_vs32], 3 ;" + + /* Give CR back to C so that it can check what happened. */ + " mfcr %[cr_] ;" + + : [high_vs0] "+r" (high_vs0), + [low_vs0] "+r" (low_vs0), + [high_vs32] "=r" (high_vs32), + [low_vs32] "=r" (low_vs32), + [cr_] "+r" (cr_) + : [touch_fp] "r" (flags.touch_fp), + [touch_vec] "r" (flags.touch_vec), + [exception] "r" (flags.exception), + [ex_fp] "i" (FP_UNA_EXCEPTION), + [ex_vec] "i" (VEC_UNA_EXCEPTION), + [ex_vsx] "i" (VSX_UNA_EXCEPTION), + [counter] "r" (counter) + + : "cr0", "ctr", "v10", "vs0", "vs10", "vs3", "vs32", "vs33", + "vs34", "fr10" + + ); + + /* + * Check if we were expecting a failure and it did not occur by checking + * CR0 state just after we leave the transaction. Either way we check if + * vs0 or vs32 got corrupted. + */ + if (expecting_failure() && !is_failure(cr_)) { + printf("\n\tExpecting the transaction to fail, %s", + "but it didn't\n\t"); + flags.result++; + } + + /* Check if we were not expecting a failure and a it occurred. */ + if (!expecting_failure() && is_failure(cr_)) { + printf("\n\tUnexpected transaction failure 0x%02lx\n\t", + failure_code()); + return (void *) -1; + } + + /* + * Check if TM failed due to the cause we were expecting. 0xda is a + * TM_CAUSE_FAC_UNAV cause, otherwise it's an unexpected cause. + */ + if (is_failure(cr_) && !failure_is_unavailable()) { + printf("\n\tUnexpected failure cause 0x%02lx\n\t", + failure_code()); + return (void *) -1; + } + + /* 0x4 is a success and 0xa is a fail. See comment in is_failure(). */ + if (DEBUG) + printf("CR0: 0x%1lx ", cr_ >> 28); + + /* Check FP (vs0) for the expected value. */ + if (high_vs0 != 0x5555555555555555 || low_vs0 != 0xFFFFFFFFFFFFFFFF) { + printf("FP corrupted!"); + printf(" high = %#16" PRIx64 " low = %#16" PRIx64 " ", + high_vs0, low_vs0); + flags.result++; + } else + printf("FP ok "); + + /* Check VEC (vs32) for the expected value. */ + if (high_vs32 != 0x5555555555555555 || low_vs32 != 0xFFFFFFFFFFFFFFFF) { + printf("VEC corrupted!"); + printf(" high = %#16" PRIx64 " low = %#16" PRIx64, + high_vs32, low_vs32); + flags.result++; + } else + printf("VEC ok"); + + putchar('\n'); + + return NULL; +} + +/* Thread to force context switch */ +void *pong(void *not_used) +{ + /* Wait thread get its name "pong". */ + if (DEBUG) + sleep(1); + + /* Classed as an interactive-like thread. */ + while (1) + sched_yield(); +} + +/* Function that creates a thread and launches the "ping" task. */ +void test_fp_vec(int fp, int vec, pthread_attr_t *attr) +{ + int retries = 2; + void *ret_value; + pthread_t t0; + + flags.touch_fp = fp; + flags.touch_vec = vec; + + /* + * Without luck it's possible that the transaction is aborted not due to + * the unavailable exception caught in the middle as we expect but also, + * for instance, due to a context switch or due to a KVM reschedule (if + * it's running on a VM). Thus we try a few times before giving up, + * checking if the failure cause is the one we expect. + */ + do { + /* Bind 'ping' to CPU 0, as specified in 'attr'. */ + pthread_create(&t0, attr, ping, (void *) &flags); + pthread_setname_np(t0, "ping"); + pthread_join(t0, &ret_value); + retries--; + } while (ret_value != NULL && retries); + + if (!retries) { + flags.result = 1; + if (DEBUG) + printf("All transactions failed unexpectedly\n"); + + } +} + +int main(int argc, char **argv) +{ + int exception; /* FP = 0, VEC = 1, VSX = 2 */ + pthread_t t1; + pthread_attr_t attr; + cpu_set_t cpuset; + + /* Set only CPU 0 in the mask. Both threads will be bound to CPU 0. */ + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + + /* Init pthread attribute. */ + pthread_attr_init(&attr); + + /* Set CPU 0 mask into the pthread attribute. */ + pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset); + + pthread_create(&t1, &attr /* Bind 'pong' to CPU 0 */, pong, NULL); + pthread_setname_np(t1, "pong"); /* Name it for systemtap convenience */ + + flags.result = 0; + + for (exception = 0; exception < NUM_EXCEPTIONS; exception++) { + printf("Checking if FP/VEC registers are sane after"); + + if (exception == FP_UNA_EXCEPTION) + printf(" a FP unavailable exception...\n"); + + else if (exception == VEC_UNA_EXCEPTION) + printf(" a VEC unavailable exception...\n"); + + else + printf(" a VSX unavailable exception...\n"); + + flags.exception = exception; + + test_fp_vec(0, 0, &attr); + test_fp_vec(1, 0, &attr); + test_fp_vec(0, 1, &attr); + test_fp_vec(1, 1, &attr); + + } + + if (flags.result > 0) { + printf("result: failed!\n"); + exit(1); + } else { + printf("result: success\n"); + exit(0); + } +} diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h index 0ffff04433c5..df4204247d45 100644 --- a/tools/testing/selftests/powerpc/tm/tm.h +++ b/tools/testing/selftests/powerpc/tm/tm.h @@ -47,6 +47,11 @@ static inline bool failure_is_syscall(void) return (failure_code() & TM_CAUSE_SYSCALL) == TM_CAUSE_SYSCALL; } +static inline bool failure_is_unavailable(void) +{ + return (failure_code() & TM_CAUSE_FAC_UNAV) == TM_CAUSE_FAC_UNAV; +} + static inline bool failure_is_nesting(void) { return (__builtin_get_texasru() & 0x400000);